# Project Analytics Notebook

This notebook performs exploratory data analysis, visualization, and predictive modeling on a synthetic project analytics dataset. The goal is to uncover insights and build models that predict project success based on features such as budget, team size, tasks completed, risk level, and timelines.

The dataset is stored in `data/project_data.csv`.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Settings for plots
sns.set(style='whitegrid')
%matplotlib inline

## Load Data

In [None]:
# Read the dataset
file_path = '../data/project_data.csv'
df = pd.read_csv(file_path)

# Display first few rows
df.head()

## Basic Statistics and Data Info

In [None]:
# General information
info = df.info()

# Summary statistics
summary_stats = df.describe(include='all')

info, summary_stats

## Data Preprocessing

In [None]:
# Convert date columns to datetime
for col in ['start_date', 'planned_end_date', 'actual_end_date']:
    df[col] = pd.to_datetime(df[col])

# Create project duration features
# Planned and actual durations in days
df['planned_duration_days'] = (df['planned_end_date'] - df['start_date']).dt.days
df['actual_duration_days'] = (df['actual_end_date'] - df['start_date']).dt.days

# Encode categorical variables
risk_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
df['risk_level_encoded'] = df['risk_level'].map(risk_mapping)

# Check for missing values
missing_values = df.isnull().sum()

missing_values

## Exploratory Visualizations

In [None]:
# Distribution of project durations
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
sns.histplot(df['planned_duration_days'], bins=30, ax=ax[0], color='skyblue')
ax[0].set_title('Distribution of Planned Duration (days)')
ax[0].set_xlabel('Planned Duration (days)')

sns.histplot(df['actual_duration_days'], bins=30, ax=ax[1], color='salmon')
ax[1].set_title('Distribution of Actual Duration (days)')
ax[1].set_xlabel('Actual Duration (days)')

plt.tight_layout()
plt.show()

In [None]:
# Relationship between budget and tasks completed
plt.figure(figsize=(8, 5))
sns.scatterplot(x='budget', y='tasks_completed', hue='risk_level', data=df)
plt.title('Budget vs Tasks Completed by Risk Level')
plt.xlabel('Budget (USD)')
plt.ylabel('Tasks Completed')
plt.show()

In [None]:
# Success rate by risk level
plt.figure(figsize=(6, 4))
success_rate = df.groupby('risk_level')['success'].mean().reset_index()
sns.barplot(x='risk_level', y='success', data=success_rate, palette='viridis')
plt.title('Average Success Rate by Risk Level')
plt.xlabel('Risk Level')
plt.ylabel('Average Success Rate')
plt.show()

In [None]:
# Correlation heatmap for numerical features
plt.figure(figsize=(10, 6))
num_cols = ['team_size', 'budget', 'tasks_completed', 'planned_duration_days', 'actual_duration_days', 'client_satisfaction', 'delayed', 'success', 'risk_level_encoded']
corr = df[num_cols].corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## Predictive Modeling

In [None]:
# Features and target
target = 'success'
features = ['team_size', 'budget', 'tasks_completed', 'planned_duration_days', 'actual_duration_days', 'client_satisfaction', 'risk_level_encoded', 'delayed']

X = df[features]
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

acc_lr = accuracy_score(y_test, y_pred_lr)
cm_lr = confusion_matrix(y_test, y_pred_lr)

acc_lr, cm_lr

In [None]:
# Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred_rf)
cm_rf = confusion_matrix(y_test, y_pred_rf)

acc_rf, cm_rf

In [None]:
# Show evaluation reports for both models
report_lr = classification_report(y_test, y_pred_lr)
report_rf = classification_report(y_test, y_pred_rf)

print('Logistic Regression Classification Report:
', report_lr)
print('Random Forest Classification Report:
', report_rf)