
# Project Management Analytics

This notebook demonstrates a comprehensive data analysis workflow on a synthetic project management dataset. It covers data loading, cleaning, exploratory data analysis (EDA), and predictive modeling. The goal is to showcase skills relevant to roles like Business Analyst, Program Manager, and Data Analyst.


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve

# Load dataset
file_path = 'synthetic_project_data.csv'
df = pd.read_csv(file_path)

# Display first few rows
pd.set_option('display.max_columns', None)
df.head()


In [None]:

# Data types and basic info
df.info()


In [None]:

# Check for missing values
missing = df.isnull().sum()
missing



## Exploratory Data Analysis (EDA)

Let's explore the dataset through summary statistics and visualizations.


In [None]:

# Descriptive statistics
stats = df.describe(include='all')
stats


In [None]:

# Histograms for numeric variables
numeric_cols = ['Team_Size', 'Duration_Months', 'Budget', 'Expenditure', 'Risk_Rating', 'Completion_Percent']
plt.figure(figsize=(12, 8))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(2, 3, i)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()


In [None]:

# Bar charts for categorical variables
categorical_cols = ['Project_Priority', 'Phase', 'On_Time', 'Success']
plt.figure(figsize=(12, 8))
for i, col in enumerate(categorical_cols, 1):
    plt.subplot(2, 2, i)
    sns.countplot(x=df[col], palette='viridis')
    plt.title(f'Count of {col}')
plt.tight_layout()
plt.show()


In [None]:

# Correlation matrix for numeric variables
corr_matrix = df[['Team_Size', 'Duration_Months', 'Budget', 'Expenditure', 'Risk_Rating', 'Completion_Percent', 'On_Time', 'Success']].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()



## Predictive Modeling

We will build classification models to predict whether a project will be successful based on features such as budget, duration, risk rating, and completion percentage.


In [None]:

# Prepare features and target
X = df[['Team_Size', 'Duration_Months', 'Budget', 'Expenditure', 'Risk_Rating', 'Completion_Percent', 'On_Time']]
y = df['Success']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Feature scaling is not strictly necessary for tree-based models, but beneficial for logistic regression
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:

# Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)
y_pred_lr = log_reg.predict(X_test_scaled)

# Evaluation metrics
acc_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
roc_auc_lr = roc_auc_score(y_test, log_reg.predict_proba(X_test_scaled)[:,1])

print(f"Logistic Regression Accuracy: {acc_lr:.2f}")
print(f"Precision: {precision_lr:.2f}")
print(f"Recall: {recall_lr:.2f}")
print(f"ROC AUC: {roc_auc_lr:.2f}")


In [None]:

# Random Forest model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

# Evaluation metrics for random forest
acc_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, rf_clf.predict_proba(X_test)[:,1])

print(f"Random Forest Accuracy: {acc_rf:.2f}")
print(f"Precision: {precision_rf:.2f}")
print(f"Recall: {recall_rf:.2f}")
print(f"ROC AUC: {roc_auc_rf:.2f}")


In [None]:

# Plot ROC curves for both models
fpr_lr, tpr_lr, _ = roc_curve(y_test, log_reg.predict_proba(X_test_scaled)[:,1])
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_clf.predict_proba(X_test)[:,1])

plt.figure(figsize=(8, 6))
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {roc_auc_lr:.2f}')
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {roc_auc_rf:.2f}')
plt.plot([0,1], [0,1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
plt.show()
