
# Project Management Success Analysis

This notebook performs an exploratory and predictive analysis on a synthetic project management dataset.
The goal is to understand factors influencing cost overruns and on-time completion of projects and to build a predictive model.

Dataset features:
- `Project_ID`: Unique identifier for each project
- `Complexity`: Project complexity (`Low`, `Medium`, `High`)
- `Team_Size`: Number of people in the project team
- `Budget_K`: Project budget in thousands of dollars
- `Duration_Months`: Planned project duration in months
- `Risk_Count`: Number of identified risks for the project
- `Stakeholder_Engagement_Score`: Engagement score from 1 (low) to 10 (high)
- `Methodology`: Project management methodology (`Agile`, `Waterfall`, `Hybrid`)
- `Cost_Overrun_Pct`: Percentage cost overrun at project completion
- `On_Time_Completion`: 1 if project completed on time, 0 otherwise


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = 'project_management_dataset.csv'
df = pd.read_csv(file_path)
df.head()


In [None]:

# Basic info and summary statistics
df.info()

# Describe numerical columns
df.describe()

# Value counts for categorical features
print(df['Complexity'].value_counts())
print(df['Methodology'].value_counts())



In [None]:

# Distribution of cost overruns
plt.figure(figsize=(8,4))
sns.histplot(df['Cost_Overrun_Pct'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Cost Overrun (%)')
plt.xlabel('Cost Overrun (%)')
plt.ylabel('Frequency')
plt.show()

# On-time completion by complexity
plt.figure(figsize=(6,4))
sns.barplot(x='Complexity', y='On_Time_Completion', data=df, ci=None, palette='viridis')
plt.title('On-Time Completion Rate by Complexity')
plt.xlabel('Complexity')
plt.ylabel('On-Time Completion Rate')
plt.ylim(0,1)
plt.show()

# Correlation heatmap for numerical variables
plt.figure(figsize=(8,6))
numeric_cols = ['Team_Size','Budget_K','Duration_Months','Risk_Count','Stakeholder_Engagement_Score','Cost_Overrun_Pct']
corr = df[numeric_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap (Numeric Features)')
plt.show()


In [None]:

# Prepare data for modeling
X = df.drop(['Project_ID','Cost_Overrun_Pct','On_Time_Completion'], axis=1)
y = df['On_Time_Completion']

# Identify categorical and numeric columns
categorical_cols = ['Complexity','Methodology']
numeric_cols = ['Team_Size','Budget_K','Duration_Months','Risk_Count','Stakeholder_Engagement_Score']

# Preprocess features: One-hot encode categorical, pass through numeric
preprocess = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(drop='first'), categorical_cols),
        ('numeric', 'passthrough', numeric_cols)
    ]
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Build model pipeline
model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train model
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('
Classification Report:
', classification_report(y_test, y_pred))
