
# Project Performance Analysis

This notebook explores a synthetic dataset of project performance metrics for various programs.
The dataset includes planned vs. actual durations and budgets, team sizes, stakeholder satisfaction, and a binary success outcome.
We'll perform exploratory data analysis and build predictive models to estimate the likelihood of project success.


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
file_path = 'synthetic_project_data.csv'
df = pd.read_csv(file_path)

# Display first few rows
df.head()


In [None]:

# Summary statistics
summary = df.describe(include='all')

# Correlation matrix
corr_matrix = df[['planned_duration_days', 'actual_duration_days', 'planned_budget', 'actual_cost', 'team_size', 'stakeholder_satisfaction', 'success']].corr()

summary, corr_matrix


In [None]:

# Plot distributions of planned vs actual durations
plt.figure(figsize=(10,5))
sns.histplot(df['planned_duration_days'], color='skyblue', label='Planned', kde=True)
sns.histplot(df['actual_duration_days'], color='salmon', label='Actual', kde=True)
plt.title('Distribution of Planned vs. Actual Duration (days)')
plt.xlabel('Days')
plt.legend()
plt.show()

# Plot budgets vs actual costs
plt.figure(figsize=(10,5))
sns.scatterplot(data=df, x='planned_budget', y='actual_cost', hue='success')
plt.title('Planned Budget vs. Actual Cost')
plt.xlabel('Planned Budget')
plt.ylabel('Actual Cost')
plt.show()

# Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:

# Prepare features and target
a= df[['planned_duration_days', 'actual_duration_days', 'planned_budget', 'actual_cost', 'team_size', 'stakeholder_satisfaction']]
y = df['success']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(a, y, test_size=0.3, random_state=42, stratify=y)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred = log_reg.predict(X_test_scaled)

# Evaluation
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

acc, report, conf_matrix
