
# Project Performance Analysis

This Jupyter notebook performs an exploratory data analysis (EDA) and predictive modeling on a synthetic project management dataset. The goal is to simulate real-world business and program management scenarios where we want to understand factors that contribute to the success of projects.

## Dataset Overview

The dataset (`project_data.csv`) contains information on **300** hypothetical projects managed by a company. Each project has features such as:

- **start_date** and **end_date**: when the project began and ended.
- **duration_days**: length of the project in days.
- **budget_k_usd**: approximate budget in thousands of USD.
- **team_size**: number of team members.
- **complexity**: a score from 1 (simple) to 10 (highly complex).
- **category**: project category (e.g., Digital Transformation, Infrastructure, Human Resources, Marketing, Research).
- **region**: geographical region of the project.
- **manager_experience_years**: years of experience of the project manager.
- **risk_score**: a risk rating between 0 and 100.
- **success**: target variable (1 if the project was successful, 0 otherwise).

We will analyze the relationships between these variables, visualize distributions, and build a logistic regression model to predict project success.


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# Set style for plots
sns.set(style='whitegrid', context='notebook')

# Load dataset
file_path = 'data/project_data.csv'
df = pd.read_csv(file_path, parse_dates=['start_date', 'end_date'])

# Display first five rows
df.head()


In [None]:

# Basic info and summary statistics
print("Dataset shape:", df.shape)
print("
Summary statistics for numeric variables:")
df.describe()


In [None]:

# Histogram of project duration
plt.figure(figsize=(6,4))
sns.histplot(df['duration_days'], bins=30, kde=True)
plt.title('Distribution of Project Duration (days)')
plt.xlabel('Duration (days)')
plt.ylabel('Frequency')
plt.show()

# Histogram of budgets
plt.figure(figsize=(6,4))
sns.histplot(df['budget_k_usd'], bins=30, kde=True)
plt.title('Distribution of Budgets (k USD)')
plt.xlabel('Budget (k USD)')
plt.ylabel('Frequency')
plt.show()


In [None]:

# Scatter plot: Budget vs Duration colored by success
plt.figure(figsize=(6,4))
sns.scatterplot(x='duration_days', y='budget_k_usd', hue='success', data=df, palette='viridis')
plt.title('Budget vs Duration by Success')
plt.xlabel('Duration (days)')
plt.ylabel('Budget (k USD)')
plt.legend(title='Success')
plt.show()

# Bar plot: Success rate by category
plt.figure(figsize=(6,4))
success_rate_by_cat = df.groupby('category')['success'].mean().sort_values()
success_rate_by_cat.plot(kind='barh', color='skyblue')
plt.title('Success Rate by Category')
plt.xlabel('Average Success Rate')
plt.ylabel('Category')
plt.show()


In [None]:

# Correlation heatmap for numeric features
numeric_cols = ['duration_days', 'budget_k_usd', 'team_size', 'complexity', 'manager_experience_years', 'risk_score', 'success']
plt.figure(figsize=(8,6))
corr = df[numeric_cols].corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()


In [None]:

# Prepare data for logistic regression
# One-hot encode categorical variables
X = df.drop(columns=['project_id', 'start_date', 'end_date', 'success'])
X = pd.get_dummies(X, columns=['category', 'region'], drop_first=True)
y = df['success']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Predict on test set
y_pred = log_reg.predict(X_test)

# Evaluate model
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")
print("
Classification Report:
", classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
cmd = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=log_reg.classes_)
cmd.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

# Optional: display feature coefficients
coeff_df = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': log_reg.coef_[0]
}).sort_values(by='coefficient', key=lambda x: abs(x), ascending=False)

coeff_df.head(10)
