# Project Success Prediction Analysis

This notebook demonstrates exploratory data analysis and predictive modelling on a synthetic project dataset. The dataset contains features like team size, budget, duration, and complexity of hypothetical projects, along with a binary success label. We'll explore the relationships between these variables and build predictive models.


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Load dataset
file_path = 'synthetic_project_dataset.csv'
data = pd.read_csv(file_path)

# Display head
print(data.head())


## Summary Statistics

In [None]:

print("Data shape:", data.shape)
print("
Summary statistics:")
print(data.describe())


## Exploratory Data Visualization

In [None]:

# Set plot style
sns.set(style='whitegrid')

# Histogram for numerical features
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
numeric_cols = ['Team_Size', 'Budget_kUSD', 'Duration_Months', 'Complexity']
for idx, col in enumerate(numeric_cols):
    ax = axes[idx//2, idx%2]
    sns.histplot(data[col], kde=True, ax=ax, color='skyblue')
    ax.set_title(f"Distribution of {col}")

plt.tight_layout()
plt.show()

# Scatterplot: Budget vs Duration colored by Success
plt.figure(figsize=(7,5))
sns.scatterplot(data=data, x='Budget_kUSD', y='Duration_Months', hue='Success', palette='viridis')
plt.title('Budget vs Duration by Success')
plt.show()

# Correlation heatmap
plt.figure(figsize=(6,5))
corr = data[['Team_Size','Budget_kUSD','Duration_Months','Complexity','Success']].corr()
sns.heatmap(corr, annot=True, cmap='Blues')
plt.title('Correlation Matrix')
plt.show()


## Predictive Modelling

In [None]:

# Prepare features and target
X = data[['Team_Size','Budget_kUSD','Duration_Months','Complexity']]
y = data['Success']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

print("Logistic Regression Performance:
")
print(classification_report(y_test, y_pred_lr))

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Performance:
")
print(classification_report(y_test, y_pred_rf))

# Confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(10,4))
sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot=True, fmt='d', cmap='Reds', ax=axes[0])
axes[0].set_title('Logistic Regression Confusion Matrix')
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title('Random Forest Confusion Matrix')
plt.show()
