
# Synthetic Project Management Dataset

This project demonstrates end-to-end data analysis on a synthetic project management dataset. 
The dataset contains information about fictitious projects including team size, budget, complexity, client importance, and duration. 
The goal is to explore the data, visualize relationships, and build predictive models to understand the factors impacting project success.

We will start with exploratory data analysis (EDA) using descriptive statistics and visualizations, and then progress to predictive modeling. 
We'll implement both regression and classification models to predict project duration and success, respectively.


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report, confusion_matrix

# Load the dataset
df = pd.read_csv('/home/oai/share/synthetic_project_data.csv')

# Display the first few rows
df.head()


In [None]:

# Summary statistics
df.describe()


In [None]:

# Histogram for numeric features
sns.set(style="whitegrid")
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

numeric_cols = ['team_size', 'budget', 'complexity', 'client_importance', 'duration_days']

for idx, col in enumerate(numeric_cols):
    ax = axes[idx//3, idx%3]
    sns.histplot(df[col], kde=True, ax=ax)
    ax.set_title(f'Distribution of {col}')

plt.tight_layout()
plt.show()


In [None]:

# Correlation heatmap
plt.figure(figsize=(8, 6))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()


In [None]:

# Predicting project duration (regression)

# Features and target for regression
X_reg = df[['team_size', 'budget', 'complexity', 'client_importance']]
y_reg = df['duration_days']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Initialize and train model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Predict on test data
y_pred = lin_reg.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Linear Regression Model Performance:")
print(f"  Mean Absolute Error: {mae:.2f}")
print(f"  R-squared: {r2:.2f}")


In [None]:

# Predicting project success (classification)

# Features and target for classification
X_clf = df[['team_size', 'budget', 'complexity', 'client_importance', 'duration_days']]
y_clf = df['success']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

# Initialize and train model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Predict on test data
y_pred = log_reg.predict(X_test)

# Evaluate
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Logistic Regression Model Performance:")
print(f"  Accuracy: {acc:.2f}")
print("
Classification Report:
", report)
print("
Confusion Matrix:
", cm)


In [None]:

# Plot predicted probabilities

# Fit model on entire dataset for probability visualization
log_reg_full = LogisticRegression(max_iter=1000)
log_reg_full.fit(X_clf, y_clf)

# Predict probabilities
probabilities = log_reg_full.predict_proba(X_clf)[:, 1]

plt.figure(figsize=(8, 5))
sns.histplot(probabilities, bins=30, kde=True)
plt.title('Distribution of Predicted Success Probabilities')
plt.xlabel('Probability of Success')
plt.ylabel('Frequency')
plt.show()
