<a href="https://colab.research.google.com/github/sridhartroy/AIML/blob/main/MyPMFriend.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MyPMFriend

In [2]:
import sys
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from joblib import dump, load

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve, precision_recall_curve, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.inspection import permutation_importance
from sklearn.utils import shuffle

RANDOM_STATE = 6579
np.random.seed(RANDOM_STATE)


## Step 1 : Connect to JIRA. But for now use synthetic data with non-linear data for key project management fields.


In [4]:
# creating a randomized non-linear synthetic data for typical program management variables.

def sigmoid(z):
  return 1/(1+np.exp(-z))

def synthetic_pm_data(n=5000, random_state=RANDOM_STATE):
    # defining and initializing PM variables.
    rng = np.random.default_rng(random_state) # random generator function
    story_points = rng.choice([1,2,3,5,8,13,21], size=n, p=[0.1,0.2,0.25,0.15,0.15,0.1, 0.05])
    effort = np.clip(story_points + rng.normal(0, 1.2, n), 1, None)
    num_dependencies = rng.poisson(2.0, size=n)
    team_load = np.clip(rng.normal(1.0, 0.4, n), 0.6, 1.8)
    past_delay_rate = np.clip(rng.beta(2, 3, n), 0, 1)
    bug_count_recent = rng.poisson(2.0, size=n)
    risk_tagged = rng.choice([0,1,2], size=n, p=[0.3, 0.4, 0.3])
    assignee_exp = np.clip(rng.normal(3.5, 2.0, n), 0, 12)
    priority = rng.choice([1,2,3,4], size=n, p=[0.15,0.35,0.35,0.15])
    sprint_day = rng.integers(1, 15, size=n)
    component_dependency = rng.choice([0,1,2], size=n, p=[0.2, 0.5,0.3])
    requirement_churn = np.clip(rng.beta(1.5, 3.2, n), 0, 1)

    # create a weight factor for each of the variables that typically impact the delay
    logit = (
        0.5 * (num_dependencies == 2).astype(float)
        + 0.7 * (num_dependencies == 3).astype(float)
        + 0.8 * (num_dependencies  > 3).astype(float)
        + 1.0 * (team_load - 1.0)
        + 1.0 * risk_tagged
        + 1.8 * past_delay_rate
        + 0.9 * np.tanh((bug_count_recent - 2) / 2.0)
        - 0.4 * (assignee_exp / 5.0)
        + 0.7 * (priority == 3).astype(float)
        + 0.3 * (priority == 4).astype(float)
        + 0.7 * component_dependency
        + 1.0 * requirement_churn
        + 0.5 * (effort / np.maximum(story_points, 1) - 1.0)
        + 0.7 * (num_dependencies * past_delay_rate)
        + 0.8 * (team_load * requirement_churn)
    )
    p_delay = sigmoid(logit - 1.2)
    delayed = (rng.random(n) < p_delay).astype(int)

    df = pd.DataFrame({
        "story_points": story_points,
        "est_days": np.round(est_days, 2),
        "num_dependencies": num_dependencies,
        "team_load": np.round(team_load, 3),
        "past_delay_rate": np.round(past_delay_rate, 3),
        "bug_count_recent": bug_count_recent,
        "risk_tagged": risk_tagged,
        "sentiment_score": np.round(sentiment_score, 3),
        "owner_experience": np.round(owner_experience, 2),
        "priority": priority,
        "sprint_day": sprint_day,
        "cross_component": cross_component,
        "requirement_churn": np.round(requirement_churn, 3),
        "delayed": delayed
    })
    return shuffle(df, random_state=random_state).reset_index(drop=True)

data = make_synthetic_pm_data()
csv_path = "/mnt/data/pm_synthetic_tasks_v4.csv"
data.to_csv(csv_path, index=False)
print("Saved dataset:", csv_path)
data.head()


OSError: Cannot save file into a non-existent directory: '/mnt/data'

## 2) Train/Test split

In [4]:

TARGET = "delayed"
FEATURES = [c for c in data.columns if c != TARGET]
X_train, X_test, y_train, y_test = train_test_split(
    data[FEATURES], data[TARGET], test_size=0.25, stratify=data[TARGET], random_state=42
)
X_train.shape, X_test.shape


((1875, 13), (625, 13))

## 3) Model: RandomForest + calibration

In [7]:
base = RandomForestClassifier(
    n_estimators=500, min_samples_split=4, min_samples_leaf=2,
    random_state=42, n_jobs=-1, class_weight="balanced_subsample"
)
# Changed base_estimator to estimator based on TypeError
clf = CalibratedClassifierCV(estimator=base, cv=3, method="sigmoid")
clf.fit(X_train, y_train)
model_path = "c:/pm_delay_rf_calibrated_v4.joblib"
dump(clf, model_path); print("Saved model to:", model_path)

Saved model to: c:/pm_delay_rf_calibrated_v4.joblib


## 4) Evaluation & plots

In [8]:

y_proba = clf.predict_proba(X_test)[:,1]
y_pred = (y_proba >= 0.5).astype(int)
roc_auc = roc_auc_score(y_test, y_proba)
pr_auc = average_precision_score(y_test, y_proba)
print(f"ROC-AUC: {roc_auc:.3f}\nPR-AUC: {pr_auc:.3f}")
print(classification_report(y_test, y_pred, digits=3))


ROC-AUC: 0.745
PR-AUC: 0.876
              precision    recall  f1-score   support

           0      0.588     0.300     0.397       190
           1      0.748     0.908     0.820       435

    accuracy                          0.723       625
   macro avg      0.668     0.604     0.609       625
weighted avg      0.699     0.723     0.692       625



## 5) Feature importance

In [9]:

perm = permutation_importance(clf, X_test, y_test, n_repeats=8, random_state=42, n_jobs=-1)
importances = pd.DataFrame({
    "feature": X_test.columns,
    "importance_mean": perm.importances_mean,
    "importance_std": perm.importances_std
}).sort_values("importance_mean", ascending=False)
importances.head(10)


Unnamed: 0,feature,importance_mean,importance_std
3,team_load,0.012,0.006597
5,bug_count_recent,0.0114,0.006372
6,risk_tagged,0.0112,0.006597
4,past_delay_rate,0.011,0.007977
2,num_dependencies,0.011,0.009845
11,cross_component,0.009,0.005875
7,sentiment_score,0.0046,0.007085
12,requirement_churn,0.0038,0.006245
0,story_points,0.0034,0.004266
1,est_days,0.0014,0.004113


## 6) Inference helper

In [10]:

def predict_delay(df: pd.DataFrame, threshold=0.6):
    proba = clf.predict_proba(df[FEATURES])[:,1]
    out = df.copy()
    out['delay_prob'] = proba
    out['risk_label'] = (proba >= threshold).astype(int)
    return out

predict_delay(X_test.head(5), threshold=0.6)


Unnamed: 0,story_points,est_days,num_dependencies,team_load,past_delay_rate,bug_count_recent,risk_tagged,sentiment_score,owner_experience,priority,sprint_day,cross_component,requirement_churn,delay_prob,risk_label
2364,8,7.62,0,0.849,0.33,3,0,0.205,0.87,4,5,0,0.092,0.428162,0
1284,5,2.88,2,0.964,0.425,0,0,0.036,2.41,1,5,1,0.444,0.676725,1
1335,2,1.69,0,0.743,0.322,1,0,0.353,2.58,3,2,0,0.451,0.362765,0
1282,3,1.0,2,1.197,0.033,0,0,-0.585,6.82,2,3,1,0.011,0.431232,0
2230,2,1.0,3,0.74,0.307,1,0,-0.852,4.73,4,12,0,0.537,0.76092,1
