# ðŸ“… Schedule Slip Predictor

## ATLAS Capital Delivery - Activity Delay Risk Model

This notebook builds a Random Forest classifier to predict schedule slip probability:
- Analyzes activity characteristics, float, dependencies
- Incorporates vendor performance history
- Provides calibrated probabilities for reliable alerts
- SHAP values explain individual predictions

**Business Value**: Identify at-risk activities before they impact critical path.

In [None]:
# Imports
from snowflake.snowpark import Session
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, roc_auc_score, brier_score_loss
import shap

# Connect to Snowflake
connection_params = {"connection_name": "demo"}
session = Session.builder.configs(connection_params).create()
session.use_database("CAPITAL_PROJECTS_DB")
session.use_warehouse("CAPITAL_ML_WH")
print(f"Connected: {session.get_current_account()}")

In [None]:
# Load activity and vendor data
activities_df = session.table("ATOMIC.PROJECT_ACTIVITY").to_pandas()
vendors_df = session.table("ATOMIC.VENDOR").to_pandas()

# Merge vendor risk scores
df = activities_df.merge(
    vendors_df[['VENDOR_ID', 'RISK_SCORE', 'ONTIME_DELIVERY_RATE']], 
    left_on='ASSIGNED_VENDOR_ID', 
    right_on='VENDOR_ID', 
    how='left'
).fillna({'RISK_SCORE': 50, 'ONTIME_DELIVERY_RATE': 0.85})

print(f"Activities loaded: {len(df)}")
print(f"Critical path activities: {df['IS_CRITICAL'].sum()}")

In [None]:
# Feature engineering
df['IS_CRITICAL_NUM'] = df['IS_CRITICAL'].astype(int)
df['FLOAT_DAYS'] = df['TOTAL_FLOAT'].fillna(30)
df['DURATION'] = df['PLANNED_DURATION'].fillna(30)
df['PROGRESS'] = df['PERCENT_COMPLETE'].fillna(0) / 100

# Create target: did activity slip? (simulate based on characteristics)
np.random.seed(42)
slip_prob = (
    0.1 + 
    0.3 * df['IS_CRITICAL_NUM'] + 
    0.2 * (df['FLOAT_DAYS'] < 5).astype(int) +
    0.15 * (df['RISK_SCORE'] / 100) +
    np.random.uniform(-0.1, 0.1, len(df))
).clip(0, 1)
df['SLIPPED'] = (np.random.random(len(df)) < slip_prob).astype(int)

print(f"Slip rate: {df['SLIPPED'].mean()*100:.1f}%")

In [None]:
# Prepare features and train model
feature_cols = ['IS_CRITICAL_NUM', 'FLOAT_DAYS', 'DURATION', 'PROGRESS', 
                'RISK_SCORE', 'ONTIME_DELIVERY_RATE']

X = df[feature_cols].values
y = df['SLIPPED'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest with calibration for reliable probabilities
base_model = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
model = CalibratedClassifierCV(base_model, method='isotonic', cv=3)
model.fit(X_train, y_train)

# Evaluate
y_prob = model.predict_proba(X_test)[:, 1]
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.3f}")
print(f"Brier Score: {brier_score_loss(y_test, y_prob):.4f} (lower is better)")
print(classification_report(y_test, (y_prob > 0.5).astype(int)))

In [None]:
# Generate predictions for all activities
all_probs = model.predict_proba(X)[:, 1]

# Create prediction dataframe
pred_df = df[['ACTIVITY_ID', 'PROJECT_ID']].copy()
pred_df['SLIP_PROBABILITY'] = all_probs
pred_df['PREDICTED_SLIP_DAYS'] = (all_probs * 15).astype(int)  # Rough estimate
pred_df['RISK_LEVEL'] = pd.cut(all_probs, bins=[0, 0.3, 0.5, 0.7, 1.0], 
                                labels=['LOW', 'MEDIUM', 'HIGH', 'CRITICAL'])
pred_df['PREDICTION_DATE'] = pd.Timestamp.now().date()
pred_df['MODEL_NAME'] = 'SCHEDULE_SLIP_PREDICTOR'
pred_df['MODEL_VERSION'] = '1.0'

# Save to Snowflake
sp_preds = session.create_dataframe(pred_df)
sp_preds.write.mode('overwrite').save_as_table('ML.SCHEDULE_RISK_PREDICTIONS')

print("\\nðŸ“… Schedule Risk Summary:")
print(f"Total activities: {len(pred_df)}")
print(f"High/Critical risk: {(pred_df['RISK_LEVEL'].isin(['HIGH', 'CRITICAL'])).sum()}")
print("\\nâœ… Predictions saved to ML.SCHEDULE_RISK_PREDICTIONS")