In [121]:
# Preprocessing: handle categorical, date, and numeric features
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

train_data = pd.read_csv('train_data.csv')
print(f"Training data size: {len(train_data)} samples")

# Analyze delay patterns by Course_Category
print("ANALYZING DELAY PATTERNS IN TRAINING DATA")

# Pattern 1: Course Category patterns
category_stats = train_data.groupby('Course_Category')['Delay_Days'].agg(['mean', 'std', 'min', 'max', 'count'])
print("\nDelay by Course Category:")
print(category_stats)

# Pattern 2: Module Name patterns
module_stats = train_data.groupby('Module_Name')['Delay_Days'].agg(['mean', 'std', 'min', 'max']).sort_values('mean', ascending=False)
print("\nDelay by Module Name (top patterns):")
print(module_stats.head(15))

# KEY INSIGHT: Cloud always has 4 days, Management always has 1 day
# Let's build a rule-based + ML hybrid approach

# Create dictionaries for exact pattern matching
category_delay_exact = {
    'Cloud': 4,       # Always 4 days in training data
    'Management': 1,  # Always 1 day in training data
}

# For categories with variation, use mean
category_delay_mean = train_data.groupby('Course_Category')['Delay_Days'].mean().to_dict()
module_delay_mean = train_data.groupby('Module_Name')['Delay_Days'].mean().to_dict()
global_delay_mean = train_data['Delay_Days'].mean()

print(f"\nCategory delay means: {category_delay_mean}")
print(f"Global delay mean: {global_delay_mean:.2f}")

# Convert date columns to datetime
for col in ['Planned_Start_Date', 'Actual_Start_Date', 'Planned_End_Date', 'Actual_End_Date']:
    train_data[col] = pd.to_datetime(train_data[col])

# Calculate Planned_Duration
train_data['Planned_Duration'] = (train_data['Planned_End_Date'] - train_data['Planned_Start_Date']).dt.days

# Feature Engineering
for col in ['Planned_Start_Date', 'Actual_Start_Date']:
    train_data[col + '_month'] = train_data[col].dt.month
    train_data[col + '_day'] = train_data[col].dt.day
    train_data[col + '_dayofweek'] = train_data[col].dt.dayofweek
    train_data[col + '_quarter'] = train_data[col].dt.quarter

# Start delay
train_data['Start_Delay'] = (train_data['Actual_Start_Date'] - train_data['Planned_Start_Date']).dt.days

# Weekend flags
train_data['Planned_Weekend'] = train_data['Planned_Start_Date_dayofweek'].isin([5, 6]).astype(int)
train_data['Actual_Weekend'] = train_data['Actual_Start_Date_dayofweek'].isin([5, 6]).astype(int)

# Batch size categories
train_data['Batch_Category'] = pd.cut(train_data['Batch_Size'], bins=[0, 20, 25, 30, 100], labels=['Small', 'Medium', 'Large', 'XLarge'])

# Label encode
le_category = LabelEncoder()
le_batch_cat = LabelEncoder()

train_data['Course_Category_Encoded'] = le_category.fit_transform(train_data['Course_Category'])
train_data['Batch_Category_Encoded'] = le_batch_cat.fit_transform(train_data['Batch_Category'])

# Target encoding
train_data['Category_Delay_Mean'] = train_data['Course_Category'].map(category_delay_mean)
train_data['Module_Delay_Mean'] = train_data['Module_Name'].map(module_delay_mean)

# Feature selection
feature_cols = ['Batch_Size', 'Planned_Duration', 'Start_Delay',
                'Planned_Start_Date_month', 'Planned_Start_Date_day', 'Planned_Start_Date_dayofweek', 'Planned_Start_Date_quarter',
                'Actual_Start_Date_month', 'Actual_Start_Date_day', 'Actual_Start_Date_dayofweek', 'Actual_Start_Date_quarter',
                'Planned_Weekend', 'Actual_Weekend',
                'Course_Category_Encoded', 'Batch_Category_Encoded', 
                'Category_Delay_Mean', 'Module_Delay_Mean']

X = train_data[feature_cols].copy()
y = train_data['Delay_Days']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nFeature set size: {X_train.shape[1]} features")

# Train models
print("\nTraining GradientBoosting...")
gb_model = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=42
)
gb_model.fit(X_train, y_train)

print("Training RandomForest...")
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=5,
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

# Create Ensemble
print("Creating Voting Ensemble...")
ensemble_model = VotingRegressor(
    estimators=[('rf', rf_model), ('gb', gb_model)]
)
ensemble_model.fit(X_train, y_train)

model = ensemble_model

# Final evaluation on test set
y_pred = np.round(model.predict(X_test)).clip(0, None)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("MODEL EVALUATION ON INTERNAL TEST SET")
print(f"MAE: {mae:.3f}, RMSE: {rmse:.3f}, R2: {r2*100:.2f}%")

Training data size: 80 samples
ANALYZING DELAY PATTERNS IN TRAINING DATA

Delay by Course Category:
                     mean       std  min  max  count
Course_Category                                     
AI               1.550000  1.356272    0    3     20
Cloud            4.000000  0.000000    4    4     10
Data             0.833333  0.698932    0    2     30
Management       1.000000  0.000000    1    1     10
Programming      0.200000  0.421637    0    1     10

Delay by Module Name (top patterns):
                        mean  std  min  max
Module_Name                                
Azure Fundamentals       4.0  NaN    4    4
AWS Essentials           4.0  NaN    4    4
Cloud Fundamentals       4.0  0.0    4    4
DevOps Basics            4.0  NaN    4    4
Google Cloud             4.0  NaN    4    4
IoT Basics               4.0  NaN    4    4
AI in Finance            3.0  NaN    3    3
AI Ethics                3.0  NaN    3    3
Reinforcement Learning   3.0  NaN    3    3
AI for 

In [116]:
# Print accuracy scores
print(f'MAE: {mae:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R2 Score: {r2*100:.2f}%')

MAE: 0.25
RMSE: 0.50
R2 Score: 88.21%


In [117]:
# Analyze test data patterns to understand what we're predicting
import json
with open('test_data_500.json', 'r') as f:
    test_json = json.load(f)
analyze_df = pd.DataFrame(test_json)

print("TEST DATA ANALYSIS")
print(f"\nSamples by Course Category:")
print(analyze_df['Course_Category'].value_counts())

print(f"\nDelay_Days distribution in test data:")
print(analyze_df['Delay_Days'].value_counts().sort_index())

print(f"\nDelay by Course Category in TEST data:")
print(analyze_df.groupby('Course_Category')['Delay_Days'].agg(['mean', 'std', 'min', 'max', 'count']))

TEST DATA ANALYSIS

Samples by Course Category:
Course_Category
Programming    50
Data           50
AI             50
Cloud          50
Management     50
Security       50
DevOps         50
Database       50
Testing        50
Mobile         50
Name: count, dtype: int64

Delay_Days distribution in test data:
Delay_Days
0    127
1    121
2     98
3     61
4     33
5     27
6     21
7     12
Name: count, dtype: int64

Delay by Course Category in TEST data:
                 mean       std  min  max  count
Course_Category                                 
AI               1.90  1.961361    0    7     50
Cloud            2.00  1.873554    0    7     50
Data             1.48  1.764040    0    7     50
Database         1.46  1.445754    0    7     50
DevOps           2.40  2.080031    0    7     50
Management       2.28  1.796141    0    6     50
Mobile           1.92  1.904238    0    7     50
Programming      1.88  1.934013    0    7     50
Security         2.16  1.608143    0    6     50
Tes

In [122]:
# Simpler Prediction Function - focus on generalizable patterns
def predict_delay(model, input_df):
    df = input_df.copy()
    
    # Convert date columns
    for col in ['Planned_Start_Date', 'Actual_Start_Date', 'Planned_End_Date']:
        df[col] = pd.to_datetime(df[col])
    
    # Calculate Planned Duration - THIS IS THE KEY FEATURE
    df['Planned_Duration'] = (df['Planned_End_Date'] - df['Planned_Start_Date']).dt.days
    
    for col in ['Planned_Start_Date', 'Actual_Start_Date']:
        df[col + '_month'] = df[col].dt.month
        df[col + '_day'] = df[col].dt.day
        df[col + '_dayofweek'] = df[col].dt.dayofweek
        df[col + '_quarter'] = df[col].dt.quarter
    
    # Calculate start delay (difference between actual and planned start)
    df['Start_Delay'] = (df['Actual_Start_Date'] - df['Planned_Start_Date']).dt.days
    
    # Weekend flags
    df['Planned_Weekend'] = df['Planned_Start_Date_dayofweek'].isin([5, 6]).astype(int)
    df['Actual_Weekend'] = df['Actual_Start_Date_dayofweek'].isin([5, 6]).astype(int)
    
    # Batch category
    df['Batch_Category'] = pd.cut(df['Batch_Size'], bins=[0, 20, 25, 30, 100], labels=['Small', 'Medium', 'Large', 'XLarge'])
    
    # Label encode - use default for unknown categories
    df['Course_Category_Encoded'] = df['Course_Category'].apply(
        lambda x: le_category.transform([x])[0] if x in le_category.classes_ else len(le_category.classes_))
    df['Batch_Category_Encoded'] = df['Batch_Category'].apply(
        lambda x: le_batch_cat.transform([x])[0] if x in le_batch_cat.classes_ else 0)
    
    # Target encoding - use global mean for unknown categories
    df['Category_Delay_Mean'] = df['Course_Category'].map(category_delay_mean).fillna(global_delay_mean)
    df['Module_Delay_Mean'] = df['Module_Name'].map(module_delay_mean).fillna(global_delay_mean)
    
    # Prepare features for ML model
    feature_cols = ['Batch_Size', 'Planned_Duration', 'Start_Delay',
                    'Planned_Start_Date_month', 'Planned_Start_Date_day', 'Planned_Start_Date_dayofweek', 'Planned_Start_Date_quarter',
                    'Actual_Start_Date_month', 'Actual_Start_Date_day', 'Actual_Start_Date_dayofweek', 'Actual_Start_Date_quarter',
                    'Planned_Weekend', 'Actual_Weekend',
                    'Course_Category_Encoded', 'Batch_Category_Encoded', 
                    'Category_Delay_Mean', 'Module_Delay_Mean']
    
    X_new = df[feature_cols].copy()
    
    # Get ML predictions
    ml_predictions = model.predict(X_new)
    
    # Simple post-processing: round and clip
    final_predictions = np.round(ml_predictions).clip(0, 7).astype(int)
    
    return final_predictions

In [123]:
# Use the function to predict from JSON file and print results with detailed info
import json
with open('test_data_1000.json', 'r') as f:
    test_json = json.load(f)
test_df = pd.DataFrame(test_json)
test_df['Predicted_Delay'] = predict_delay(model, test_df)

# Convert dates to datetime for calculations
test_df['Planned_Start_Date'] = pd.to_datetime(test_df['Planned_Start_Date'])
test_df['Actual_Start_Date'] = pd.to_datetime(test_df['Actual_Start_Date'])

# Calculate predicted end date (Actual_Start_Date + Predicted_Delay)
test_df['Predicted_End_Date'] = test_df['Actual_Start_Date'] + pd.to_timedelta(test_df['Predicted_Delay'], unit='D')

if 'Delay_Days' in test_df.columns:
    # Calculate actual end date
    test_df['Actual_End_Date'] = test_df['Actual_Start_Date'] + pd.to_timedelta(test_df['Delay_Days'], unit='D')
    test_df['Error_Margin'] = test_df['Delay_Days'] - test_df['Predicted_Delay']
    
    # Classify prediction accuracy - USE CONSISTENT STRINGS WITH SYMBOLS
    def classify_match(error):
        error = abs(error)
        if error == 0:
            return " Exact Match"
        elif error <= 2:
            return " Close Enough"
        elif error <= 5:
            return " Moderate"
        else:
            return " Not Close"
    
    test_df['Match_Status'] = test_df['Error_Margin'].apply(classify_match)

# Print detailed results (first 10 only to avoid clutter)
print("DETAILED PREDICTION RESULTS (First 10 samples)")

for idx, row in test_df.head(10).iterrows():
    print(f"\nTraining ID: {row['Training_ID']}")
    print(f"  Course Name:       {row['Module_Name']}")
    print(f"  Course Category:   {row['Course_Category']}")
    print(f"  Instructor:        {row['Trainer']}")
    print(f"  Planned Start:     {row['Planned_Start_Date'].strftime('%Y-%m-%d')}")
    print(f"  Actual Start:      {row['Actual_Start_Date'].strftime('%Y-%m-%d')}")
    if 'Delay_Days' in test_df.columns:
        print(f"  Actual Delay:      {row['Delay_Days']} days")
        print(f"  Predicted Delay:   {row['Predicted_Delay']} days")
        print(f"  Error Margin:      {row['Error_Margin']:.0f} days")
        print(f"  Status:            {row['Match_Status']}")

print(f"\n... and {len(test_df) - 10} more records")

DETAILED PREDICTION RESULTS (First 10 samples)

Training ID: 5001
  Course Name:       Airflow Pipelines
  Course Category:   Data
  Instructor:        Manish Tiwari
  Planned Start:     2032-11-14
  Actual Start:      2032-11-15
  Actual Delay:      5 days
  Predicted Delay:   1 days
  Error Margin:      4 days
  Status:             Moderate

Training ID: 5002
  Course Name:       Mobile CI/CD
  Course Category:   Mobile
  Instructor:        Rekha Sinha
  Planned Start:     2032-10-02
  Actual Start:      2032-10-03
  Actual Delay:      1 days
  Predicted Delay:   1 days
  Error Margin:      0 days
  Status:             Exact Match

Training ID: 5003
  Course Name:       FastAPI Development
  Course Category:   Programming
  Instructor:        Amit Kumar
  Planned Start:     2031-08-12
  Actual Start:      2031-08-14
  Actual Delay:      0 days
  Predicted Delay:   2 days
  Error Margin:      -2 days
  Status:             Close Enough

Training ID: 5004
  Course Name:       SOC Operat

In [124]:
# Summary accuracy metrics for predictions on test JSON data
if 'Delay_Days' in test_df.columns:
    total_samples = len(test_df)
    
    print("PREDICTION ACCURACY SUMMARY")
    
    # Count by match status - use exact string matching
    exact = (test_df['Match_Status'] == " Exact Match").sum()
    close = (test_df['Match_Status'] == " Close Enough").sum()
    moderate = (test_df['Match_Status'] == " Moderate").sum()
    not_close = (test_df['Match_Status'] == " Not Close").sum()
    
    print(f"\n   Exact Match (0 days error):     {exact}/{total_samples} ({exact/total_samples*100:.1f}%)")
    print(f"   Close Enough (1-2 days error):  {close}/{total_samples} ({close/total_samples*100:.1f}%)")
    print(f"   Moderate (3-5 days error):      {moderate}/{total_samples} ({moderate/total_samples*100:.1f}%)")
    print(f"   Not Close (>5 days error):      {not_close}/{total_samples} ({not_close/total_samples*100:.1f}%)")
    
    # Overall accuracy (exact + close)
    good_predictions = exact + close
    print(f"\n  OVERALL ACCURACY (Exact + Close): {good_predictions}/{total_samples} ({good_predictions/total_samples*100:.1f}%)")
    
    print("ERROR STATISTICS:")
    print(f"  Mean Absolute Error:  {abs(test_df['Error_Margin']).mean():.2f} days")
    print(f"  Max Over-prediction:  {test_df['Error_Margin'].min():.2f} days")
    print(f"  Max Under-prediction: {test_df['Error_Margin'].max():.2f} days")
    
    # Debug: Show actual Match_Status distribution
    print("MATCH STATUS DISTRIBUTION:")
    print(test_df['Match_Status'].value_counts())
else:
    print("Cannot calculate accuracy - 'Delay_Days' column not found in test data.")

PREDICTION ACCURACY SUMMARY

   Exact Match (0 days error):     208/1000 (20.8%)
   Close Enough (1-2 days error):  653/1000 (65.3%)
   Moderate (3-5 days error):      132/1000 (13.2%)
   Not Close (>5 days error):      7/1000 (0.7%)

  OVERALL ACCURACY (Exact + Close): 861/1000 (86.1%)
ERROR STATISTICS:
  Mean Absolute Error:  1.43 days
  Max Over-prediction:  -2.00 days
  Max Under-prediction: 6.00 days
MATCH STATUS DISTRIBUTION:
Match_Status
Close Enough    653
Exact Match     208
Moderate        132
Not Close         7
Name: count, dtype: int64


# Model Workflow Diagram

```mermaid
graph TD
    subgraph "Cell 1: Data Loading and Preprocessing"
        A[Load train_data.csv] --> B[Analyze Delay Patterns by Course_Category]
        B --> C[Analyze Delay Patterns by Module_Name]
        C --> D[Create Pattern Dictionaries: category_delay_mean, module_delay_mean, global_delay_mean]
        D --> E[Convert Date Columns to Datetime]
        E --> F[Calculate Planned_Duration]
        F --> G[Extract Date Features: month, day, dayofweek, quarter]
        G --> H[Calculate Start_Delay]
        H --> I[Create Weekend Flags: Planned_Weekend, Actual_Weekend]
        I --> J[Create Batch_Category using pd.cut]
        J --> K[Label Encode: Course_Category, Batch_Category]
        K --> L[Target Encoding: Category_Delay_Mean, Module_Delay_Mean]
        L --> M[Select 17 Feature Columns]
        M --> N[Split Data: train_test_split 80/20]
        N --> O[Train GradientBoostingRegressor]
        O --> P[Train RandomForestRegressor]
        P --> Q[Create VotingRegressor Ensemble]
        Q --> R[Evaluate Model: MAE, RMSE, R2]
    end

    subgraph "Cell 2: Print Accuracy Scores"
        S[Print MAE, RMSE, R2 Score]
    end

    subgraph "Cell 3: Test Data Analysis"
        T[Load test_data_500.json] --> U[Analyze Course_Category Distribution]
        U --> V[Analyze Delay_Days Distribution]
    end

    subgraph "Cell 4: Define Prediction Function"
        W[Define predict_delay Function]
    end

    subgraph "Cell 5: Make Predictions"
        X[Load test_data_1000.json] --> Y[Apply predict_delay Function]
        Y --> Z[Calculate Predicted_End_Date]
        Z --> AA[Calculate Error_Margin]
        AA --> AB[Classify Match Status: Exact, Close, Moderate, Not Close]
        AB --> AC[Print Detailed Results for First 10 Samples]
    end

    subgraph "Cell 6: Accuracy Summary"
        AD[Count Match Status Categories] --> AE[Calculate Overall Accuracy]
        AE --> AF[Print Error Statistics: MAE, Max Over/Under Prediction]
        AF --> AG[Print Match Status Distribution]
    end

    R --> S
    S --> T
    V --> W
    W --> X
    AC --> AD
```