In [1]:
# Preprocessing: handle categorical, date, and numeric features
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

train_data = pd.read_csv('train_data.csv')

# Convert date columns to datetime
for col in ['Planned_Start_Date', 'Actual_Start_Date']:
    train_data[col] = pd.to_datetime(train_data[col])

# Feature engineering: extract year, month, day from dates
for col in ['Planned_Start_Date', 'Actual_Start_Date']:
    train_data[col + '_year'] = train_data[col].dt.year
    train_data[col + '_month'] = train_data[col].dt.month
    train_data[col + '_day'] = train_data[col].dt.day

# Encode categorical features
X = train_data[['Course_Category', 'Module_Name', 'Trainer', 'Batch_Size',
                 'Planned_Start_Date_year', 'Planned_Start_Date_month', 'Planned_Start_Date_day',
                 'Actual_Start_Date_year', 'Actual_Start_Date_month', 'Actual_Start_Date_day']]
X = pd.get_dummies(X)
y = train_data['Delay_Days']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for RandomForest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}


# GridSearchCV is used for hyperparameter tuning.
# It tries all combinations of parameters specified in param_grid using cross-validation.

gs = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),  # The model to tune
    param_grid=param_grid,                             # Dictionary with parameters to try (n_estimators, max_depth, min_samples_split).
    cv=3,                                              # 3-fold cross-validation
    scoring='neg_mean_absolute_error',                 # Optimize for lowest MAE
    n_jobs=-1                                          # Use all CPU cores
)
gs.fit(X_train, y_train)
model = gs.best_estimator_

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)


In [2]:
# Print accuracy scores
print(f'MAE: {mae:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R2 Score: {r2*100:.2f}%')

MAE: 0.37
RMSE: 0.63
R2 Score: 81.56%


In [3]:
results = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred})
results['Error_Margin'] = results['Actual'] - results['Predicted']
print(results.head())

   Actual  Predicted  Error_Margin
0       4      3.800         0.200
1       1      0.440         0.560
2       4      3.955         0.045
3       1      1.285        -0.285
4       0      2.050        -2.050


In [4]:
# Function to predict delay days for new data
def predict_delay(model, input_df):
    df = input_df.copy()
    # Convert date columns
    for col in ['Planned_Start_Date', 'Actual_Start_Date']:
        df[col] = pd.to_datetime(df[col])
        df[col + '_year'] = df[col].dt.year
        df[col + '_month'] = df[col].dt.month
        df[col + '_day'] = df[col].dt.day
    # Prepare features
    features = ['Course_Category', 'Module_Name', 'Trainer', 'Batch_Size',
                'Planned_Start_Date_year', 'Planned_Start_Date_month', 'Planned_Start_Date_day',
                'Actual_Start_Date_year', 'Actual_Start_Date_month', 'Actual_Start_Date_day']
    X_new = pd.get_dummies(df[features])
    # Align columns with training data
    X_new = X_new.reindex(columns=X.columns, fill_value=0)
    return model.predict(X_new)

In [5]:
# Use the function to predict from JSON file and print results with error margin
import json
with open('test_data.json', 'r') as f:
    test_json = json.load(f)
test_df = pd.DataFrame(test_json)
test_df['Predicted_Delay'] = predict_delay(model, test_df)
if 'Delay_Days' in test_df.columns:
    test_df['Error_Margin'] = test_df['Delay_Days'] - test_df['Predicted_Delay']
display_cols = ['Training_ID', 'Module_Name', 'Delay_Days', 'Predicted_Delay', 'Error_Margin']
if 'Delay_Days' not in test_df.columns:
    display_cols.remove('Delay_Days')
    display_cols.remove('Error_Margin')
print(test_df[display_cols].to_string(index=False))

 Training_ID                Module_Name  Delay_Days  Predicted_Delay  Error_Margin
         201         Scala Fundamentals           1            0.235         0.765
         202         Business Analytics           2            1.035         0.965
         203               AI in Retail           0            0.400        -0.400
         204 Robotic Process Automation           3            2.615         0.385
         205          Data Storytelling           0            0.005        -0.005
         206         MongoDB Essentials           1            1.070        -0.070
         207          Kubernetes Basics           4            3.665         0.335
         208               Scrum Master           1            1.085        -0.085
         209              Ruby on Rails           0            0.000         0.000
         210          BigQuery Advanced           1            1.030        -0.030
