# Enhanced Model Training with Rich Features

This notebook trains models using the feature-engineered datasets that include:
- Priority Level percentages
- Mental Health percentage
- Top 5 Incident Category percentages
- Lag features (previous day/week)
- Temporal features (weekend, holiday, etc.)

## Model Training Strategy

This notebook trains enhanced models using:
- Temporal features
- Priority + Mental Health + Category features  
- Lag features


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import sys
from datetime import datetime
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
import xgboost as xgb
import joblib

sns.set_style("whitegrid")
plt.style.use('default')

output_file = 'Final_output_sector.txt'
os.makedirs('Sector_Model_Comparison_Results', exist_ok=True)
os.makedirs('Sector_Model_Comparison_Results/lasso', exist_ok=True)
os.makedirs('Sector_Model_Comparison_Results/linear', exist_ok=True)
os.makedirs('Sector_Model_Comparison_Results/xgboost', exist_ok=True)
os.makedirs('Sector_Model_Comparison_Results/randomforest', exist_ok=True)
os.makedirs('Sector_Model_Comparison_Results/models', exist_ok=True)

def log_output(text):
    with open(output_file, 'a', encoding='utf-8') as f:
        f.write(str(text) + '\n')
    print(text)

log_output(f"=== Training Started: {datetime.now()} ===\n")


=== Training Started: 2025-12-03 21:40:33.147890 ===



## Load Datasets

Load enhanced datasets.


In [2]:
enhanced_daily = pd.read_csv('sector_daily_enhanced.csv')
enhanced_weekly = pd.read_csv('sector_weekly_enhanced.csv')
enhanced_monthly = pd.read_csv('sector_monthly_enhanced.csv')

log_output(f"Dataset shapes - Daily: {enhanced_daily.shape}, Weekly: {enhanced_weekly.shape}, Monthly: {enhanced_monthly.shape}")


Dataset shapes - Daily: (10351, 51), Weekly: (1771, 35), Monthly: (341, 35)


## Prepare Features

Define feature sets for enhanced models.


In [3]:
enhanced_features_daily = ['Month', 'Year', 'Day_of_Year', 'Week', 
                           'pct_priority_1', 'pct_priority_2', 'pct_priority_3', 'pct_priority_4',
                           'pct_mental_health',
                           'pct_category_1', 'pct_category_2', 'pct_category_3', 'pct_category_4', 'pct_category_5',
                           'lag_previous_day', 'lag_same_day_last_week', 'lag_2days_ago', 
                           'lag_same_day_last_month', 'lag_previous_week_total',
                           'is_weekend', 'is_peak_day', 'is_holiday',
                           'is_peak_hour_period', 'hour_category', 'hours_from_peak',
                           'pct_priority_1_peak_hour', 'is_high_priority_day', 'priority_1_x_peak_hour',
                           'days_since_start', 'year_trend', 'month_trend', 'week_trend', 'day_of_year_trend',
                           'rolling_mean_7d', 'rolling_std_7d', 'rolling_mean_30d',
                           'Sector_Encoded']

enhanced_features_weekly = ['Month', 'Year', 'Week',
                           'pct_priority_1', 'pct_priority_2', 'pct_priority_3', 'pct_priority_4',
                           'pct_mental_health',
                           'pct_category_1', 'pct_category_2', 'pct_category_3', 'pct_category_4', 'pct_category_5',
                           'lag_previous_week',
                           'is_peak_hour_period', 'hours_from_peak',
                           'is_high_priority_week',
                           'days_since_start', 'year_trend', 'month_trend', 'week_trend',
                           'rolling_mean_4w',
                           'Sector_Encoded']

enhanced_features_monthly = ['Month', 'Year',
                            'pct_priority_1', 'pct_priority_2', 'pct_priority_3', 'pct_priority_4',
                            'pct_mental_health',
                            'pct_category_1', 'pct_category_2', 'pct_category_3', 'pct_category_4', 'pct_category_5',
                            'lag_previous_month', 'lag_same_month_last_year',
                            'is_peak_hour_period', 'hours_from_peak',
                            'is_high_priority_month', 'is_peak_month',
                            'days_since_start', 'year_trend', 'month_trend',
                            'rolling_mean_3m',
                            'Sector_Encoded']

log_output(f"Feature counts - Enhanced: Daily={len(enhanced_features_daily)}, Weekly={len(enhanced_features_weekly)}, Monthly={len(enhanced_features_monthly)}")


Feature counts - Enhanced: Daily=37, Weekly=23, Monthly=23


## Encode Categorical Features

Encode Sector for enhanced models.


In [4]:
le_sector = LabelEncoder()

enhanced_daily['Sector_Encoded'] = le_sector.fit_transform(enhanced_daily['Sector'])
enhanced_weekly['Sector_Encoded'] = le_sector.transform(enhanced_weekly['Sector'])
enhanced_monthly['Sector_Encoded'] = le_sector.transform(enhanced_monthly['Sector'])

log_output("Encoding complete")


Encoding complete


## Train-Test Split

Use temporal split (80/20) to avoid data leakage.


In [5]:
X_enhanced_d = enhanced_daily[enhanced_features_daily]
y_enhanced_d = enhanced_daily['Call_Count']
X_train_ed, X_test_ed, y_train_ed, y_test_ed = train_test_split(
    X_enhanced_d, y_enhanced_d, test_size=0.2, shuffle=False, random_state=42)

X_enhanced_w = enhanced_weekly[enhanced_features_weekly]
y_enhanced_w = enhanced_weekly['Call_Count']
X_train_ew, X_test_ew, y_train_ew, y_test_ew = train_test_split(
    X_enhanced_w, y_enhanced_w, test_size=0.2, shuffle=False, random_state=42)

X_enhanced_m = enhanced_monthly[enhanced_features_monthly]
y_enhanced_m = enhanced_monthly['Call_Count']
X_train_em, X_test_em, y_train_em, y_test_em = train_test_split(
    X_enhanced_m, y_enhanced_m, test_size=0.2, shuffle=False, random_state=42)

log_output("Train-test splits complete")
log_output(f"Daily - Train {len(X_train_ed)}, Test {len(X_test_ed)}")
log_output(f"Weekly - Train {len(X_train_ew)}, Test {len(X_test_ew)}")
log_output(f"Monthly - Train {len(X_train_em)}, Test {len(X_test_em)}")


Train-test splits complete
Daily - Train 8280, Test 2071
Weekly - Train 1416, Test 355
Monthly - Train 272, Test 69


## Scale Features

Standardize features for linear models.


In [6]:
scaler_enhanced_d = StandardScaler()
scaler_enhanced_w = StandardScaler()
scaler_enhanced_m = StandardScaler()

X_train_ed_scaled = scaler_enhanced_d.fit_transform(X_train_ed)
X_test_ed_scaled = scaler_enhanced_d.transform(X_test_ed)

X_train_ew_scaled = scaler_enhanced_w.fit_transform(X_train_ew)
X_test_ew_scaled = scaler_enhanced_w.transform(X_test_ew)

X_train_em_scaled = scaler_enhanced_m.fit_transform(X_train_em)
X_test_em_scaled = scaler_enhanced_m.transform(X_test_em)

log_output("Feature scaling complete")


Feature scaling complete


## Train Models

Train Lasso, Linear Regression, Random Forest, and XGBoost using enhanced feature sets.


In [7]:
# Store all models and results
models = {}
results = {}

In [8]:


alphas = np.logspace(-4, 2, 100)

lasso_enhanced_d = LassoCV(alphas=alphas, cv=5, max_iter=10000, random_state=42, n_jobs=-1)
lasso_enhanced_d.fit(X_train_ed_scaled, y_train_ed)
models['lasso_enhanced_daily'] = lasso_enhanced_d

lasso_enhanced_w = LassoCV(alphas=alphas, cv=5, max_iter=10000, random_state=42, n_jobs=-1)
lasso_enhanced_w.fit(X_train_ew_scaled, y_train_ew)
models['lasso_enhanced_weekly'] = lasso_enhanced_w

lasso_enhanced_m = LassoCV(alphas=alphas, cv=5, max_iter=10000, random_state=42, n_jobs=-1)
lasso_enhanced_m.fit(X_train_em_scaled, y_train_em)
models['lasso_enhanced_monthly'] = lasso_enhanced_m

log_output("Lasso models trained")


Lasso models trained


In [9]:
linear_enhanced_d = LinearRegression()
linear_enhanced_d.fit(X_train_ed_scaled, y_train_ed)
models['linear_enhanced_daily'] = linear_enhanced_d

linear_enhanced_w = LinearRegression()
linear_enhanced_w.fit(X_train_ew_scaled, y_train_ew)
models['linear_enhanced_weekly'] = linear_enhanced_w

linear_enhanced_m = LinearRegression()
linear_enhanced_m.fit(X_train_em_scaled, y_train_em)
models['linear_enhanced_monthly'] = linear_enhanced_m

log_output("Linear Regression models trained")


Linear Regression models trained


In [10]:
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

rf_daily_space = {
    "n_estimators": np.arange(200, 501),
    "max_depth": np.arange(4, 9),
    "max_features": ["auto", "sqrt", 0.7, 0.9]
}

rf_weekly_space = {
    "n_estimators": np.arange(100, 301),
    "max_depth": np.arange(3, 6),
    "max_features": ["auto", "sqrt", 0.5, 0.7]
}

rf_monthly_space = {
    "n_estimators": np.arange(50, 201),
    "max_depth": np.arange(3, 7),
    "max_features": ["auto", "sqrt", 0.5]
}

def train_random_search(model, param_space, X_train, y_train, model_name):
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_space,
        scoring=rmse_scorer,
        n_iter=30,
        cv=5,
        verbose=0,
        random_state=42,
        n_jobs=-1
    )
    search.fit(X_train, y_train)
    log_output(f"{model_name} - Best params: {search.best_params_}")
    log_output(f"{model_name} - Best RMSE: {np.sqrt(-search.best_score_):.2f}")
    return search.best_estimator_

log_output("Training Random Forest models...")

models["rf_enhanced_daily"] = train_random_search(
    RandomForestRegressor(random_state=42, n_jobs=-1), 
    rf_daily_space, 
    X_train_ed, 
    y_train_ed, 
    "RF Enhanced Daily"
)

models["rf_enhanced_weekly"] = train_random_search(
    RandomForestRegressor(random_state=42, n_jobs=-1), 
    rf_weekly_space, 
    X_train_ew, 
    y_train_ew, 
    "RF Enhanced Weekly"
)

models["rf_enhanced_monthly"] = train_random_search(
    RandomForestRegressor(random_state=42, n_jobs=-1), 
    rf_monthly_space, 
    X_train_em, 
    y_train_em, 
    "RF Enhanced Monthly"
)

log_output("Random Forest training complete")


Training Random Forest models...
RF Enhanced Daily - Best params: {'n_estimators': np.int64(215), 'max_features': 0.9, 'max_depth': np.int64(8)}
RF Enhanced Daily - Best RMSE: 9.60
RF Enhanced Weekly - Best params: {'n_estimators': np.int64(290), 'max_features': 0.5, 'max_depth': np.int64(5)}
RF Enhanced Weekly - Best RMSE: 85.27
RF Enhanced Monthly - Best params: {'n_estimators': np.int64(180), 'max_features': 0.5, 'max_depth': np.int64(6)}
RF Enhanced Monthly - Best RMSE: 342.68
Random Forest training complete


In [11]:
if 'rmse_scorer' not in locals():
    rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

xgb_daily_space = {
    "n_estimators": np.arange(200, 501),
    "max_depth": np.arange(4, 9),
    "learning_rate": np.linspace(0.05, 0.15, 20),
    "colsample_bytree": np.linspace(0.5, 1.0, 10),
    "subsample": np.linspace(0.7, 1.0, 10)
}

xgb_weekly_space = {
    "n_estimators": np.arange(100, 301),
    "max_depth": np.arange(3, 6),
    "learning_rate": np.linspace(0.05, 0.10, 20),
    "colsample_bytree": np.linspace(0.3, 0.7, 10),
    "subsample": np.linspace(0.7, 1.0, 10)
}

xgb_monthly_space = {
    "n_estimators": np.arange(50, 151),
    "max_depth": np.arange(2, 5),
    "learning_rate": np.linspace(0.05, 0.10, 20),
    "colsample_bytree": np.linspace(0.2, 0.6, 10),
    "subsample": np.linspace(0.7, 1.0, 10)
}

if 'train_random_search' not in locals():
    def train_random_search(model, param_space, X_train, y_train, model_name):
        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_space,
            scoring=rmse_scorer,
            n_iter=30,
            cv=5,
            verbose=0,
            random_state=42,
            n_jobs=-1
        )
        search.fit(X_train, y_train)
        log_output(f"{model_name} - Best params: {search.best_params_}")
        log_output(f"{model_name} - Best RMSE: {np.sqrt(-search.best_score_):.2f}")
        return search.best_estimator_

log_output("Training XGBoost models...")

models["xgb_enhanced_daily"] = train_random_search(
    xgb.XGBRegressor(random_state=42, n_jobs=-1), 
    xgb_daily_space, 
    X_train_ed, 
    y_train_ed, 
    "XGB Enhanced Daily"
)

models["xgb_enhanced_weekly"] = train_random_search(
    xgb.XGBRegressor(random_state=42, n_jobs=-1), 
    xgb_weekly_space, 
    X_train_ew, 
    y_train_ew, 
    "XGB Enhanced Weekly"
)

models["xgb_enhanced_monthly"] = train_random_search(
    xgb.XGBRegressor(random_state=42, n_jobs=-1), 
    xgb_monthly_space, 
    X_train_em, 
    y_train_em, 
    "XGB Enhanced Monthly"
)

log_output(f"All models trained. Total: {len(models)}")


Training XGBoost models...
XGB Enhanced Daily - Best params: {'subsample': np.float64(0.7), 'n_estimators': np.int64(451), 'max_depth': np.int64(6), 'learning_rate': np.float64(0.13421052631578947), 'colsample_bytree': np.float64(0.8888888888888888)}
XGB Enhanced Daily - Best RMSE: 6.66
XGB Enhanced Weekly - Best params: {'subsample': np.float64(0.7333333333333333), 'n_estimators': np.int64(256), 'max_depth': np.int64(4), 'learning_rate': np.float64(0.07105263157894737), 'colsample_bytree': np.float64(0.5222222222222221)}
XGB Enhanced Weekly - Best RMSE: 68.97
XGB Enhanced Monthly - Best params: {'subsample': np.float64(1.0), 'n_estimators': np.int64(123), 'max_depth': np.int64(2), 'learning_rate': np.float64(0.1), 'colsample_bytree': np.float64(0.4666666666666666)}
XGB Enhanced Monthly - Best RMSE: 210.98
All models trained. Total: 12


## Evaluate Models

Calculate metrics for all enhanced models.


In [12]:
def evaluate_model(model, X_test, y_test, X_train_scaled=None, use_scaled=False):
    """Evaluate model and return metrics"""
    if use_scaled:
        y_pred = model.predict(X_test)
    else:
        y_pred = model.predict(X_test)
    
    # Clip negative predictions to 0 (call counts cannot be negative)
    y_pred = np.maximum(0, y_pred)
    
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    return {'R2': r2, 'MAE': mae, 'RMSE': rmse, 'predictions': y_pred}

# Evaluate all models
test_sets = {
    'lasso_enhanced_daily': (X_test_ed_scaled, y_test_ed, True),
    'lasso_enhanced_weekly': (X_test_ew_scaled, y_test_ew, True),
    'lasso_enhanced_monthly': (X_test_em_scaled, y_test_em, True),
    'linear_enhanced_daily': (X_test_ed_scaled, y_test_ed, True),
    'linear_enhanced_weekly': (X_test_ew_scaled, y_test_ew, True),
    'linear_enhanced_monthly': (X_test_em_scaled, y_test_em, True),
    'rf_enhanced_daily': (X_test_ed, y_test_ed, False),
    'rf_enhanced_weekly': (X_test_ew, y_test_ew, False),
    'rf_enhanced_monthly': (X_test_em, y_test_em, False),
    'xgb_enhanced_daily': (X_test_ed, y_test_ed, False),
    'xgb_enhanced_weekly': (X_test_ew, y_test_ew, False),
    'xgb_enhanced_monthly': (X_test_em, y_test_em, False)
}

results = {}
for model_name, (X_test, y_test, use_scaled) in test_sets.items():
    results[model_name] = evaluate_model(models[model_name], X_test, y_test, use_scaled=use_scaled)

# Create comparison DataFrame
comparison_data = []
for model_name, metrics in results.items():
    comparison_data.append({
        'Model': model_name,
        'R2': metrics['R2'],
        'MAE': metrics['MAE'],
        'RMSE': metrics['RMSE']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df['Aggregation'] = comparison_df['Model'].apply(
    lambda x: 'Daily' if 'daily' in x else ('Weekly' if 'weekly' in x else 'Monthly')
)
comparison_df['Algorithm'] = comparison_df['Model'].apply(lambda x: x.split('_')[0].upper())

log_output("\n=== Model Evaluation Results ===")
log_output(comparison_df.to_string(index=False))
comparison_df.to_csv('Sector_Model_Comparison_Results/model_comparison_sector.csv', index=False)



=== Model Evaluation Results ===
                  Model       R2        MAE       RMSE Aggregation Algorithm
   lasso_enhanced_daily 0.463636   6.039571   7.877014       Daily     LASSO
  lasso_enhanced_weekly 0.804462  55.932445  67.030907      Weekly     LASSO
 lasso_enhanced_monthly 0.922455  69.495427  86.822279     Monthly     LASSO
  linear_enhanced_daily 0.465167   6.000205   7.865762       Daily    LINEAR
 linear_enhanced_weekly 0.797005  57.479819  68.297195      Weekly    LINEAR
linear_enhanced_monthly 0.902814  75.183076  97.198162     Monthly    LINEAR
      rf_enhanced_daily 0.475903   6.146660   7.786417       Daily        RF
     rf_enhanced_weekly 0.816873  51.353031  64.868909      Weekly        RF
    rf_enhanced_monthly 0.715993 127.256417 166.157494     Monthly        RF
     xgb_enhanced_daily 0.704087   4.330860   5.850783       Daily       XGB
    xgb_enhanced_weekly 0.894107  39.572205  49.327964      Weekly       XGB
   xgb_enhanced_monthly 0.480562 186.67802

## Feature Correlation Analysis

Analyze correlations between features and target variable to understand relationships.


In [None]:
# Correlation heatmaps for enhanced features (Daily, Weekly, Monthly)
fig, axes = plt.subplots(1, 3, figsize=(24, 7))

# Daily correlation
enhanced_daily_corr = enhanced_daily[enhanced_features_daily + ['Call_Count']].copy()
correlation_matrix_daily = enhanced_daily_corr.corr()
sns.heatmap(correlation_matrix_daily, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'}, 
            fmt='.2f', vmin=-1, vmax=1, ax=axes[0])
axes[0].set_title('Enhanced Sector Daily Features: Correlation Heatmap', fontsize=13, fontweight='bold')

# Weekly correlation
enhanced_weekly_corr = enhanced_weekly[enhanced_features_weekly + ['Call_Count']].copy()
correlation_matrix_weekly = enhanced_weekly_corr.corr()
sns.heatmap(correlation_matrix_weekly, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'}, 
            fmt='.2f', vmin=-1, vmax=1, ax=axes[1])
axes[1].set_title('Enhanced Sector Weekly Features: Correlation Heatmap', fontsize=13, fontweight='bold')

# Monthly correlation
enhanced_monthly_corr = enhanced_monthly[enhanced_features_monthly + ['Call_Count']].copy()
correlation_matrix_monthly = enhanced_monthly_corr.corr()
sns.heatmap(correlation_matrix_monthly, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'}, 
            fmt='.2f', vmin=-1, vmax=1, ax=axes[2])
axes[2].set_title('Enhanced Sector Monthly Features: Correlation Heatmap', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.savefig('Sector_Model_Comparison_Results/feature_correlation_heatmap_all_aggregations.png', dpi=300, bbox_inches='tight')
plt.close()

# Show correlations with target variable for each aggregation
log_output("Top 10 Features Correlated with Call_Count (Daily):")
target_correlations_daily = correlation_matrix_daily['Call_Count'].sort_values(key=abs, ascending=False)
log_output(str(target_correlations_daily.head(11)))

log_output("\nTop 10 Features Correlated with Call_Count (Weekly):")
target_correlations_weekly = correlation_matrix_weekly['Call_Count'].sort_values(key=abs, ascending=False)
log_output(str(target_correlations_weekly.head(11)))

log_output("\nTop 10 Features Correlated with Call_Count (Monthly):")
target_correlations_monthly = correlation_matrix_monthly['Call_Count'].sort_values(key=abs, ascending=False)
log_output(str(target_correlations_monthly.head(11)))


Top 10 Features Correlated with Call_Count (Daily):
Call_Count                 1.000000
rolling_mean_7d            0.942893
rolling_mean_30d           0.933603
lag_previous_week_total    0.915746
lag_previous_day           0.884391
lag_2days_ago              0.868679
lag_same_day_last_week     0.865347
lag_same_day_last_month    0.799289
pct_priority_1             0.595985
rolling_std_7d             0.560903
pct_priority_4            -0.489804
Name: Call_Count, dtype: float64

Top 10 Features Correlated with Call_Count (Weekly):
Call_Count               1.000000
rolling_mean_4w          0.727256
pct_priority_4          -0.627086
is_high_priority_week   -0.588084
pct_priority_2          -0.585574
pct_category_4          -0.581119
pct_priority_3          -0.530006
lag_previous_week        0.505479
pct_category_3          -0.428874
pct_category_1          -0.411729
pct_category_2          -0.398358
Name: Call_Count, dtype: float64

Top 10 Features Correlated with Call_Count (Monthly):
Cal

## Actual vs Predicted Scatter Plots

Visualize model predictions against actual values for all models.


In [18]:
# Random Forest: Actual vs Predicted Scatter Plots (Daily, Weekly, Monthly) for random forest alone
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Use UN-SCALED features for tree-based models
rf_model_configs = [
    ('rf_enhanced_daily', X_test_ed, y_test_ed, 'RF Enhanced Daily', axes[0]),
    ('rf_enhanced_weekly', X_test_ew, y_test_ew, 'RF Enhanced Weekly', axes[1]),
    ('rf_enhanced_monthly', X_test_em, y_test_em, 'RF Enhanced Monthly', axes[2])
]

for model_name, X_test, y_test, title, ax in rf_model_configs:
    model = models[model_name]
    y_pred = model.predict(X_test)
    
    # Clip negative predictions to 0 (just in case)
    y_pred = np.maximum(0, y_pred)
    
    # Compute metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Scatter plot
    ax.scatter(y_test, y_pred, alpha=0.5, s=20)
    
    # Perfect prediction line (y=x)
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    ax.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
    
    ax.set_xlabel('Actual Call Count', fontsize=11)
    ax.set_ylabel('Predicted Call Count', fontsize=11)
    ax.set_title(f'{title}\nR² = {r2:.4f}, RMSE = {rmse:.2f}', fontsize=12, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('Sector_Model_Comparison_Results/randomforest/rf_actual_vs_predicted_scatter.png', dpi=300, bbox_inches='tight')
plt.close()


In [19]:
# Create scatter plots for linear regression models (Daily, Weekly, Monthly)
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

model_configs = [
    ('linear_enhanced_daily', X_test_ed_scaled, y_test_ed, 'Linear Enhanced Daily', axes[0], True),
    ('linear_enhanced_weekly', X_test_ew_scaled, y_test_ew, 'Linear Enhanced Weekly', axes[1], True),
    ('linear_enhanced_monthly', X_test_em_scaled, y_test_em, 'Linear Enhanced Monthly', axes[2], True)
]

for model_name, X_test, y_test, title, ax, use_scaled in model_configs:
    model = models[model_name]
    y_pred = model.predict(X_test)
    
    # Clip negative predictions to 0 (call counts cannot be negative)
    y_pred = np.maximum(0, y_pred)
    
    # Recalculate metrics with clipped predictions
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Scatter plot
    ax.scatter(y_test, y_pred, alpha=0.5, s=20)
    
    # Perfect prediction line (y=x)
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    ax.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
    
    ax.set_xlabel('Actual Call Count', fontsize=11)
    ax.set_ylabel('Predicted Call Count', fontsize=11)
    ax.set_title(f'{title}\nR² = {r2:.4f}, RMSE = {rmse:.2f}', fontsize=12, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('Sector_Model_Comparison_Results/linear/linear_actual_vs_predicted_scatter.png', dpi=300, bbox_inches='tight')
plt.close()


In [20]:
# Create scatter plots for lasso models (Daily, Weekly, Monthly)
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

model_configs = [
    ('lasso_enhanced_daily', X_test_ed_scaled, y_test_ed, 'Lasso Enhanced Daily', axes[0], True),
    ('lasso_enhanced_weekly', X_test_ew_scaled, y_test_ew, 'Lasso Enhanced Weekly', axes[1], True),
    ('lasso_enhanced_monthly', X_test_em_scaled, y_test_em, 'Lasso Enhanced Monthly', axes[2], True)
]

for model_name, X_test, y_test, title, ax, use_scaled in model_configs:
    model = models[model_name]
    y_pred = model.predict(X_test)
    
    # Clip negative predictions to 0 (call counts cannot be negative)
    y_pred = np.maximum(0, y_pred)
    
    # Recalculate metrics with clipped predictions
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Scatter plot
    ax.scatter(y_test, y_pred, alpha=0.5, s=20)
    
    # Perfect prediction line (y=x)
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    ax.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
    
    ax.set_xlabel('Actual Call Count', fontsize=11)
    ax.set_ylabel('Predicted Call Count', fontsize=11)
    ax.set_title(f'{title}\nR² = {r2:.4f}, RMSE = {rmse:.2f}', fontsize=12, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('Sector_Model_Comparison_Results/lasso/lasso_actual_vs_predicted_scatter.png', dpi=300, bbox_inches='tight')
plt.close()


## Ensemble Methods: XGBoost Actual vs Predicted

Visualize how the XGBoost ensemble models perform for enhanced features at daily, weekly, and monthly aggregations.


In [21]:
# XGBoost: Actual vs Predicted Scatter Plots (Daily, Weekly, Monthly) for xgboost alone
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Use UN-SCALED features for tree-based models
xgb_model_configs = [
    ('xgb_enhanced_daily', X_test_ed, y_test_ed, 'XGB Enhanced Daily', axes[0]),
    ('xgb_enhanced_weekly', X_test_ew, y_test_ew, 'XGB Enhanced Weekly', axes[1]),
    ('xgb_enhanced_monthly', X_test_em, y_test_em, 'XGB Enhanced Monthly', axes[2])
]

for model_name, X_test, y_test, title, ax in xgb_model_configs:
    model = models.get(model_name)
    if model is None:
        ax.set_visible(False)
        continue
    
    y_pred = model.predict(X_test)
    
    # Clip negative predictions to 0 (just in case)
    y_pred = np.maximum(0, y_pred)
    
    # Compute metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Scatter plot
    ax.scatter(y_test, y_pred, alpha=0.5, s=20)
    
    # Perfect prediction line (y=x)
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    ax.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
    
    ax.set_xlabel('Actual Call Count', fontsize=11)
    ax.set_ylabel('Predicted Call Count', fontsize=11)
    ax.set_title(f'{title}\nR² = {r2:.4f}, RMSE = {rmse:.2f}', fontsize=12, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('Sector_Model_Comparison_Results/xgboost/xgb_actual_vs_predicted_scatter.png', dpi=300, bbox_inches='tight')
plt.close()


## Visualize Model Comparison

Compare Enhanced models across different algorithms and aggregations.


In [None]:
# R2 Score Comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Daily comparison
daily_comparison = comparison_df[comparison_df['Aggregation'] == 'Daily']
pivot_daily = daily_comparison.pivot_table(index='Algorithm', values='R2', aggfunc='mean').reset_index()
pivot_daily = pivot_daily.set_index('Algorithm')
sns.heatmap(pivot_daily, annot=True, fmt='.4f', cmap='RdYlGn', ax=axes[0], cbar_kws={'label': 'R² Score'})
axes[0].set_title('Daily Models: R² by Algorithm', fontsize=13, fontweight='bold')
axes[0].set_xlabel('Model Type')
axes[0].set_ylabel('Algorithm')

# Weekly comparison
weekly_comparison = comparison_df[comparison_df['Aggregation'] == 'Weekly']
pivot_weekly = weekly_comparison.pivot_table(index='Algorithm', values='R2', aggfunc='mean').reset_index()
pivot_weekly = pivot_weekly.set_index('Algorithm')
sns.heatmap(pivot_weekly, annot=True, fmt='.4f', cmap='RdYlGn', ax=axes[1], cbar_kws={'label': 'R² Score'})
axes[1].set_title('Weekly Models: R² by Algorithm', fontsize=13, fontweight='bold')
axes[1].set_xlabel('Model Type')
axes[1].set_ylabel('Algorithm')

# Monthly comparison
monthly_comparison = comparison_df[comparison_df['Aggregation'] == 'Monthly']
pivot_monthly = monthly_comparison.pivot_table(index='Algorithm', values='R2', aggfunc='mean').reset_index()
pivot_monthly = pivot_monthly.set_index('Algorithm')
sns.heatmap(pivot_monthly, annot=True, fmt='.4f', cmap='RdYlGn', ax=axes[2], cbar_kws={'label': 'R² Score'})
axes[2].set_title('Monthly Models: R² by Algorithm', fontsize=13, fontweight='bold')
axes[2].set_xlabel('Model Type')
axes[2].set_ylabel('Algorithm')

plt.tight_layout()
plt.savefig('Sector_Model_Comparison_Results/r2_comparison_all_aggregations.png', dpi=300, bbox_inches='tight')
plt.close()


In [24]:
# RMSE Comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Daily RMSE
pivot_daily_rmse = daily_comparison.pivot_table(index='Algorithm', values='RMSE', aggfunc='mean').reset_index()
pivot_daily_rmse = pivot_daily_rmse.set_index('Algorithm')
sns.heatmap(pivot_daily_rmse, annot=True, fmt='.2f', cmap='YlOrRd_r', ax=axes[0], cbar_kws={'label': 'RMSE'})
axes[0].set_title('Daily Models: RMSE by Algorithm', fontsize=13, fontweight='bold')
axes[0].set_xlabel('')
axes[0].set_ylabel('Algorithm')

# Weekly RMSE
pivot_weekly_rmse = weekly_comparison.pivot_table(index='Algorithm', values='RMSE', aggfunc='mean').reset_index()
pivot_weekly_rmse = pivot_weekly_rmse.set_index('Algorithm')
sns.heatmap(pivot_weekly_rmse, annot=True, fmt='.2f', cmap='YlOrRd_r', ax=axes[1], cbar_kws={'label': 'RMSE'})
axes[1].set_title('Weekly Models: RMSE by Algorithm', fontsize=13, fontweight='bold')
axes[1].set_xlabel('')
axes[1].set_ylabel('Algorithm')

# Monthly RMSE
pivot_monthly_rmse = monthly_comparison.pivot_table(index='Algorithm', values='RMSE', aggfunc='mean').reset_index()
pivot_monthly_rmse = pivot_monthly_rmse.set_index('Algorithm')
sns.heatmap(pivot_monthly_rmse, annot=True, fmt='.2f', cmap='YlOrRd_r', ax=axes[2], cbar_kws={'label': 'RMSE'})
axes[2].set_title('Monthly Models: RMSE by Algorithm', fontsize=13, fontweight='bold')
axes[2].set_xlabel('')
axes[2].set_ylabel('Algorithm')

plt.tight_layout()
plt.savefig('Sector_Model_Comparison_Results/rmse_comparison_all_aggregations.png', dpi=300, bbox_inches='tight')
plt.close()


## Feature Importance Analysis

Analyze which features contribute most to predictions in enhanced models.


In [25]:
# Lasso Coefficients for Enhanced Daily Model
coef_df = pd.DataFrame({
    'Feature': enhanced_features_daily,
    'Coefficient': models['lasso_enhanced_daily'].coef_
}).sort_values('Coefficient', key=abs, ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(data=coef_df.head(15), x='Coefficient', y='Feature', palette='viridis')
plt.axvline(x=0, color='black', linestyle='-', linewidth=1)
plt.xlabel('Coefficient Value', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Enhanced Daily Lasso: Top 15 Feature Coefficients', fontsize=13, fontweight='bold')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('Sector_Model_Comparison_Results/lasso/lasso_feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()

print("Top 10 Most Important Features (by absolute coefficient):")
print(coef_df.head(10))


Top 10 Most Important Features (by absolute coefficient):
                    Feature  Coefficient
33          rolling_mean_7d    68.271984
18  lag_previous_week_total   -44.932656
15   lag_same_day_last_week    12.725406
3                      Week    -6.743641
35         rolling_mean_30d    -6.588390
32        day_of_year_trend     6.454223
17  lag_same_day_last_month     2.492155
20              is_peak_day     1.518524
19               is_weekend    -0.641736
21               is_holiday    -0.588713


In [26]:
# XGBoost Feature Importance
xgb_importance = pd.DataFrame({
    'Feature': enhanced_features_daily,
    'Importance': models['xgb_enhanced_daily'].feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(data=xgb_importance.head(15), x='Importance', y='Feature', palette='magma')
plt.xlabel('Feature Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Enhanced Daily XGBoost: Top 15 Feature Importance', fontsize=13, fontweight='bold')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('Sector_Model_Comparison_Results/xgboost/xgb_feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()

log_output("Top 10 Most Important Features (XGBoost):")
log_output(str(xgb_importance.head(10)))


Top 10 Most Important Features (XGBoost):
                     Feature  Importance
35          rolling_mean_30d    0.551835
33           rolling_mean_7d    0.322754
20               is_peak_day    0.018897
18   lag_previous_week_total    0.018473
15    lag_same_day_last_week    0.012850
19                is_weekend    0.011312
14          lag_previous_day    0.007954
34            rolling_std_7d    0.005938
29                year_trend    0.004294
25  pct_priority_1_peak_hour    0.003119


In [27]:
# Linear Regression Coefficients for Enhanced Daily Model
linear_coef_df = pd.DataFrame({
    'Feature': enhanced_features_daily,
    'Coefficient': models['linear_enhanced_daily'].coef_
}).sort_values('Coefficient', key=abs, ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(data=linear_coef_df.head(15), x='Coefficient', y='Feature', palette='viridis')
plt.axvline(x=0, color='black', linestyle='-', linewidth=1)
plt.xlabel('Coefficient Value', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Enhanced Daily Linear Regression: Top 15 Feature Coefficients', fontsize=13, fontweight='bold')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('Sector_Model_Comparison_Results/linear/linear_feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()

log_output("Top 10 Most Important Features (Linear Regression - by absolute coefficient):")
log_output(str(linear_coef_df.head(10)))


Top 10 Most Important Features (Linear Regression - by absolute coefficient):
                    Feature  Coefficient
33          rolling_mean_7d    71.041860
28         days_since_start    57.375364
18  lag_previous_week_total   -47.043068
1                      Year   -28.056087
29               year_trend   -28.056087
15   lag_same_day_last_week    13.297994
35         rolling_mean_30d    -8.012705
2               Day_of_Year    -5.328853
32        day_of_year_trend    -5.328853
31               week_trend    -3.707977


In [28]:
# Random Forest Feature Importance
rf_importance = pd.DataFrame({
    'Feature': enhanced_features_daily,
    'Importance': models['rf_enhanced_daily'].feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(data=rf_importance.head(15), x='Importance', y='Feature', palette='plasma')
plt.xlabel('Feature Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Enhanced Daily Random Forest: Top 15 Feature Importance', fontsize=13, fontweight='bold')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('Sector_Model_Comparison_Results/randomforest/rf_feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()

print("Top 10 Most Important Features (Random Forest):")
print(rf_importance.head(10))


Top 10 Most Important Features (Random Forest):
                    Feature  Importance
33          rolling_mean_7d    0.802532
35         rolling_mean_30d    0.151132
18  lag_previous_week_total    0.022165
15   lag_same_day_last_week    0.008031
34           rolling_std_7d    0.004401
36           Sector_Encoded    0.002991
20              is_peak_day    0.001997
17  lag_same_day_last_month    0.000715
19               is_weekend    0.000579
16            lag_2days_ago    0.000441


## Save Results and Models

Save comparison metrics and trained models.


In [30]:
joblib.dump(models['lasso_enhanced_daily'], 'Sector_Model_Comparison_Results/models/lasso_enhanced_daily.joblib')
joblib.dump(models['lasso_enhanced_weekly'], 'Sector_Model_Comparison_Results/models/lasso_enhanced_weekly.joblib')
joblib.dump(models['lasso_enhanced_monthly'], 'Sector_Model_Comparison_Results/models/lasso_enhanced_monthly.joblib')

# Save Linear Regression models
joblib.dump(models['linear_enhanced_daily'], 'Sector_Model_Comparison_Results/models/linear_enhanced_daily.joblib')
joblib.dump(models['linear_enhanced_weekly'], 'Sector_Model_Comparison_Results/models/linear_enhanced_weekly.joblib')
joblib.dump(models['linear_enhanced_monthly'], 'Sector_Model_Comparison_Results/models/linear_enhanced_monthly.joblib')

joblib.dump(models['rf_enhanced_daily'], 'Sector_Model_Comparison_Results/models/rf_enhanced_daily.joblib')
joblib.dump(models['rf_enhanced_weekly'], 'Sector_Model_Comparison_Results/models/rf_enhanced_weekly.joblib')
joblib.dump(models['rf_enhanced_monthly'], 'Sector_Model_Comparison_Results/models/rf_enhanced_monthly.joblib')
joblib.dump(models['xgb_enhanced_daily'], 'Sector_Model_Comparison_Results/models/xgb_enhanced_daily.joblib')
joblib.dump(models['xgb_enhanced_weekly'], 'Sector_Model_Comparison_Results/models/xgb_enhanced_weekly.joblib')
joblib.dump(models['xgb_enhanced_monthly'], 'Sector_Model_Comparison_Results/models/xgb_enhanced_monthly.joblib')

joblib.dump(scaler_enhanced_d, 'Sector_Model_Comparison_Results/models/scaler_enhanced_daily.joblib')
joblib.dump(scaler_enhanced_w, 'Sector_Model_Comparison_Results/models/scaler_enhanced_weekly.joblib')
joblib.dump(scaler_enhanced_m, 'Sector_Model_Comparison_Results/models/scaler_enhanced_monthly.joblib')

avg_r2 = comparison_df['R2'].mean()
log_output(f"\n=== Final Summary ===")
log_output(f"Average R² across all models: {avg_r2:.4f}")
log_output(f"Total models trained: {len(comparison_df)}")
log_output(f"=== Training Completed: {datetime.now()} ===\n")



=== Final Summary ===
Average R² across all models: 0.7036
Total models trained: 12
=== Training Completed: 2025-12-04 00:29:56.167248 ===

