# Train Model

In [1]:

import warnings, time, os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, log_loss, brier_score_loss, RocCurveDisplay)
from sklearn.calibration import calibration_curve

from utilities.common import FEATURES

warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')
plt.style.use('default')


## Config

In [2]:
DATA_PATH = '../data/processed/Training_Data.csv'  
OUT_DIR = 'models'               
OUTER_SPLITS = 5
INNER_SPLITS = 3
N_RANDOM_ITER = 40               
RANDOM_SEED = 42


Path(OUT_DIR).mkdir(exist_ok=True)

In [3]:

def load_data(path):
    df = pd.read_csv(path)
    df = df[FEATURES + ['RESULT']]
    if 'RESULT' not in df.columns:
        raise ValueError('RESULT column missing')
    X = df.drop(columns=['RESULT'])
    y = df['RESULT'].astype(int)
    return X, y

def build_preprocessor(X):
    numeric_cols = X.select_dtypes(include=['number']).columns.to_list()

    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    return ColumnTransformer([('num', numeric_pipeline, numeric_cols)])

def get_models_and_params():
    return {
        'log_reg': (
            LogisticRegression(max_iter=300, solver='liblinear', random_state=RANDOM_SEED),
            {
                'model__C'          : np.logspace(-3, 3, 10),
                'model__penalty'    : ['l1', 'l2']
            }
        ),
        'decision_tree': (
            DecisionTreeClassifier(random_state=RANDOM_SEED),
            {
                'model__max_depth'          : [None, 3, 5, 7, 9, 12],
                'model__min_samples_leaf'   : [1, 5, 10, 30]
            }
        ),
        'random_forest': (
            RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1),
            {
                'model__n_estimators'       : [200, 400, 600, 800],
                'model__max_depth'          : [None, 5, 10, 15],
                'model__min_samples_leaf'   : [1, 2, 4],
                'model__max_features'       : ['sqrt', 'log2', 0.8]
            }
        ),
        'xgboost': (
            XGBClassifier(
                objective='binary:logistic',
                eval_metric='logloss',
                tree_method='hist',
                n_jobs=-1,
                random_state=RANDOM_SEED,
                use_label_encoder=False
            ),
            {
                'model__n_estimators'       : [300, 500, 800],
                'model__learning_rate'      : [0.01, 0.05, 0.1],
                'model__max_depth'          : [3, 5, 7, 9],
                'model__subsample'          : [0.6, 0.8, 1.0],
                'model__colsample_bytree'   : [0.6, 0.8, 1.0],
                'model__gamma'              : [0, 0.1, 0.2, 0.3],
                'model__lambda'             : [1, 1.5, 2.0]
            }
        )
    }

def evaluate_model(name, model, X_test, y_test, out_dir):
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred  = (y_proba >= 0.5).astype(int)

    metrics = {
        'accuracy'  : accuracy_score(y_test, y_pred),
        'roc_auc'   : roc_auc_score(y_test, y_proba),
        'log_loss'  : log_loss(y_test, y_proba),
        'brier'     : brier_score_loss(y_test, y_proba)
    }

    RocCurveDisplay.from_predictions(y_test, y_proba)
    plt.title(f'ROC – {name}')
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, f'{name}_roc.png')); plt.close()

    prob_true, prob_pred = calibration_curve(y_test, y_proba, n_bins=10)
    plt.plot(prob_pred, prob_true, marker='o')
    plt.plot([0,1],[0,1],'--')
    plt.xlabel('Predicted probability'); plt.ylabel('True probability')
    plt.title(f'Calibration – {name}')
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, f'{name}_calibration.png')); plt.close()

    return metrics


In [4]:

def nested_cv_training(X, y, 
                       outer_splits=OUTER_SPLITS,
                       inner_splits=INNER_SPLITS,
                       n_iter=N_RANDOM_ITER,
                       out_dir=OUT_DIR):

    outer_cv = TimeSeriesSplit(n_splits=outer_splits)
    models_params = get_models_and_params()
    all_results = []

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X)):
        print(f'\n=== Outer Fold {fold+1}/{outer_splits} ===')
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        preprocessor = build_preprocessor(X_train)

        for mdl_name, (mdl_obj, param_grid) in models_params.items():
            start = time.time()
            print(f'Training {mdl_name}')

            pipe = Pipeline([('preprocess', preprocessor),
                             ('model', mdl_obj)])

            if mdl_name == 'log_reg':
                search = GridSearchCV(
                    pipe, param_grid=param_grid,
                    cv=TimeSeriesSplit(n_splits=inner_splits),
                    scoring='neg_log_loss',
                    n_jobs=-1, verbose=0
                )
            else:
                search = RandomizedSearchCV(
                    pipe, param_distributions=param_grid,
                    n_iter=n_iter,
                    cv=TimeSeriesSplit(n_splits=inner_splits),
                    scoring='neg_log_loss',
                    n_jobs=-1, verbose=0,
                    random_state=RANDOM_SEED
                )

            search.fit(X_train, y_train)
            best_model = search.best_estimator_
            dur = time.time() - start

            metrics = evaluate_model(f'{mdl_name}_fold{fold+1}', best_model, X_test, y_test, out_dir)
            metrics.update({
                'model'         : mdl_name,
                'outer_fold'    : fold+1,
                'train_samples' : len(train_idx),
                'test_samples'  : len(test_idx),
                'best_params'   : search.best_params_,
                'fit_time_sec'  : round(dur, 2)
            })

            print(json.dumps(metrics, indent=2))
            all_results.append(metrics)

            joblib.dump(best_model, os.path.join(out_dir, f'{mdl_name}_fold{fold+1}.pkl'))

    res_df = pd.DataFrame(all_results)
    res_df.to_csv(os.path.join(out_dir, 'nested_cv_results.csv'), index=False)
    display(res_df.groupby('model')[['accuracy','roc_auc','log_loss','brier']].mean())


## Start Training

In [5]:

X, y = load_data(DATA_PATH)
nested_cv_training(X, y)



=== Outer Fold 1/5 ===
Training log_reg
{
  "accuracy": 0.6662374245472837,
  "roc_auc": 0.7283198181657823,
  "log_loss": 0.6072991304698281,
  "brier": 0.21032647942952495,
  "model": "log_reg",
  "outer_fold": 1,
  "train_samples": 12428,
  "test_samples": 12425,
  "best_params": {
    "model__C": 0.004641588833612777,
    "model__penalty": "l2"
  },
  "fit_time_sec": 3.65
}
Training decision_tree




{
  "accuracy": 0.6461971830985915,
  "roc_auc": 0.7092960577747114,
  "log_loss": 0.6217800792542353,
  "brier": 0.21610371493872776,
  "model": "decision_tree",
  "outer_fold": 1,
  "train_samples": 12428,
  "test_samples": 12425,
  "best_params": {
    "model__min_samples_leaf": 30,
    "model__max_depth": 3
  },
  "fit_time_sec": 1.71
}
Training random_forest
{
  "accuracy": 0.6692957746478874,
  "roc_auc": 0.7286834130169082,
  "log_loss": 0.6108120116332484,
  "brier": 0.2112961843398816,
  "model": "random_forest",
  "outer_fold": 1,
  "train_samples": 12428,
  "test_samples": 12425,
  "best_params": {
    "model__n_estimators": 600,
    "model__min_samples_leaf": 4,
    "model__max_features": "log2",
    "model__max_depth": 10
  },
  "fit_time_sec": 211.16
}
Training xgboost
{
  "accuracy": 0.670261569416499,
  "roc_auc": 0.7333681719021943,
  "log_loss": 0.6058534743014297,
  "brier": 0.20922230747998907,
  "model": "xgboost",
  "outer_fold": 1,
  "train_samples": 12428,
  "te



{
  "accuracy": 0.688933601609658,
  "roc_auc": 0.7573124326435325,
  "log_loss": 0.5845285000578558,
  "brier": 0.20015279689774498,
  "model": "log_reg",
  "outer_fold": 2,
  "train_samples": 24853,
  "test_samples": 12425,
  "best_params": {
    "model__C": 0.1,
    "model__penalty": "l1"
  },
  "fit_time_sec": 5.73
}
Training decision_tree




{
  "accuracy": 0.6707444668008048,
  "roc_auc": 0.7332835214183979,
  "log_loss": 0.6029508490471458,
  "brier": 0.20761722384543244,
  "model": "decision_tree",
  "outer_fold": 2,
  "train_samples": 24853,
  "test_samples": 12425,
  "best_params": {
    "model__min_samples_leaf": 1,
    "model__max_depth": 3
  },
  "fit_time_sec": 3.57
}
Training random_forest




{
  "accuracy": 0.6867605633802817,
  "roc_auc": 0.7535456009114151,
  "log_loss": 0.5887979178897134,
  "brier": 0.20183108173265718,
  "model": "random_forest",
  "outer_fold": 2,
  "train_samples": 24853,
  "test_samples": 12425,
  "best_params": {
    "model__n_estimators": 600,
    "model__min_samples_leaf": 2,
    "model__max_features": 0.8,
    "model__max_depth": 5
  },
  "fit_time_sec": 490.13
}
Training xgboost




{
  "accuracy": 0.6885311871227364,
  "roc_auc": 0.7578007573152311,
  "log_loss": 0.583501585016769,
  "brier": 0.1998858231225601,
  "model": "xgboost",
  "outer_fold": 2,
  "train_samples": 24853,
  "test_samples": 12425,
  "best_params": {
    "model__subsample": 0.8,
    "model__n_estimators": 500,
    "model__max_depth": 5,
    "model__learning_rate": 0.01,
    "model__lambda": 1,
    "model__gamma": 0.2,
    "model__colsample_bytree": 0.8
  },
  "fit_time_sec": 117.48
}

=== Outer Fold 3/5 ===
Training log_reg
{
  "accuracy": 0.6936016096579477,
  "roc_auc": 0.7677087151877068,
  "log_loss": 0.5748475103532709,
  "brier": 0.196070248631826,
  "model": "log_reg",
  "outer_fold": 3,
  "train_samples": 37278,
  "test_samples": 12425,
  "best_params": {
    "model__C": 0.004641588833612777,
    "model__penalty": "l2"
  },
  "fit_time_sec": 12.4
}
Training decision_tree




{
  "accuracy": 0.6811267605633803,
  "roc_auc": 0.7441047125921076,
  "log_loss": 0.5949175655596691,
  "brier": 0.20437565883463216,
  "model": "decision_tree",
  "outer_fold": 3,
  "train_samples": 37278,
  "test_samples": 12425,
  "best_params": {
    "model__min_samples_leaf": 1,
    "model__max_depth": 3
  },
  "fit_time_sec": 5.68
}
Training random_forest




{
  "accuracy": 0.6938430583501006,
  "roc_auc": 0.7656839218463868,
  "log_loss": 0.5759716592667796,
  "brier": 0.19679861141585103,
  "model": "random_forest",
  "outer_fold": 3,
  "train_samples": 37278,
  "test_samples": 12425,
  "best_params": {
    "model__n_estimators": 600,
    "model__min_samples_leaf": 4,
    "model__max_features": "sqrt",
    "model__max_depth": 10
  },
  "fit_time_sec": 795.77
}
Training xgboost




{
  "accuracy": 0.6941649899396378,
  "roc_auc": 0.7680334760166693,
  "log_loss": 0.5732921422829148,
  "brier": 0.1957729283934396,
  "model": "xgboost",
  "outer_fold": 3,
  "train_samples": 37278,
  "test_samples": 12425,
  "best_params": {
    "model__subsample": 0.8,
    "model__n_estimators": 500,
    "model__max_depth": 5,
    "model__learning_rate": 0.01,
    "model__lambda": 1,
    "model__gamma": 0.2,
    "model__colsample_bytree": 0.8
  },
  "fit_time_sec": 139.59
}

=== Outer Fold 4/5 ===
Training log_reg
{
  "accuracy": 0.6711468812877264,
  "roc_auc": 0.7395373571299723,
  "log_loss": 0.6006806260045934,
  "brier": 0.2069290541590735,
  "model": "log_reg",
  "outer_fold": 4,
  "train_samples": 49703,
  "test_samples": 12425,
  "best_params": {
    "model__C": 0.021544346900318832,
    "model__penalty": "l1"
  },
  "fit_time_sec": 21.29
}
Training decision_tree




{
  "accuracy": 0.6647082494969819,
  "roc_auc": 0.7242311080378043,
  "log_loss": 0.61519457978959,
  "brier": 0.2128036517925937,
  "model": "decision_tree",
  "outer_fold": 4,
  "train_samples": 49703,
  "test_samples": 12425,
  "best_params": {
    "model__min_samples_leaf": 30,
    "model__max_depth": 5
  },
  "fit_time_sec": 8.04
}
Training random_forest




{
  "accuracy": 0.6696177062374246,
  "roc_auc": 0.735177871859795,
  "log_loss": 0.6027372054136446,
  "brier": 0.2081733255658836,
  "model": "random_forest",
  "outer_fold": 4,
  "train_samples": 49703,
  "test_samples": 12425,
  "best_params": {
    "model__n_estimators": 600,
    "model__min_samples_leaf": 4,
    "model__max_features": "sqrt",
    "model__max_depth": 10
  },
  "fit_time_sec": 1088.9
}
Training xgboost




{
  "accuracy": 0.6746076458752515,
  "roc_auc": 0.7405564868580121,
  "log_loss": 0.5988048683617336,
  "brier": 0.20640087569692556,
  "model": "xgboost",
  "outer_fold": 4,
  "train_samples": 49703,
  "test_samples": 12425,
  "best_params": {
    "model__subsample": 0.8,
    "model__n_estimators": 500,
    "model__max_depth": 5,
    "model__learning_rate": 0.01,
    "model__lambda": 1,
    "model__gamma": 0.2,
    "model__colsample_bytree": 0.8
  },
  "fit_time_sec": 155.97
}

=== Outer Fold 5/5 ===
Training log_reg




{
  "accuracy": 0.6580281690140845,
  "roc_auc": 0.7272563514301422,
  "log_loss": 0.6065555353643128,
  "brier": 0.2103056385314856,
  "model": "log_reg",
  "outer_fold": 5,
  "train_samples": 62128,
  "test_samples": 12425,
  "best_params": {
    "model__C": 0.021544346900318832,
    "model__penalty": "l1"
  },
  "fit_time_sec": 25.63
}
Training decision_tree




{
  "accuracy": 0.6426559356136821,
  "roc_auc": 0.705883182097763,
  "log_loss": 0.6240987815925528,
  "brier": 0.21759648620501257,
  "model": "decision_tree",
  "outer_fold": 5,
  "train_samples": 62128,
  "test_samples": 12425,
  "best_params": {
    "model__min_samples_leaf": 30,
    "model__max_depth": 5
  },
  "fit_time_sec": 9.72
}
Training random_forest




{
  "accuracy": 0.6542454728370222,
  "roc_auc": 0.7177635708706306,
  "log_loss": 0.6143357430835067,
  "brier": 0.213590957213991,
  "model": "random_forest",
  "outer_fold": 5,
  "train_samples": 62128,
  "test_samples": 12425,
  "best_params": {
    "model__n_estimators": 600,
    "model__min_samples_leaf": 4,
    "model__max_features": "sqrt",
    "model__max_depth": 10
  },
  "fit_time_sec": 1399.3
}
Training xgboost




{
  "accuracy": 0.6560160965794769,
  "roc_auc": 0.7242380530325444,
  "log_loss": 0.6096806240572814,
  "brier": 0.21159497322120047,
  "model": "xgboost",
  "outer_fold": 5,
  "train_samples": 62128,
  "test_samples": 12425,
  "best_params": {
    "model__subsample": 0.8,
    "model__n_estimators": 500,
    "model__max_depth": 5,
    "model__learning_rate": 0.01,
    "model__lambda": 1,
    "model__gamma": 0.2,
    "model__colsample_bytree": 0.8
  },
  "fit_time_sec": 172.6
}


Unnamed: 0_level_0,accuracy,roc_auc,log_loss,brier
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
decision_tree,0.661087,0.72336,0.611788,0.211699
log_reg,0.67559,0.744027,0.594782,0.204757
random_forest,0.674753,0.740171,0.598531,0.206338
xgboost,0.676716,0.744799,0.594227,0.204575
