In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import torch
import joblib
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from functions import rm

random_state = 101
path_csv = "/content/drive/MyDrive/Thesis Data/processed_data/Scaled/"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

X_train = rm(pd.read_csv(path_csv + "ontime_reporting_X_train.csv"))
y_train = np.ravel(pd.read_csv(path_csv + "ontime_reporting_y_train.csv"))

## Tuned Logistic Regression model on SMOTE-RUS data

In [None]:
def objective_lr(trial):
    params = {
        "C": trial.suggest_float("C", 0.001, 1000.0, log=True), #log=True helps to search a large range efficiently with fewer trials
        "solver": trial.suggest_categorical("solver", ["sag","saga","newton-cholesky"]), #"sag" and "saga" works on features with the same scale
        "penalty": trial.suggest_categorical("penalty", ["l2", None])
    }

    steps = [
        ("smote", SMOTE(random_state=random_state)),
        ("rus", RandomUnderSampler(random_state=random_state)),
        ("logreg", LogisticRegression(**params, random_state=random_state, n_jobs=-1, verbose=2))
    ]

    lr_pipeline = Pipeline(steps=steps)

    skfold = StratifiedKFold(n_splits=3, random_state=random_state, shuffle=True)

    scores = cross_val_score(lr_pipeline, X_train, y_train, scoring="roc_auc", n_jobs=-1, cv=skfold)

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction='maximize', study_name='Logistic Regression Tuned SMOTE+RUS', sampler=TPESampler(seed=random_state))
study.optimize(objective_lr, n_trials=5, show_progress_bar=True)

In [1]:
logreg_best_params = study.best_params #
print(logreg_best_params)

{'C': 0.012341115473953784, 'solver': 'saga', 'penalty': None}


In [None]:
visualization.matplotlib.plot_optimization_history(study) #Optuna trial optimaization history.png
visualization.matplotlib.plot_param_importances(study) #Optuna hyperparameter importance.png
visualization.plot_slice(study) #Optuna hyperparameter slice plot.png
visualization.matplotlib.plot_timeline(study) #Optuna trial timeline.png

<img src="./Images/Tuned models on SMOTE-ENN data/Logistic Regression/Optuna trial optimization history.png">

In [None]:
pipeline = Pipeline(
    [
        ("smote", SMOTE(random_state=random_state)),
        ("rus", RandomUnderSampler(random_state=random_state))
    ]
)

X_smoterus, y_smoterus = pipeline.fit_resample(X_train, y_train)
tuned_lr = LogisticRegression(**logreg_best_params, random_state=random_state, n_jobs=-1, verbose=2)
tuned_lr.fit(X_smoterus, y_smoterus)

In [None]:
joblib.dump(study, 'study_logreg_tuned_SMOTE-RUS.pkl')
joblib.dump(tuned_lr, 'logreg_tuned_SMOTE-RUS.joblib')

## Tuned XGBoost model on SMOTE-RUS data

In [None]:
#https://stackoverflow.com/questions/55591063/how-to-perform-smote-with-cross-validation-in-sklearn-in-python

def objective_xgboost(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 150, 350, step=50),
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-2, 1e-2, 15e-2, 1e-1, 2e-1]),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "subsample": trial.suggest_float("subsample", 0.0, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.8, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 0, 20),
    }

    steps = [
        ("smote", SMOTE(random_state=random_state)),
        ("rus", RandomUnderSampler(random_state=random_state)),
        ("xgb", xgb.XGBClassifier(**params, random_state=random_state, n_jobs=-1, verbosity=1, device="cuda"))
    ]

    xgb_pipeline = Pipeline(steps=steps)

    skfold = StratifiedKFold(n_splits=3, random_state=random_state, shuffle=True)

    scores = cross_val_score(xgb_pipeline, X_train, y_train, cv=skfold, scoring="roc_auc")

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction='maximize', study_name='XGBoost Tuned SMOTE-RUS', sampler=TPESampler(seed=random_state))
study.optimize(objective_xgboost, n_trials=5, show_progress_bar=True)

In [None]:
xgboost_best_params = study.best_params #
print(xgboost_best_params)

In [None]:
visualization.matplotlib.plot_optimization_history(study) #Optuna trial optimaization history.png
visualization.matplotlib.plot_param_importances(study) #Optuna hyperparameter importance.png
visualization.plot_slice(study) #Optuna hyperparameter slice plot.png
visualization.matplotlib.plot_timeline(study) #Optuna trial timeline.png

<img src="./Images/Tuned models on SMOTE-ENN data/XGBoost/Optuna trial optimization history.png">

In [None]:
pipeline = Pipeline(
    [
        ("smote", SMOTE(random_state=random_state)),
        ("rus", RandomUnderSampler(random_state=random_state))
    ]
)

X_smoterus, y_smoterus = pipeline.fit_resample(X_train, y_train)
tuned_xgb = xgb.XGBClassifier(**xgboost_best_params, random_state=random_state, n_jobs=-1, verbosity=2, device="cuda")
tuned_xgb.fit(X_smoterus, y_smoterus)

In [None]:
joblib.dump(study, 'study_XGBoost_tuned_SMOTE-RUS.pkl')
joblib.dump(tuned_xgb, 'XGBoost_tuned_SMOTE-RUS.joblib')

## Tuned Random Forests model on SMOTE-RUS data

In [None]:
def objective_rf(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 5, 9),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "n_estimators": trial.suggest_int("n_estimators", 150, 350, step=50),
    }

    steps = [
        ("smote", SMOTE(random_state=random_state)),
        ("rus", RandomUnderSampler(random_state=random_state)),
        ("rf", RandomForestClassifier(**params, random_state=random_state, verbose=1, n_jobs=-1))
    ]

    rf_pipeline = Pipeline(steps=steps)

    skfold = StratifiedKFold(n_splits=3, random_state=random_state, shuffle=True)

    scores = cross_val_score(rf_pipeline, X_train, y_train, scoring="roc_auc", cv=skfold)

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction='maximize', study_name='Random Forests Tuned SMOTE-RUS', sampler=TPESampler(seed=random_state))
study.optimize(objective_rf, n_trials=5, show_progress_bar=True)

In [None]:
rf_best_params = study.best_params #
print(rf_best_params)

In [None]:
visualization.matplotlib.plot_optimization_history(study) #Optuna trial optimaization history.png
visualization.matplotlib.plot_param_importances(study) #Optuna hyperparameter importance.png
visualization.plot_slice(study) #Optuna hyperparameter slice plot.png
visualization.matplotlib.plot_timeline(study) #Optuna trial timeline.png

<img src="./Images/Tuned models on SMOTE-ENN data/Random Forests/Optuna trial optimization history.png">

In [None]:
pipeline = Pipeline(
    [
        ("smote", SMOTE(random_state=random_state)),
        ("rus", RandomUnderSampler(random_state=random_state))
    ]
)

X_smoterus, y_smoterus = pipeline.fit_resample(X_train, y_train)
tuned_rf = RandomForestClassifier(**rf_best_params, random_state=random_state, verbose=1, n_jobs=-1)
tuned_rf.fit(X_smoterus, y_smoterus)

In [None]:
joblib.dump(study, 'study_rf_tuned_SMOTE-RUS.pkl')
joblib.dump(tuned_rf, 'rf_tuned_SMOTE-RUS.joblib')

## Tuned TabNet model on SMOTE-RUS data

In [None]:
def objective_tabnet(trial):

    params = {
        "n_d": trial.suggest_int("n_d", 8,  64, step=4), #limited ram usage
        "n_steps": trial.suggest_int("n_steps", 2, 10), #limited ram usage
        "gamma": trial.suggest_float("gamma", 1.0, 2.0, step=0.01),
        "n_shared": trial.suggest_int("n_shared", 1, 5),
        "lambda_sparse": trial.suggest_categorical("lambda_sparse", [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]),
        #"mask_type": trial.suggest_categorical("mask_type", ["sparsemax", "entmax"])
    }

    model_tabnetClass = TabNetClassifier(**params, seed=random_state, verbose=2, optimizer_params=dict(lr=2e-2, weight_decay=1e-5), n_a=params["n_d"], device_name="cuda") #device_name="cuda"

    skfold = StratifiedKFold(n_splits=3, random_state=random_state, shuffle=True)

    auc_scores = []

    for train_index, val_index in skfold.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

        smote = SMOTE(random_state=random_state)
        X_smote, y_smote = smote.fit_resample(X_train_fold, y_train_fold)

        rus = RandomUnderSampler(random_state=random_state)
        X_rus, y_rus = rus.fit_resample(X_smote, y_smote)

        model_tabnetClass.fit(X_train = X_rus.values,
                              y_train = y_rus,
                              eval_set = [(X_val_fold.values, y_val_fold)],
                              patience=2,
                              max_epochs=4,
                              batch_size=44000,
                              virtual_batch_size=44000,
                              eval_metric=["auc"]
                              )

        y_pred = model_tabnetClass.predict_proba(X_val_fold.values)

        auc_scores.append(roc_auc_score(y_val_fold, y_pred[:,1]))

    return np.mean(auc_scores)

In [None]:
study = optuna.create_study(direction="maximize", study_name='TabNet Tuned SMOTE-RUS', sampler=TPESampler(seed=random_state))
study.optimize(objective_tabnet, n_trials=5, show_progress_bar=True, gc_after_trial=True)

In [None]:
tabnet_best_params = study.best_params #
print(tabnet_best_params)

In [None]:
visualization.matplotlib.plot_optimization_history(study) #Optuna trial optimaization history.png
visualization.matplotlib.plot_param_importances(study) #Optuna hyperparameter importance.png
visualization.plot_slice(study) #Optuna hyperparameter slice plot.png
visualization.matplotlib.plot_timeline(study) #Optuna trial timeline.png

<img src="./Images/Tuned models on SMOTE-ENN data/TabNet/Optuna trial optimization history.png">

In [None]:
pipeline = Pipeline(
    [
        ("smote", SMOTE(random_state=random_state)),
        ("rus", RandomUnderSampler(random_state=random_state))
    ]
)

X_smoterus, y_smoterus = pipeline.fit_resample(X_train, y_train)
tabnet_tuned = TabNetClassifier(**tabnet_best_params, seed=random_state, verbose=2, optimizer_params=dict(lr=2e-2, weight_decay=1e-5), n_a=tabnet_best_params["n_d"], device_name="cuda")
tabnet_tuned.fit(X_smoterus.values, y_smoterus, max_epochs=4, batch_size=44000, virtual_batch_size=44000)

In [None]:
joblib.dump(study, 'study_tabnet_tuned_SMOTE-RUS.pkl')
torch.save(tabnet_tuned, 'TabNet_tuned_SMOTE-RUS.pt')