In [None]:
! pip install optuna
! pip install

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import joblib
import optuna
import torch
from optuna import Trial, visualization
from optuna.samplers import TPESampler

random_state = 101
path_csv = "/content/drive/MyDrive/Thesis Data/processed_data/Scaled/"

In [None]:
#https://www.kaggle.com/code/yus002/logistic-regression-optuna-tuning
#Code used from the above author
#Helps to reduce size in memory of the data so that models can run faster

def rm(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
from google.colab import drive
drive.mount('/content/drive')

X_train = pd.read_csv(path_csv + "ontime_reporting_X_train.csv")
y_train = pd.read_csv(path_csv + "ontime_reporting_y_train.csv")

X_train = rm(X_train)
y_train = np.ravel(y_train)

## Tuned Logistic Regression model on imbalanced data

In [None]:
#https://emerginginvestigators.org/articles/comparison-of-the-ease-of-use-and-accuracy-of-two-machine-learning-algorithms-forestry-case-study/pdf#:~:text=The%20key%20difference%20between%20newton,to%20newton%2Dcg%20and%20lbfgs.
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
#Also paper by Dahir

def objective_lr(trial):
    params = {
        "C": trial.suggest_float("C", 0.001, 1000.0, log=True), #log=True helps to search a large range efficiently with fewer trials
        "solver": trial.suggest_categorical("solver", ["sag","saga","newton-cholesky"]), #"sag" and "saga" works on features with the same scale
        "penalty": trial.suggest_categorical("penalty", ["l2", None])
    }

    model_lrClass = LogisticRegression(**params, random_state=random_state, n_jobs=-1, verbose=2)

    skfold = StratifiedKFold(n_splits=3, random_state=random_state, shuffle=True)

    scores = cross_val_score(model_lrClass, X_train, y_train, scoring="roc_auc", n_jobs=-1, cv=skfold)

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction='maximize', study_name='Logistic Regression Tuned Imbalanced', sampler=TPESampler(seed=random_state))
study.optimize(objective_lr, n_trials=20, show_progress_bar=True)

In [1]:
best_params = study.best_params #{'C': 0.012341115473953784, 'solver': 'saga', 'penalty': None}
print(best_params)

{'C': 0.012341115473953784, 'solver': 'saga', 'penalty': None}


In [None]:
visualization.matplotlib.plot_optimization_history(study) #Optuna trial optimaization history.png
visualization.matplotlib.plot_param_importances(study) #Optuna hyperparameter importance.png
visualization.plot_slice(study) #Optuna hyperparameter slice plot.png
visualization.matplotlib.plot_timeline(study) #Optuna trial timeline.png

<img src="./Images/Tuned models on imbalanced data/Logistic Regression/Optuna trial optimization history.png">

In [None]:
tuned_lr_model = LogisticRegression(**best_params, random_state=random_state, n_jobs=-1, verbose=2)
tuned_lr_model.fit(X_train, y_train)

In [None]:
joblib.dump(tuned_lr_model, 'logreg_tuned_imbalance.joblib')

## Tuned XGBoost model on imbalanced data

In [None]:
def objective_xgboost(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 150, 350, step=50),
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-2, 1e-2, 15e-2, 1e-1, 2e-1]),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "subsample": trial.suggest_float("subsample", 0.0, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.8, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 0, 20),
    }

    model_xgbClass = xgb.XGBClassifier(**params, random_state=random_state, n_jobs=-1, verbosity=2, device="cuda")
    
    skfold = StratifiedKFold(n_splits=3, random_state=random_state, shuffle=True)
    
    scores = cross_val_score(model_xgbClass, X_train, y_train, cv=skfold, scoring="roc_auc")  

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction='maximize', study_name='XGBoost Tuned Imbalanced', sampler=TPESampler(seed=random_state))
study.optimize(objective_xgboost, n_trials=20, show_progress_bar=True) 

In [1]:
xgboost_best_params = study.best_params #{'n_estimators': 350, 'learning_rate': 0.05, 'max_depth': 16, 'subsample': 0.9951754722614875, 'colsample_bytree': 0.8668977611468552, 'min_child_weight': 8}
print(xgboost_best_params)

{'n_estimators': 350, 'learning_rate': 0.05, 'max_depth': 16, 'subsample': 0.9951754722614875, 'colsample_bytree': 0.8668977611468552, 'min_child_weight': 8}


In [None]:
visualization.matplotlib.plot_optimization_history(study) #Optuna trial optimaization history.png
visualization.matplotlib.plot_param_importances(study) #Optuna hyperparameter importance.png
visualization.plot_slice(study) #Optuna hyperparameter slice plot.png
visualization.matplotlib.plot_timeline(study) #Optuna trial timeline.png

<img src="./Images/Tuned models on imbalanced data/XGBoost/Optuna trial optimaization history.png">

In [None]:
tuned_xgb_imbalanced = xgb.XGBClassifier(**xgboost_best_params, random_state=random_state, n_jobs=-1, verbosity=2, device="cuda") 
tuned_xgb_imbalanced.fit(X_train, y_train)

In [None]:
joblib.dump(tuned_xgb_imbalanced, 'XGBoost_tuned_imbalance.joblib')

## Tuned Random Forests model on imbalanced data

In [None]:
def objective_rf(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 5, 9),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "n_estimators": trial.suggest_int("n_estimators", 150, 350, step=50)
    }

    model_rfClass = RandomForestClassifier(**params, random_state=random_state, verbose=1, n_jobs=-1)
        
    skfold = StratifiedKFold(n_splits=3, random_state=random_state, shuffle=True)
    
    scores = cross_val_score(model_rfClass, X_train, y_train, cv=skfold, scoring="roc_auc")  

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction='maximize', study_name='Random Forests Tuned Imbalanced', sampler=TPESampler(seed=random_state))
study.optimize(objective_rf, n_trials=20, show_progress_bar=True)

In [9]:
rf_best_params = study.best_params #{'max_depth': 20, 'min_samples_leaf': 5, 'min_samples_split': 6, 'n_estimators': 250}
print(rf_best_params)

{'max_depth': 20, 'min_samples_leaf': 5, 'min_samples_split': 6, 'n_estimators': 250}


In [None]:
visualization.matplotlib.plot_optimization_history(study) #Optuna trial optimaization history.png
visualization.matplotlib.plot_param_importances(study) #Optuna hyperparameter importance.png
visualization.plot_slice(study) #Optuna hyperparameter slice plot.png
visualization.matplotlib.plot_timeline(study) #Optuna trial timeline.png

<img src="./Images/Tuned models on imbalanced data/Random Forests/Optuna trial optimization history.png">

In [None]:
tuned_rf_imbalanced = RandomForestClassifier(**rf_best_params, random_state=random_state, verbose=1, n_jobs=-1)
tuned_rf_imbalanced.fit(X_train, y_train)

In [None]:
joblib.dump(tuned_rf_imbalanced, 'rf_tuned_imbalance.joblib')

## Tuned TabNet model on imbalanced data

In [None]:
def objective_tabnet(trial):

    params = {
        "n_d": trial.suggest_int("n_d", 8,  32, step=4),
        "n_steps": trial.suggest_int("n_steps", 2, 10),
        "gamma": trial.suggest_float("gamma", 1.0, 2.0, step=0.01),
        "n_shared": trial.suggest_int("n_shared", 1, 5),
        "lambda_sparse": trial.suggest_categorical("lambda_sparse", [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]),
    }

    model_tabnetClass = TabNetClassifier(**params, seed=random_state, verbose=2, optimizer_params=dict(lr=2e-2, weight_decay=1e-5), n_a=params["n_d"], device_name="cuda")

    skfold = StratifiedKFold(n_splits=3, random_state=random_state, shuffle=True)

    auc_scores = []

    for train_index, val_index in skfold.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]



        model_tabnetClass.fit(X_train = X_train_fold.values,
                              y_train = y_train_fold,
                              eval_set = [(X_val_fold.values, y_val_fold)],
                              patience=2,
                              max_epochs=4,
                              batch_size=14000,
                              virtual_batch_size=14000,
                              eval_metric=["auc"]
                              )

        y_pred = model_tabnetClass.predict_proba(X_val_fold.values)

        auc_scores.append(roc_auc_score(y_val_fold, y_pred[:,1]))

    return np.mean(auc_scores)

In [None]:
study = optuna.create_study(direction="maximize", study_name='TabNet Tuned Imbalanced', sampler=TPESampler(seed=random_state))
study.optimize(objective_tabnet, n_trials=20, show_progress_bar=True, gc_after_trial=True)

In [8]:
tabnet_best_params = study.best_params #{'n_d': 64, 'n_steps': 4, 'gamma': 1.78, 'n_shared': 1, 'lambda_sparse': 0.001}
print(tabnet_best_params)

{'n_d': 64, 'n_steps': 4, 'gamma': 1.78, 'n_shared': 1, 'lambda_sparse': 0.001}


In [None]:
visualization.matplotlib.plot_optimization_history(study) #Optuna trial optimaization history.png
visualization.matplotlib.plot_param_importances(study) #Optuna hyperparameter importance.png
visualization.plot_slice(study) #Optuna hyperparameter slice plot.png
visualization.matplotlib.plot_timeline(study) #Optuna trial timeline.png

<img src="./Images/Tuned models on imbalanced data/TabNet/Optuna trial optimization history.png">

In [None]:
tabnet_tuned_imbalance = TabNetClassifier(**tabnet_best_params, seed=random_state, verbose=2, optimizer_params=dict(lr=2e-2, weight_decay=1e-5), n_a=tabnet_best_params["n_d"], device_name="cuda")
tabnet_tuned_imbalance.fit(X_train.values, y_train, max_epochs=4, batch_size=14000, virtual_batch_size=14000)

In [None]:
torch.save(tabnet_tuned_imbalance, 'TabNet_tuned_imbalance.pt')