In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import pandas as pd
import numpy as np
import joblib
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

random_state = 101
path_csv = "/content/drive/MyDrive/Thesis Data/processed_data/Scaled/"

In [None]:
#https://www.kaggle.com/code/yus002/logistic-regression-optuna-tuning
#Code used from the above author
#Helps to reduce size in memory of the data so that models can run faster

def rm(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
from google.colab import drive
drive.mount('/content/drive')

X_train = pd.read_csv(path_csv + "ontime_reporting_X_train.csv")
y_train = pd.read_csv(path_csv + "ontime_reporting_y_train.csv")

X_train = rm(X_train)
y_train = np.ravel(y_train)

## Tuned Logistic Regression model on ROS data

In [None]:
def objective_lr(trial):
    params = {
        "C": trial.suggest_float("C", 0.001, 1000.0, log=True), #log=True helps to search a large range efficiently with fewer trials
        "solver": trial.suggest_categorical("solver", ["sag","saga","newton-cholesky"]), #"sag" and "saga" works on features with the same scale
        "penalty": trial.suggest_categorical("penalty", ["l2", None])
    }

    steps = [
        ("ros", RandomOverSampler(sampling_strategy="minority", random_state=random_state)),
        ("logreg", LogisticRegression(**params, random_state=random_state, n_jobs=-1, verbose=2))
    ]

    xgb_pipeline = Pipeline(steps=steps)

    skfold = StratifiedKFold(n_splits=3, random_state=random_state, shuffle=True)

    scores = cross_val_score(xgb_pipeline, X_train, y_train, scoring="roc_auc", n_jobs=-1, cv=skfold)

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction='maximize', study_name='Logistic Regression Tuned ROS', sampler=TPESampler(seed=random_state))
study.optimize(objective_lr, n_trials=20, show_progress_bar=True)

In [None]:
logreg_best_params = study.best_params
#{'C': 0.012341115473953784, 'solver': 'saga', 'penalty': None}
print(logreg_best_params)

In [None]:
visualization.matplotlib.plot_optimization_history(study) #Optuna trial optimaization history.png
visualization.matplotlib.plot_param_importances(study) #Optuna hyperparameter importance.png
visualization.plot_slice(study) #Optuna hyperparameter slice plot.png
visualization.matplotlib.plot_timeline(study) #Optuna trial timeline.png

In [None]:
tuned_lr_model = LogisticRegression(**logreg_best_params, random_state=random_state, n_jobs=-1, verbose=2)
tuned_lr_model.fit(X_train, y_train)

In [None]:
joblib.dump(tuned_lr_model, 'lr_tuned_ROS.joblib')

## Tuned XGBoost model on ROS data

In [None]:
#https://stackoverflow.com/questions/55591063/how-to-perform-smote-with-cross-validation-in-sklearn-in-python

def objective_xgboost(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 150, 350, step=50),
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-2, 1e-2, 15e-2, 1e-1, 2e-1]),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "subsample": trial.suggest_float("subsample", 0.0, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.8, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 0, 20),
    }

    steps = [
        ("ros", RandomOverSampler(sampling_strategy="minority", random_state=random_state)),
        ("xgb", xgb.XGBClassifier(**params, random_state=random_state, n_jobs=-1, verbosity=1, device="cuda"))
    ]

    xgb_pipeline = Pipeline(steps=steps)

    skfold = StratifiedKFold(n_splits=3, random_state=random_state, shuffle=True)

    scores = cross_val_score(xgb_pipeline, X_train, y_train, cv=skfold, scoring="roc_auc")

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction='maximize', study_name='XGBoost Tuned ROS', sampler=TPESampler(seed=random_state))
study.optimize(objective_xgboost, n_trials=20, show_progress_bar=True)

In [None]:
xgboost_best_params = study.best_params
#{'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 16, 'subsample': 0.8180174838276538, 'colsample_bytree': 0.8949014373055646, 'min_child_weight': 17}
print(xgboost_best_params)

In [None]:
visualization.matplotlib.plot_optimization_history(study) #Optuna trial optimaization history.png
visualization.matplotlib.plot_param_importances(study) #Optuna hyperparameter importance.png
visualization.plot_slice(study) #Optuna hyperparameter slice plot.png
visualization.matplotlib.plot_timeline(study) #Optuna trial timeline.png

In [None]:
tuned_xgb_imbalanced = xgb.XGBClassifier(**xgboost_best_params, random_state=random_state, n_jobs=-1, verbosity=2, device="cuda")
tuned_xgb_imbalanced.fit(X_train, y_train)

In [None]:
joblib.dump(tuned_xgb_imbalanced, 'xgb_tuned_ROS.joblib')

## Tuned Random Forests model on ROS data

## Tuned TabNet model on ROS data