In [2]:
import pandas as pd
import numpy as np
import pickle
import joblib
import torch
from sklearn.linear_model import LogisticRegression
from pytorch_tabnet.tab_model import TabNetClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

random_state = 101

path_csv = "../../../Thesis_data/processed_data/"
path_model = "../../../Thesis_data/Models/"

In [1]:
#https://www.kaggle.com/code/yus002/logistic-regression-optuna-tuning
#Code used from the above author
#Helps to reduce size in memory of the data so that models can run faster

def rm(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
X_train = pd.read_csv(path_csv + "ontime_reporting_X_train.csv")
y_train = pd.read_csv(path_csv + "ontime_reporting_y_train.csv")

In [4]:
#Reducing memory usage
X_train = rm(X_train)
y_train = np.ravel(y_train)

Memory usage of dataframe is 1256.91 MB
Memory usage after optimization is: 242.04 MB
Decreased by 80.7%


## Logistic Regression Baseline

In [5]:
logreg_baseline = LogisticRegression(random_state=random_state, n_jobs=-1)
logreg_baseline.fit(X_train, y_train)
pickle.dump(logreg_baseline, open(path_model + "logreg_baseline.pkl", 'wb'))

## XGBoost Baseline

In [6]:
xgb_class_baseline = xgb.XGBClassifier(random_state=random_state, n_jobs=-1) #device_name="cuda", pip install xgboost==1.7.5 --user
xgb_class_baseline.fit(X_train, y_train)
pickle.dump(xgb_class_baseline, open(path_model + "XGBoost_baseline.pkl", 'wb'))

## TabNet Baseline

In [None]:
tabnet_class_baseline = TabNetClassifier(seed=random_state, verbose=2, device_name="cuda")
tabnet_class_baseline.fit(X_train.values, y_train, max_epochs=50, batch_size=44000)

path_model = "/content/drive/MyDrive/Thesis Data/Models/"
torch.save(tabnet_class_baseline, path_model + 'TabNet_baseline.pt')

## Random Forests Baseline

In [None]:
rf_baseline = RandomForestClassifier(random_state=random_state, n_jobs=-1)
rf_baseline.fit(X_train, y_train)
joblib.dump(rf_baseline, path_model + 'rf_baseline.joblib') 