In [None]:
#!pip install pytorch-tabnet

In [None]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import cross_validate, train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

In [None]:
dtypes = {
    "MONTH":"int8",
    "DAY_OF_MONTH":"int8",
    "DAY_OF_WEEK":"int8",
    "OP_UNIQUE_CARRIER":"object",
    "CRS_DEP_TIME":"int16",
    "CRS_ARR_TIME":"int16",
    "DEP_DEL15":"bool",
    "DISTANCE_GROUP":"category",
    "NUMBER_OF_SEATS":"int16",
    "AWND":"float64",
    "PRCP":"float64",
    "SNOW":"float64",
    "SNWD":"float64",
    "TMAX":"float64",
    "MEDIAN_AGE":"float64",
    "TOT_POP":"int64",
    "AVG_HOUSEHOLD_SIZE":"float64", 
    "ORIGIN_LAT ":"float64",
    "ORIGIN_LONG":"float64",
    "DEST_LAT":"float64",
    "DEST_LONG":"float64",
    "PREV_FLIGHT_DELAY":"bool",
    "PLANE_AGE":"int16"
    }

In [None]:
ontime_reporting = pd.read_csv('/content/drive/MyDrive/Thesis Data/processed_data/ontime_reporting_clean_export.csv', dtype=dtypes)

In [None]:
ontime_reporting = ontime_reporting.drop(columns=["OP_UNIQUE_CARRIER"])
X = ontime_reporting.loc[:, ontime_reporting.columns != "DEP_DEL15"]
y = ontime_reporting["DEP_DEL15"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [None]:
def objective_tabnet(trial):
    params = {
        "n_d": trial.suggest_int("n_d", 8, 64, step=8),
        "n_a": trial.suggest_int("n_a", 8, 64, step=8),
        "n_steps": trial.suggest_int("n_steps", 3, 10),
        "gamma": trial.suggest_float("subsample", 1.0, 2.0, step=0.01),
        "n_shared": trial.suggest_int("colsample_bytree", 1, 5)
    }

    model_tabnetClass = TabNetClassifier(**params, seed=random_state, n_jobs=-1, verbose=1)
    
    skfold = StratifiedKFold(n_splits=3, random_state=random_state, shuffle=True)
    auc_scores = []

    for train_index, val_index in skfold.split(X_train, y_train):
        X_train_fold, y_train_fold = X_train.iloc[train_index], y_train.iloc[train_index]
        X_val_fold, y_val_fold = X_train.iloc[val_index], y_train.iloc[val_index]

        model_tabnetClass.fit(X_train_fold, y_train_fold,
                              eval_set = [(X_val_fold, y_val_fold)]
                              patience=5,
                              max_epochs=10,
                              eval_metric=["auc"])
        
        y_pred = model_tabnetClass.predict(X_val_fold)#[:, 1]
        auc_scores.append(roc_auc_score(y_val_fold, y_pred))

    return np.mean(auc_scores)

In [None]:
study = optuna.create_study(direction="minimize", study_name='TabNet')
study.optimize(objective_tabnet, n_trials=1) #5 hours -> timeout=6*60

In [None]:
TabNet_params = study.best_params