In [None]:
#https://mikulskibartosz.name/how-to-reduce-memory-usage-in-pandas

def rm(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min == 0 and c_max == 1:
                    df[col] = df[col].astype('bool')
            elif str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [11]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
from pytorch_tabnet.tab_model import TabNetClassifier
import xgboost as xgb

random_state = 101

path_csv = "../../../../Thesis_data/processed_data/"
path_model = "../../../../Thesis_data/Models/"

In [12]:
X_train = pd.read_csv(path_csv + "ontime_reporting_X_train.csv")
y_train = pd.read_csv(path_csv + "ontime_reporting_y_train.csv")

In [None]:
#Reducing memory usage
X_train = rm(X_train)
y_train = np.ravel(y_train)

## Logistic Regression Baseline

In [15]:
logreg_baseline = LogisticRegression(random_state=101, n_jobs=-1, max_iter=1000)
logreg_baseline.fit(X_train, y_train)
pickle.dump(logreg_baseline, open(path_model + "logreg_baseline.sav", 'wb'))

  y = column_or_1d(y, warn=True)


## XGBoost Baseline

In [None]:
xgb_class_baseline = xgb.XGBClassifier(random_state=random_state, n_jobs=-1) #tree_method="gpu_hist"
xgb_class_baseline.fit(X_train, y_train)
pickle.dump(xgb_class_baseline, open(path_model + "XGBoost_baseline.sav", 'wb'))

## TabNet Baseline

In [None]:
TabNet_class_baseline = TabNetClassifier(seed=random_state, verbose=2, max_epochs=10, patience=5) #device_name="cuda"
TabNet_class_baseline.fit(X_train.values, y_train)
pickle.dump(TabNet_class_baseline, open(path_model + "TabNet_baseline.sav", 'wb'))