# Deep learning model's performance on tabular data compared to GBDT and TabNet model
This is part of my experiment to compare the performance of MLP, GBDT and TabNet model. This notebook trains a model with using TabNet with pre-training to predict home insurance lapse on home insurance dataset. The main notebook can be found [here](https://www.kaggle.com/kyosukemorita/deep-learning-vs-gbdt-model-on-tabular-data).

In [None]:
# TabNet
!pip install pytorch-tabnet

In [None]:
import os
import numpy as np
import pandas as pd
import warnings
import time
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

In [None]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [None]:
def timer(myFunction):
    def functionTimer(*args, **kwargs):
        start_time = time.time()
        result = myFunction(*args, **kwargs)
        end_time = time.time()
        computation_time = round(end_time - start_time, 2)
        print("{} is excuted".format(myFunction.__name__))
        print('Computation took: {:.2f} seconds'.format(computation_time))
        return result
    return functionTimer

In [None]:
@timer
def prepareInputs(df: "pd.dataFrame") -> "pd.dataFrame":
    """Prepare the input for training

    Args:
        df (pd.DataFrame): raw data
        
    Process:
        1. Exclude missing values
        2. Clean the target variable
        3. Create dummy variables for categorical variables
        4. Create age features
        5. Impute missing value
    
    Return: pd.dataFrame
    """
    
    # 1. Exclude missing values
    df = df[df["POL_STATUS"].notnull()]
    
    # 2. Clean the target variable
    df = df[df["POL_STATUS"] != "Unknown"]
    df["lapse"] = np.where(df["POL_STATUS"] == "Lapsed", 1, 0)
    
    # 3. Create dummy variables for categorical variables
    categorical_cols = ["CLAIM3YEARS", "BUS_USE", "AD_BUILDINGS",
                        "APPR_ALARM", "CONTENTS_COVER", "P1_SEX",
                        "BUILDINGS_COVER", "P1_POLICY_REFUSED", 
                        "APPR_LOCKS", "FLOODING",
                        "NEIGH_WATCH", "SAFE_INSTALLED", "SEC_DISC_REQ",
                        "SUBSIDENCE", "LEGAL_ADDON_POST_REN", 
                        "HOME_EM_ADDON_PRE_REN","HOME_EM_ADDON_POST_REN", 
                        "GARDEN_ADDON_PRE_REN", "GARDEN_ADDON_POST_REN", 
                        "KEYCARE_ADDON_PRE_REN", "KEYCARE_ADDON_POST_REN", 
                        "HP1_ADDON_PRE_REN", "HP1_ADDON_POST_REN",
                        "HP2_ADDON_PRE_REN", "HP2_ADDON_POST_REN", 
                        "HP3_ADDON_PRE_REN", "HP3_ADDON_POST_REN", 
                        "MTA_FLAG", "OCC_STATUS", "OWNERSHIP_TYPE",
                        "PROP_TYPE", "PAYMENT_METHOD", "P1_EMP_STATUS",
                        "P1_MAR_STATUS"
                        ]
    
    for col in categorical_cols:
        dummies = pd.get_dummies(df[col], 
                                 drop_first = True,
                                 prefix = col
                                )
        df = pd.concat([df, dummies], 1)
    
    # 4. Create age features
    df["age"] = (datetime.strptime("2013-01-01", "%Y-%m-%d") - pd.to_datetime(df["P1_DOB"])).dt.days // 365
    df["property_age"] = 2013 - df["YEARBUILT"]
    df["cover_length"] = 2013 - pd.to_datetime(df["COVER_START"]).dt.year
    
    # 5. Impute missing value
    df["RISK_RATED_AREA_B_imputed"] = df["RISK_RATED_AREA_B"].fillna(df["RISK_RATED_AREA_B"].mean())
    df["RISK_RATED_AREA_C_imputed"] = df["RISK_RATED_AREA_C"].fillna(df["RISK_RATED_AREA_C"].mean())
    df["MTA_FAP_imputed"] = df["MTA_FAP"].fillna(0)
    df["MTA_APRP_imputed"] = df["MTA_APRP"].fillna(0)

    return df

In [None]:
# Split train and test
@timer
def splitData(df: "pd.DataFrame", FEATS: "list"):
    """Split the dataframe into train and test
    
    Args:
        df: preprocessed dataframe
        FEATS: feature list
        
    Returns:
        X_train, y_train, X_test, y_test
    """
    
    train, test = train_test_split(df, test_size = .3, random_state = 42)
    train, test = prepareInputs(train), prepareInputs(test)
    
    return train[FEATS], train["lapse"], test[FEATS], test["lapse"]

In [None]:
# Standardise the data sets
@timer
def standardiseNumericalFeats(X_train, X_test):
    """Standardise the numerical features
    
    Returns:
        Standardised X_train and X_test
    """

    numerical_cols = [
        "age", "property_age", "cover_length", "RISK_RATED_AREA_B_imputed", 
        "RISK_RATED_AREA_C_imputed", "MTA_FAP_imputed", "MTA_APRP_imputed",
        "SUM_INSURED_BUILDINGS", "NCD_GRANTED_YEARS_B", "SUM_INSURED_CONTENTS", 
        "NCD_GRANTED_YEARS_C", "SPEC_SUM_INSURED", "SPEC_ITEM_PREM", 
        "UNSPEC_HRP_PREM", "BEDROOMS", "MAX_DAYS_UNOCC", "LAST_ANN_PREM_GROSS"
    ]

    for col in numerical_cols:
        scaler = StandardScaler()

        X_train[col] = scaler.fit_transform(X_train[[col]])
        X_test[col] = scaler.transform(X_test[[col]])
        
    return X_train, X_test

## TabNet

In [None]:
@timer
def tabNetPretrain(X_train):
    """Pretrain TabNet model
    
    Return:
        TabNet pretrainer obj
    """
    tabnet_params = dict(n_d=8, n_a=8, n_steps=3, gamma=1.3,
                             n_independent=2, n_shared=2,
                             seed=42, lambda_sparse=1e-3,
                             optimizer_fn=torch.optim.Adam,
                             optimizer_params=dict(lr=2e-2,
                                                   weight_decay=1e-5
                                                  ),
                             mask_type="entmax",
                             scheduler_params=dict(max_lr=0.05,
                                                   steps_per_epoch=int(X_train.shape[0] / 256),
                                                   epochs=200,
                                                   is_batch_level=True
                                                  ),
                             scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
                             verbose=10
                        )

    pretrainer = TabNetPretrainer(**tabnet_params)

    pretrainer.fit(
        X_train=X_train.to_numpy(),
        eval_set=[X_train.to_numpy()],
        max_epochs = 100,
        patience = 10, 
        batch_size = 256, 
        virtual_batch_size = 128,
        num_workers = 1, 
        drop_last = True)
    
    return pretrainer

In [None]:
@timer
def trainTabNetModel(X_train, y_train, pretrainer):
    """Train TabNet model
    
    Args:
        pretrainer: pretrained model. If not using this, use None
        
    Return:
        TabNet model obj
    """
    
    tabNet_model = TabNetClassifier(
                                   n_d=16,
                                   n_a=16,
                                   n_steps=4,
                                   gamma=1.9,
                                   n_independent=4,
                                   n_shared=5,
                                   seed=42,
                                   optimizer_fn = torch.optim.Adam,
                                   scheduler_params = {"milestones": [150,250,300,350,400,450],'gamma':0.2},
                                   scheduler_fn=torch.optim.lr_scheduler.MultiStepLR
                                  )

    tabNet_model.fit(
        X_train = X_train.to_numpy(),
        y_train = y_train.to_numpy(),
        eval_set=[(X_train.to_numpy(), y_train.to_numpy()),
                  (X_test.to_numpy(), y_test.to_numpy())],
        max_epochs = 100,
        batch_size = 256,
        patience = 10,
        from_unsupervised = pretrainer
        )
    
    return tabNet_model

In [None]:
# Make predictions
def makePredictions(X_test, tabNet_model) -> "pd.DataFrame":
    """Make predictions
    
    Return:
        Predictions
    """
    
    return tabNet_model.predict_proba(X_test.to_numpy())[:,1]

In [None]:
# Evaluation
def evaluate(y_tabNet_pred) -> None:
    """Evaluate the predictions
    
    Process:
        Print ROC AUC and F1 score
    """
    
    print("The ROC AUC score of TabNet model is " +
          str(round(roc_auc_score(y_test, y_tabNet_pred), 4))
         )

    print("The F1 score of TabNet model at threshold = 0.27 is " +
          str(round(f1_score(y_test, np.where(y_tabNet_pred > 0.27, 1, 0)), 4))
         )

In [None]:
FEATS = [
         "CLAIM3YEARS_Y", "BUS_USE_Y", "AD_BUILDINGS_Y",
         "CONTENTS_COVER_Y", "P1_SEX_M", "P1_SEX_N", "BUILDINGS_COVER_Y", 
         "P1_POLICY_REFUSED_Y", "APPR_ALARM_Y", "APPR_LOCKS_Y", "FLOODING_Y", 
         "NEIGH_WATCH_Y", "SAFE_INSTALLED_Y", "SEC_DISC_REQ_Y", "SUBSIDENCE_Y", 
         "LEGAL_ADDON_POST_REN_Y", "HOME_EM_ADDON_PRE_REN_Y", 
         "HOME_EM_ADDON_POST_REN_Y", "GARDEN_ADDON_PRE_REN_Y",
         "GARDEN_ADDON_POST_REN_Y", "KEYCARE_ADDON_PRE_REN_Y", 
         "KEYCARE_ADDON_POST_REN_Y", "HP1_ADDON_PRE_REN_Y", "HP1_ADDON_POST_REN_Y", 
         "HP2_ADDON_PRE_REN_Y", "HP2_ADDON_POST_REN_Y", "HP3_ADDON_PRE_REN_Y", 
         "HP3_ADDON_POST_REN_Y", "MTA_FLAG_Y", "OCC_STATUS_LP",
         "OCC_STATUS_PH", "OCC_STATUS_UN", "OCC_STATUS_WD",
         "OWNERSHIP_TYPE_2.0", "OWNERSHIP_TYPE_3.0", "OWNERSHIP_TYPE_6.0", 
         "OWNERSHIP_TYPE_7.0", "OWNERSHIP_TYPE_8.0", "OWNERSHIP_TYPE_11.0", 
         "OWNERSHIP_TYPE_12.0", "OWNERSHIP_TYPE_13.0", "OWNERSHIP_TYPE_14.0", 
         "OWNERSHIP_TYPE_16.0", "OWNERSHIP_TYPE_17.0", 
         "OWNERSHIP_TYPE_18.0", "PROP_TYPE_2.0", "PROP_TYPE_3.0", "PROP_TYPE_4.0", 
         "PROP_TYPE_7.0", "PROP_TYPE_9.0", "PROP_TYPE_10.0", 
         "PROP_TYPE_16.0", "PROP_TYPE_17.0", "PROP_TYPE_18.0", "PROP_TYPE_19.0", 
         "PROP_TYPE_20.0", "PROP_TYPE_21.0", "PROP_TYPE_22.0", "PROP_TYPE_23.0", 
         "PROP_TYPE_24.0", "PROP_TYPE_25.0", "PROP_TYPE_26.0", "PROP_TYPE_27.0", 
         "PROP_TYPE_29.0", "PROP_TYPE_30.0", "PROP_TYPE_31.0", 
         "PROP_TYPE_32.0", "PROP_TYPE_37.0", "PROP_TYPE_39.0", 
         "PROP_TYPE_40.0", "PROP_TYPE_44.0", "PROP_TYPE_45.0", "PROP_TYPE_47.0", 
         "PROP_TYPE_48.0", "PROP_TYPE_51.0", "PROP_TYPE_52.0", "PROP_TYPE_53.0", 
         "PAYMENT_METHOD_NonDD", "PAYMENT_METHOD_PureDD", "P1_EMP_STATUS_C", 
         "P1_EMP_STATUS_E", "P1_EMP_STATUS_F", "P1_EMP_STATUS_H", "P1_EMP_STATUS_I", 
         "P1_EMP_STATUS_N", "P1_EMP_STATUS_R", "P1_EMP_STATUS_S", "P1_EMP_STATUS_U", 
         "P1_EMP_STATUS_V", "P1_MAR_STATUS_B", "P1_MAR_STATUS_C", "P1_MAR_STATUS_D", 
         "P1_MAR_STATUS_M", "P1_MAR_STATUS_N", "P1_MAR_STATUS_O", "P1_MAR_STATUS_P", 
         "P1_MAR_STATUS_S", "P1_MAR_STATUS_W", 
         "age", "property_age", "cover_length", "RISK_RATED_AREA_B_imputed", 
         "RISK_RATED_AREA_C_imputed", "MTA_FAP_imputed", "MTA_APRP_imputed",
         "SUM_INSURED_BUILDINGS", "NCD_GRANTED_YEARS_B", "SUM_INSURED_CONTENTS", 
         "NCD_GRANTED_YEARS_C", "SPEC_SUM_INSURED", "SPEC_ITEM_PREM", 
         "UNSPEC_HRP_PREM", "BEDROOMS", "MAX_DAYS_UNOCC", "LAST_ANN_PREM_GROSS"
        ]


print("Reading the data")
df = pd.read_csv("../input/home-insurance/home_insurance.csv")

print("Preprocessing the data")
X_train, y_train, X_test, y_test = splitData(df, FEATS)
X_train, X_test = standardiseNumericalFeats(X_train, X_test)

print("The ratio of lapse class in training set is " +
      str(round(y_train.sum()/len(y_train) * 100, 2)) +
      "%"
     )

print("The ratio of lapse class in test set is " +
      str(round(y_test.sum()/len(y_test) * 100, 2)) +
      "%"
     )

print("Pretrain TabNet model")
pretrainer = tabNetPretrain(X_train)

print("Training TabNet model")
tabNet_model = trainTabNetModel(X_train, y_train, pretrainer)

print("Making predictions")
y_tabNet_pred = makePredictions(X_test, tabNet_model)

print("Evaluation of the model")
evaluate(y_tabNet_pred)

## Feature importance

In [None]:
# TabNet model
importance_tabNet = pd.DataFrame(tabNet_model.feature_importances_,index=X_train.columns).sort_values(0, ascending = False)
importance_tabNet.columns = ["importance"]
importance_tabNet

## Prediction distribution

In [None]:
plt.hist(y_tabNet_pred, bins = 100)
plt.title("Prediction distribution of pretrained TabNet")
plt.show()