In [None]:
!pip install pytorch-tabnet

In [None]:
import os
import numpy as np
import pandas as pd
import warnings
import time
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

In [None]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [None]:
def splitData(df: "pd.DataFrame", FEATS: "List"):
    """Split the dataframe into train and test
    
    Args:
        df: preprocessed dataframe
        FEATS: feature list
        
    Returns:
        X_train, y_train, X_test, y_test
    """
    
    train, test = train_test_split(df, test_size = .3, random_state = 42)
    
    return train[FEATS], train["Survived"], test[FEATS], test["Survived"]

In [None]:
def prepareInputs(df: "pd.DataFrame"):
    """Preprocess
    
    Args:
        df: raw dataframe
    
    Return:
        df: processed dataframe
    """
    
    # 1: Inpute missing values
    df["Age"] = df["Age"].fillna(df["Age"].mean())
    df["Fare"] = df["Fare"].fillna(df["Fare"].mean())
    
    # 2: One hot encoding
    df["Cabin_category"] = [str(cabin_category)[0] for cabin_category in df["Cabin"]]
    
    categorical_cols = [
        "Pclass", "Sex", "SibSp",
        "Parch", "Embarked", 
        "Cabin_category"
                        ]
    
    for col in categorical_cols:
        dummies = pd.get_dummies(df[col], 
                                 drop_first = True,
                                 prefix = col
                                )
        df = pd.concat([df, dummies], 1)
        
    df = df.drop(categorical_cols, 1)
    
    df["Ticket_A/5"] = np.where(df["Age"] == "A/5", 1, 0)
    df["Ticket_C.A"] = np.where(df["Age"] == "C.A.", 1, 0)
    df["Ticket_SC/PARIS"] = np.where(df["Age"] == "SC/PARIS", 1, 0)
    df = df.drop(["Cabin", "Ticket"], 1)
    
    return df

In [None]:
# Standardise the data sets
def standardiseNumericalFeats(df):
    """Standardise the numerical features
    
    Returns:
        Standardised dataframe
    """

    numerical_cols = [
        "Age", "Fare"
    ]

    for col in numerical_cols:
        scaler = StandardScaler()

        df[col] = scaler.fit_transform(df[[col]])
        
    return df

In [None]:
def tabNetPretrain(X_train):
    """Pretrain TabNet model
    
    Return:
        TabNet pretrainer obj
    """
    tabnet_params = dict(n_d=8, n_a=8, n_steps=3, gamma=1.3,
                             n_independent=2, n_shared=2,
                             seed=42, lambda_sparse=1e-3,
                             optimizer_fn=torch.optim.Adam,
                             optimizer_params=dict(lr=2e-2,
                                                   weight_decay=1e-5
                                                  ),
                             mask_type="entmax",
                             scheduler_params=dict(max_lr=0.05,
                                                   steps_per_epoch=int(X_train.shape[0] / 256),
                                                   epochs=200,
                                                   is_batch_level=True
                                                  ),
                             scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
                             verbose=10
                        )

    pretrainer = TabNetPretrainer(**tabnet_params)

    pretrainer.fit(
        X_train=X_train.to_numpy(),
        eval_set=[X_train.to_numpy()],
        max_epochs = 100,
        patience = 15, 
        batch_size = 256, 
        virtual_batch_size = 128,
        num_workers = 1, 
        drop_last = True)
    
    return pretrainer

In [None]:
def trainTabNetModel(X_train, y_train, X_test, y_test, pretrainer):
    """Train TabNet model
    
    Args:
        pretrainer: pretrained model. If not using this, use None
        
    Return:
        TabNet model obj
    """
    
    tabNet_model = TabNetClassifier(
                                   n_d=8,
                                   n_a=8,
                                   n_steps=4,
                                   gamma=1.3,
                                   n_independent=4,
                                   n_shared=5,
                                   seed=42,
                                   optimizer_fn = torch.optim.Adam,
                                   scheduler_params = {"milestones": [150,250,300,350,400,450],'gamma':0.2},
                                   scheduler_fn=torch.optim.lr_scheduler.MultiStepLR
                                  )

    tabNet_model.fit(
        X_train = X_train.to_numpy(),
        y_train = y_train.to_numpy(),
        eval_set=[(X_train.to_numpy(), y_train.to_numpy()),
                  (X_test.to_numpy(), y_test.to_numpy())],
        eval_metric=['accuracy'],
        max_epochs = 100,
        batch_size = 256,
        patience = 15,
        from_unsupervised = pretrainer
        )
    
    return tabNet_model

In [None]:
# Make predictions
def makePredictions(X_test, tabNet_model) -> "pd.DataFrame":
    """Make predictions
    
    Return:
        Predictions
    """
    
    return tabNet_model.predict_proba(X_test.to_numpy())[:,1]

In [None]:
# Evaluation
def evaluate(y_test, y_tabNet_pred) -> None:
    """Evaluate the predictions
    
    Process:
        Print accuracy score
    """
    
    print("The accuracy score of TabNet model is " +
          str(round(accuracy_score(y_test, np.where(y_tabNet_pred > .42, 1, 0)), 4))
         )

In [None]:
FEATS = [
        "Pclass", "Sex", "Age",
        "SibSp", "Parch", "Ticket",
        "Fare", "Cabin", "Embarked"
    ]

print("Reading the data")
df = pd.read_csv("../input/tabular-playground-series-apr-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-apr-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-apr-2021/sample_submission.csv")

print("Preprocessing the data")
X_train, y_train, X_validation, y_validation = splitData(df, FEATS)
X_train, X_validation = prepareInputs(X_train), prepareInputs(X_validation)
X_train, X_validation = standardiseNumericalFeats(X_train), standardiseNumericalFeats(X_validation)

print("The ratio of lapse class in training set is " +
      str(round(y_train.sum()/len(y_train) * 100, 2)) +
      "%"
     )

print("The ratio of lapse class in validation set is " +
      str(round(y_validation.sum()/len(y_validation) * 100, 2)) +
      "%"
     )

print("Pretrain TabNet model")
pretrainer = tabNetPretrain(X_train)

print("Training TabNet model")
tabNet_model = trainTabNetModel(X_train, y_train, X_validation, y_validation, pretrainer)

print("Making validation predictions")
y_tabNet_pred = makePredictions(X_validation, tabNet_model)

print("Evaluation of the model")
evaluate(y_validation, y_tabNet_pred)

print("Making predictions")
test = prepareInputs(test[FEATS])
test = standardiseNumericalFeats(test)

submission["Survived"] = makePredictions(test, tabNet_model)
submission["Survived"] = np.where(submission["Survived"] > .42, 1, 0)

submission.to_csv("submission.csv", index = False)

## Feature importance

In [None]:
# TabNet model
importance_tabNet = pd.DataFrame(tabNet_model.feature_importances_,index=X_train.columns).sort_values(0, ascending = False)
importance_tabNet.columns = ["importance"]
importance_tabNet

## Prediction distribution

In [None]:
plt.hist(y_tabNet_pred, bins = 100)
plt.title("Prediction distribution of pretrained TabNet")
plt.show()