In [None]:
%%capture
pip install pytorch-tabnet

In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_curve, roc_auc_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F

from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetClassifier

# from tqdm import tqdm
from tqdm.notebook import tqdm
import string
import random
import time
import os
import gc

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

%matplotlib inline
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [None]:
class CFG:
    input = "../input/tabular-playground-series-may-2022"
    
    n_splits = 10
    seed     = 42
    n_bins   = 50
    
    target   = 'target'
    tab_pred = 'tab_pred'
    pred     = 'pred'
    
    int1_features = ['f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12',
                     'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18']
    int2_features = ['f_29', 'f_30']
    int_features  = int1_features + int2_features
    
    float1_features = ['f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06']
    float2_features = ['f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26']
    float3_features = ['f_28']
    float_features  = float1_features + float2_features + float3_features

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True
    
seed_everything(CFG.seed)

# [Reference](https://www.kaggle.com/code/aboriginal3153/tps-mar-22-neural-network-by-pytorch)

In [None]:
# train_df = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
# test_df  = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')
# print(train_df.shape)
# print(test_df.shape)
# train_df.head()

train = pd.read_csv("/".join([CFG.input, "train.csv"]))
test  = pd.read_csv("/".join([CFG.input, "test.csv"]))
submission = pd.read_csv("/".join([CFG.input, "sample_submission.csv"]))

print(train.shape)
print(test.shape)
print(submission.shape)

# Preprocessing
## Feature Engineering

In [None]:
all_df = pd.concat([train, test]).reset_index(drop=True)

In [None]:
print(all_df.shape)
all_df.head()

In [None]:
class feature_engineering:
    def __init__(self, df):
        self.df = df
        self.f_27_len = len(self.df['f_27'][0])
        self.alphabet_upper = list(string.ascii_uppercase)
        
    def get_features(self):
        for i in range(10):
            self.df[f'ch{i}'] = self.df.f_27.str.get(i).apply(ord) - ord('A')
            self.df["unique_characters"] = self.df.f_27.apply(lambda s: len(set(s)))
            self.df['i_02_21'] = (self.df.f_21 + self.df.f_02 > 5.2).astype(int) - (self.df.f_21 + self.df.f_02 < -5.3).astype(int)
            self.df['i_05_22'] = (self.df.f_22 + self.df.f_05 > 5.1).astype(int) - (self.df.f_22 + self.df.f_05 < -5.4).astype(int)
            i_00_01_26 = self.df.f_00 + self.df.f_01 + self.df.f_26
            self.df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)
        
        return self.df
        
#     def get_features(self):
#         for i in range(self.f_27_len):
#             self.df[f'f_27_{i}'] = self.df['f_27'].apply(lambda x: x[i])

#         for letter in tqdm(self.alphabet_upper):
#             self.df[f'f_27_{letter}_count'] = self.df['f_27'].str.count(letter)

#         self.df['f_27_nunique'] = self.df['f_27'].apply(lambda x: len(set(x)))

#         return self.df
    
    def scaling(self, features):
        sc = StandardScaler()
        self.df[features] = sc.fit_transform(self.df[features])

        return self.df

    def label_encoding(self, features):
        new_features = []
        
        for feature in features:
            if self.df[feature].dtype == 'O':
                le = LabelEncoder()
                self.df[f'{feature}_enc'] = le.fit_transform(self.df[feature])
                new_features.append(f'{feature}_enc')
            else:
                new_features.append(feature)

        return self.df, new_features
    
    def onehot_encoding(self, features):
        new_features = []
        self.df = pd.get_dummies(self.df, columns=features)
        
        feats = [col for col in self.df.columns if CFG.target not in col]
        for feat in feats:
            if self.df[feat].dtype == 'uint8':
                new_features.append(feat)

        return self.df, new_features

In [None]:
%%time

fe     = feature_engineering(all_df)
all_df = fe.get_features()

In [None]:
print(all_df.shape)
all_df.head()

In [None]:
features = [col for col in all_df.columns if CFG.target not in col]
num_features = []
cat_features = []

for feature in features:
    if all_df[feature].dtype == float:
        num_features.append(feature)
    else:
        cat_features.append(feature)

cat_features.remove('id')
cat_features.remove('f_27')

# Scaling and encoding

In [None]:
all_df, cat_features = fe.label_encoding(cat_features)

all_df       = fe.scaling(num_features)
all_features = cat_features + num_features

In [None]:
train_len = train.shape[0]
train     = all_df[:train_len]
test      = all_df[train_len:].reset_index(drop=True)

In [None]:
display(train[all_features])
display(test[all_features])

# TabNet

In [None]:
skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train, y=train[CFG.target])):
    X_train = train[all_features].to_numpy()[trn_idx]
    y_train = train[CFG.target].to_numpy()[trn_idx]
    X_valid = train[all_features].to_numpy()[val_idx]
    y_valid = train[CFG.target].to_numpy()[val_idx]
    X_test = test[all_features].to_numpy()
    
    print(f"===== FOLD {fold} =====")
    
    tabnet_params = dict(
        n_d=64,
        n_steps=5,
        gamma=1.3,
        n_independent=3,
        n_shared=3,
        seed=CFG.seed,
        momentum=2e-2,
        lambda_sparse=1e-6,

        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(
            lr=1e-2,
            weight_decay=1e-7
        ),
        
        scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
        scheduler_params=dict(
            mode='max',
            factor=0.9,
            patience=3,
            min_lr=1e-6,
        ),
        verbose=10,
        device_name='auto',
        mask_type='sparsemax',
    )
    
    # Defining TabNet model
    model = TabNetClassifier(**tabnet_params)

    model.fit(
        X_train=X_train,
        y_train=y_train,
        from_unsupervised=None,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_name=["train", "valid"],
        eval_metric=["auc"],
        batch_size=2048,
        virtual_batch_size=2048,
        max_epochs=200,
        drop_last=True,
        pin_memory=True,
        patience=20,
        num_workers=4,
    )

    train.loc[val_idx, CFG.tab_pred] = model.predict_proba(X_valid)[:, -1]
    print(f"auc score: {roc_auc_score(y_true=y_valid, y_score=train.loc[val_idx, CFG.tab_pred]):.6f}\n")
    
    test[f'{CFG.tab_pred}_{fold}'] = model.predict_proba(X_test)[:, -1]

print(f"auc score : {roc_auc_score(y_true=train[CFG.target], y_score=train[CFG.tab_pred]):.6f}")

In [None]:
cols = [col for col in test.columns if CFG.tab_pred in col]

submission[CFG.target] = test[cols].mean(axis=1)
submission.to_csv("submission.csv", index=False)
submission

In [None]:
# # Function to obtain the activation function
# def get_activation(activation_name):
#     if activation_name == 'Relu':
#         activation = F.relu
#     elif activation_name == 'ELU':
#         activation = F.elu
#     else:
#         activation = F.leaky_relu
#     return activation

# # Function to get optimize method
# def get_optimizer(model, optimizer_name, lr, weight_decay):
#     if optimizer_name == 'MomentumSGD': 
#         optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
#     elif optimizer_name == 'Adam':
#         optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
#     elif optimizer_name == 'Adagrad':
#         optimizer = torch.optim.Adagrad(model.parameters(), lr=lr, weight_decay=weight_decay)      
#     else:
#         optimizer = torch.optim.RMSprop(model.parameters())
#     return optimizer

# # Function to train Neural Network
# def train(model, train_dataloader, optimizer):
#     # check whether GPU is available
#     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#     model.to(device)
#     # Define the error function
#     criterion = nn.MSELoss()
#     #　Model in learning mode
#     model.train()
#     #　If the network is somewhat fixed, make it faster
#     torch.backends.cudnn.benchark = True
#     # epoch loss
#     epoch_loss = 0
#     iteration = 0
#     for batch, (data, target) in enumerate(train_dataloader):
#         data, target = data.to(device), target.to(device)
#         optimizer.zero_grad()
#         output = model(data)
#         output = output.view(1, -1)[0]
#         # print(output.shape, target.shape)
#         target = target.to(torch.float32)
#         loss   = criterion(output, target)
#         epoch_loss += loss.item()
#         loss.backward()
#         optimizer.step()
#         iteration += 1
#     epoch_loss /= iteration
#     return epoch_loss

# # Function for prediction
# def predict(model, dataloader):
#     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#     model.eval()
#     model.to(device)
#     y_pred = np.array([])
#     with torch.no_grad():
#         for data in dataloader:
#             data = data[0]
#             #output = model(data)
#             output = model(data.to(device))
#             output = output.view(1, -1)
#             output = output.to('cpu').detach().numpy().copy()
#             #output = output.to(device)
#             y_pred = np.append(y_pred, output[0])
#         y_pred = np.array(y_pred)
#     return y_pred

# # Function for plot loss function of each epoch
# def loss_plot(logs_train, logs_valid):
#     plt.plot(logs_train[0][1:], logs_train[1][1:], '-b', label='train')
#     plt.plot(logs_valid[0][1:], logs_valid[1][1:], '-r', label='test')
#     plt.xlabel('epoch')
#     plt.ylabel('loss')
#     plt.legend()
#     plt.show()

In [None]:
# params = {'num_layer': 2, 
#           'num_nodes_0': 24, 
#           'num_nodes_1': 12, 
#           'dropout_rate': 0.5, 
#           'activation': 'leaky_relu', 
#           'optimizer': 'Adam', 
#           'weight_decay': 1e-10, 
#           'Adam_lr': 0.001}

In [None]:
# from tqdm import tqdm
# ### prameter
# k_split = 10
# num_epochs = 100
# batch_size = 64
# ###

# # k-fold cross-validation
# kfold = StratifiedKFold(n_splits=k_split,random_state=1, shuffle=True).split(X_train_std, y_train)
# #### get parameter
# num_layer       = params['num_layer']
# num_nodes       = [int(params[s]) for s in params.keys() if 'num_nodes' in s]
# dropout_rate    = params['dropout_rate']
# activation_name = params['activation']
# optimizer_name  = params['optimizer']
# lr              = params[optimizer_name+'_lr']
# weight_decay    = params['weight_decay']
# ######

# scores = []   # list to save score 
# models = []   # list to save model
# for k, (train_id, test_id) in enumerate(kfold):
#     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#     # Instantiate Model
#     model = Net(input_size=X_train_std.shape[1],
#                 num_layer=num_layer, 
#                 num_nodes=num_nodes, 
#                 dropout_rate=dropout_rate, 
#                 activation_name=activation_name)
#     # model to GPU
#     model.to(device)
#     optimizer = get_optimizer(model, optimizer_name, lr, weight_decay)
#     # data to dataloader
#     dataset          = torch.utils.data.TensorDataset(torch.Tensor(X_train_std[train_id]), 
#                                              torch.tensor(y_train[train_id]))
#     train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
#     valid_dataset    = torch.utils.data.TensorDataset(torch.Tensor(X_train_std[test_id]))
#     valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size)
#     # training each epoch
#     logs_train = [[0], [np.inf]]
#     logs_valid = [[0], [np.inf]]
#     for epoch in tqdm(range(num_epochs)):
#         epoch_loss = train(model, train_dataloader, optimizer)
#         valid_pred = predict(model, valid_dataloader)
#         valid_loss = mean_squared_error(y_train[test_id], valid_pred)
#         if epoch_loss < min(logs_valid[1]):
#             torch.save(model.state_dict(), './models'+str(k))
#         logs_train[0].append(epoch+1)
#         logs_train[1].append(epoch_loss)
#         logs_valid[0].append(epoch+1)
#         logs_valid[1].append(valid_loss)   
#     # valid
#     model.load_state_dict(torch.load('./models'+str(k)))
#     pred_y_k = predict(model, valid_dataloader)
#     # score
#     score = roc_auc_score(y_train[test_id], pred_y_k)
#     print('Fold: %2d, AUC: %.3f' % (k+1, score))
#     scores.append(score)
#     models.append(model)
#     loss_plot(logs_train, logs_valid)
# print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

# Evaluation with testdata
* Define a function to predict and summarize the results in each of the models created by the k-fold cross-validation.

In [None]:
# def predict_kfold(models, X_test):
#     # Create array for storing test data
#     y_pred = np.zeros((len(X_test), len(models)))
#     # Crate dataloader
#     test_dataset = torch.utils.data.TensorDataset(torch.Tensor(X_test))
#     test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=512)
#     for fold_, model_ in enumerate(models):
#         model_.load_state_dict(torch.load('./models'+str(fold_)))
#         # predict
#         pred_ = predict(model_, test_dataloader)
#         # store
#         y_pred[:, fold_] = pred_ 
#     y_pred = y_pred.mean(axis=1)
#     return y_pred
# y_pred = predict_kfold(models, X_test_std)

In [None]:
# # calclate auc and roc curves and evaluate performance on test data
# roc = roc_curve(y_test, y_pred)
# print("roc", roc_auc_score(y_test, y_pred))
# fpr, tpr, thresholds = roc
# plt.plot(fpr, tpr, marker='o')
# plt.xlabel('FPR: False positive rate')
# plt.ylabel('TPR: True positive rate')
# plt.grid()

In [None]:
# # Try the same calculations on training data
# y_pred_train = predict_kfold(models, X_train_std)
# roc = roc_curve(y_train, y_pred_train)
# print("roc", roc_auc_score(y_train, y_pred_train))
# fpr, tpr, thresholds = roc
# plt.plot(fpr, tpr, marker='o')
# plt.xlabel('FPR: False positive rate')
# plt.ylabel('TPR: True positive rate')
# plt.grid()

# Submit data
* Apply model to test_df and create submit data

In [None]:
# X_submit = test_df.values
# X_submit_std = stdsc.transform(X_submit)
# y_submit = predict_kfold(models, X_submit_std)
# print(y_submit)
# print(y_submit.shape)
# plt.hist(y_submit, bins=30, density=True)
# plt.show()

In [None]:
# submission_df = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')
# print(submission_df.shape)
# submission_df.head()

In [None]:
# submission_df['target'] = pd.DataFrame(y_submit)
# submission_df.head()

In [None]:
# submission_df.to_csv("submission.csv", index=False, header=True)

# Hyperparameter Tuning
## A method of hyperparameter tuning using a technique called `optuna`

* First, split the training data into data used for training and data used for tuning. And then **standardize.**

In [None]:
# valid_size = 0.1
# X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=valid_size,
#                                                       stratify=y_train)
# print(X_train.shape, X_valid.shape)
# print(y_train.shape, y_valid.shape)

# stdsc = StandardScaler()
# X_train_std   = stdsc.fit_transform(X_train)
# X_valid_std   = stdsc.transform(X_valid)
# train_dataset = torch.utils.data.TensorDataset(torch.Tensor(X_train_std), torch.tensor(y_train))
# valid_dataset = torch.utils.data.TensorDataset(torch.Tensor(X_valid_std), torch.tensor(y_valid))
# train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64)
# valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=64)

**Tuning is performed below using optuna. Here, we define trial and explore each parameter.**

In [None]:
# # define the class
# class Net(nn.Module):
#     def __init__(self, trial, input_size, num_layer, num_nodes, dropout_rate):
#         super(Net, self).__init__()
#         self.activation = get_activation(trial)
#         self.linears = nn.ModuleList([nn.Linear(input_size, num_nodes[0])])
#         self.batchnorms = nn.ModuleList([nn.BatchNorm1d(num_nodes[0])])
#         for i in range(1, num_layer):
#             self.linears.append(nn.Linear(num_nodes[i-1], num_nodes[i]))
#             self.batchnorms.append(nn.BatchNorm1d(num_nodes[i]))
#         self.fcl = nn.Linear(num_nodes[-1], 1)
#         self.dropout = nn.Dropout(dropout_rate)

#     def forward(self, x):
#         for i, d in enumerate(zip(self.linears, self.batchnorms)):
#             l, b = d[0], d[1]
#             x = b(self.activation(l(x)))
#             x = self.dropout(x)
#         x = torch.sigmoid(self.fcl(x))
#         return x

In [None]:
# def train(model, device, train_dataloader, optimizer):
#     model.train()
#     criterion = nn.MSELoss()
#     for batch, (data, target) in enumerate(train_dataloader):
#         data, target = data.to(device), target.to(device)
#         optimizer.zero_grad()
#         output = model(data)
#         output = output.view(1, -1)[0]
#         # print(output.shape, target.shape)
#         target = target.to(torch.float32)
#         loss = criterion(output, target)
#         loss.backward()
#         optimizer.step()
# def test(model, device, valid_dataloader):
#     model.eval()
#     criterion = nn.MSELoss()
#     loss = 0
#     iteration = 0
#     with torch.no_grad():
#         for data, target in valid_dataloader:
#             data, target = data.to(device), target.to(device)
#             output = model(data)
#             output = output.view(1, -1)[0]
#             loss += criterion(output, target)
#             iteration += 1
#     loss /= iteration
#     return loss

In [None]:
# def get_optimizer(trial, model):
#     optimizer_names = ['MomentumSGD', 'Adam', 'Adagrad']
#     optimizer_name = trial.suggest_categorical('optimizer', optimizer_names)
#     weight_decay = trial.suggest_loguniform('weight_decay', 1e-10, 1e-3)
#     if optimizer_name == optimizer_names[0]: 
#         momentum_sgd_lr = trial.suggest_loguniform('Momentum_SGD_lr', 1e-5, 1e-1)
#         optimizer = torch.optim.SGD(model.parameters(), lr=momentum_sgd_lr, momentum=0.9, weight_decay=weight_decay)
#     elif optimizer_name == optimizer_names[1]:
#         adam_lr = trial.suggest_loguniform('Adam_lr', 1e-5, 1e-1)
#         optimizer = torch.optim.Adam(model.parameters(), lr=adam_lr, weight_decay=weight_decay)
#     elif optimizer_name == optimizer_names[2]:
#         adagrad_lr = trial.suggest_loguniform('Adagrad_lr', 1e-5, 1e-1)
#         optimizer = torch.optim.Adagrad(model.parameters(), lr=adagrad_lr, weight_decay=weight_decay)      
#     return optimizer

In [None]:
# def get_activation(trial):
#     activation_names = ['ReLU', 'ELU', 'leaky_relu']
#     activation_name = trial.suggest_categorical('activation', activation_names)
#     if activation_name == activation_names[0]:
#         activation = F.relu
#     elif activation_name == activation_names[1]:
#         activation = F.elu
#     else:
#         activation = F.leaky_relu
#     return activation

In [None]:
# epochs = 30
# def objective(trial):
#     device = "cuda:0" if torch.cuda.is_available() else "cpu"
#     # hidden layer
#     num_layer = trial.suggest_int('num_layer', 2, 7)
#     # the number of nodes
#     num_nodes = [int(trial.suggest_discrete_uniform('num_nodes_'+str(i), 16, 128, 16)) for i in range(num_layer)]
#     # dropout ratio
#     dropout_rate = trial.suggest_float('dropout_rate', 0.0, 1.0)

#     model = Net(trial, X_train.shape[1],num_layer, num_nodes, dropout_rate).to(device)
#     optimizer = get_optimizer(trial, model)
#     error_rate = 0
#     for epoch in range(epochs):
#         train(model, device, train_dataloader, optimizer)
#     error_rate = test(model, device, valid_dataloader)
#     return error_rate

In [None]:
# import optuna
# TRIAL_SIZE = 100
# study = optuna.create_study()
# study.optimize(objective, n_trials=TRIAL_SIZE)
# best_params = study.best_params
# print(best_params)

In [None]:
# # params = best_params
# params={'num_layer': 4, 'num_nodes_0': 128.0, 'num_nodes_1': 112.0, 'num_nodes_2': 96.0,
#         'num_nodes_3': 96.0, 'dropout_rate': 0.08387843261849516, 'activation': 'ReLU',
#         'optimizer': 'Adam', 'weight_decay': 2.227219890291524e-09, 'Adam_lr': 0.0019802197708342255}