# Installing necessary requirements

In [None]:
!pip install pytorch-tabnet

# TabNet Architecture

<img src= "https://miro.medium.com/max/788/1*twB1nZHPN5Cuxu2h_jpEPg.png" alt ="TabNet" style='width: 1000px;'>

# Loading packages and modules

In [None]:
# to read and manipulate data
import pandas as pd
import numpy as np
from collections import defaultdict
pd.options.display.max_columns = None

# to preprocess the data
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# to model the data
from sklearn.metrics import log_loss, accuracy_score, roc_curve
from sklearn.model_selection import StratifiedKFold
import torch
from torch import nn
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer

# to set the environment
import os
import sys
import random
from tqdm.notebook import tqdm

# for hyperparameter optimization
import optuna
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances

# Utility functions

In [None]:
# function to unpack train hyperparameters
def unpack_train_hyperpars(hyperparameters):
            # unpacking of shared hyperparameters for the pretrainer and classifier
            shared_hyperparameters = dict(n_d = hyperparameters['hyp_shared']['n_d'],
                                          n_a = hyperparameters['hyp_shared']['n_a'],
                                          n_steps = hyperparameters['hyp_shared']['n_steps'],
                                          n_independent = hyperparameters['hyp_shared']['n_independent'],
                                          n_shared = hyperparameters['hyp_shared']['n_shared'],
                                          mask_type = hyperparameters['hyp_shared']['mask_type'],
                                          lambda_sparse = hyperparameters['hyp_shared']['lambda_sparse'],
                                          cat_idxs = cat_indexes, 
                                          cat_dims = cat_dimensions, 
                                          cat_emb_dim = [dim * 2 for dim in cat_embedding_dimensions] if hyperparameters['hyp_shared']['cat_dim_big'] else cat_embedding_dimensions,
                                          optimizer_fn = torch.optim.Adam if hyperparameters['hyp_shared']['use_adam'] else torch.optim.RMSprop,
                                          scheduler_fn = torch.optim.lr_scheduler.ReduceLROnPlateau,
                                          verbose = 1)
            
            # unpacking the hyperparameters for the classifier
            train_hyperparameters = dict(gamma = hyperparameters['hyp_supervised']['gamma'],
                                         momentum = hyperparameters['hyp_supervised']['momentum'],
                                         optimizer_params = dict(lr = hyperparameters['hyp_supervised']['lr'],
                                                                 weight_decay = hyperparameters['hyp_supervised']['weight_decay']),
                                         scheduler_params = dict(mode = 'min', factor = 0.5, patience = 5, min_lr = 1e-8))
            
            # function to return the set of hyperparameters for training
            return shared_hyperparameters, train_hyperparameters
        
                                             
# function to unpack the hyperparameters for the pretrainer       
def unpack_unsup_hyperpars(hyperparameters, shared_hyperparameters):
                # unpacking the pretrainer-specific hyperparameters
                pretrain_hyperparameters = dict(optimizer_params = dict(lr = hyperparameters['hyp_pretrainer']['lr_pretrainer'],
                                                                        weight_decay = hyperparameters['hyp_pretrainer']['weight_decay_pretrainer']),
                                                scheduler_params = dict(mode = "min", factor = 0.5, patience = 5, min_lr = 1e-8))
                
                # updating the unpacked hyperparameters with the shared ones
                pretrain_hyperparameters.update(shared_hyperparameters)
                
                # returning the hyperparameters for the pretrainer
                return pretrain_hyperparameters
            

# function to instantiate the pretrainer with a set of hyperparameters
def instantiate_pretrainer(pretrainer_hyperparameters):
    # instantiating the pretrainer with the defined hyperparameters
    pretrainer = TabNetPretrainer(**pretrainer_hyperparameters)
    # returning the pretrainer instance
    return pretrainer


# function to train the pretrainer
def fit_pretrainer(model, X, val_set, pretrain_ratio, bs, vbs):
    # training the unsupervised tabnet model instance
    model.fit(X_train = X,
              eval_set = [val_set],
              pretraining_ratio = pretrain_ratio,
              max_epochs = 100,
              patience = 5,
              batch_size = bs,
              virtual_batch_size = np.int(bs / vbs),
              drop_last = True)
    # returning the trained pretrainer
    return model

# function to instantiate the pretrainer with a set of hyperparameters
def instantiate_supervised(supervised_hyperparameters):
    # instantiating the pretrainer with the defined hyperparameters
    supervised_model = TabNetClassifier(**supervised_hyperparameters)
    # returning the pretrainer instance
    return supervised_model


def fit_supervised(model, X, y, val_set, val_name, bs, vbs, use_pretrain):
    # training the unsupervised tabnet model instance
    model.fit(X_train = X, 
              y_train = y, 
              eval_set = val_set, 
              eval_name = val_name, 
              eval_metric = ['logloss'], 
              max_epochs = 100, 
              patience = 5,
              batch_size = bs, 
              virtual_batch_size = np.int(bs / vbs), 
              from_unsupervised = use_pretrain 
             )
    # returning the trained pretrainer
    return model


# function to extract the best probability threshold according to the roc curve
def extract_roc_threshold(y_obs, probs):
    # extracting fpr, tpr and thresholds from the roc curve
    fpr, tpr, thresholds = roc_curve(y_true = y_obs, y_score = probs)
    # calculating the gmeans for each of the fpr and tpr from the roc curve
    gmeans = np.sqrt(tpr * (1 - fpr))
    # identifying the element with the highest gmeans - best trade-off between fpr and tpr
    best_threshold = np.argmax(gmeans)
    # returning the probability threshold that maximizes the gmeans
    return thresholds[best_threshold]


# function to encode categories for a selected probability threshold
def move_threshold(probabilities, threshold):
    return (probabilities >= threshold).astype('int')


# function to evaluate a trained model
def score_model(model, X_eval, y_eval, scoring_method):
    
    # fitting the model trained model to the data to obtain predicted probabilities
    predicted_probabilities = model.predict_proba(X_eval)[:, 1]
    
    # use the traditional approach
    if scoring_method == 'traditional':
        # predicting the class label with the traditional approach
        selected_threshold = 0.5
        predicted_classes = model.predict(X_eval)
    # use the ROC AUC curve to define the threshold and set the class
    else:
        selected_threshold = extract_roc_threshold(y_obs = y_eval, probs = predicted_probabilities)
        predicted_classes = move_threshold(probabilities = predicted_probabilities, threshold = selected_threshold)
    
    # calculating the accuracy score of the model
    acc = accuracy_score(y_true = y_eval, y_pred = predicted_classes)
    
    # calculating the log loss of the model
    #ll = log_loss(y_true = y_eval, y_pred = predicted_probabilities)
    
    # returning the trained model
    #return acc, ll, selected_threshold
    return acc, selected_threshold


# function to run the stratified kfold with the selected hyperparameters
def evaluate_stratifield_kfold(X, y, kfolds, hyperparameters, should_pretrain):
    print(f'\n------------- Initializing the Stratified {kfolds}-fold evaluation -------------')
    # creating empty lists to store the accuracy and log loss
    fold_scores = []
    
    # printing the hyperparameters under evaluation
    print('Evaluating the following hyperparameters:')
    print(hyperparameters)
    
    # deciding whether past submissions should be used and which ones
    if hyperparameters['which_submission'] == 'lr': # logistic regression submission labels
        # sampling the indices of the test dataframe that will be used
        test_idx_leak = np.random.choice(a = range(X_test_df.shape[0]), size = np.int(X_test_df.shape[0] * 0.4), replace = False)
        # getting the indexes of the test dataframe
        X_leak, y_leak = X_test_df.iloc[test_idx_leak], y_lr_submission[test_idx_leak]
    if hyperparameters['which_submission'] == 'tabnet':
        # sampling the indices of the test dataframe that will be used
        test_idx_leak = np.random.choice(a = range(X_test_df.shape[0]), size = np.int(X_test_df.shape[0] * 0.4), replace = False)
        # getting the indexes of the test dataframe
        X_leak, y_leak = X_test_df.iloc[test_idx_leak], y_tabnet_submission[test_idx_leak]
            
    # instanting the stratified k-fold
    skf = StratifiedKFold(n_splits = kfolds, random_state = 42, shuffle = True)
    
    # unpacking the statified k-fold
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        
        print(f'\n------------------ Starting fold {fold + 1} ------------------')
        
        # separating the train from the test folds
        X_train_fold, X_val_fold, y_train_fold, y_val_fold = X.iloc[train_idx], X.iloc[test_idx], y[train_idx], y[test_idx]
        
        # mergining the test dataset leak to the train set if this approach should be used
        if hyperparameters['which_submission'] is not None:
            # merging and replacing the fold objects
            X_train_fold, y_train_fold = pd.concat([X_train_fold, X_leak]), np.hstack((y_train_fold, y_leak))
            # shuffling the sequence of observations
            shuffled_indices = np.random.choice(a = range(X_train_fold.shape[0]), size = X_train_fold.shape[0], replace = False)
            # rearranging x and y
            X_train_fold, y_train_fold = X_train_fold.iloc[shuffled_indices], y_train_fold[shuffled_indices]
            
        # instantiating a pipeline to transform the data
        pipeline = Pipeline(steps = [('age_imputation', ColumnTransformer([('impute_age', SimpleImputer(strategy = 'mean'), [0])], remainder = 'passthrough')), 
                                     ('fare_imputation', ColumnTransformer([('impute_fare', SimpleImputer(strategy = 'median'), [1])], remainder = 'passthrough')),
                                     ('scaler', ColumnTransformer([('mmscaler', MinMaxScaler(), [0, 1, 2, 3, 4])], remainder = 'passthrough'))
                                    ])
        
        # training the pipeline
        pipeline.fit(X_train_fold)
        
        # fitting the pipeline to the data
        X_train_fold = pipeline.transform(X_train_fold)
        X_val_fold = pipeline.transform(X_val_fold)
             
        # unpacking the hyperparameters used for training 
        shared_hyper, train_hyper = unpack_train_hyperpars(hyperparameters)
        
        # setting the hyperparameters for training
        train_hyper.update(shared_hyper)
        
        if should_pretrain:
            # unpacking the hyperparameters used for the pretrainer
            unsup_hyperparameters = unpack_unsup_hyperpars(hyperparameters, shared_hyper)
            
            # instantiating the hyperparameters
            pretrainer = instantiate_pretrainer(pretrainer_hyperparameters = unsup_hyperparameters)
            
            # combining the data
            X_unsup_test = np.vstack((X_train_fold, X_val_fold))
            
            # training the unsupervised model
            pretrainer = fit_pretrainer(model = pretrainer, 
                                        X = X_unsup_test if hyperparameters['hyp_pretrainer']['pretrain_on_test'] else X_train_fold, 
                                        val_set = X_unsup_test if hyperparameters['hyp_pretrainer']['pretrain_on_test'] else X_train_fold, 
                                        pretrain_ratio = hyperparameters['hyp_pretrainer']['pretraining_ratio'], 
                                        bs = np.int(hyperparameters['hyp_shared']['batch_size']), 
                                        vbs = np.int(hyperparameters['hyp_shared']['virtual_bs_ratio']))

        
        # instantiate the supervised classifier
        clf = instantiate_supervised(supervised_hyperparameters = train_hyper)
        
        # fit the supervised classifier
        clf = fit_supervised(model = clf, 
                             X = X_train_fold, y = y_train_fold,
                             val_set = [(X_val_fold, y_val_fold)], 
                             val_name = ['validation'],
                             bs = np.int(hyperparameters['hyp_shared']['batch_size']), 
                             vbs = np.int(hyperparameters['hyp_shared']['virtual_bs_ratio']),
                             use_pretrain = pretrainer if should_pretrain else None)
        
        # extracting the train score
        train_acc, _ = score_model(model = clf, X_eval = X_train_fold, y_eval = y_train_fold, scoring_method = hyperparameters['scoring'])
        
        # extracting the validation score
        val_acc, _ = score_model(model = clf, X_eval = X_val_fold, y_eval = y_val_fold, scoring_method = hyperparameters['scoring'])
        
        # appending accuracy and log loss to the list
        fold_scores.append((train_acc, val_acc))
        
        # printing the results of this fold
        print(f"> Fold {fold + 1}: Training accuracy: {train_acc} | Validation accuracy: {val_acc}")
     
    # structuring the training and validation scores in a dataframe
    scores = pd.DataFrame(fold_scores, columns=['train_accuracy', 'val_accuracy'])
     
    # printing average scores for this run
    print(f"End of Run!\nTraining loss: Training accuracy: {scores['train_accuracy'].mean()} | Validation accuracy: {scores['val_accuracy'].mean()}")
     
    # returning the scores
    return scores


 
# objective function for optuna to evaluate
def objective(trial):
    
    # search space
    search_space = {'hyp_supervised': {'lr': trial.suggest_float('lr', 0.001, 2e-2, log = True),
                                       'weight_decay': trial.suggest_float('weight_decay', 0.0, 1e-3),
                                       'gamma': trial.suggest_float('gamma', 1.3, 1.6),
                                       'momentum': trial.suggest_float('momentum', 0.02, 0.1)},
                    'hyp_shared': {'n_d': trial.suggest_int('n_d', 3, 14),
                                   'n_a': trial.suggest_int('n_a', 3, 14),
                                   'n_steps': trial.suggest_int('n_steps', 2, 6),
                                   'n_independent': trial.suggest_int('n_independent', 1, 4),
                                   'n_shared': trial.suggest_int('n_shared', 1, 4),
                                   'mask_type': trial.suggest_categorical('mask_type', ['entmax', 'sparsemax']),
                                   'lambda_sparse': trial.suggest_float('lambda_sparse', 0.0001, 0.01, log = True),
                                   'batch_size': trial.suggest_categorical('batch_size', [1024, 2048, 4096]),
                                   'virtual_bs_ratio': trial.suggest_categorical('virtual_bs_ratio', [2, 4, 8]),
                                   'cat_dim_big': trial.suggest_categorical('cat_dim_big', [True, False]),
                                   'use_adam': trial.suggest_categorical('use_adam', [True, False])},
                    'scoring': trial.suggest_categorical('scoring', ['traditional', 'auc']),
                    'which_submission': trial.suggest_categorical('which_submission', [None, 'lr', 'tabnet'])
                   }
    
    # defining whether TabNet should be pretrained
    pretrain_tabnet = trial.suggest_categorical('pretrain_tabnet', [True, False])
    
    # defining tabnet pretrainer hyperparameters if this was chosen and add it to the search space
    if pretrain_tabnet:
        # defining the hyperparameter dictionary
        search_pretrain = {'hyp_pretrainer': {'pretraining_ratio': trial.suggest_float('pretraining_ratio', 0.3, 0.7),
                                              'lr_pretrainer': trial.suggest_float('lr_pretrainer', 0.001, 2e-2, log = True),
                                              'weight_decay_pretrainer': trial.suggest_float('weight_decay_pretrainer', 0.0, 1e-3),
                                              'pretrain_on_test': trial.suggest_categorical('pretrain_on_test', [True, False])}}
        
        # updating the search space dictionary
        search_space.update(search_pretrain)
    
    # fit SKF
    skf_evaluated = evaluate_stratifield_kfold(X, y, kfolds = kfolds, 
                                               hyperparameters = search_space, should_pretrain = pretrain_tabnet)
    
    # unpack accuracy and loss
    acc = skf_evaluated['val_accuracy'].mean()
    
    # return scores
    return acc

In [None]:
# function to unpack the final hyperparameters for training
def unpack_final_hyperpars(hyperparameters):
            # unpacking of shared hyperparameters for the pretrainer and classifier
            shared_hyperparameters = dict(n_d = hyperparameters['n_d'],
                                          n_a = hyperparameters['n_a'],
                                          n_steps = hyperparameters['n_steps'],
                                          n_independent = hyperparameters['n_independent'],
                                          n_shared = hyperparameters['n_shared'],
                                          mask_type = hyperparameters['mask_type'],
                                          lambda_sparse = hyperparameters['lambda_sparse'],
                                          cat_idxs = cat_indexes, 
                                          cat_dims = cat_dimensions, 
                                          cat_emb_dim = [dim * 2 for dim in cat_embedding_dimensions] if hyperparameters['cat_dim_big'] else cat_embedding_dimensions, 
                                          optimizer_fn = torch.optim.Adam if hyperparameters['use_adam'] else torch.optim.RMSprop,
                                          scheduler_fn = torch.optim.lr_scheduler.ReduceLROnPlateau,
                                          verbose = 1)
            
            # unpacking the hyperparameters for the classifier
            train_hyperparameters = dict(gamma = hyperparameters['gamma'],
                                         momentum = hyperparameters['momentum'],
                                         optimizer_params = dict(lr = hyperparameters['lr'],
                                                                 weight_decay = hyperparameters['weight_decay']),
                                         scheduler_params = dict(mode = 'min', factor = 0.5, patience = 5, min_lr = 1e-8))
            
            # function to return the set of hyperparameters for training
            return shared_hyperparameters, train_hyperparameters


        # function to unpack the hyperparameters for the pretrainer       
def unpack_final_unsup(hyperparameters, shared_hyperparameters):
                # unpacking the pretrainer-specific hyperparameters
                pretrain_hyperparameters = dict(optimizer_params = dict(lr = hyperparameters['lr_pretrainer'],
                                                                        weight_decay = hyperparameters['weight_decay_pretrainer']),
                                                scheduler_params = dict(mode = "min", factor = 0.5, patience = 5, min_lr = 1e-8))
                
                # updating the unpacked hyperparameters with the shared ones
                pretrain_hyperparameters.update(shared_hyperparameters)
                
                # returning the hyperparameters for the pretrainer
                return pretrain_hyperparameters
            
# function to run the model with the best hyperparameters and submit it
def submit_stratifield_kfold(X, y, kfolds, hyperparameters, should_pretrain):
    print(f'\n------------- Initializing the Stratified {kfolds}-fold evaluation -------------')
    # creating empty lists to store the accuracy and log loss
    fold_scores = []
    
    # creating empty lists to store the thresholds
    threshold_list = []
    
    # creating a numpy array to store probabilities
    probas = np.zeros(len(X_test_array))
    
    # printing the hyperparameters under evaluation
    print('Evaluating the following hyperparameters:')
    print(hyperparameters)
    
    # instanting the stratified k-fold
    skf = StratifiedKFold(n_splits = kfolds, random_state = 42, shuffle = True)

    # deciding whether past submissions should be used and which ones
    if hyperparameters['which_submission'] == 'lr': # logistic regression submission labels
        # sampling the indices of the test dataframe that will be used
        test_idx_leak = np.random.choice(a = range(X_test_df.shape[0]), size = np.int(X_test_df.shape[0] * 0.4), replace = False)
        # getting the indexes of the test dataframe
        X_leak, y_leak = X_test_df.iloc[test_idx_leak], y_lr_submission[test_idx_leak]
    if hyperparameters['which_submission'] == 'tabnet':
        # sampling the indices of the test dataframe that will be used
        test_idx_leak = np.random.choice(a = range(X_test_df.shape[0]), size = np.int(X_test_df.shape[0] * 0.4), replace = False)
        # getting the indexes of the test dataframe
        X_leak, y_leak = X_test_df.iloc[test_idx_leak], y_tabnet_submission[test_idx_leak]
        
    # unpacking the statified k-fold
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        
        print(f'\n------------------ Starting fold {fold + 1} ------------------')
        
        # separating the train from the test folds
        X_train_fold, X_val_fold, y_train_fold, y_val_fold = X.iloc[train_idx], X.iloc[test_idx], y[train_idx], y[test_idx]
        
        # mergining the test dataset leak to the train set if this approach should be used
        if hyperparameters['which_submission'] is not None:
            # merging and replacing the fold objects
            X_train_fold, y_train_fold = pd.concat([X_train_fold, X_leak]), np.hstack((y_train_fold, y_leak))
            # shuffling the sequence of observations
            shuffled_indices = np.random.choice(a = range(X_train_fold.shape[0]), size = X_train_fold.shape[0], replace = False)
            # rearranging x and y
            X_train_fold, y_train_fold = X_train_fold.iloc[shuffled_indices], y_train_fold[shuffled_indices]
        
        # instantiating a pipeline to transform the data
        pipeline = Pipeline(steps = [('age_imputation', ColumnTransformer([('impute_age', SimpleImputer(strategy = 'mean'), [0])], remainder = 'passthrough')), 
                                     ('fare_imputation', ColumnTransformer([('impute_fare', SimpleImputer(strategy = 'median'), [1])], remainder = 'passthrough')),
                                     ('scaler', ColumnTransformer([('mmscaler', MinMaxScaler(), [0, 1, 2, 3, 4])], remainder = 'passthrough'))
                                    ])
        
        # training the pipeline
        pipeline.fit(X_train_fold)
        
        # fitting the pipeline to the data
        X_train_fold = pipeline.transform(X_train_fold)
        X_val_fold = pipeline.transform(X_val_fold)
        X_test_fold = pipeline.transform(X_test_array)
        
        # unpacking the hyperparameters used for training 
        shared_hyper, train_hyper = unpack_final_hyperpars(hyperparameters)
        
        # setting the hyperparameters for training
        train_hyper.update(shared_hyper)
        
        if should_pretrain:
            # unpacking the hyperparameters used for the pretrainer
            unsup_hyperparameters = unpack_final_unsup(hyperparameters, shared_hyper)
            
            # instantiating the hyperparameters
            pretrainer = instantiate_pretrainer(pretrainer_hyperparameters = unsup_hyperparameters)
            
            # combining the data
            X_unsup_test = np.vstack((X_train_fold, X_val_fold))
            
            # training the unsupervised model
            pretrainer = fit_pretrainer(model = pretrainer, 
                                        X = X_unsup_test if hyperparameters['pretrain_on_test'] else X_train_fold, 
                                        val_set = X_unsup_test if hyperparameters['pretrain_on_test'] else X_train_fold, 
                                        pretrain_ratio = hyperparameters['pretraining_ratio'], 
                                        bs = np.int(hyperparameters['batch_size']), 
                                        vbs = np.int(hyperparameters['virtual_bs_ratio']))
            
        # instantiate the supervised classifier
        clf = instantiate_supervised(supervised_hyperparameters = train_hyper)
        
        # fitting the model to the data
        clf = fit_supervised(model = clf, 
                             X = X_train_fold, y = y_train_fold,
                             val_set = [(X_val_fold, y_val_fold)], 
                             val_name = ['validation'],
                             bs = np.int(hyperparameters['batch_size']), 
                             vbs = np.int(hyperparameters['virtual_bs_ratio']),
                             use_pretrain = pretrainer if should_pretrain else None)
        
        # extracting the train score
        train_acc, _ = score_model(model = clf, X_eval = X_train_fold, y_eval = y_train_fold, scoring_method = hyperparameters['scoring'])
        
        # extracting the validation score
        val_acc, val_threshold = score_model(model = clf, X_eval = X_val_fold, y_eval = y_val_fold, scoring_method = hyperparameters['scoring'])
        
        # appending accuracy and log loss to the list
        fold_scores.append((train_acc, val_acc))
        
        # appending the threshold to the list
        threshold_list.append(val_threshold)
        
        # printing the results of this fold
        print(f"> Fold {fold + 1}: Training accuracy: {train_acc} | Validation accuracy: {val_acc}")
        
        # generating predictions on the test set with the model
        probas += clf.predict_proba(X_test_fold)[:, 1]
        
    # structuring the training and validation scores in a dataframe
    scores = pd.DataFrame(fold_scores, columns=['train_accuracy', 'val_accuracy'])
    
    # printing average scores for this run
    print(f"End of Run!\nTraining loss: Training accuracy: {scores['train_accuracy'].mean()} | Validation accuracy: {scores['val_accuracy'].mean()}")
    
    # returning the scores
    return scores, probas, threshold_list

# Loading the data

In [None]:
# train data
train = pd.read_csv(filepath_or_buffer = '../input/tabular-playground-series-apr-2021/train.csv')
train.shape

In [None]:
# test data
test = pd.read_csv(filepath_or_buffer = '../input/tabular-playground-series-apr-2021/test.csv')
test.shape

In [None]:
# submission file
submission = pd.read_csv(filepath_or_buffer = '../input/tabular-playground-series-apr-2021/sample_submission.csv')
submission.shape

In [None]:
# submitted predictions from my optimized logistic regression
lr_submission = pd.read_csv(filepath_or_buffer = '../input/lr-submission-tps202104/lr_submission.csv')
lr_submission.shape

In [None]:
# submitted predictions from my optimized logistic regression
tabnet_submission = pd.read_csv(filepath_or_buffer = '../input/tabnetsubmissiontps202104/tabnet_submission-tps202104.csv')
tabnet_submission.shape

# Merging the datasets

In [None]:
# putting the train on top of the test set in order to create some general features
df = pd.concat([train, test])
df.shape

# Feature Engineering

General features.

In [None]:
## adding number of missing values in the rows
df['nMissing'] = df.isnull().sum(axis = 1)

# encoding the deck to which de cabin belongs to
df['Deck'] = df.Cabin.str.extract(pat = r'(^[A-Z])')
# filling missing value for deck
df['Deck'] = df.Deck.fillna('U')

# encoding whether the passenger travels alone
df['TravelsAlone'] = ((df.SibSp == 0) & (df.Parch == 0)).astype('int').astype('object')

# encoding whether the passenger travels accompanied by somebody
df['TravelsTwo'] = ((df.SibSp == 1) & (df.Parch == 0) | (df.SibSp == 0) & (df.Parch == 1)).astype('int').astype('object')

# calculating the size of the family
df['FamilySize'] = df.SibSp + df.Parch + 1

# adding the family name
df['FamilyName'] = df.Name.str.extract(pat=r'(\w+)(?=,)')

# creating flags for the other columns that will be imputed
df['AgeImputed'] = (df.Age.isnull()).astype('int').astype('object')
df['FareImputed'] = (df.Fare.isnull()).astype('int').astype('object')
df['EmbarkedImputed'] = (df.Embarked.isnull()).astype('int').astype('object')

# parsing Pclass to category
df['Pclass'] = df.Pclass.astype('object')

Ticket letters representation.

In [None]:
# fixing possible typo on the ticket column 
df['Ticket'] = df.Ticket.str.replace(pat=r'\bSTON\b', repl='SOTON', regex=True)

# extracting the letters from the ticket column
df['TicketLetters'] = df.Ticket.str.findall(pat = r'([A-Za-z]+)')

# filling NaNs with an empty list
df['TicketLetters'] = df.TicketLetters.fillna('')

# parsing the list column to a string column
df['TicketLetters'] = df.TicketLetters.apply(lambda x: ' '.join([letras for letras in x]))

# filling missing ticket letters if the string is empty
df['TicketLetters'] = df.TicketLetters.apply(lambda x: 'N' if x == '' else x)

# putting all letters to upper
df['TicketLetters'] = df.TicketLetters.str.upper()

# fixing some tickets that seems to be the same
df['TicketLetters'] = df.TicketLetters.replace(to_replace = {'C A': 'CA', 'W E P': 'WE P', 'SOTON O Q': 'SOTON OQ', 'S W PP': 'SW PP', 
                                                             'S O C': 'SO C', 'C A SOTON': 'SOTON CA', 'S C PARIS': 'SC PARIS', 'P PP': 'PP',
                                                             'S C A': 'SC A', 'S O P P': 'S O P'})

Ticket numbers representation.

In [None]:
# extracting the ticket number
df['TicketNumber'] = df.Ticket.str.extract(pat=r'(?<=\s)?([0-9]+)$').astype('float')

# filling missing values in the ticket number
df['TicketNumber'] = df.TicketNumber.fillna(0)

# number of digits
df['TicketDigits'] = df.TicketNumber.astype('int').astype('str').str.count(pat='[0-9]')

# rounding the ticket number to an integer
df['TicketNumber'] = (df.TicketNumber / (df.TicketDigits * 1000)).astype('int')

# scaling the ticket back to its original scale
df['TicketNumber'] = df.TicketNumber * (df.TicketDigits * 1000)

# calculating the frequency of ticket numbers
frequencies_tnb = df.TicketNumber.value_counts(normalize=False)

# mapping the frequencies to ticket numbers
mask_TcktNmbFreq = df.TicketNumber.map(frequencies_tnb)

# replacing ticket numbers that are very rare by a random number
df['TicketNumber'] = df.TicketNumber.mask(mask_TcktNmbFreq < 10, 999999)

Cabin number representation.

In [None]:
# extracting the numeric part of the cabin
df['CabinNb'] = df.Cabin.fillna('0').str.extract(pat=r'([0-9]+)').astype('int')

# extracting the count of digits in the cabin number
df['CabinDigits'] = df.CabinNb.astype('str').str.count(pat='[0-9]').astype('int')

# rounding the ticket number to an integer
df['CabinNb'] = (df.CabinNb / (df.CabinDigits * 10)).astype('int')

# scaling the ticket back to its original scale
df['CabinNb'] = df.CabinNb * (df.CabinDigits * 10)

# calculating the frequency of cabin numbers
frequencies_cbn = df.CabinNb.value_counts(normalize=False)

# mapping the frequencies to cabin numbers
mask_CabinNbFreq = df.CabinNb.map(frequencies_cbn)

# replacing cabin numbers that are very rare by a random number
df['CabinNb'] = df.CabinNb.mask(mask_CabinNbFreq < 10, 99999)

# Encoding categorical columns

In [None]:
## arranging columns of the dataframe
num_columns = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize', 'nMissing']
cat_columns = ['Pclass', 'Sex', 'Embarked', 'Deck', 'TravelsAlone', 'TravelsTwo', 'TicketLetters', 'TicketNumber', 'CabinNb']

In [None]:
# creating a defaultdict to help on label encoding
dict_label_encoding = defaultdict(LabelEncoder)

# applying the label encoder to each of the categorical columns
df[cat_columns] = df[cat_columns].apply(lambda x: dict_label_encoding[x.name].fit_transform(x))

# Splitting the data

In [None]:
# arranging the columns
df = df[num_columns + cat_columns + ['Survived']]

In [None]:
# index of the categorical features
cat_indexes = [column_index for column_index, column_name in enumerate(df.columns) if column_name in cat_columns]

In [None]:
# dimensions of each categorical feature
cat_dimensions = df[cat_columns].nunique().tolist()

In [None]:
# defining the embedding dimensions
cat_embedding_dimensions = np.ceil(np.log(cat_dimensions)).astype(np.int)

# ensuring the embedding dimensions are greater than 1 and smaller than 50 
cat_embedding_dimensions = np.clip(cat_embedding_dimensions, a_min = 1, a_max = 50).tolist()

In [None]:
# separating the train and test data
train_df, test_df = df[:train.shape[0]], df[train.shape[0]:]

# separating inputs and outputs for the training data
X_train_df, y_train = train_df.drop(columns='Survived'), train_df.Survived

# parsing y_train to a numpy array
y_train = LabelEncoder().fit_transform(y_train)

# separating inputs and outputs for the training data
X_test_df = test_df.drop(columns='Survived')

# parsing the test dataframe to a numpy array
X_test_array = X_test_df.to_numpy()

# separating the targets from the lr and tabnet submission
y_lr_submission = LabelEncoder().fit_transform(lr_submission.Survived)
y_tabnet_submission = LabelEncoder().fit_transform(tabnet_submission.Survived)

# Hyperparameter Optimization

In [None]:
# setting up the data to use
X = X_train_df
y = y_train
kfolds = 5

In [None]:
# creating a study
study = optuna.create_study(directions = ['maximize'], pruner = optuna.pruners.MedianPruner())

In [None]:
# running the study
study.optimize(func = objective, n_trials = 100, timeout = 60 * 60 * 7)

# Exploring hyperparameter combinations

In [None]:
plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
plot_parallel_coordinate(study)

In [None]:
# study results
study.trials_dataframe().sort_values('value', ascending = False)

# Saving study results

In [None]:
study.trials_dataframe().to_csv('study_trials.csv', index=False)

# Fitting the best model

In [None]:
# extracting the best hyperparameters
best_hyperparameters = study.best_trials[0].params
print(f'Best hyperparameter combination evaluated: {best_hyperparameters}')

In [None]:
# fitting the model on the best combination of hyperparameters
last_scores, probas, thrs = submit_stratifield_kfold(X, y, kfolds = kfolds, hyperparameters = best_hyperparameters, should_pretrain = best_hyperparameters['pretrain_tabnet'])

# Adding predictions to the submission file

In [None]:
# adding predictions to the submission dataframe
submission['Survived'] = ((probas / kfolds) >= np.mean(thrs)).astype('int')

# Saving predictions

In [None]:
submission.to_csv('submission.csv', index=False)