# Acknowledgements

- built-upon Juan Pablo Margni's wonderful starter: https://www.kaggle.com/jmargni/tps-apr-2021-lightgbm-optuna

# Imports

In [None]:
!pip install -q pytorch-tabnet

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
import optuna
import joblib
from pytorch_tabnet.tab_model import TabNetClassifier

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')

In [None]:
def label_encoder(c):
    lc = LabelEncoder()
    return lc.fit_transform(c)

In [None]:
def preprocess(df):
    label_cols = ['Name', 'Ticket']
    onehot_cols = ['Pclass', 'Sex', 'Cabin', 'Embarked']
    numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare', 'Survived']
    age_map = df[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
    df.Age = df.Age.fillna(df.Pclass.map(age_map['Age']))
    df.Cabin = df.Cabin.fillna('X').map(lambda x: x[0].strip())
    df.Ticket = df.Ticket.fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
    df.Fare = df.Fare.fillna(df.Fare.mean())
    df.Embarked = df.Embarked.fillna('X')
    df.Name = df.Name.map(lambda x: x.split(',')[0])
    onehot_encoded_df = pd.get_dummies(df[onehot_cols])
    label_encoded_df = df[label_cols].apply(label_encoder)
    numerical_df = df[numerical_cols]
    return pd.concat([numerical_df, label_encoded_df, onehot_encoded_df], axis=1)

In [None]:
all_df = preprocess(df = pd.concat([train_df, test_df]))

In [None]:
# Re-split all data
X = all_df[:train_df.shape[0]]
y = X.pop('Survived')
X_ = all_df[train_df.shape[0]:].drop(columns=['Survived'])

# Inference with Best Params

In [None]:
folds = KFold(n_splits = 16)
oof = np.zeros(X.shape[0])
predictions = np.zeros(X_.shape[0])

In [None]:
BEST_PARAMS = {
        'n_a': 16,
        'n_d': 16,
        'n_steps': 3,
        'n_independent': 2,
        'batch_size': 1024,
        'virtual_batch_size': 128,
        'seed': 42,
    }

In [None]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print("Fold {}".format(fold_))
    X_train = X.iloc[trn_idx].values
    y_train = y[trn_idx]
    X_test = X.iloc[val_idx].values
    y_test = y[val_idx]
    clf = TabNetClassifier(
        n_a=BEST_PARAMS['n_a'],
        n_d=BEST_PARAMS['n_d'],
        n_steps=BEST_PARAMS['n_steps'],
        n_independent=BEST_PARAMS['n_independent'],
        seed=42,
    )
    clf.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        eval_name=['train', 'valid'],
        eval_metric=['accuracy'],
        max_epochs=50,
        patience=10, 
        batch_size=BEST_PARAMS['batch_size'],
        virtual_batch_size=BEST_PARAMS['virtual_batch_size']
    )
    
    predictions += clf.predict(X_.values) / folds.n_splits

In [None]:
binarizer = np.vectorize(lambda x: 1 if x >= .5 else 0)
prediction_binarized = binarizer(predictions)
submission = pd.concat([sample_submission,pd.DataFrame(prediction_binarized)], axis=1).drop(columns=['Survived'])
submission.columns = ['PassengerId', 'Survived']
submission.to_csv('submission.csv', index=False)