## Installation

In [None]:
!pip install pytorch-tabnet

## Import Libraries

In [None]:
# 環境によって処理を変えるためのもの
import sys
import os
IN_COLAB = 'google.colab' in sys.modules
IN_KAGGLE = 'kaggle_web_client' in sys.modules
LOCAL = not (IN_KAGGLE or IN_COLAB)
print(f'IN_COLAB:{IN_COLAB}, IN_KAGGLE:{IN_KAGGLE}, LOCAL:{LOCAL}')

In [None]:
# Hide Warning
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Python Libraries
import os
import math
import random
import glob
import pickle
import gc
from pathlib import Path

# Third party
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn import preprocessing
from sklearn.model_selection import GroupKFold,StratifiedKFold
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_absolute_error,roc_auc_score,accuracy_score

import category_encoders as ce
import optuna

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor, TabNetClassifier


def set_seed(seed: int=29):
    print(f'set_seed{seed}')
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
set_seed()

In [None]:
class CFG:
    competition = 'spaceship_titanic'
    seed = 29
    target_col = 'Transported'
    n_fold = 5
    trn_fold = [0,1,2,3,4]
    
    # Traininh
    # ==============================================================
    pretrain_param = {
        'model':{
            'n_d': 8,
            'n_a': 8,
            'n_steps': 3,
            'gamma': 1.3,
            'n_independent': 2,
            'n_shared': 2,
            'seed': seed,
            'lambda_sparse': 1e-3,
            'optimizer_fn':torch.optim.Adam,
            'optimizer_params':{'lr':2e-2},
            'mask_type': 'entmax',
            'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
            'scheduler_params': {
                'mode':'min',
                'patience':5,
                'min_lr': 1e-5,
                'factor': 0.9,
            },
            'verbose': 10
        },
        'fit':{
            'max_epochs': 200,
            'patience': 20,
            'batch_size': 256,
            'virtual_batch_size': 128,
            'num_workers': 1,
            'drop_last': True
        }
    }
    train_params = {
        'model':{
            'n_d': 8,
            'n_a': 8,
            'n_steps': 3,
            'gamma': 1.3,
            'n_independent': 2,
            'n_shared': 2,
            'seed': seed,
            'lambda_sparse': 1e-3,
            'optimizer_fn': torch.optim.Adam,
            'optimizer_params': {'lr': 2e-2, 'weight_decay': 1e-5},
            'mask_type': 'entmax',
            'scheduler_fn': torch.optim.lr_scheduler.OneCycleLR,
            'scheduler_params': {
                'max_lr': 0.05,
                'steps_per_epoch': 5,
                'epochs': 200,
                'is_batch_level': True,
            },
            'verbose': 10
        },
        'fit':{
            'eval_metric': ['accuracy',"auc"],
            'max_epochs': 200,
            'patience': 20,
            'batch_size': 256,
            'virtual_batch_size': 128,
            'num_workers': 0,
            'drop_last': False
        }
    }
    
    # Feature
    # ==============================================================
    fillna_value= {
        'Age': -1,
        'RoomService': -1,
        'FoodCourt': -1,
        'ShoppingMall': -1,
        'Spa': -1,
        'VRDeck': -1,
    }
    cate_cols = []
    cont_cols = []
    feature_cols = []

In [None]:
if IN_KAGGLE:
    INPUT_DIR = Path('../input/spaceship-titanic')
    OUTPUT_DIR = './'
elif IN_COLAB:
    INPUT_DIR = Path('/content/input/')
    OUTPUT_DIR = f'/content/drive/MyDrive/kaggle/spaceship-titanic/{CFG.exp_name}/'
if LOCAL:
    INPUT_DIR = Path("F:/Kaggle/spaceship-titanic/data/input/")
    OUTPUT_DIR = f'F:/Kaggle/pspaceship-titanic/data/output/{CFG.exp_name}/'
    
df_train = pd.read_csv(INPUT_DIR / "train.csv")
df_test = pd.read_csv(INPUT_DIR / "test.csv")
df_sub = pd.read_csv(INPUT_DIR / "sample_submission.csv")
df_oof = df_train.copy()
display(df_train.head())
display(df_test.head())
display(df_sub.head())

In [None]:
CFG.cate_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', ]
CFG.cont_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
CFG.feature_cols =  CFG.cont_cols  #+ CFG.cate_cols

## Feature Engneering

In [None]:
# fillna
df_train = df_train.fillna(CFG.fillna_value)
df_test = df_test.fillna(CFG.fillna_value)
print(df_train[CFG.cont_cols].isnull().sum())
df_train[CFG.cont_cols].describe()

In [None]:
# Caategory Encodeing
_OE_COLS = ['HomePlanet', 'CryoSleep', 'Destination','VIP']
OE_COLS = ['OE_' + col for col in _OE_COLS]
ce_oe = ce.OrdinalEncoder(cols=_OE_COLS,handle_missing='return_nan')
df_train[OE_COLS] = ce_oe.fit_transform(df_train[_OE_COLS])
df_test[OE_COLS] = ce_oe.transform(df_test[_OE_COLS])
# CFG.feature_cols += OE_COLS 
print(OE_COLS)

# One-Hot-Encoding
_OHE_COLS = ['HomePlanet', 'CryoSleep', 'Destination','VIP']
ce_ohe = ce.OneHotEncoder(cols=_OHE_COLS, handle_unknown='impute')
_df_ohe = ce_ohe.fit_transform(df_train[_OHE_COLS])
OHE_COLS = _df_ohe.columns.to_list()
df_train[OHE_COLS] = _df_ohe[OHE_COLS]
df_test[OHE_COLS] = ce_ohe.transform(df_test[_OHE_COLS])
CFG.feature_cols += OHE_COLS 
print(OHE_COLS)

## CV Split

In [None]:
print(df_train.Transported.value_counts())
df_train["fold"] = -1
"""
StratifiedKFold
"""
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(df_train, df_train[CFG.target_col])):
    df_train.loc[val_index, 'fold'] = int(n)
    
"""
"""
df_train['fold'] = df_train['fold'].astype(int)
df_oof['fold'] = df_train['fold']
print(df_train.groupby(['fold', CFG.target_col]).size())

## Function

In [None]:
"""==============================
Pretrain
=============================="""

unsupervised_model = TabNetPretrainer(**CFG.pretrain_param["model"])

unsupervised_model.fit(
    X_train=df_train.loc[df_train['fold'] != 0, CFG.feature_cols].to_numpy(),
    eval_set=[df_train.loc[df_train['fold'] == 0, CFG.feature_cols].to_numpy()],
    **CFG.pretrain_param["fit"]
)

In [None]:
def fit_tabnet(cfg, X_train, y_train, X_valid, y_valid, unsupervised_model=None):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    
    clf = TabNetClassifier(**cfg.train_params["model"])
    clf.fit(
        X_train=X_train,
        y_train=y_train,
        eval_set=[(X_valid, y_valid)],
        eval_name = ["valid"],
        **cfg.train_params["fit"],
        from_unsupervised=unsupervised_model
            )
    oof_pred = clf.predict_proba(X_valid)[:,1]
    score = roc_auc_score(y_valid, oof_pred)
    return oof_pred, clf, score

## Training

In [None]:
# set params
CFG.train_params["model"]["scheduler_params"]["steps_per_epoch"] = int(math.ceil((df_train['fold'] != 0).sum()/CFG.train_params["fit"]["batch_size"]))


models = []
for fold in tqdm(range(CFG.n_fold)):
    if not fold in CFG.trn_fold:
        continue
    print(f"{'='*38} Fold: {fold} {'='*38}")

    oof_pred, model, score = fit_tabnet(CFG,
                                       df_train.loc[df_train['fold'] != fold, CFG.feature_cols].to_numpy(),
                                       df_train.loc[df_train['fold'] != fold, CFG.target_col].to_numpy(),
                                       df_train.loc[df_train['fold'] == fold, CFG.feature_cols].to_numpy(),
                                       df_train.loc[df_train['fold'] == fold, CFG.target_col].to_numpy(),
                                       unsupervised_model,)
    # oof
    df_oof.loc[df_train['fold'] == fold,['pred']] = oof_pred
    models.append(model)

In [None]:
df_oof[df_oof['Transported'] == True].pred.hist()

In [None]:
print(f'roc_auc:{roc_auc_score(df_oof.Transported, df_oof.pred)}')
df_oof['pred_bool'] = df_oof.pred > 0.5
print(f'accuracy:{accuracy_score(df_oof.Transported, df_oof.pred_bool)}')

In [None]:
y_preds =[]
for model in models:
    y_pred = model.predict_proba(df_test[CFG.feature_cols].to_numpy())[:,1]
    y_preds.append(y_pred)
pred = np.mean(y_preds,axis=0)

In [None]:
df_sub['Transported'] = pred > 0.5
df_sub.to_csv('submission.csv', index=False)

In [None]:
model.feature_importances_

## reference

- https://www.kaggle.com/code/sinchir0/selfsupervisedtabnet-titanic-comparing-lgbm-nn

## To-Do

- wandb対応
- 正規化が必要か調査
- GPUで動かせるようにする
- カテゴリ変数の扱い

cat_idxs=cat_idxs, # comment out when Unsupervised
                         cat_dims=cat_dims, # comment out when Unsupervised