# TPS Mar 2021 Stacking

* version2: late submission
* version1: 3rd solution

## Libraries

In [None]:
import pandas as pd
import numpy as np
import random
import os

from scipy.stats import rankdata
from pathlib import Path

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import Ridge

import matplotlib.pyplot as plt
import seaborn as sns

## Define parameters

In [None]:
n_folds = 10
seed_list = [i for i in range(2000, 2022)]

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(seed_list[0])

## Load data

In [None]:
INPUT_PATH = Path("../input/tabular-playground-series-mar-2021")

TRAIN_PATH = Path("../input/tps-mar-2021/train")
TEST_PATH = Path("../input/tps-mar-2021/test")

In [None]:
train_df = pd.read_csv(INPUT_PATH / "train.csv")
test_df = pd.read_csv(INPUT_PATH / "test.csv")
sub_df = pd.read_csv(INPUT_PATH / 'sample_submission.csv')

In [None]:
cat_features = [col for col in train_df.columns if 'cat' in col]
cont_features = [col for col in train_df.columns if 'cont' in col]

all_df = pd.concat([train_df, test_df]).reset_index(drop=True)
le = LabelEncoder()

for col in cat_features:
    all_df[col] = le.fit_transform(all_df[col])

train_df = all_df[:train_df.shape[0]].copy()
test_df = all_df[train_df.shape[0]:].reset_index(drop=True).copy()

all_features = cat_features + cont_features

In [None]:
train_oof_dict = {
    'trans_rgr_1': 'train_rgr_epoch2000_probas8_params0_batch512.npy',
    'trans_rgr_2': 'train_rgr_epoch2000_probas8_params1_batch512.npy',
    'trans_rgr_3': 'train_rgr_epoch2000_probas8_params2_batch512.npy',
    'trans_rgr_4': 'train_rgr_epoch2000_probas8_params3_batch512.npy',
    'trans_rgr_5': 'train_rgr_epoch2000_probas8_params4_batch512.npy',
    'trans_rgr_6': 'train_rgr_epoch2000_probas8_params5_batch512.npy',
    'trans_rgr_7': 'train_rgr_epoch2000_probas8_params6_batch512.npy',
    'trans_rgr_15': 'train_rgr_epoch2000_probas8_params14_batch512.npy',
    'trans_rgr_19': 'train_rgr_epoch2000_probas8_params18_batch512.npy',
    'trans_rgr_23': 'train_rgr_epoch2000_probas8_params22_batch512.npy',
    'trans_rgr_24': 'train_rgr_epoch2000_probas8_params23_batch512.npy',
    'trans_rgr_27': 'train_rgr_epoch1000_probas8_params26_batch512.npy',
    'trans_rdg_1': 'train_rdg_epoch2000_probas8_params0_batch512.npy',
    'trans_rdg_2': 'train_rdg_epoch2000_probas8_params1_batch512.npy',
    'trans_rdg_3': 'train_rdg_epoch2000_probas8_params2_batch512.npy',
    'trans_rdg_4': 'train_rdg_epoch2000_probas8_params3_batch512.npy',
    'trans_rdg_5': 'train_rdg_epoch2000_probas8_params4_batch512.npy',
    'trans_rdg_6': 'train_rdg_epoch2000_probas8_params5_batch512.npy',
    'trans_rdg_7': 'train_rdg_epoch2000_probas8_params6_batch512.npy',
    'trans_rdg_15': 'train_rdg_epoch2000_probas8_params14_batch512.npy',
    'trans_rdg_19': 'train_rdg_epoch2000_probas8_params18_batch512.npy',
    'trans_rdg_23': 'train_rdg_epoch2000_probas8_params22_batch512.npy',
    'trans_rdg_24': 'train_rdg_epoch2000_probas8_params23_batch512.npy',
    'trans_rdg_27': 'train_rdg_epoch1000_probas8_params26_batch512.npy',
    'trans_bayrdg_1': 'train_bayrdg_epoch2000_probas8_params0_batch512.npy',
    'trans_bayrdg_2': 'train_bayrdg_epoch2000_probas8_params1_batch512.npy',
    'trans_hgb_2': 'train_hgb_epoch2000_probas8_params1_batch512.npy',
    'lightgbm1': 'train_oof_lgbm_0.npy',
    'lightgbm2': 'train_oof_lgbm_1.npy',
    'lightgbm3': 'train_oof_lgbm_2.npy',
    'lightgbm4': 'train_oof_lgbm_3.npy',
    'lightgbm5': 'train_oof_lgbm_4.npy',
    'lightgbm6': 'train_oof_lgbm_5.npy',
    'xgboost1': 'train_oof_xgb_0.npy',
    'xgboost2': 'train_oof_xgb_1.npy',
    'catboost1': 'train_oof_cbt_0.npy',
    'catboost2': 'train_oof_cbt_1.npy',
    'logistic_regression1': 'train_oof_lr_0.npy',
    'logistic_regression2': 'train_oof_lr_1.npy',
    'random_forest1': 'train_oof_rf_0.npy',
    'tabnet1': 'train_oof_tabnet_0.npy',
    'tabnet2': 'train_oof_tabnet_1.npy',
    'histgradient1': 'train_oof_hgb_0.npy',
    'histgradient2': 'train_oof_hgb_1.npy',
    'keras1': 'train_oof_keras_0.npy',
    'keras2': 'train_oof_keras_1.npy',
    'keras3': 'train_oof_keras_2.npy',
    'keras4': 'train_oof_keras_3.npy',
    'dae1': 'train_oof_dae_0.npy',
    'dae2': 'train_oof_dae_1.npy',
    'dae3': 'train_oof_dae_2.npy',
    'dae4': 'train_oof_dae_3.npy',
    'dae5': 'train_oof_MLP_TPSMAR_FINAL.npy',
    'dtables1': 'train_oof_dtables_0.npy',
    'dtables2': 'train_oof_dtables_1.npy'
}

test_pred_dict = {
    'trans_rgr_1': 'test_rgr_epoch2000_probas8_params0_batch512.npy',
    'trans_rgr_2': 'test_rgr_epoch2000_probas8_params1_batch512.npy',
    'trans_rgr_3': 'test_rgr_epoch2000_probas8_params2_batch512.npy',
    'trans_rgr_4': 'test_rgr_epoch2000_probas8_params3_batch512.npy',
    'trans_rgr_5': 'test_rgr_epoch2000_probas8_params4_batch512.npy',
    'trans_rgr_6': 'test_rgr_epoch2000_probas8_params5_batch512.npy',
    'trans_rgr_7': 'test_rgr_epoch2000_probas8_params6_batch512.npy',
    'trans_rgr_15': 'test_rgr_epoch2000_probas8_params14_batch512.npy',
    'trans_rgr_19': 'test_rgr_epoch2000_probas8_params18_batch512.npy',
    'trans_rgr_23': 'test_rgr_epoch2000_probas8_params22_batch512.npy',
    'trans_rgr_24': 'test_rgr_epoch2000_probas8_params23_batch512.npy',
    'trans_rgr_27': 'test_rgr_epoch1000_probas8_params26_batch512.npy',
    'trans_rdg_1': 'test_rdg_epoch2000_probas8_params0_batch512.npy',
    'trans_rdg_2': 'test_rdg_epoch2000_probas8_params1_batch512.npy',
    'trans_rdg_3': 'test_rdg_epoch2000_probas8_params2_batch512.npy',
    'trans_rdg_4': 'test_rdg_epoch2000_probas8_params3_batch512.npy',
    'trans_rdg_5': 'test_rdg_epoch2000_probas8_params4_batch512.npy',
    'trans_rdg_6': 'test_rdg_epoch2000_probas8_params5_batch512.npy',
    'trans_rdg_7': 'test_rdg_epoch2000_probas8_params6_batch512.npy',
    'trans_rdg_15': 'test_rdg_epoch2000_probas8_params14_batch512.npy',
    'trans_rdg_19': 'test_rdg_epoch2000_probas8_params18_batch512.npy',
    'trans_rdg_23': 'test_rdg_epoch2000_probas8_params22_batch512.npy',
    'trans_rdg_24': 'test_rdg_epoch2000_probas8_params23_batch512.npy',
    'trans_rdg_27': 'test_rdg_epoch1000_probas8_params26_batch512.npy',
    'trans_bayrdg_1': 'test_bayrdg_epoch2000_probas8_params0_batch512.npy',
    'trans_bayrdg_2': 'test_bayrdg_epoch2000_probas8_params1_batch512.npy',
    'trans_hgb_2': 'test_hgb_epoch2000_probas8_params1_batch512.npy',
    'lightgbm1': 'test_preds_lgbm_0.npy',
    'lightgbm2': 'test_preds_lgbm_1.npy',
    'lightgbm3': 'test_preds_lgbm_2.npy',
    'lightgbm4': 'test_preds_lgbm_3.npy',
    'lightgbm5': 'test_preds_lgbm_4.npy',
    'lightgbm6': 'test_preds_lgbm_5.npy',
    'xgboost1': 'test_preds_xgb_0.npy',
    'xgboost2': 'test_preds_xgb_1.npy',
    'catboost1': 'test_preds_cbt_0.npy',
    'catboost2': 'test_preds_cbt_1.npy',
    'logistic_regression1': 'test_preds_lr_0.npy',
    'logistic_regression2': 'test_preds_lr_1.npy',
    'random_forest1': 'test_preds_rf_0.npy',
    'tabnet1': 'test_preds_tabnet_0.npy',
    'tabnet2': 'test_preds_tabnet_1.npy',
    'histgradient1': 'test_preds_hgb_0.npy',
    'histgradient2': 'test_preds_hgb_1.npy',
    'keras1': 'test_preds_keras_0.npy',
    'keras2': 'test_preds_keras_1.npy',
    'keras3': 'test_preds_keras_2.npy',
    'keras4': 'test_preds_keras_3.npy',
    'dae1': 'test_preds_dae_0.npy',
    'dae2': 'test_preds_dae_1.npy',
    'dae3': 'test_preds_dae_2.npy',
    'dae4': 'test_preds_dae_3.npy',
    'dae5': 'test_preds_MLP_TPSMAR_FINAL.npy',
    'dtables1': 'test_preds_dtables_0.npy',
    'dtables2': 'test_preds_dtables_1.npy'
}

In [None]:
oof_df = pd.DataFrame()
preds_df = pd.DataFrame()

for name, train_oof in train_oof_dict.items():
    oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)
    
for name, test_pred in test_pred_dict.items():
    preds_df = pd.concat([preds_df, pd.Series(np.load(TEST_PATH / test_pred), name=name)], axis=1)

## Check data

In [None]:
oof_df

In [None]:
preds_df

## Check correlation

In [None]:
plt.figure(figsize=(32, 16))
sns.heatmap(oof_df.corr(), annot=True, fmt='.2f')

In [None]:
plt.figure(figsize=(32, 16))
sns.heatmap(preds_df.corr(), annot=True, fmt='.2f')

## Stacking

In [None]:
oof_list = list()
preds_list = list()

for seed in seed_list:
    oof = np.zeros(oof_df.shape[0])
    preds = 0
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(oof_df, train_df['target'])):
        X_train, y_train = oof_df.to_numpy()[train_idx], train_df['target'].to_numpy()[train_idx]
        X_valid, y_valid = oof_df.to_numpy()[valid_idx], train_df['target'].to_numpy()[valid_idx]
    
        model = Ridge(random_state=seed)
        model.fit(X_train, y_train)
        
        oof[valid_idx] = model.predict(X_valid)
        preds += model.predict(preds_df) / n_folds

    auc = roc_auc_score(train_df['target'], oof)
    print(f"SEED {seed}: AUC {auc:.6f}")

    oof_list.append(oof)
    preds_list.append(preds)

auc = roc_auc_score(train_df['target'], np.mean(rankdata(oof_list, axis=1), axis=0)/oof_df.shape[0])
print(f"SEED AVERAGING AUC {auc:.6f}")

In [None]:
sub_df['target'] = np.mean(rankdata(preds_list, axis=1), axis=0) / preds_df.shape[0]
sub_df.to_csv(f"submission.csv", index=False)
display(sub_df.head(), sub_df.tail())