# TPS Mar 2021 Rank Averaging and Stacking

In [None]:
import pandas as pd
import numpy as np
import random
import os

from scipy.stats import rankdata
from pathlib import Path

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import BayesianRidge, Ridge

In [None]:
n_folds = 10
seed_list = [i for i in range(2000, 2022)]

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(seed_list[0])

In [None]:
INPUT_PATH = Path("../input/tabular-playground-series-mar-2021")

TRAIN_PATH = Path("../input/tps-mar-2021-preprocessed-data/preprocessed-data/train")
TEST_PATH = Path("../input/tps-mar-2021-preprocessed-data/preprocessed-data/test")

In [None]:
train_df = pd.read_csv(INPUT_PATH / "train.csv")
test_df = pd.read_csv(INPUT_PATH / "test.csv")
sub_df = pd.read_csv(INPUT_PATH / 'sample_submission.csv')

In [None]:
train_oof_dict = {
    'trans_1': 'train_rgr_epoch2000_probas8_params0_batch512.npy',
    'trans_2': 'train_rgr_epoch2000_probas8_params1_batch512.npy',
    'trans_3': 'train_rgr_epoch2000_probas8_params2_batch512.npy',
    'trans_4': 'train_rgr_epoch2000_probas8_params3_batch512.npy',
    'trans_5': 'train_rgr_epoch2000_probas8_params4_batch512.npy',
    'trans_6': 'train_rgr_epoch2000_probas8_params5_batch512.npy',
    'trans_7': 'train_rgr_epoch2000_probas8_params6_batch512.npy',
    'lightgbm1': 'train_lgb.npy',
    'lightgbm2': 'train_oof_lgbm_0.npy',
    'lightgbm3': 'train_oof_lgbm_1.npy',
    'xgboost': 'train_xgb.npy',
    'catboost': 'train_cbt.npy',
    'logistic_regression1': 'train_lr.npy',
    'logistic_regression2': 'train_oof_lr_0.npy',
    'random_forest': 'train_rf.npy',
    'tabnet1': 'train_tabnet_0.npy',
    'tabnet2': 'train_tabnet_1.npy',
    'histgradient1': 'train_oof_hgb_0.npy',
    'histgradient2': 'train_oof_hgb_1.npy',
    'keras1': 'train_keras_0.npy',
    'keras2': 'train_keras_1.npy'
}

test_pred_dict = {
    'trans_1': 'test_rgr_epoch2000_probas8_params0_batch512.npy',
    'trans_2': 'test_rgr_epoch2000_probas8_params1_batch512.npy',
    'trans_3': 'test_rgr_epoch2000_probas8_params2_batch512.npy',
    'trans_4': 'test_rgr_epoch2000_probas8_params3_batch512.npy',
    'trans_5': 'test_rgr_epoch2000_probas8_params4_batch512.npy',
    'trans_6': 'test_rgr_epoch2000_probas8_params5_batch512.npy',
    'trans_7': 'test_rgr_epoch2000_probas8_params6_batch512.npy',
    'lightgbm1': 'test_lgb.npy',
    'lightgbm2': 'test_preds_lgbm_0.npy',
    'lightgbm3': 'test_preds_lgbm_1.npy',
    'xgboost': 'test_xgb.npy',
    'catboost': 'test_cbt.npy',
    'logistic_regression1': 'test_lr.npy',
    'logistic_regression2': 'test_preds_lr_0.npy',
    'random_forest': 'test_rf.npy',
    'tabnet1': 'test_tabnet_0.npy',
    'tabnet2': 'test_tabnet_1.npy',
    'histgradient1': 'test_preds_hgb_0.npy',
    'histgradient2': 'test_preds_hgb_1.npy',
    'keras1': 'test_keras_0.npy',
    'keras2': 'test_keras_1.npy'
}


In [None]:
oof_df = pd.DataFrame()
preds_df = pd.DataFrame()

for name, train_oof in train_oof_dict.items():
    oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)
    
for name, test_pred in test_pred_dict.items():
    preds_df = pd.concat([preds_df, pd.Series(np.load(TEST_PATH / test_pred), name=name)], axis=1)

## Rank Averaging

In [None]:
oof_rank = np.mean(rankdata(oof_df, axis=0), axis=1) / oof_df.shape[0]
preds_rank = np.mean(rankdata(preds_df, axis=0), axis=1) / preds_df.shape[0]

In [None]:
auc = roc_auc_score(train_df['target'], rankdata(oof_rank), average='micro')
print(f'auc {auc:.6f}')    

In [None]:
sub_df['target'] = preds_rank
sub_df.to_csv(f"submission_rank_averaging_cv{auc:.6f}.csv", index=False)
display(sub_df.head(), sub_df.tail())

## Stacking

In [None]:
oof_list = list()
preds_list = list()

for seed in seed_list:
    oof = np.zeros(oof_df.shape[0])
    preds = 0
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(oof_df, train_df['target'])):
        model = Ridge(random_state=seed)
        model.fit(oof_df.iloc[train_idx], train_df['target'].to_numpy()[train_idx])

        oof[valid_idx] = model.predict(oof_df.iloc[valid_idx])
        preds += model.predict(preds_df) / n_folds

    auc = roc_auc_score(train_df['target'], oof)
    print(f"SEED {seed}: AUC {auc:.6f}")

    oof_list.append(oof)
    preds_list.append(preds)

print(f"SEED AVERAGING AUC {roc_auc_score(train_df['target'], np.mean(rankdata(oof_list, axis=1), axis=0)/oof_df.shape[0]):.6f}")

In [None]:
sub_df['target'] = np.mean(rankdata(preds_list, axis=1), axis=0) / preds_df.shape[0]
sub_df.to_csv(f"submission_stacking_cv{auc:.6f}.csv", index=False)
display(sub_df.head(), sub_df.tail())