In [None]:
!pip uninstall -y lightgbm
!apt-get install -y libboost-all-dev
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
!cd LightGBM/python-package/;python setup.py install --precompile

In [None]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

# Imports

In [None]:
# Asthetics
import warnings
import sklearn.exceptions
from pandas.core.common import SettingWithCopyWarning
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=SettingWithCopyWarning)
warnings.filterwarnings('ignore', category=sklearn.exceptions.UndefinedMetricWarning)

# General
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import os
import random
import gc
import itertools
gc.enable()

# Visialisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style="whitegrid")

# Machine Learning
## Utils
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, cross_validate
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn import preprocessing
import category_encoders as ce
## Feature Selection
from sklearn.feature_selection import chi2, f_classif, f_regression
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile, VarianceThreshold
## Classification Models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier
import lightgbm as lgb

# Metrics
from sklearn.metrics import roc_auc_score

# Deep Learning
import torch

# Fixing Seed
RANDOM_SEED = 42

def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything()

# Device Optimization
if torch.cuda.is_available():
    GPU = True
else:
    GPU = False
    
print(f'GPU Available: {GPU}')

# Read Data

In [None]:
data_dir = '../input/tabular-playground-series-sep-2021'

train_file_path = '../input/tpssep2021folds/train_10_fold_nulls.csv'
test_file_path = os.path.join(data_dir, 'test.csv')
sample_sub_file_path = os.path.join(data_dir, 'sample_solution.csv')

print(f'Train file: {train_file_path}')
print(f'Test file: {test_file_path}')
print(f'Sample Sub file: {sample_sub_file_path}')

In [None]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)
sub_df = pd.read_csv(sample_sub_file_path)

In [None]:
target = ['claim']
not_features = ['id', 'kfold', 'claim']
cols = list(train_df.columns)
features = [feat for feat in cols if feat not in not_features]

In [None]:
# From https://www.kaggle.com/hiro5299834/tps-sep-2021-single-lgbm
train_df['n_missing'] = train_df[features].isna().sum(axis=1)
test_df['n_missing'] = test_df[features].isna().sum(axis=1)

train_df['std'] = train_df[features].std(axis=1)
test_df['std'] = test_df[features].std(axis=1)

features += ['n_missing', 'std']

# L0 Models

## 1. XGBoost - 1

In [None]:
test_pred_all = None
valid_pred_all = {}
all_roc = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i].copy()
    valid = train_df[train_df['kfold'] == i].copy()
    test = test_df.copy()
    
    valid_ids = valid.id.values.tolist()
    
    if GPU:
        xgb_params = {
            'n_estimators': 18382,
            'learning_rate': 0.010019309106542829,
            'reg_lambda': 0.06489247352086885,
            'reg_alpha': 38.607681064365444,
            'max_depth': 5,
            'subsample': 0.7338887825604986,
            'colsample_bytree': 0.28819599059198336,
            'booster': 'gbtree',
            'random_state': RANDOM_SEED,
            'verbosity': 0,
            'tree_method':'gpu_hist',
            'gpu_id': 0,
            'predictor': 'gpu_predictor'
        }
    else:
        xgb_params = {
            'n_estimators': 1000,
            'learning_rate': 0.011159116340385172,
            'reg_lambda': 0.07868674215849121,
            'reg_alpha': 26.263704018098185,
            'max_depth': 5,
            'subsample': 0.7612480691247493,
            'colsample_bytree': 0.2695349533886053,
            'booster': 'gbtree',
            'random_state': RANDOM_SEED,
            'verbosity': 0,
            'n_jobs': 4
        }
        
    clf = XGBClassifier(**xgb_params)  
    clf.fit(train[features].values, train[target].values,
            eval_set = [(valid[features].values, valid[target].values)],
            eval_metric = 'auc',
            early_stopping_rounds = 300,
            verbose=False)
    
    valid_pred = clf.predict_proba(valid[features].values)[:, 1]
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = clf.predict_proba(test[features].values)[:, 1]
    roc = roc_auc_score(valid[target].values, valid_pred)
    all_roc.append(roc)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} ROC-AUC: {round(roc, 4)}')
    
print('')
print(f'Average ROC-AUC: {round(np.mean(all_roc), 4)} Std: {round(np.std(all_roc), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_1']
valid_pred_all.to_csv('train_pred_1.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_1'] = test_pred_all
sub_2.to_csv('test_pred_1.csv', index=False)

## 2. LGBM - 1

In [None]:
test_pred_all = None
valid_pred_all = {}
all_roc = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i].copy()
    valid = train_df[train_df['kfold'] == i].copy()
    test = test_df.copy()
    
    valid_ids = valid.id.values.tolist()
    
    lgb_train = lgb.Dataset(train[features], train[target])
    lgb_valid = lgb.Dataset(valid[features], valid[target], reference=lgb_train)
    
    if GPU:
        lgbm_params = {
            'n_estimators': 10000,
            'learning_rate': 0.1,
            'early_stopping_round': 300,
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'auc',
            'verbosity': -1,
            'device': 'gpu',
            'gpu_platform_id': 0,
            'gpu_device_id': 0,
            'random_state': RANDOM_SEED
        }
    else:
        lgbm_params = {
            'n_estimators': 1000,
            'learning_rate': 0.1,
            'early_stopping_round': 300,
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'auc',
            'verbosity': -1,
            'n_jobs': 4,
            'random_state': RANDOM_SEED
        }
    
    clf = lgb.train(lgbm_params, lgb_train, valid_sets=[lgb_valid], verbose_eval=False)
    
    valid_pred = clf.predict(valid[features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = clf.predict(test[features].values)
    roc = roc_auc_score(valid[target].values, valid_pred)
    all_roc.append(roc)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} ROC-AUC: {round(roc, 4)}')
    
print('')
print(f'Average ROC-AUC: {round(np.mean(all_roc), 4)} Std: {round(np.std(all_roc), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_2']
valid_pred_all.to_csv('train_pred_2.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_2'] = test_pred_all
sub_2.to_csv('test_pred_2.csv', index=False)

## 3. Catboost - 1

In [None]:
test_pred_all = None
valid_pred_all = {}
all_roc = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i].copy()
    valid = train_df[train_df['kfold'] == i].copy()
    test = test_df.copy()
    
    valid_ids = valid.id.values.tolist()
    
    if GPU:
        cat_params = {
            'depth' : 5,
            'grow_policy' : 'SymmetricTree',
            'l2_leaf_reg' : 3.0,
            'random_strength' : 1.0,
            'learning_rate' : 0.1,
            'iterations' : 10000,
            'loss_function' : 'CrossEntropy',
            'eval_metric' : 'AUC',
            'use_best_model' : True,
            'early_stopping_rounds' : 300,
            'task_type' : 'GPU',
            'verbose' : False
        }
    else:
        cat_params = {
            'depth' : 5,
            'grow_policy' : 'SymmetricTree',
            'l2_leaf_reg' : 3.0,
            'random_strength' : 1.0,
            'learning_rate' : 0.1,
            'iterations' : 1000,
            'loss_function' : 'CrossEntropy',
            'eval_metric' : 'AUC',
            'use_best_model' : True,
            'early_stopping_rounds' : 100,
            'task_type' : 'CPU',
            'thread_count' : 4,
            'verbose' : False
        }
        
    clf = CatBoostClassifier(**cat_params)  
    clf.fit(train[features].values, train[target].values,
            eval_set = [(valid[features].values, valid[target].values)])
    
    valid_pred = clf.predict_proba(valid[features].values)[:, 1]
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = clf.predict_proba(test[features].values)[:, 1]
    roc = roc_auc_score(valid[target].values, valid_pred)
    all_roc.append(roc)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} ROC-AUC: {round(roc, 4)}')
    
print('')
print(f'Average ROC-AUC: {round(np.mean(all_roc), 4)} Std: {round(np.std(all_roc), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_3']
valid_pred_all.to_csv('train_pred_3.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_3'] = test_pred_all
sub_2.to_csv('test_pred_3.csv', index=False)

## 4. LGBM - 2

In [None]:
test_pred_all = None
valid_pred_all = {}
all_roc = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i].copy()
    valid = train_df[train_df['kfold'] == i].copy()
    test = test_df.copy()
    
    train.fillna(train.mean(), inplace=True)
    valid.fillna(valid.mean(), inplace=True)
    test.fillna(test.mean(), inplace=True)
    
    scaler = preprocessing.StandardScaler()
    train[features] = scaler.fit_transform(train[features])
    valid[features] = scaler.transform(valid[features])
    test[features] = scaler.transform(test[features])
    
    valid_ids = valid.id.values.tolist()
    
    lgb_train = lgb.Dataset(train[features], train[target])
    lgb_valid = lgb.Dataset(valid[features], valid[target], reference=lgb_train)
    
    if GPU:
        lgbm_params = {
            'n_estimators': 20000,
            'learning_rate': 5e-3,
            'subsample': 0.6,
            'subsample_freq': 1,
            'colsample_bytree': 0.4,
            'reg_alpha': 10.0,
            'reg_lambda': 1e-1,
            'min_child_weight': 256,
            'min_child_samples': 20,
            'early_stopping_round': 200,
            'objective': 'binary',
            'metric': 'auc',
            'verbosity': -1,
            'device': 'gpu',
            'gpu_platform_id': 0,
            'gpu_device_id': 0,
            'random_state': 2021
        }
    else:
        lgbm_params = {
            'n_estimators': 20000,
            'learning_rate': 5e-3,
            'subsample': 0.6,
            'subsample_freq': 1,
            'colsample_bytree': 0.4,
            'reg_alpha': 10.0,
            'reg_lambda': 1e-1,
            'min_child_weight': 256,
            'min_child_samples': 20,
            'early_stopping_round': 200,
            'objective': 'binary',
            'metric': 'auc',
            'verbosity': -1,
            'n_jobs': 4,
            'random_state': 2021
        }
    
    clf = lgb.train(lgbm_params, lgb_train, valid_sets=[lgb_valid], verbose_eval=False)
    
    valid_pred = clf.predict(valid[features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = clf.predict(test[features].values)
    roc = roc_auc_score(valid[target].values, valid_pred)
    all_roc.append(roc)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} ROC-AUC: {round(roc, 4)}')
    
print('')
print(f'Average ROC-AUC: {round(np.mean(all_roc), 4)} Std: {round(np.std(all_roc), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_4']
valid_pred_all.to_csv('train_pred_4.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_4'] = test_pred_all
sub_2.to_csv('test_pred_4.csv', index=False)

## 5. Catboost - 2

In [None]:
test_pred_all = None
valid_pred_all = {}
all_roc = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i].copy()
    valid = train_df[train_df['kfold'] == i].copy()
    test = test_df.copy()
    
    valid_ids = valid.id.values.tolist()
    
    if GPU:
        cat_params = {
            'depth': 3,
            'grow_policy': 'Lossguide',
            'l2_leaf_reg': 5.0682077505381e-06,
            'random_strength': 0.0001395042015032738,
            'learning_rate': 0.022430394051758566,
            'iterations': 14805,
            'loss_function': 'Logloss',
            'bagging_temperature': 0.6867277487647423,
            'border_count': 210,
            'eval_metric' : 'AUC',
            'use_best_model' : True,
            'early_stopping_rounds' : 300,
            'task_type' : 'GPU',
            'verbose' : False
        }
    else:
        cat_params = {
            'depth': 3,
            'grow_policy': 'Lossguide',
            'l2_leaf_reg': 5.0682077505381e-06,
            'random_strength': 0.0001395042015032738,
            'learning_rate': 0.022430394051758566,
            'iterations': 14805,
            'loss_function': 'Logloss',
            'bagging_temperature': 0.6867277487647423,
            'border_count': 210,
            'eval_metric' : 'AUC',
            'use_best_model' : True,
            'early_stopping_rounds' : 300,
            'task_type' : 'CPU',
            'thread_count' : 4,
            'verbose' : False
        }
        
    clf = CatBoostClassifier(**cat_params)  
    clf.fit(train[features].values, train[target].values,
            eval_set = [(valid[features].values, valid[target].values)])
    
    valid_pred = clf.predict_proba(valid[features].values)[:, 1]
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = clf.predict_proba(test[features].values)[:, 1]
    roc = roc_auc_score(valid[target].values, valid_pred)
    all_roc.append(roc)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} ROC-AUC: {round(roc, 4)}')
    
print('')
print(f'Average ROC-AUC: {round(np.mean(all_roc), 4)} Std: {round(np.std(all_roc), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_5']
valid_pred_all.to_csv('train_pred_5.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_5'] = test_pred_all
sub_2.to_csv('test_pred_5.csv', index=False)

# Blending (L1 Models)

In [None]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)
sub_df = pd.read_csv(sample_sub_file_path)

In [None]:
prev_features = list(train_df.columns)

In [None]:
df1 = pd.read_csv('train_pred_1.csv')
df2 = pd.read_csv('train_pred_2.csv')
df3 = pd.read_csv('train_pred_3.csv')
df4 = pd.read_csv('train_pred_4.csv')
df5 = pd.read_csv('train_pred_5.csv')

df_test1 = pd.read_csv('test_pred_1.csv')
df_test2 = pd.read_csv('test_pred_2.csv')
df_test3 = pd.read_csv('test_pred_3.csv')
df_test4 = pd.read_csv('test_pred_4.csv')
df_test5 = pd.read_csv('test_pred_5.csv')

In [None]:
train_df = train_df.merge(df1, on='id', how='left')
train_df = train_df.merge(df2, on='id', how='left')
train_df = train_df.merge(df3, on='id', how='left')
train_df = train_df.merge(df4, on='id', how='left')
train_df = train_df.merge(df5, on='id', how='left')

test_df = test_df.merge(df_test1, on='id', how='left')
test_df = test_df.merge(df_test2, on='id', how='left')
test_df = test_df.merge(df_test3, on='id', how='left')
test_df = test_df.merge(df_test4, on='id', how='left')
test_df = test_df.merge(df_test5, on='id', how='left')

In [None]:
cols = list(train_df.columns)
blend_features = [feat for feat in cols if str(feat).startswith('pred')]
print(blend_features)

## 1. Logistic Regression

In [None]:
test_pred_all = None
valid_pred_all = {}
all_roc = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    valid_ids = valid.id.values.tolist()

    model = LogisticRegression(solver='liblinear')
    model.fit(train[blend_features].values, train[target].values)

    valid_pred = model.predict_proba(valid[blend_features].values)[:, 1]
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = model.predict_proba(test[blend_features].values)[:, 1]
    roc = roc_auc_score(valid[target].values, valid_pred)
    all_roc.append(roc)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} ROC: {round(roc, 4)}')
    
print('')
print(f'Average ROC: {round(np.mean(all_roc), 4)} Std: {round(np.std(all_roc), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_L1_1']
valid_pred_all.to_csv('L1_train_pred_1.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_L1_1'] = test_pred_all
sub_2.to_csv('L1_test_pred_1.csv', index=False)

## 2. Naive Bayes

In [None]:
test_pred_all = None
valid_pred_all = {}
all_roc = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    valid_ids = valid.id.values.tolist()

    model = GaussianNB()
    model.fit(train[blend_features].values, train[target].values)

    valid_pred = model.predict_proba(valid[blend_features].values)[:, 1]
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = model.predict_proba(test[blend_features].values)[:, 1]
    roc = roc_auc_score(valid[target].values, valid_pred)
    all_roc.append(roc)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} ROC: {round(roc, 4)}')
    
print('')
print(f'Average ROC: {round(np.mean(all_roc), 4)} Std: {round(np.std(all_roc), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_L1_2']
valid_pred_all.to_csv('L1_train_pred_2.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_L1_2'] = test_pred_all
sub_2.to_csv('L1_test_pred_2.csv', index=False)

## 3. Linear Regression

In [None]:
test_pred_all = None
valid_pred_all = {}
all_roc = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    valid_ids = valid.id.values.tolist()

    model = LinearRegression()
    model.fit(train[blend_features].values, train[target].values)

    valid_pred = model.predict(valid[blend_features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = model.predict(test[blend_features].values)
    roc = roc_auc_score(valid[target].values, valid_pred)
    all_roc.append(roc)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} ROC: {round(roc, 4)}')
    
print('')
print(f'Average ROC: {round(np.mean(all_roc), 4)} Std: {round(np.std(all_roc), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_L1_3']
valid_pred_all.to_csv('L1_train_pred_3.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_L1_3'] = test_pred_all
sub_2.to_csv('L1_test_pred_3.csv', index=False)

## 4. QDA

In [None]:
test_pred_all = None
valid_pred_all = {}
all_roc = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    valid_ids = valid.id.values.tolist()

    model = QuadraticDiscriminantAnalysis()
    model.fit(train[blend_features].values, train[target].values)

    valid_pred = model.predict_proba(valid[blend_features].values)[:, 1]
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = model.predict_proba(test[blend_features].values)[:, 1]
    roc = roc_auc_score(valid[target].values, valid_pred)
    all_roc.append(roc)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} ROC: {round(roc, 4)}')
    
print('')
print(f'Average ROC: {round(np.mean(all_roc), 4)} Std: {round(np.std(all_roc), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_L1_4']
valid_pred_all.to_csv('L1_train_pred_4.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_L1_4'] = test_pred_all
sub_2.to_csv('L1_test_pred_4.csv', index=False)

## 5. XGBoost

In [None]:
test_pred_all = None
valid_pred_all = {}
all_roc = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    valid_ids = valid.id.values.tolist()
    
    if GPU:
        xgb_params = {
            'n_estimators': 10000,
            'booster': 'gbtree',
            'random_state': RANDOM_SEED,
            'verbosity': 0,
            'tree_method':'gpu_hist',
            'gpu_id': 0,
            'predictor': 'gpu_predictor'
        }
    else:
        xgb_params = {
            'n_estimators': 1000,
            'booster': 'gbtree',
            'random_state': RANDOM_SEED,
            'verbosity': 0,
            'n_jobs': 4
        }
        
    model = XGBClassifier(**xgb_params)  
    model.fit(train[blend_features].values, train[target].values,
            eval_set = [(valid[blend_features].values, valid[target].values)],
            eval_metric = 'auc',
            early_stopping_rounds = 300,
            verbose=False)

    valid_pred = model.predict_proba(valid[blend_features].values)[:, 1]
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = model.predict_proba(test[blend_features].values)[:, 1]
    roc = roc_auc_score(valid[target].values, valid_pred)
    all_roc.append(roc)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} ROC: {round(roc, 4)}')
    
print('')
print(f'Average ROC: {round(np.mean(all_roc), 4)} Std: {round(np.std(all_roc), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_L1_5']
valid_pred_all.to_csv('L1_train_pred_5.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_L1_5'] = test_pred_all
sub_2.to_csv('L1_test_pred_5.csv', index=False)

## 6. LGBM

In [None]:
test_pred_all = None
valid_pred_all = {}
all_roc = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    valid_ids = valid.id.values.tolist()
        
    lgbm_params = {
        'learning_rate': 0.1,
        'n_estimators': 10000,
        'max_depth': 2,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'verbose': -1,
        'n_jobs': 4,
        'random_state': RANDOM_SEED
    }
        
    model = LGBMClassifier(**lgbm_params)  
    model.fit(train[blend_features].values, train[target].values,
            eval_set = [(valid[blend_features].values, valid[target].values)],
            early_stopping_rounds = 300,
            verbose=False)

    valid_pred = model.predict_proba(valid[blend_features].values)[:, 1]
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = model.predict_proba(test[blend_features].values)[:, 1]
    roc = roc_auc_score(valid[target].values, valid_pred)
    all_roc.append(roc)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} ROC: {round(roc, 4)}')
    
print('')
print(f'Average ROC: {round(np.mean(all_roc), 4)} Std: {round(np.std(all_roc), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_L1_6']
valid_pred_all.to_csv('L1_train_pred_6.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_L1_6'] = test_pred_all
sub_2.to_csv('L1_test_pred_6.csv', index=False)

## 7. Cat Boost

In [None]:
test_pred_all = None
valid_pred_all = {}
all_roc = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    valid_ids = valid.id.values.tolist()
        
    if GPU:
        cat_params = {
            'depth' : 3,
            'grow_policy' : 'SymmetricTree',
            'learning_rate' : 0.1,
            'iterations' : 10000,
            'loss_function' : 'CrossEntropy',
            'eval_metric' : 'AUC',
            'use_best_model' : True,
            'early_stopping_rounds' : 300,
            'task_type' : 'GPU',
            'verbose' : False
        }
    else:
        cat_params = {
            'depth' : 3,
            'grow_policy' : 'SymmetricTree',
            'learning_rate' : 0.1,
            'iterations' : 1000,
            'loss_function' : 'CrossEntropy',
            'eval_metric' : 'AUC',
            'use_best_model' : True,
            'early_stopping_rounds' : 100,
            'task_type' : 'CPU',
            'thread_count' : 4,
            'verbose' : False
        }
        
    model = CatBoostClassifier(**cat_params)  
    model.fit(train[blend_features].values, train[target].values,
            eval_set = [(valid[blend_features].values, valid[target].values)])

    valid_pred = model.predict_proba(valid[blend_features].values)[:, 1]
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = model.predict_proba(test[blend_features].values)[:, 1]
    roc = roc_auc_score(valid[target].values, valid_pred)
    all_roc.append(roc)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} ROC: {round(roc, 4)}')
    
print('')
print(f'Average ROC: {round(np.mean(all_roc), 4)} Std: {round(np.std(all_roc), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_L1_7']
valid_pred_all.to_csv('L1_train_pred_7.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_L1_7'] = test_pred_all
sub_2.to_csv('L1_test_pred_7.csv', index=False)

# Stacking (L2 Model)

In [None]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)
sub_df = pd.read_csv(sample_sub_file_path)

In [None]:
prev_features = list(train_df.columns)

In [None]:
df1 = pd.read_csv('L1_train_pred_1.csv')
df2 = pd.read_csv('L1_train_pred_2.csv')
df3 = pd.read_csv('L1_train_pred_3.csv')
df4 = pd.read_csv('L1_train_pred_4.csv')
df5 = pd.read_csv('L1_train_pred_5.csv')
df6 = pd.read_csv('L1_train_pred_6.csv')
df7 = pd.read_csv('L1_train_pred_7.csv')

df_test1 = pd.read_csv('L1_test_pred_1.csv')
df_test2 = pd.read_csv('L1_test_pred_2.csv')
df_test3 = pd.read_csv('L1_test_pred_3.csv')
df_test4 = pd.read_csv('L1_test_pred_4.csv')
df_test5 = pd.read_csv('L1_test_pred_5.csv')
df_test6 = pd.read_csv('L1_test_pred_6.csv')
df_test7 = pd.read_csv('L1_test_pred_7.csv')

In [None]:
train_df = train_df.merge(df1, on='id', how='left')
train_df = train_df.merge(df2, on='id', how='left')
train_df = train_df.merge(df3, on='id', how='left')
train_df = train_df.merge(df4, on='id', how='left')
train_df = train_df.merge(df5, on='id', how='left')
train_df = train_df.merge(df6, on='id', how='left')
train_df = train_df.merge(df7, on='id', how='left')

test_df = test_df.merge(df_test1, on='id', how='left')
test_df = test_df.merge(df_test2, on='id', how='left')
test_df = test_df.merge(df_test3, on='id', how='left')
test_df = test_df.merge(df_test4, on='id', how='left')
test_df = test_df.merge(df_test5, on='id', how='left')
test_df = test_df.merge(df_test6, on='id', how='left')
test_df = test_df.merge(df_test7, on='id', how='left')

In [None]:
cols = list(train_df.columns)
stack_features = [feat for feat in cols if str(feat).startswith('pred')]
print(stack_features)

In [None]:
test_pred_all = None
all_roc = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    model = LogisticRegression(solver='liblinear')
    model.fit(train[stack_features].values, train[target].values)

    valid_pred = model.predict_proba(valid[stack_features].values)[:, 1]
    test_pred = model.predict_proba(test[stack_features].values)[:, 1]
    roc = roc_auc_score(valid[target].values, valid_pred)
    all_roc.append(roc)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} ROC: {round(roc, 4)}')
    
print('')
print(f'Average ROC: {round(np.mean(all_roc), 4)} Std: {round(np.std(all_roc), 4)}')
test_pred_all /= train_df['kfold'].nunique()

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['claim'] = test_pred_all
sub_2.to_csv('Stacked_Submission_1.csv', index=False)