# Libraries

In [None]:
import pandas as pd
import numpy as np
import random
import time
import os
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
from catboost import CatBoostClassifier

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
import plotly.figure_factory as ff
import plotly.express as px

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
N_SPLITS = 5
N_ESTIMATORS = 5000
EARLY_STOPPING_ROUNDS = 200
VERBOSE = 500
SEED = 2021

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

# Datasets

In [None]:
INPUT = "../input/tabular-playground-series-oct-2021/"

train = pd.read_csv(INPUT + "train.csv")
test = pd.read_csv(INPUT + "test.csv")
submission = pd.read_csv(INPUT + "sample_submission.csv")

features = [col for col in test.columns if 'f' in col]
TARGET = 'target'

target = train[TARGET].copy()
train = train.drop('target', axis=1)

In [None]:
train.shape, test.shape

# CatBoostClassifier

In [None]:
cat_params = {'iterations': 2866,
 'od_wait': 3385,
 'learning_rate': 0.04280810491488757,
 'reg_lambda': 0.32139709692279206,
 'subsample': 0.8442605943226449,
 'random_strength': 22.468752639603235,
 'depth': 4,
 'min_data_in_leaf': 31,
 'leaf_estimation_iterations': 15,
 'task_type':"GPU",
 'bootstrap_type':'Poisson'}

In [None]:
cat_oof = np.zeros(train.shape[0])
cat_pred = np.zeros(test.shape[0])

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train, y=target)):
    print(f"===== CatBoostClassifier fold {fold} =====")
    X_train = train[features].iloc[trn_idx]
    y_train = target.iloc[trn_idx]
    X_valid = train[features].iloc[val_idx]
    y_valid = target.iloc[val_idx]
    X_test = test[features]
    
    start = time.time()
    model = CatBoostClassifier(**cat_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE,
    )

    cat_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    cat_pred += model.predict_proba(X_test)[:, -1] / N_SPLITS

    elapsed = time.time() - start
    auc = roc_auc_score(y_valid, cat_oof[val_idx])
    print(f"fold {fold} - cat auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

print(f"oof cat roc = {roc_auc_score(target, cat_oof)}")

np.save("cat_oof.npy", cat_oof)
np.save("cat_pred.npy", cat_pred)

# LGBMClassifier

In [None]:
lgb_params = {'objective': 'binary',
               'boosting_type': 'gbdt',
               'num_leaves': 62, 
               'max_depth': 512,
               'learning_rate': 0.05,
               'n_estimators': N_ESTIMATORS,
               'reg_alpha': 29.5,
               'reg_lambda': 94.1,
               'random_state': SEED,
               'bagging_seed': SEED,
               'feature_fraction_seed': SEED,
               'n_jobs': 4,
               'subsample': 0.5, 
               'subsample_freq': 2, 
               'colsample_bytree': 0.41, 
               'min_child_samples': 117,
               'min_child_weight': 426}

In [None]:
lgb_oof = np.zeros(train.shape[0])
lgb_pred = np.zeros(test.shape[0])
lgb_importances = pd.DataFrame()

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train, y=target)):
    print(f"===== LGBMClassifier fold {fold} =====")
    X_train = train[features].iloc[trn_idx]
    y_train = target.iloc[trn_idx]
    X_valid = train[features].iloc[val_idx]
    y_valid = target.iloc[val_idx]
    X_test = test[features]
    
    start = time.time()
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE,
    )
    
    fi_tmp = pd.DataFrame()
    fi_tmp['feature'] = model.feature_name_
    fi_tmp['importance'] = model.feature_importances_
    fi_tmp['fold'] = fold
    fi_tmp['seed'] = SEED
    lgb_importances = lgb_importances.append(fi_tmp)

    lgb_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    lgb_pred += model.predict_proba(X_test)[:, -1] / N_SPLITS

    elapsed = time.time() - start
    auc = roc_auc_score(y_valid, lgb_oof[val_idx])
    print(f"fold {fold} - lgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

print(f"oof lgb roc = {roc_auc_score(target, lgb_oof)}")

np.save("lgb_oof.npy", lgb_oof)
np.save("lgb_pred.npy", lgb_pred)

# Feature importance

In [None]:
order = list(lgb_importances.groupby('feature').mean().sort_values('importance', ascending=False).index)

fig = plt.figure(figsize=(16, 16), tight_layout=True)
sns.barplot(x="importance", y="feature", data=lgb_importances.groupby('feature').mean().reset_index(), order=order)
plt.title("LightGBM feature importances")

# Ensembling 

In [None]:
cat_submission = submission.copy()
lgb_submission = submission.copy()

In [None]:
cat_submission[TARGET] = cat_pred
lgb_submission[TARGET] = lgb_pred

In [None]:
# Group data together
hist_data = [cat_submission.target, lgb_submission.target]

group_labels = ['catboost', 'lgbm']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.3, show_hist=False, show_rug=False)
fig.show()

In [None]:
data=np.corrcoef([cat_submission.target, lgb_submission.target])
fig = px.imshow(data,
                x=group_labels,
                y=group_labels
               )
fig.show()

In [None]:
submission.loc[:, 'target'] = (0.5 * cat_submission.target
                             + 0.5 * lgb_submission.target
                              )

# Submission

In [None]:
submission.to_csv("submission.csv", index=False)
submission