# Libraries
---

In [None]:
import pandas as pd
import numpy as np
import random
import time
import sys
import os
import gc

import cudf

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

import warnings
warnings.simplefilter('ignore')

# Parameters
---

In [None]:
DEBUG = True
EXTRA_DATA = False

N_SPLITS = 5
N_ESTIMATORS = 20000
EARLY_STOPPING_ROUNDS = 200
VERBOSE = 1000
SEED = 42

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(SEED)

# Datasets
---

In [None]:
train = cudf.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
test = cudf.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

if DEBUG:
    train = train.sample(frac=0.4, random_state=SEED)
    
TARGET = 'target'

In [None]:
features = [col for col in train.columns if 'f' in col]

cont_features =[]
disc_features =[]

for col in features:
    if train[col].dtype=='float64':
        cont_features.append(col)
    else:
        disc_features.append(col)

In [None]:
train['bin_count'] = train[disc_features].sum(axis=1)
test['bin_count'] = test[disc_features].sum(axis=1)

disc_features += ['bin_count']

In [None]:
train[cont_features] = train[cont_features].astype('float32')
train[disc_features] = train[disc_features].astype('uint8')
test[cont_features] = test[cont_features].astype('float32')
test[disc_features] = test[disc_features].astype('uint8')

In [None]:
train = train.to_pandas()
test = test.to_pandas()

features = disc_features + cont_features

In [None]:
display(train.info())
display(train[features].head())

In [None]:
display(test.info())
display(test[features].head())

# XGBoost
---

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'learning_rate': 8e-3,
    'seed': SEED,
    'subsample': 0.6,
    'colsample_bytree': 0.4,
    'n_estimators': N_ESTIMATORS,
    'max_depth': 8,
    'alpha': 10,
    'lambda': 1e-1,
    'min_child_weight': 256,
    'importance_type': 'gain',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
}

## Cross validation

In [None]:
N_EXTRADATA = 5

if not EXTRA_DATA:
    xgb_oof = np.zeros(train.shape[0])
xgb_pred = np.zeros(test.shape[0])
xgb_importances = pd.DataFrame()

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train, y=train[TARGET])):
    print(f"===== fold {fold} =====")
    X_train, y_train = train[features].iloc[trn_idx], train[TARGET].iloc[trn_idx]
    X_valid, y_valid = train[features].iloc[val_idx], train[TARGET].iloc[val_idx]
    X_test = test[features]
    
    if EXTRA_DATA:
        oof_idx = fold*len(val_idx)*N_EXTRADATA

        X_train_row_length = X_train.shape[0]
        X_valid_row_length = X_valid.shape[0]
        ex_X_train = []
        ex_y_train = []
        ex_X_valid = []
        ex_y_valid = []
        for _ in range(N_EXTRADATA):
            ex_X_train.append(X_train)
            ex_y_train.append(y_train)
            ex_X_valid.append(X_valid)
            ex_y_valid.append(y_valid)
        X_train = pd.concat(ex_X_train).reset_index(drop=True)
        y_train = pd.concat(ex_y_train).reset_index(drop=True)
        X_valid = pd.concat(ex_X_valid).reset_index(drop=True)
        y_valid = pd.concat(ex_y_valid).reset_index(drop=True)

        for i in tqdm(range(N_EXTRADATA-1)):
            X_train_multiplier = np.random.normal(loc=1.0, scale=0.01, size=(X_train_row_length, len(cont_features)))
            X_valid_multiplier = np.random.normal(loc=1.0, scale=0.01, size=(X_valid_row_length, len(cont_features)))
            X_train.loc[X_train_row_length*(i+1):X_train_row_length*(i+2)-1, cont_features] *= X_train_multiplier
            X_valid.loc[X_valid_row_length*(i+1):X_valid_row_length*(i+2)-1, cont_features] *= X_valid_multiplier

        del X_train_multiplier, X_train_row_length
        del X_valid_multiplier, X_valid_row_length
        gc.collect()
    
    start = time.time()
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE
    )

    fi_tmp = pd.DataFrame()
    fi_tmp['feature'] = X_train.columns
    fi_tmp['importance'] = model.feature_importances_
    fi_tmp['fold'] = fold
    fi_tmp['seed'] = SEED
    xgb_importances = xgb_importances.append(fi_tmp)

    del fi_tmp
    gc.collect()
    
    if EXTRA_DATA:
        y_hat = model.predict_proba(X_valid)[:, -1]
        if fold == 0:
            xgb_oof = y_hat
            target = y_valid
        else:
            xgb_oof = np.concatenate([xgb_oof, y_hat])
            target = np.concatenate([target, y_valid])
    else:
        xgb_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
        
    xgb_pred += model.predict_proba(X_test)[:, -1] / N_SPLITS

    elapsed = time.time() - start
    if EXTRA_DATA:
        auc = roc_auc_score(y_valid, y_hat)
    else:
        auc = roc_auc_score(y_valid, xgb_oof[val_idx])
    print(f"fold {fold} - xgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

    del X_train, y_train, X_valid, y_valid
    if EXTRA_DATA:
        del y_hat
    gc.collect()
    
if EXTRA_DATA:
    print(f"oof xgb auc = {roc_auc_score(target, xgb_oof)}")
else:
    print(f"oof xgb auc = {roc_auc_score(train[TARGET], xgb_oof)}")
    
np.save("xgb_oof.npy", xgb_oof)
np.save("xgb_pred.npy", xgb_pred)

## Feature importance

In [None]:
order = list(xgb_importances.groupby('feature').mean().sort_values('importance', ascending=False).index)

fig = plt.figure(figsize=(16, 32), tight_layout=True)
sns.barplot(x="importance", y="feature", data=xgb_importances.groupby('feature').mean().reset_index(), order=order)
plt.title("XGBoost feature importances")

# Submission
---

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')

submission[TARGET] = xgb_pred
submission.to_csv("submission.csv", index=False)
submission