# Libraries
---

In [None]:
import cudf

import pandas as pd
import numpy as np
import random
import time
import os
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters
---

In [None]:
N_SPLITS = 5
N_ESTIMATORS = 20000
EARLY_STOPPING_ROUNDS = 200
VERBOSE = 1000
SEED = 42

CURRENT_PATH = os.getcwd().split("/")[-1]

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(SEED)

# Datasets
---

In [None]:
%%time

train = cudf.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
test = cudf.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
train = train[train.columns[1:]]
test = test[test.columns[1:]]

TARGET = 'target'

In [None]:
features = [col for col in train.columns if 'f' in col]

cont_features =[]
disc_features =[]

for col in features:
    if train[col].dtype=='float64':
        cont_features.append(col)
    else:
        disc_features.append(col)

In [None]:
train[cont_features] = train[cont_features].astype('float32')
train[disc_features] = train[disc_features].astype('uint8')

test[cont_features] = test[cont_features].astype('float32')
test[disc_features] = test[disc_features].astype('uint8')

In [None]:
cols = disc_features.copy()
cols.remove('f22')
cols.remove('f43')
train['disc_sum'] = train[cols].sum(axis=1)
test['disc_sum'] = test[cols].sum(axis=1)

disc_features += ['disc_sum']

In [None]:
cols_ovr = [f'{col}_ovr' for col in cont_features]
train[cols_ovr] = (train[cont_features] > train[cont_features].mean()).astype('uint8')
test[cols_ovr] = (test[cont_features] > test[cont_features].mean()).astype('uint8')

disc_features += cols_ovr

In [None]:
train = train.to_pandas()
test = test.to_pandas()

features = disc_features + cont_features

In [None]:
display(train.info())
display(train[features].head())

In [None]:
display(test.info())
display(test[features].head())

# XGBoost
---

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'learning_rate': 8e-3,
    'seed': SEED,
    'subsample': 0.6,
    'colsample_bylevel': 0.9,
    'colsample_bytree': 0.4,
    'n_estimators': N_ESTIMATORS,
    'max_depth': 8,
    'alpha': 64,
    'lambda': 32,
    'min_child_weight': 8,
    'importance_type': 'total_gain',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
}

## Cross validation

In [None]:
xgb_oof = np.zeros(train.shape[0])
xgb_pred = np.zeros(test.shape[0])
xgb_importances = pd.DataFrame()

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train, y=train[TARGET])):
    print(f"===== fold {fold} =====")
    X_train, y_train = train[features].iloc[trn_idx], train[TARGET].iloc[trn_idx]
    X_valid, y_valid = train[features].iloc[val_idx], train[TARGET].iloc[val_idx]
    X_test = test[features]
    
    start = time.time()
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE
    )

    fi_tmp = pd.DataFrame()
    fi_tmp['feature'] = X_train.columns
    fi_tmp['importance'] = model.feature_importances_
    fi_tmp['fold'] = fold
    fi_tmp['seed'] = SEED
    xgb_importances = xgb_importances.append(fi_tmp)

    xgb_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    xgb_pred += model.predict_proba(X_test)[:, -1] / N_SPLITS

    elapsed = time.time() - start
    auc = roc_auc_score(y_valid, xgb_oof[val_idx])
    print(f"fold {fold} - xgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")
        
print(f"oof xgb auc = {roc_auc_score(train[TARGET], xgb_oof)}")

np.save("xgb_oof.npy", xgb_oof)
np.save("xgb_pred.npy", xgb_pred)

## Feature importance

In [None]:
order = list(xgb_importances.groupby('feature').mean().sort_values('importance', ascending=False).index)

fig = plt.figure(figsize=(16, 32), tight_layout=True)
sns.barplot(x="importance", y="feature", data=xgb_importances.groupby('feature').mean().reset_index(), order=order)
plt.title("XGBoost feature importances")

# Submission
---

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')

submission[TARGET] = xgb_pred
submission.to_csv(f"{CURRENT_PATH}_submission.csv", index=False)
submission