# Libraries
---

In [None]:
import datatable as dt

import pandas as pd
import numpy as np
import random
import time
import os
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters
---

In [None]:
N_SPLITS = 5
N_ESTIMATORS = 20000
EARLY_STOPPING_ROUNDS = 200
VERBOSE = 1000
SEED = 42

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(SEED)

# Datasets
---

In [None]:
train = dt.fread('../input/tabular-playground-series-oct-2021/train.csv').to_pandas()
test = dt.fread('../input/tabular-playground-series-oct-2021/test.csv').to_pandas()
train = train[train.columns[1:]]
test = test[test.columns[1:]]

TARGET = 'target'
train[TARGET] = train[TARGET].astype('uint8')

In [None]:
features = [col for col in train.columns if 'f' in col]

cont_features =[]
disc_features =[]

for col in features:
    if train[col].dtype=='float64':
        cont_features.append(col)
    else:
        disc_features.append(col)

In [None]:
train[cont_features] = train[cont_features].astype('float32')
train[disc_features] = train[disc_features].astype('uint8')

test[cont_features] = test[cont_features].astype('float32')
test[disc_features] = test[disc_features].astype('uint8')

In [None]:
train['bin_count'] = train[disc_features].sum(axis=1)
test['bin_count'] = test[disc_features].sum(axis=1)

disc_features += ['bin_count']

In [None]:
scaler = RobustScaler()
train[cont_features] = scaler.fit_transform(train[cont_features])
test[cont_features] = scaler.transform(test[cont_features])

In [None]:
features = disc_features + cont_features

In [None]:
display(train.info())
display(train[features].head())

In [None]:
display(test.info())
display(test[features].head())

# LightGBM
---

In [None]:
lgb_params = {
    'objective': 'binary',
    'n_estimators': N_ESTIMATORS,
    'random_state': SEED,
    'learning_rate': 8e-3,
    'subsample': 0.6,
    'subsample_freq': 1,
    'colsample_bytree': 0.4,
    'reg_alpha': 10.0,
    'reg_lambda': 1e-1,
    'min_child_weight': 256,
    'min_child_samples': 20,
    'categorical_feature': len(disc_features),
}

## Cross validation

In [None]:
lgb_oof = np.zeros(train.shape[0])
lgb_pred = np.zeros(test.shape[0])

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train, y=train[TARGET])):
    print(f"===== fold {fold} =====")
    X_train, y_train = train[features].iloc[trn_idx], train[TARGET].iloc[trn_idx]
    X_valid, y_valid = train[features].iloc[val_idx], train[TARGET].iloc[val_idx]
    X_test = test[features]
    
    start = time.time()
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE,
    )

    lgb_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    lgb_pred += model.predict_proba(X_test)[:, -1] / N_SPLITS

    elapsed = time.time() - start
    auc = roc_auc_score(y_valid, lgb_oof[val_idx])
    print(f"fold {fold} - lgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

print(f"oof lgb roc = {roc_auc_score(train[TARGET], lgb_oof)}")

np.save("lgb_oof.npy", lgb_oof)
np.save("lgb_pred.npy", lgb_pred)

# Submission
---

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')

submission[TARGET] = lgb_pred
submission.to_csv("submission.csv", index=False)
submission