# TPS September 2021 - LGBM Optuna Baseline

In [None]:
import math
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgbm
import optuna.integration.lightgbm as lgbo
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from sklearn.metrics import roc_auc_score
import catboost
import optuna
import tensorflow as tf

sns.set_theme()
sns.set_palette(palette = "rainbow")

gpu_available = tf.test.is_gpu_available()
%matplotlib inline

## Load, Preprocess, Setup

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
train = train.set_index('id')
target = train['claim']
train = train.drop('claim', axis=1)
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
test = test.set_index('id')

In [None]:
for col in train.columns:
    avg_val = train[col].mean()
    train[col].fillna(avg_val, inplace=True)
    test[col].fillna(avg_val, inplace=True)

In [None]:
FOLDS = 20

print("Using {} Folds, Calibraiton Train Size of {}, Calibraiton Test Size of {}".format(FOLDS, 1 - 1 / FOLDS, 1 / FOLDS))

In [None]:
cal_X_train, cal_X_val, cal_y_train, cal_y_val = train_test_split(train, target, random_state=0, stratify=target, test_size=1 / FOLDS)

In [None]:
def score_model(mod, X, y):
    y_pred = mod.predict(X)
    return np.sqrt(roc_auc_score(y, y_pred))

## Search Hyperparameters

In [None]:
if True:
    params = { "objective": "binary", "metric": "auc", "device": "gpu" if gpu_available else "cpu" }

    lgb_train = lgbm.Dataset(cal_X_train, cal_y_train)
    lgb_valid = lgbm.Dataset(cal_X_val,   cal_y_val)

    model = lgbo.train(params, lgb_train, valid_sets=[lgb_valid], verbose_eval=False, num_boost_round=100, early_stopping_rounds=5)

    params = model.params

    with open("lgbm.json", "w") as file:
        file.write(json.dumps(params, indent=4))
else:
    params = {
        "objective": "binary",
        "metric": "auc",
        "device": "gpu",
        "feature_pre_filter": False,
        "lambda_l1": 1.5524815747440739,
        "lambda_l2": 3.0890640417182456e-7,
        "num_leaves": 132,
        "feature_fraction": 0.4,
        "bagging_fraction": 1,
        "bagging_freq": 0,
        "min_child_samples": 5,
        "num_iterations": 100,
        "early_stopping_round": 5
    }

params["learning_rate"] = 0.006
params["num_iterations"] = 80000
params

## K-Fold-Prediction

In [None]:
test_preds = []
fold_score = []

for fold, (trn_idx, val_idx) in enumerate(StratifiedKFold(n_splits=20, random_state=42, shuffle=True).split(train, target)):
    print("Fold :", fold + 1)

    fold_X_train, fold_y_train = train.iloc[trn_idx], target.iloc[trn_idx]
    fold_X_test,  fold_y_test  = train.iloc[val_idx], target.iloc[val_idx]

    lgb_train = lgbm.Dataset(fold_X_train, fold_y_train)
    lgb_valid = lgbm.Dataset(fold_X_test,  fold_y_test)

    model = lgbm.train(
        params, 
        lgb_train, 
        valid_sets=[lgb_valid], 
        verbose_eval=False, 
        early_stopping_rounds=100
    )

    test_preds.append(model.predict(test))
    score = score_model(model, fold_X_test, fold_y_test)
    fold_score.append(score)
    
    print("#### fold #########", score)

## Model Prediction Evaluation

In [None]:
plt.figure(figsize=(15, 5))
g = sns.barplot(x=list(range(len(test_preds))), y=fold_score)
g.set_ylim(min(fold_score), max(fold_score))
g.set_title("Model Performance")

None

In [None]:
plt.subplots(figsize=(11, 9))
corr = np.corrcoef(test_preds)
g = sns.heatmap(corr, mask=np.triu(np.ones_like(corr, dtype=bool)))
g.set_title("Test Prediction Correlation")

None

In [None]:
total_preds = np.zeros(shape=test_preds[0].shape)
for i, pred in enumerate(test_preds):
    preds = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
    preds["claim"] = pred
    preds.to_csv("submission_{}.csv".format(i), index=False)
    
    total_preds += pred

total_preds /= len(test_preds)

preds = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
preds["claim"] = total_preds
preds.to_csv("submission.csv", index=False)