In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer
import lightgbm
import optuna

RANDOM_STATE = 42  # random state
N_FOLD = 5       # number of fold for CV
N_BINS = 10      # number of bins for target discretization


In [None]:
# Load data
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv")
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv")
# add fold column
df_train["fold"] = -1
df_train.drop(columns="id", inplace=True)
df_train.head()

# Split dataset for CV

Split the dataset using discretized target

In [None]:
# Target discretization (try to use different strategies e.g.: quantile, kmeans)
target = df_train.loss.values.reshape(-1,1)
kbins = KBinsDiscretizer(n_bins=N_BINS, encode="ordinal", strategy="uniform")
target_discrete = kbins.fit_transform(target)

In [None]:
# Create folds
skf = StratifiedKFold(N_FOLD, shuffle=True, random_state=RANDOM_STATE)

for k, (train_index, test_index) in enumerate(skf.split(np.zeros(len(target)), target_discrete)):
    df_train.loc[test_index, "fold"] = k

# convert fold to int
df_train["fold"] = df_train["fold"].astype(np.int32)
df_train.head()

# Train LGBM

In [None]:
def objective(trial, write_submission=False):
    rmse_list = []

    if write_submission:
        classifier_list = []

    train_col = set(df_train.columns).difference(["id", "loss", "fold"])

    params = {
        "reg_alpha" : trial.suggest_loguniform("reg_alpha" , 1e-2 , 1),
        "reg_lambda" : trial.suggest_loguniform("reg_lambda" , 1e-2 , 1),
        "num_leaves" : trial.suggest_int("num_leaves" , 40 , 200),
        "learning_rate" : trial.suggest_float("learning_rate" , 0.1 , 0.2),
        "max_depth" : trial.suggest_int("max_depth" , 3 , 4),
        "n_estimators" : trial.suggest_int("n_estimators" , 200 ,1000),
        "min_child_samples" : trial.suggest_int("min_child_samples" , 10 , 100),
        "min_child_weight" : trial.suggest_loguniform("min_child_weight" , 1e-5 , 1),
        "subsample" : trial.suggest_float("subsample" , 0.05 , 1.0),
        "colsample_bytree" : trial.suggest_float("colsample_bytree" , 0.05 , 0.1),
        "random_state": RANDOM_STATE
    }

    for k in range(N_FOLD):
        # get train/val index
        train_index = df_train["fold"] != k
        val_index = df_train["fold"] == k

        # Compute train/val dataset
        X_train = df_train.loc[train_index, train_col].values
        y_train = df_train.loc[train_index, "loss"].values

        X_val = df_train.loc[val_index, train_col].values
        y_val = df_train.loc[val_index, "loss"].values

        
        # ===================================
        lgbm = lightgbm.LGBMRegressor(**params)
        lgbm.fit(X_train, y_train, eval_set=[(X_val,y_val)], eval_metric="rmse", early_stopping_rounds=100, verbose=False)
        # ====================================

        best_rmse = lgbm.best_score_["valid_0"]["rmse"]
        rmse_list.append(best_rmse)
    
        if write_submission:
            classifier_list.append(lgbm)
            print(f"fold {k}: rmse {rmse_list[-1]}")

    rmse_cv = np.mean(rmse_list)

    if write_submission:
        print(f"Writing submission with rmse cv: {rmse_cv}")
        df_sub = pd.DataFrame()
        df_sub["id"] = df_test["id"]

        y_test = 0

        for k in range(N_FOLD):
            X_test = df_test.loc[:, train_col]
            y_test += classifier_list[k].predict(X_test)

        y_test /= N_FOLD

        df_sub["loss"] = y_test

        df_sub.to_csv("submission.csv", float_format="%.12f", index=False)

    return rmse_cv

In [None]:
# Optimize objective function with OPTUNA !
sampler = optuna.samplers.TPESampler(seed=RANDOM_STATE)
study = optuna.create_study(sampler=sampler, direction="minimize")
study.optimize(objective, n_trials=20)

# Print best parameters
best_trial = study.best_trial
print("Best trial:")
print(best_trial.params)

In [None]:
# Write submission file
objective(best_trial, write_submission=True)