# TPS September 2021 - Optuna + LGBM

## Setup GPU for Lightgbm

We need to setup LightGBM to enable GPU acceleration.

In [None]:
!pip uninstall -y lightgbm
!apt-get install -y libboost-all-dev
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
!cd LightGBM/python-package/;python setup.py install --precompile

In [None]:
# cleanup
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

## Import libraries

In [None]:
import warnings
warnings.simplefilter("ignore")
import logging
import sys

import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
import optuna
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

## Load dataset

In [None]:
folds_dir = "../input/tps-september-2021-strat-kfolds/"
data_dir = "../input/tabular-playground-series-sep-2021/"

df_train = pd.read_csv(folds_dir + "train_folds.csv")
df_test = pd.read_csv(data_dir + "test.csv")
submission = pd.read_csv(data_dir + "sample_solution.csv")

features = [col for col in df_test.columns if "f" in col]

TARGET = "claim"
target = df_train[TARGET].copy()

## Preprocessing

In [None]:
# Handling missing values
my_imputer = SimpleImputer(strategy="mean")
imputed_df = pd.DataFrame(my_imputer.fit_transform(df_train))
# Imputation removed column names; put them back
imputed_df.columns = df_train.columns

df_train = imputed_df

## Training

In [None]:
def new_objective(seed=1, n_estimators=4500):
    def objective(trial):
        fold = 0

        x_train = df_train[df_train.kfold != fold].reset_index(drop=True)
        x_valid = df_train[df_train.kfold == fold].reset_index(drop=True)

        y_train = x_train[TARGET]
        y_valid = x_valid[TARGET]

        x_train = x_train[features]
        x_valid = x_valid[features]
        
        # standardize
        scaler = StandardScaler()
        x_train = scaler.fit_transform(x_train)
        x_valid = scaler.transform(x_valid)
        

        param = {
            "random_state": seed,
            "n_estimators": n_estimators,
            "objective": "binary",
            "metric": "AUC", 
            "verbosity": -1,
            
            "learning_rate": trial.suggest_loguniform('learning_rate', 0.01, 1.0),
            "reg_alpha": trial.suggest_categorical("reg_alpha", [1,10.0]),
            "reg_lambda": trial.suggest_categorical("reg_lambda", [1e-1,1e-2]),
            "colsample_bytree": trial.suggest_categorical("colsample_bytree", [0.4,0.6,0.8]),
            "subsample": trial.suggest_categorical("subsample", [0.4,0.6,0.8]),
            "subsample_freq": trial.suggest_categorical("subsample_freq", [1,2]),
            "max_depth": -1,
            "num_leaves" : trial.suggest_categorical("num_leaves", [128,512]),
            "min_child_weight" : trial.suggest_categorical("min_child_weight", [128,256]),
            "min_child_samples": trial.suggest_categorical("min_child_samples", [20,100]),
            
            # optional: enable gpu
            "device": "gpu",
            "gpu_platform_id": 0,
            "gpu_device_id": 0
        }

        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_valid = lgb.Dataset(x_valid, y_valid, reference=lgb_train)
        model = lgb.train(param, lgb_train, valid_sets=[lgb_valid], 
                           early_stopping_rounds=300, verbose_eval=500)   
        valid_preds = model.predict(x_valid)

        score = roc_auc_score(y_valid, valid_preds) 
        return score
    return objective

In [None]:
n_trials = 50
seed = 42
print(f"Training with seed {seed}...")

# objective function with custom seed
obj_func = new_objective(seed=seed)

study_name = "optuna_lgbm"
storage_name = f"sqlite:///{study_name}.db"
study = optuna.create_study(direction="maximize", study_name=study_name, storage=storage_name)
study.optimize(obj_func, n_trials=n_trials)

print("Done.")

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
trials_df = study.trials_dataframe()
trials_df

Done.