In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna
from optuna.pruners import SuccessiveHalvingPruner, MedianPruner
from optuna.integration import XGBoostPruningCallback, LightGBMPruningCallback
from optuna.distributions import *

## Find optimal hyperparameters for XGB, LGB using Optuna + GPU

In [None]:
# load data

train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

features = [col for col in test.columns if 'f' in col]
TARGET = 'claim'

train['n_missing'] = train[features].isna().sum(axis=1)
#test['n_missing'] = test[features].isna().sum(axis=1)

train['std'] = train[features].std(axis=1)
#test['std'] = test[features].std(axis=1)

features += ['n_missing', 'std']

In [None]:
train[features] = train[features].fillna(train[features].mean())
#test[features] = test[features].fillna(test[features].mean())

scaler = StandardScaler()
train[features] = scaler.fit_transform(train[features])
#test[features] = scaler.transform(test[features])

In [None]:
y_cv = train.pop(TARGET)
X_cv = train.drop('id', axis = 1)

In [None]:
TIMEOUT = int(3600*7)
MODEL_NAME = 'lgb'

In [None]:
def get_params(trial, model_name):
    
    if model_name == 'xgb':
        trial_params = {
            'max_depth': trial.suggest_int('max_depth', 2, 16),
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3),
            'n_estimators': trial.suggest_int('n_estimators', 100, 200),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 512),
            'subsample': trial.suggest_float('subsample', 0.0, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.0, 1.0),
            'alpha': trial.suggest_int('alpha', 0, 100),
            'lambda': trial.suggest_int('lambda', 1, 100),
            'gamma': trial.suggest_float('gamma', 0.0, 0.1)}
        
        user_params = {
            'use_label_encoder': False,
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            'random_state': 0,
            'verbosity': 0,
            'n_jobs':4,
            'objective': 'binary:logistic'}
        
    elif model_name == 'lgb':
        trial_params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 20000),
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3),
            'num_leaves': trial.suggest_int('num_leaves', 20, 3000),
            'max_depth': trial.suggest_int('max_depth', 3, 16),
            'min_child_samples': trial.suggest_int('min_child_samples', 200, 10000, 100),
            'reg_alpha': trial.suggest_int('reg_alpha', 0, 100, step=5),
            'reg_lambda': trial.suggest_int('reg_lambda', 0, 100, step=5),
            'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 10.0),
            'subsample': trial.suggest_float('subsample', 0.1, 1.0),
            'subsample_freq': trial.suggest_int('subsample_freq', 1, 5),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0)}
        
        user_params = {
            'n_jobs': 4,
            'device': 'gpu',
            'objective': 'binary'}
        
    elif model_name == 'cb':
        trial_params = {}
        
    else:
        trial_params = {}

    return trial_params, user_params


def select_model(model_name):

    if model_name == 'xgb':
        model = XGBClassifier
        
    elif model_name == 'lgb':
        model = LGBMClassifier
        
    elif model_name == 'cb':
        model = CatBoostClassifier
        
    else:
        model = None
        
    return model


def generate_fit_params(trial, model_name, X_train, y_train, X_val, y_val):
    
    if model_name == 'xgb':        
        fit_params = {'X': X_train,
                      'y': y_train,
                      'eval_set': [(X_val, y_val)],
                      'eval_metric': 'logloss',
                      'early_stopping_rounds': 100,
                      'verbose': 0,
                      'callbacks': [XGBoostPruningCallback(trial, 'validation_0-logloss')]}
        
    elif model_name == 'lgb':
        fit_params = {'X': X_train,
                      'y': y_train,
                      'eval_set': [(X_val, y_val)],
                      'eval_metric': 'binary_logloss',
                      'early_stopping_rounds': 100,
                      'verbose': 0,
                      'callbacks': [LightGBMPruningCallback(trial, 'binary_logloss')]}
        
    elif model_name == 'cb':
        fit_params = {}
        
    else:
        fit_params = {}
    
    return fit_params


def objective(trial, X, y, model_name, cv = None):
    
    '''
    Args:
        trial: an optuna trial
        X: input features
        y: target
        search_params: optuna distributions
        user_params: additional params
        fit_params: passed to fit
        cv: cross validations strategy
    
    '''
    
    if cv==None:
        cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
    
    trial_params, user_params = get_params(trial, model_name)
    
    if user_params!=None:
        for k,v in user_params.items():
            trial.set_user_attr(k,v)
        params = {**trial_params, **user_params}
    else:
        params = trial_params
    
    cv_scores = []
    
    for idx, (train_idx, val_idx) in enumerate(cv.split(X,y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model = select_model(model_name)(**params)
        model.fit(**generate_fit_params(trial = trial,
                                        model_name = model_name, 
                                        X_train = X_train, 
                                        y_train = y_train, 
                                        X_val = X_val,
                                        y_val = y_val))
        
        probas = model.predict_proba(X_val)
        cv_scores.append(log_loss(y_val, probas))
        
    cv_score = np.mean(cv_scores)
    
    return cv_score


In [None]:
study = optuna.create_study(study_name = MODEL_NAME,
                            direction='minimize',
                            storage = 'sqlite:///optuna.db',
                            load_if_exists = True)

study.optimize(lambda trial: objective(trial, X_cv, y_cv, model_name = MODEL_NAME),
               timeout = TIMEOUT)

In [None]:
best_trial = study.best_trial
print (f'Best Trial Number: {best_trial.number}')
print (f'Best Trial Score: {best_trial.value}')
print ('Best Trial Params:')
print ({**best_trial.params, **best_trial.user_attrs})