# Load Dependencies

In [None]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import LabelEncoder

from lightgbm import LGBMClassifier

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
from optuna.integration import LightGBMPruningCallback

import warnings
warnings.filterwarnings("ignore", message="categorical_column in param dict is overridden.")
warnings.filterwarnings("ignore", message='Overriding the parameters from Reference Dataset.')


In [None]:
train=pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
sub=pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')

In [None]:
conditions = [
    (train.target == "Class_1"), (train.target == "Class_2"), (train.target == "Class_3"),
    (train.target == "Class_4"), (train.target == "Class_5"), (train.target == "Class_6"),
    (train.target == "Class_7"), (train.target == "Class_8"), (train.target == "Class_9")
]
choices = [0, 1, 2, 3, 4, 5, 6, 7, 8]
train["target"] = np.select(conditions, choices)

X_test = test.drop(['id'], axis=1)
X = train.drop(['id', 'target'], axis=1)
y = train.target

In [None]:
for col in X.columns:
    X[col] = X[col].astype('category')
    
for col in X_test.columns:
    X_test[col] = X_test[col].astype('category')

In [None]:
K=5
SEED=314
ESR=100

fixed_params = {
    'random_state': SEED,
    'n_estimators': 100000, 
    #'boosting_type':'goss',
    'learning_rate':0.01,
    'metric':'multi_logloss'
}

kf = StratifiedKFold(n_splits=K, shuffle=True, random_state=SEED)

see: https://www.kaggle.com/gomes555/tps-jun2021-lightautoml

In [None]:
oof_lightautoml=pd.read_csv('../input/tps-jun2021-lightautoml/oof_lightautoml.csv')
sub_lightautoml=pd.read_csv('../input/tps-jun2021-lightautoml/sub_lightautoml.csv')

oof_lightautoml = oof_lightautoml.drop('id', axis=1)
oof_lightautoml.columns = ['pred_lightautoml' + str(i) for i in range(1, 10)]

sub_lightautoml = sub_lightautoml.drop('id', axis=1)
sub_lightautoml.columns = ['pred_lightautoml' + str(i) for i in range(1, 10)]

X = pd.concat([X, oof_lightautoml], axis=1)
X_test = pd.concat([X_test, sub_lightautoml], axis=1)

# Baseline

In [None]:
#lgb_oof = np.zeros((X.shape[0], 9))
#lgb_pred = 0
#
#for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
#    print(f"➜ FOLD :{fold}")
#    X_train = X.iloc[train_idx]
#    y_train = y.iloc[train_idx]
#    X_val = X.iloc[val_idx]
#    y_val = y.iloc[val_idx]
#
#    start = time.time()
#    
#    model = LGBMClassifier(**fixed_params)
#    
#    model.fit(X_train, y_train,
#              eval_set=(X_val, y_val),
#              early_stopping_rounds=ESR,
#              verbose=0,
#              eval_metric="multi_logloss" 
#             )
#    
#    lgb_oof[val_idx,:] = model.predict_proba(X_val)
#    lgb_pred += model.predict_proba(X_test) / K
#    
#    lgb_logloss = log_loss(y_val, lgb_oof[val_idx])
#    print(f"score: {lgb_logloss:.6f} ")
#    print(f"elapsed: {time.time()-start:.2f} sec\n")
#    
#    del model
#
#lgb_logloss = log_loss(y, lgb_oof)
#print(f"Final logloss score: {lgb_logloss} ✔️ ")

# Optuna

In [None]:
def objective(trial):

    max_depth = trial.suggest_int('max_depth', 3, 12)
    max_num_leaves = (2 ** max_depth) - 1
    
    hyperparams = {
        #'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': max_depth,
        'num_leaves': trial.suggest_int('num_leaves', 7, max_num_leaves),
        'min_split_gain' : trial.suggest_float('min_split_gain', 1e-8, 5, log=True), # gama
        'reg_alpha': trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True), # l1
        'reg_lambda': trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True), # l2
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8), # feature_fraction 
        'subsample': trial.suggest_float('subsample', 0.1, 0.8), # bagging_fraction 
        'subsample_freq': trial.suggest_int("subsample_freq", 1, 7), # bagging_freq 
        'min_child_samples': trial.suggest_int("min_child_samples", 5, 100), # min_data_in_leaf 
        'cat_smooth': trial.suggest_float('cat_smooth', 10, 50),
        'cat_l2': trial.suggest_int('cat_l2', 1, 20)
        #'extra_trees': trial.suggest_categorical("extra_trees", [True, False])
    }

    params = dict(**fixed_params, **hyperparams)
    lgb_oof = np.zeros((X.shape[0], 9))

    for i, (train_idx, val_idx) in enumerate(kf.split(X, y) ):

        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]

        model = LGBMClassifier(**params)

        model.fit(X_train, y_train,
                  eval_set=(X_val, y_val),
                  early_stopping_rounds=ESR,
                  verbose=0,
                  eval_metric="multi_logloss",
                  callbacks=[LightGBMPruningCallback(trial, 'multi_logloss', valid_name="valid_0")]
                 )

        lgb_oof[val_idx,:] = model.predict_proba(X_val)

    return log_loss(y, lgb_oof)

In [None]:
study = optuna.create_study(direction='minimize',
                            pruner=optuna.pruners.HyperbandPruner(),
                            #pruner=optuna.pruners.HyperbandPruner(min_resource=100,  reduction_factor=4),
                            #sampler=optuna.samplers.TPESampler(n_startup_trials=50, multivariate=True, seed=123)
                           )

study.optimize(objective, 
               timeout=60*60*7.5, 
               #timeout=60*3, 
               n_trials=None, 
               gc_after_trial=False)

In [None]:
study.best_value

In [None]:
plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
plot_param_importances(study)

In [None]:
final_params = dict(**fixed_params, **study.best_params)

# without catfeatures
#final_params = {'random_state': 314,
# 'n_estimators': 100000,
# 'metric': 'multi_logloss',
# 'max_depth': 7,
# 'learning_rate': 0.03320472235360897,
# 'num_leaves': 9,
# 'min_split_gain': 0.0001542021952793842,
# 'reg_alpha': 6.740022575250581,
# 'reg_lambda': 1.2597688856185852e-08,
# 'colsample_bytree': 0.4631454737475204,
# 'subsample': 0.29923683243864324,
# 'subsample_freq': 6,
# 'min_child_samples': 77,
# 'cat_smooth': 26.06212029378543,
# 'cat_l2': 16,
# 'extra_trees': False}

# with catfeatures
#final_params = {'random_state': 314,
# 'n_estimators': 100000,
# 'learning_rate': 0.01,
# 'metric': 'multi_logloss',
# 'max_depth': 12,
# 'num_leaves': 463,
# 'min_split_gain': 1.3140865283434602,
# 'reg_alpha': 0.032315272094481116,
# 'reg_lambda': 3.117239474534205e-05,
# 'colsample_bytree': 0.5280756068150421,
# 'subsample': 0.14886257378346607,
# 'subsample_freq': 1,
# 'min_child_samples': 12,
# 'cat_smooth': 43.10632572020626,
# 'cat_l2': 9}

In [None]:
final_params

# Final Model

In [None]:
lgb_oof = np.zeros((X.shape[0], 9))
lgb_pred = 0

for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
    print(f"➜ FOLD :{fold}")
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]

    start = time.time()
    
    model = LGBMClassifier(**final_params)
    
    model.fit(X_train, y_train,
              eval_set=(X_val, y_val),
              early_stopping_rounds=ESR,
              verbose=0,
              eval_metric="multi_logloss" 
             )
    
    lgb_oof[val_idx,:] = model.predict_proba(X_val)
    lgb_pred += model.predict_proba(X_test) / K
    
    lgb_logloss = log_loss(y_val, lgb_oof[val_idx])
    print(f"score: {lgb_logloss:.6f} ")
    print(f"elapsed: {time.time()-start:.2f} sec\n")
    
    del model

lgb_logloss = log_loss(y, lgb_oof)
print(f"Final logloss score: {lgb_logloss} ✔️ ")

# Sub

In [None]:
sub.iloc[:, 1:] = lgb_pred
sub.to_csv("sub_lgb_optuned.csv", index=False)

In [None]:
oof_lgb = pd.concat([train.id,
                     pd.DataFrame(lgb_oof, 
                                  columns=["Class_1", "Class_2", "Class_3",
                                           "Class_4", "Class_5", "Class_6",
                                           "Class_7", "Class_8", "Class_9"])],
                    axis=1)
oof_lgb.to_csv("oof_lgb_optuned.csv", index=False)