# Problem definition

From description:

"The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting the category on an eCommerce product given various attributes about the listing. Although the features are anonymized, they have properties relating to real-world features."


See notebooks using R:

1. [Finding the best pre-processing configuration and predictive models based on the original data](https://www.kaggle.com/gomes555/tps-may2021-r-eda-tidymodels-workflowsets/)
2. [Create DAE dataset and fit models in DAE data](https://www.kaggle.com/gomes555/tps-may2021-r-dae-keras) 
4. [Stacking all](https://www.kaggle.com/gomes555/tps-may2021-r-tidymodels-stacks/)

Notebooks using Python:

1. [LightGbm sequencial tuning with Optuna Step-wise by LightGBM Tuner](https://www.kaggle.com/gomes555/tps-may2021-optuna-lightgbm-tuner)
2. **LightGbm tuning with Optuna TPE (Tree-structured Parzen Estimator)**
3. [LightGbm tuning one vs rest with Optuna Step-wise by LightGBM Tuner](https://www.kaggle.com/gomes555/tps-may2021-optuna-tuner-one-x-rest)
4. [LightGbm tuning pseudo label with Optuna Tuner](https://www.kaggle.com/gomes555/tps-may2021-lightgbm-pseudolabel/)
5. [Stacking All](https://www.kaggle.com/gomes555/tps-may2021-stacking)

All notebooks will be public and suggestions and criticism are very welcome!


<br>

<p align="right"><span style="color:firebrick">Dont forget the upvote if you liked the notebook! <i class="fas fa-hand-peace"></i></span> </p>

# Dependencies

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score

from lightgbm import LGBMClassifier

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
from optuna.integration import LightGBMPruningCallback

from tqdm import tqdm

In [None]:
train=pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
sub=pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

# Prepare data

In [None]:
conditions = [
    (train.target == "Class_1"),
    (train.target == "Class_2"),
    (train.target == "Class_3"),
    (train.target == "Class_4")
]
choices = [1, 2, 3, 4]
train["target"] = np.select(conditions, choices)

In [None]:
X_test = test.drop(['id'], axis=1)
X = train.drop(['id', 'target'], axis=1)
y = train.target

In [None]:
def kfold_prediction(X, y, X_test, k, hyperparams, fixed_paramns, early_stopping_rounds):

    yp_test = np.zeros([len(X_test), 4])
    loss_val = 0
    
    kf = StratifiedKFold(n_splits=k,random_state=42,shuffle=True)
    model = LGBMClassifier(**fixed_paramns, **hyperparams)
    
    pbar = tqdm(total=K)
    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        #print(f"FOLD {i} ...", end =" ")
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]

        model.fit(X_train, y_train,
                  eval_set=(X_val, y_val),
                  early_stopping_rounds=early_stopping_rounds,
                  verbose=0,
                  eval_metric="multi_logloss" 
                 )
        
        for i in [0,1,2,3]:
            yp_test[:,i] += model.predict_proba(X_test)[:,i] / k
        
        yp_val = model.predict_proba(X_val)
        loss_val += log_loss(y_val, yp_val) / k
        
        pbar.update(n=1)
            
    return loss_val, yp_test

 # Baseline

In [None]:
fixed_paramns = {
    'random_state': 314,
    'n_estimators': 100000, 
    'learning_rate': 0.02,
    #'boosting_type':'goss',
    'metric':'multi_logloss'
}

K = 8

loss, y_pred = kfold_prediction(X, y, X_test, 8, {}, fixed_paramns, early_stopping_rounds = 70)

print('\nvalidation loss:', loss)

In [None]:
def model_instance(hyperparams, fixedparams):

    clf = LGBMClassifier(**hyperparams['clf'], **fixedparams) 
    
    return clf

In [None]:
def objective(trial):
    
    global X, y, K, fixed_paramns
    
    # Default value of tree_depth, used for upper bound of num_leaves.
    max_depth = trial.suggest_int('max_depth', 2, 12)
    max_num_leaves = (2 ** max_depth) - 1
    
    hyperparams = {
        #'esr': trial.suggest_int('esr', 30, 100, 10), 
        'clf':{
            # 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5) 
            'max_depth': max_depth,
            'num_leaves': trial.suggest_int('num_leaves', 2, max_num_leaves),
            #'max_bin': trial.suggest_int('max_bin', 32, 255),
            ##'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 256),
            ##'min_data_in_bin': trial.suggest_int('min_data_in_bin', 1, 256),
            'min_split_gain' : trial.suggest_float('min_split_gain', 1e-8, 5, log=True),
            'reg_alpha': trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.04, 1.0),
            'subsample': trial.suggest_float('subsample', 0.04, 1.0),
            'subsample_freq': trial.suggest_int("subsample_freq", 1, 7),
            'min_child_samples': trial.suggest_int("min_child_samples", 5, 100)
        }

    }
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42)
    
    model = model_instance(hyperparams, fixed_paramns)
    cv = StratifiedKFold(n_splits=K,random_state=42,shuffle=True)
    
    fit_params = {
        'eval_set':(X_val, y_val),
        'early_stopping_rounds': 70, #hyperparams['esr'],
        'verbose':0,
        'eval_metric':"multi_logloss",
        'callbacks': [LightGBMPruningCallback(trial, 'multi_logloss')]
    }

    cv_score = cross_val_score(model,
                               X_train, y_train, cv=cv,
                               fit_params=fit_params,
                               n_jobs=-1, verbose=0,
                               error_score='raise',
                               scoring='neg_log_loss')

    return -np.mean(cv_score)

In [None]:
study = optuna.create_study(direction='minimize',
                            pruner=optuna.pruners.HyperbandPruner())

In [None]:
%%time

study.optimize(objective, timeout=60*15, 
               n_trials=None, gc_after_trial=False)

In [None]:
study.best_value

In [None]:
plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
plot_param_importances(study)

In [None]:
study.best_params

In [None]:
final_params = dict()
#final_params['clf']=dict(study.best_params)

# after long train...
final_params['clf'] = {
    'max_depth': 3,
    'num_leaves': 6,
    'min_split_gain': 0.17865452483871047,
    'reg_alpha': 9.540720621520459,
    'reg_lambda': 4.5781292529661375,
    'colsample_bytree': 0.0644950794287173,
    'subsample': 0.9314592865852914,
    'subsample_freq': 7,
    'min_child_samples': 57}

In [None]:
loss, y_pred = kfold_prediction(X, y, X_test, 8,final_params['clf'] ,fixed_paramns, early_stopping_rounds = 70)

In [None]:
print('\nvalidation loss:', loss)

# Submission

In [None]:
sub=pd.concat([
    test.id,
    pd.DataFrame(y_pred, columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4'])
], axis=1)

sub.to_csv('lgbm_optuna_tpe.csv', index=False)