## Tabular Playground Series June 2021

<img src="https://i.imgur.com/uHVJtv0.png">
<img src="https://lightgbm.readthedocs.io/en/latest/_images/LightGBM_logo_black_text.svg">

<br><br>

### Notebook Contents:

<div id="toc_container" style="background: #f9f9f9; border: 1px solid #aaa; display: table; font-size: 95%;
                               margin-bottom: 1em; padding: 20px; width: auto;">
<p class="toc_title" style="font-weight: 700; text-align: center">Notebook Contents</p>
<ul class="toc_list">
  <li><a href="#loading">0. Imports, Data Loading and Preprocessing</a>
  <li><a href="#eda">1. Exploratory Data Analysis</a>
  <li><a href="#optuna">2. Optuna Hyperparameter Optimization</a>
      <br>
      <ul>
    <li><a href="#optuna_objective">2.0 Define Objective</a></li>
    <li><a href="#optuna_study">2.1 Start Optimization</a></li>
    <li><a href="#optuna_plots">2.2 Check Optimization Plots</a></li>
  </ul>
</li>
<li><a href="#submission">3. Submission</a></li>
</ul>
</div>

##### Props

Props to [corochann](https://www.kaggle.com/corochann/optuna-tutorial-for-hyperparameter-optimization), I believe this notebook is the best you can find about Optuna.

--- 

<h5> Disclaimer </h5>

Code mainly taken from my [TPS May notebook](https://www.kaggle.com/tomwarrens/tps-may-2021-lightgbm-optuna).

<a id="loading"></a>

##### 0. Imports, Data Loading and Preprocessing

In [None]:
import joblib
import torch
device = 'gpu' if torch.cuda.is_available() else 'cpu'
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import QuantileTransformer, StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest
import warnings
warnings.filterwarnings('ignore')

import optuna
import tqdm
import gc
import os
root_path = '/kaggle/input/tabular-playground-series-jun-2021/'

def robust_pow(num_base, num_pow):
    # numpy does not permit negative numbers to fractional power
    # use this to perform the power algorithmic

    return np.sign(num_base) * (np.abs(num_base)) ** (num_pow)

def focal_binary_object(pred, dtrain):
    gamma_indct = 2.5
    # retrieve data from dtrain matrix
    label = dtrain.get_label()
    # compute the prediction with sigmoid
    sigmoid_pred = 1.0 / (1.0 + np.exp(-pred))
    # gradient
    # complex gradient with different parts
    g1 = sigmoid_pred * (1 - sigmoid_pred)
    g2 = label + ((-1) ** label) * sigmoid_pred
    g3 = sigmoid_pred + label - 1
    g4 = 1 - label - ((-1) ** label) * sigmoid_pred
    g5 = label + ((-1) ** label) * sigmoid_pred
    # combine the gradient
    grad = gamma_indct * g3 * robust_pow(g2, gamma_indct) * np.log(g4 + 1e-9) + \
           ((-1) ** label) * robust_pow(g5, (gamma_indct + 1))
    # combine the gradient parts to get hessian components
    hess_1 = robust_pow(g2, gamma_indct) + \
             gamma_indct * ((-1) ** label) * g3 * robust_pow(g2, (gamma_indct - 1))
    hess_2 = ((-1) ** label) * g3 * robust_pow(g2, gamma_indct) / g4
    # get the final 2nd order derivative
    hess = ((hess_1 * np.log(g4 + 1e-9) - hess_2) * gamma_indct +
            (gamma_indct + 1) * robust_pow(g5, gamma_indct)) * g1

    return grad, hess

In [None]:
#preprocessing

train = pd.read_csv(os.path.join(root_path, 'train.csv'))
test = pd.read_csv(os.path.join(root_path, 'test.csv'))
sample_submission = pd.read_csv(os.path.join(root_path, 'sample_submission.csv'))

In [None]:
#label mapping
unique_targets = train['target'].unique().tolist()
label_mapping = dict(zip(unique_targets, [int(i[-1]) - 1 for i in unique_targets]))

train['target'] = train['target'].map(label_mapping)
dataset = pd.concat([train, test], axis = 0, ignore_index = True)
train_len = len(train)

features = dataset.drop(['id', 'target'], axis=1).columns.tolist()
categorical_feature_columns = (dataset[features].apply(lambda x: x.nunique(), axis = 0)
                               .rename('n_unique').to_frame()
                               .query('n_unique < 10').index.tolist())

label = LabelEncoder() 
#Not needed: in other challenges there were proper categorical string cols, so I did this and kept it here

for column in categorical_feature_columns:
    label.fit(dataset[column])
    dataset[column] = label.transform(dataset[column])
        
categorical_features = list(range(len(categorical_feature_columns)))

train_preprocessed = dataset[:train_len]
test_preprocessed = dataset[train_len:]

assert train_preprocessed.shape[1] == test_preprocessed.shape[1]

cat_indices = [features.index(i) for i in categorical_feature_columns]

<a id = "eda"></a>
<h4> Exploratory Data Analysis </h4>

In [None]:
train_target_counts = (train.target.value_counts().rename('count').to_frame().reset_index().rename({'index': 'class'}, axis = 1)
                      .sort_values('class', ignore_index = True))
colors = sns.color_palette('rocket', 9)
levels = np.linspace(-1, 1, 9)
cmap_plot, norm = matplotlib.colors.from_levels_and_colors(levels, colors, extend="max")

fig, ax = plt.subplots(1, 2, figsize = (18, 6), gridspec_kw={'width_ratios': [2, 1]})

sns.barplot(data = train_target_counts, x = 'class', y = 'count', palette = 'rocket', ax = ax[0])
plt.style.use('fivethirtyeight')
plt.setp(ax[0].patches, linewidth=15)

for index, row in train_target_counts.iterrows():
    value = row['count']
    ax[0].text(index, value+1, value, color='black', ha="center", 
               fontsize = 13, fontweight = 'bold')

ax[0].grid(True)
ax[0].legend(fontsize=18)
ax[0].set_title('Label Balance', fontsize = 18, fontweight = 'bold')
ax[0].tick_params(axis='both', which='major', labelsize=14)
ax[0].tick_params(axis='both', which='minor', labelsize=14)
ax[0].set_xlabel('')
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation = 35, fontsize = 13, color = 'black')
ax[0].set_ylabel('distinct_values', fontsize = 18, color ='black')
plt.subplots_adjust(hspace = 0.3)
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Helvetica'

bbox=[-0.2, 0, 1.2, 0.9]
ax[1].axis('off')
ax[1].title.set_text('')
ccolors = plt.cm.BuPu(np.full(len(train_target_counts.columns), 0.1))

mpl_table = ax[1].table(cellText = train_target_counts.values, bbox=bbox, colLabels=train_target_counts.columns, colColours=ccolors)
mpl_table.auto_set_font_size(False)
mpl_table.auto_set_column_width(col=list(range(len(train_target_counts.columns))))
mpl_table.set_fontsize(18)

In [None]:
unique_values = (train.drop(['id', 'target'], axis = 1).apply(lambda x: x.nunique(), axis = 0).rename('distinct_values').to_frame()
.reset_index()
.rename({'index': 'feature'}, axis = 1))

plt.style.use('ggplot')

fig, ax = plt.subplots(1, 1, figsize = (16, 6))#, gridspec_kw={'width_ratios': [1.5, 0.6]})

percentiles_asked = [0.25, 0.5, 0.75, 0.9]
percentiles = unique_values['distinct_values'].quantile(percentiles_asked).tolist()

sns.histplot(data = unique_values, x = 'distinct_values', ax = ax, kde=False, bins = 20,
             stat = 'density', 
             alpha = 0.5, 
             fill = True,
             linewidth = 3,
             edgecolor='black',
             color = 'red',
             #line_kws= {'linewidth': 5, 'color': 'red', 'alpha': 0.6}
            )

sns.kdeplot(data = unique_values, x = 'distinct_values', ax = ax, alpha = 0.01, fill = True, 
            linewidth = 5, color = 'blue')

for m, percentile in enumerate(percentiles):
        ax.axvline(percentile, alpha = 0.35, ymin = 0, ymax = 1, linestyle = ":", color = 'blue')
        ax.text(percentile-0.16, 0.037, "{}".format(percentiles_asked[m]), size = 12, alpha = 1)
        
mean = np.round(unique_values.distinct_values.mean(), 2)
median = np.round(unique_values.distinct_values.median(), 2)
st_dev = np.round(unique_values.distinct_values.std(), 2)

ax.text(-10, 0.029, "mean: {}".format(mean), size = 12, alpha = 1)
ax.text(-10, 0.026, "median: {}".format(median), size = 12, alpha = 1)
ax.text(-10, 0.023, "std deviation: {}".format(st_dev), size = 12, alpha = 1)

#https://stackoverflow.com/questions/49926147/how-to-modify-edge-color-of-violinplot-using-seaborn/55131881 
#per cambiare colore linea esterna

ax.set_ylabel('Density', fontsize = 15)
ax.set_xlabel('distinct_values', fontsize = 15)
ax.set_title('hist-kde plot', fontsize = 16)
ax.set_ylim(0, 0.04)

ax.tick_params(axis='both', which='major', labelsize=14)
ax.tick_params(axis='both', which='minor', labelsize=14)
#plt.subplots_adjust(hspace = 0.8)
fig.suptitle('Distribution of Number of distinct values per feature (train)', fontsize = 20, fontweight = 'bold')


In [None]:
unique_values = (test.drop(['id'], axis = 1).apply(lambda x: x.nunique(), axis = 0).rename('distinct_values').to_frame()
.reset_index()
.rename({'index': 'feature'}, axis = 1))

plt.style.use('ggplot')

fig, ax = plt.subplots(1, 1, figsize = (16, 6))#, gridspec_kw={'width_ratios': [1.5, 0.6]})

percentiles_asked = [0.25, 0.5, 0.75, 0.9]
percentiles = unique_values['distinct_values'].quantile(percentiles_asked).tolist()

sns.histplot(data = unique_values, x = 'distinct_values', ax = ax, kde=False, bins = 20,
             stat = 'density', 
             alpha = 0.5, 
             fill = True,
             linewidth = 3,
             edgecolor='black',
             color = 'red',
             #line_kws= {'linewidth': 5, 'color': 'red', 'alpha': 0.6}
            )

sns.kdeplot(data = unique_values, x = 'distinct_values', ax = ax, alpha = 0.01, fill = True, 
            linewidth = 5, color = 'blue')

for m, percentile in enumerate(percentiles):
        ax.axvline(percentile, alpha = 0.35, ymin = 0, ymax = 1, linestyle = ":", color = 'blue')
        ax.text(percentile-0.16, 0.037, "{}".format(percentiles_asked[m]), size = 12, alpha = 1)
        
mean = np.round(unique_values.distinct_values.mean(), 2)
median = np.round(unique_values.distinct_values.median(), 2)
st_dev = np.round(unique_values.distinct_values.std(), 2)

ax.text(-10, 0.029, "mean: {}".format(mean), size = 12, alpha = 1)
ax.text(-10, 0.026, "median: {}".format(median), size = 12, alpha = 1)
ax.text(-10, 0.023, "std deviation: {}".format(st_dev), size = 12, alpha = 1)

#https://stackoverflow.com/questions/49926147/how-to-modify-edge-color-of-violinplot-using-seaborn/55131881 
#per cambiare colore linea esterna

ax.set_ylabel('Density', fontsize = 15)
ax.set_xlabel('distinct_values', fontsize = 15)
ax.set_title('hist-kde plot', fontsize = 16)
ax.set_ylim(0, 0.04)

ax.tick_params(axis='both', which='major', labelsize=14)
ax.tick_params(axis='both', which='minor', labelsize=14)
#plt.subplots_adjust(hspace = 0.8)
fig.suptitle('Distribution of Number of distinct values per feature (test)', fontsize = 20, fontweight = 'bold')


In [None]:
del train, test
gc.collect()

<a id="optuna"></a>

### Optuna

Look [here](https://optuna.readthedocs.io/en/stable/tutorial/) for reference about Optuna library. 

Look [here](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html) for a set of Lightgbm Classifier hyperparameters.

In [None]:
#Set to False if you want to skip it

OPTUNA_OPTIMIZATION = True
N_SPLITS = 5 #Number of folds for validation
N_TRIALS = 3 #Number of trials to find best hyperparameters
TIME = 3600*5 #Time to run optimization (alternative to N_TRIALS)
FOLD_RANDOM_SEED = 42
REPEATED_FOLD = True #Whether to use RepeatedStratifiedKFold over StratifiedKFold

FIXED_PARAMS = {"random_state": 42,
                "num_classes": len(unique_targets),
                "categorical_feature": cat_indices,
                "verbosity": -1,
                "n_jobs": -1}

best_params_v5 = {"objective": "multiclass",
    "boosting_type": "gbdt",
    "n_estimators": 100,
    "learning_rate": 0.1,
    "num_leaves": 22,
    "max_depth": 10,
    "reg_alpha": 15.457800377673841,
    "reg_lambda": 8.958320766791369,
    "colsample_bytree": 0.5508279412251145,
    "subsample": 0.100790119987575,
    "cat_smooth": 32.20219126721756}

best_params_v8 = {
    "objective": "multiclass",
    "boosting_type": "gbdt",
    "n_estimators": 100,
    "learning_rate": 0.1,
    "num_leaves": 40,
    "max_depth": 13,
    "reg_alpha": 16.79089587101574,
    "reg_lambda": 13.510484215533271,
    "colsample_bytree": 0.3075172117556715,
    "subsample": 0.08854097263254436,
    "cat_smooth": 37.0724124612169}

<a id = "optuna_objective"></a>
<h5> Define Objective </h5>

1. Possibility to use [RepeatedStratifiedKFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RepeatedStratifiedKFold.html) for cross validation

2. Define starting parameters for Optuna (read [here](https://github.com/optuna/optuna/issues/417)), using `enqueue_trial`

3. Optimizing more than one loss: I've included [focal loss](https://paperswithcode.com/method/focal-loss), which in other contexts has done well with class imbalance.

4. Saving your Optuna study

_Objective_

In [None]:
skfold = StratifiedKFold(N_SPLITS, shuffle = True, random_state = FOLD_RANDOM_SEED)
if REPEATED_FOLD:
    skfold = RepeatedStratifiedKFold(N_SPLITS, n_repeats=2, random_state=FOLD_RANDOM_SEED)

def objective(trial, cv=skfold):
    
    param_to_search_lgb = {
        "objective": trial.suggest_categorical('objective', ['multiclass', 'multiclassova']),
        "boosting_type": trial.suggest_categorical('boosting_type', ['gbdt', 'goss', 'dart']),
        "n_estimators": trial.suggest_categorical('n_estimators', [100]),
        "learning_rate": trial.suggest_categorical('learning_rate', [0.001, 0.005, 0.01, 0.05, 0.1]),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024),
        'max_depth': trial.suggest_int('max_depth', -1, 16),
        'reg_alpha': trial.suggest_float('reg_alpha', 1E-16, 25),
        'reg_lambda': trial.suggest_float('reg_lambda', 1E-16, 25),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 1E-16, 0.7),
        'subsample': trial.suggest_float('subsample', 1E-16, 0.3),
        'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 50.0)  
    }
    
    param_lgb = param_to_search_lgb.copy()
    param_lgb.update(FIXED_PARAMS)
    
    
    val_losses = []
    losses_1 = []
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'multi_logloss', valid_name='valid_1') 
    
    for kfold, (train_idx, val_idx) in tqdm.tqdm(enumerate(cv.split(train_preprocessed[features].values, 
                                                                    train_preprocessed['target'].values))):
        
        X_train = train_preprocessed.loc[train_idx, features]
        y_train = train_preprocessed.loc[train_idx, 'target']
        
        X_valid = train_preprocessed.loc[val_idx, features]
        y_valid = train_preprocessed.loc[val_idx, 'target']
        
        d_train = lgb.Dataset(X_train, label=y_train)
        d_valid = lgb.Dataset(X_valid, label=y_valid)
        watchlist = [d_train, d_valid]
        
        model = lgb.train(param_lgb,
                      train_set=d_train,
                      valid_sets=watchlist,
                      verbose_eval=0,
                      early_stopping_rounds=100,
                      callbacks=[pruning_callback])
    
        scores = model.predict(X_valid)
        loss_1 = log_loss(y_valid, scores)
        losses_1.append(loss_1)
        
    
    return np.average(losses_1)

<a id = "optuna_study"></a>
<h6> Start Optimization </h6>

In [None]:
if OPTUNA_OPTIMIZATION:
    study = optuna.create_study(study_name = 'lgbm_parameter_opt', direction = 'minimize',
                                pruner=optuna.pruners.MedianPruner(n_warmup_steps=25))
    
    study.enqueue_trial(best_params_v5)
    study.enqueue_trial(best_params_v8)
    #study.optimize(objective, n_trials=1, show_progress_bar=True)
    study.optimize(objective, timeout=TIME, show_progress_bar=True) 
    
    trial = study.best_trial
    
    print("  Value: {}".format(trial.value))
    
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    best_params = FIXED_PARAMS.copy()
    best_params.update(trial.params)
    best_params['n_estimators'] = 1000
    
else:
    trial = {
            "random_state": 42,
            "metric": "auc",
            "categorical_feature": cat_indices,
            "verbosity": -1,
            "n_estimators": 20000,
             'learning_rate': 0.1,
             'num_leaves': 98,
             'max_depth': 24,
             'reg_alpha': 0.6013328384502188,
             'reg_lambda': 8.864402629739141,
             'colsample_bytree': 0.8295666531935949,
             'subsample': 0.5621932264483348,
             'cat_smooth': 31.788282544015413}
    best_params=trial

In [None]:
#if you wish to save the studya
import joblib
joblib.dump(study, 'study.pkl')

<a id = "optuna_plots"></a>
<h6> Check Optimization plots </h6>

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_intermediate_values(study))

In [None]:
if OPTUNA_OPTIMIZATION:
    display()

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_optimization_history(study, target_name = 'Average Validation LogLoss'))

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_slice(study, target_name = 'Average Validation LogLoss'))

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_parallel_coordinate(study, target_name = 'Average Validation LogLoss'))

In [None]:
if OPTUNA_OPTIMIZATION:
    display(study.trials_dataframe())

<a id = "submission"></a>

### Submission

In [None]:
if OPTUNA_OPTIMIZATION:
    final_model = LGBMClassifier(**best_params)
else:
    final_model = LGBMClassifier(**trial)

In [None]:
test_preds = []
accuracies = []
loglosses = []

for kfold, (train_idx, val_idx) in enumerate(skfold.split(train_preprocessed[features].values, 
                                                          train_preprocessed['target'].values)):
        
        final_model.fit(train_preprocessed.loc[train_idx, features], 
                        train_preprocessed.loc[train_idx, 'target'])
        print('Fitted {}'.format(type(final_model).__name__))
        
        val_true = train_preprocessed.loc[val_idx, 'target'].values
        
        preds = final_model.predict(train_preprocessed.loc[val_idx, features])
        probs = final_model.predict_proba(train_preprocessed.loc[val_idx, features])
        
        accuracy = accuracy_score(val_true, preds)
        accuracies.append(accuracy)
        print('Fold: {}\t Validation Accuracy: {}\n'.format(kfold, accuracy))
        
        logloss = log_loss(val_true, probs)
        loglosses.append(logloss)
        print('Fold: {}\t Validation logloss: {}\n'.format(kfold, logloss))
        
        test_preds.append(final_model.predict_proba(test_preprocessed[features]))
        
print("Best Parameters mean Accuracy: {}".format(np.mean(accuracies)))
print("Best Parameters mean logloss: {}".format(np.mean(loglosses)))

In [None]:
test_predictions = np.mean(test_preds, axis = 0)
assert len(test_predictions) == len(test_preprocessed)

In [None]:
predictions_df = pd.DataFrame(test_predictions, columns = sorted(unique_targets))
predictions_df['id'] = sample_submission['id']

In [None]:
predictions_df.to_csv("submission.csv", index = False)