Hello everyone, today I would like to introduce you to a great open source hyperparameter optimization framework called [OPTUNA](https://github.com/optuna/optuna)
<center><img src="https://raw.githubusercontent.com/optuna/optuna/master/docs/image/optuna-logo.png"></center>

### 1. Load libs

In [None]:
import os
import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [None]:
PATH = "/kaggle/input/predict-volcanic-eruptions-ingv-oe"

In [None]:
y = pd.read_csv(os.path.join(PATH, 'train.csv'))
y.head()

### 2. Load train and simple aggregation

In [None]:
train = pd.DataFrame()

for s_id in tqdm(os.listdir(os.path.join(PATH, 'train'))):
    temp = pd.read_csv(os.path.join(PATH, 'train', s_id))
    temp = temp.groupby(lambda x: True).aggregate(['min','max','sum','mean','std','median'])
    temp.columns = [f'{col1}_{col2}' for col1, col2 in temp.columns]
    temp["segment_id"] = int(s_id.split('.')[0])
    train = train.append(temp.reset_index(drop=True), ignore_index=True)

In [None]:
train = train.merge(y, how='left', on='segment_id')

### 3. Let Optuna do its job)

#### Thanks for the idea of implementing the class [Toshihiko Yanase](https://stackoverflow.com/a/62164601)

#### 3.1 Init class for callback best models LightGBM + CV(5)

In [None]:
class Objective(object):

    best_models = None
    def __init__(self, kf, X, y):
        self.kf = kf
        self.X = X
        self.y = y

    def __call__(self, trial):

        mae_list = []
        param = {
            "objective": "l1",
            "metric": "mae",
            "verbosity": -1,
            "boosting_type": "gbdt",
            "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
            "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 2, 256),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
            "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        }
        
        self.models = []
        for i, (train_idx, val_idx) in enumerate(self.kf.split(self.X, self.y)):
            
            train_x, valid_x = self.X.loc[train_idx], self.X.loc[val_idx]
            train_y, valid_y = self.y[train_idx], self.y[val_idx]
            
            dtrain = lgb.Dataset(train_x, label=train_y)
            dvalid = lgb.Dataset(valid_x, label=valid_y)
            
            pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "l1")
            gbm = lgb.train(
                param, dtrain, valid_sets=[dvalid], verbose_eval=False, callbacks=[pruning_callback]
            )

            preds = gbm.predict(valid_x)
            mae_list.append(mean_absolute_error(valid_y, preds))
            self.models.append(gbm)
            
        return np.mean(mae_list)

    def callback(self, study, trial):
        if study.best_trial == trial:
            self.best_models = self.models
            self.best_params = trial.params

#### 3.2 Main func for "study"

In [None]:
def get_best_models(df, n_trials):
    kf = KFold(n_splits=5, shuffle=True, random_state=17)
    X = df.drop(['segment_id', 'time_to_eruption'], axis=1)
    y = df.time_to_eruption.values
    
    objective = Objective(kf, X, y)

    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="minimize"
    )
    study.optimize(objective, n_trials=n_trials, callbacks=[objective.callback])

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    best_models = objective.best_models
    return best_models

#### 3.3 Run study with 10 trials

In [None]:
models = get_best_models(train, n_trials=10)

### 4. Make a submission

In [None]:
submission = pd.DataFrame()

for s_id in tqdm(os.listdir(os.path.join(PATH, 'test'))):
    temp = pd.read_csv(os.path.join(PATH, 'test', s_id))
    temp = temp.groupby(lambda x: True).aggregate(['min','max','sum','mean','std','median'])
    temp.columns = [f'{col1}_{col2}' for col1, col2 in temp.columns]
    temp = temp.reset_index(drop=True)
    preds = 0
    for model in models:
        preds += model.predict(temp) / len(models)
    submission = submission.append({"segment_id": s_id.split('.')[0], "time_to_eruption": preds[0]}, ignore_index=True)

In [None]:
submission.to_csv('submission.csv', index=False)