<h3>LightGBM parameter optimization with OPTUNA</h3>

Hyperparameter search notebook for: https://www.kaggle.com/jmargni/tps-apr-2021-lightgbm-cv


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
import optuna
import joblib

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')

In [None]:
def label_encoder(c):
    lc = LabelEncoder()
    return lc.fit_transform(c)

In [None]:
def preprocess(df):
    label_cols = ['Name', 'Ticket']
    onehot_cols = ['Pclass', 'Sex', 'Cabin', 'Embarked', 'BucketAge']
    numerical_cols = ['SibSp', 'Parch', 'SibSpParch', 'BucketFare', 'Survived']
    age_map = df[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
    df.Age = df.Age.fillna(df.Pclass.map(age_map['Age']))
    df['BucketAge'] = df.Age//35
    df['BucketFare'] = train_df.Fare//2
    df['SibSpParch'] = df.SibSp + df.Parch
    df.Cabin = df.Cabin.fillna('X').map(lambda x: x[0].strip())
    df.Ticket = df.Ticket.fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
    df.Fare = df.Fare.fillna(df.Fare.mean())
    df.Embarked = df.Embarked.fillna('X')
    df.Name = df.Name.map(lambda x: x.split(',')[0])
    onehot_encoded_df = pd.get_dummies(df[onehot_cols])
    label_encoded_df = df[label_cols].apply(label_encoder)
    numerical_df = df[numerical_cols]
    return pd.concat([numerical_df, label_encoded_df, onehot_encoded_df], axis=1)

In [None]:
all_df = preprocess(df = pd.concat([train_df, test_df]))

In [None]:
# Re-split all data
X = all_df[:train_df.shape[0]]
y = X.pop('Survived')
X_ = all_df[train_df.shape[0]:].drop(columns=['Survived'])

In [None]:
def objective(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_test, label=y_test)
 
    param = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'metric': 'auc',
        'verbose': -1,
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 1),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 100, 2000, 50),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_bin': trial.suggest_int('max_bin', 10, 300, 10),
    }
    
    folds = KFold(n_splits=5)
    accuracies = []
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print("Fold {}".format(fold_))
        X_train = X.iloc[trn_idx]
        y_train = y[trn_idx]
        X_test = X.iloc[val_idx]
        y_test = y[val_idx]
        
        dtrain = lgb.Dataset(X_train, label=y_train)
        dval = lgb.Dataset(X_test, label=y_test)
    
        gbm = lgb.train(param, dtrain, valid_sets=[dval], num_boost_round=10000, early_stopping_rounds=100, verbose_eval=-1)
        preds = gbm.predict(X_test)
        pred_labels = np.rint(preds)
        accuracy = accuracy_score(y_test, pred_labels)
        accuracies.append(accuracy)
    return np.mean(accuracies)

In [None]:
def main(n_trials=10):
    try:  # try to load an already saved trials object, and increase the max
        study = joblib.load("tabular_apr.optuna")
        print("Found saved Study! Loading...")
    except:  # create a new trials object and start searching
        study = optuna.create_study(direction='maximize')

    study.optimize(objective, n_trials)
    
    joblib.dump(study, "tabular_apr.optuna")

<h4>Study state is saved in each loop.</h4>
<h4>Set <b>loops</b> and <b>trials_x_loop</b> for save study frequency and total number of trials.</h4>

In [None]:
loops = 4
trials_x_loop = 50

for i in range(loops):
    main(trials_x_loop)

In [None]:
study = joblib.load("tabular_apr.optuna")

In [None]:
study.best_value, study.best_params 

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)