# Kaggle MoA - Hyperparameter Tuning Using Hyperopt

Competition: https://www.kaggle.com/c/lish-moa

This is a sample script to showcase the use of hyperopt for hyperparameter tuning of a neural network.

Please go through the documentation of hyperopt here: http://hyperopt.github.io/hyperopt/

Please have the following libraries in your path:


1. iterativestrat - for multi-label stratified k-fold
2. hyperopt - for hyperparameter tuning of NN
3. tensorflow addons - for adding a weight normalization wrapper to the layers

In [None]:
!pip install hyperas
!pip install iterative-stratification
!pip install tensorflow-addons


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import os
import sys
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import log_loss, make_scorer
from sklearn.decomposition import PCA
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.losses import BinaryCrossentropy
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from functools import partial
from hyperopt import fmin, tpe, hp, Trials, space_eval, STATUS_OK
from hyperopt.pyll.base import scope
import hyperopt
import pickle
from tensorflow_addons.layers import WeightNormalization
import tensorflow as tf
import tensorflow_addons as tfa

plt.style.use('dark_background')


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

In [None]:
# --------------------------------------------------
# book-keeping
# --------------------------------------------------
# Data location

train_features_file = '/kaggle/input/lish-moa/train_features.csv'
train_target_file = '/kaggle/input/lish-moa/train_targets_scored.csv'
test_features_file = '/kaggle/input/lish-moa/test_features.csv'

# Some cfg - not use for hyperopt
N_FOLDS = 5
MAX_EVALS = 30

# Some settings for the experiments
run_settings = dict()
run_settings["b_remove_control"] = False
run_settings["b_hyperopt"] = True
run_settings["b_run_submission"] = True

# Name of the hyperopt results log
experiment_name = "experiment_name"


In [None]:
# ====================================================
# utils
# ====================================================

# ==========================
# [data pre-processing]
# ==========================


# Load
def load_dataset():
    df_train_feat = pd.read_csv(train_features_file)
    df_train_target = pd.read_csv(train_target_file)
    df_test_feat = pd.read_csv(test_features_file)

    return df_train_feat, df_train_target, df_test_feat


# Shuffle
def shuffle_data(data, targets):
    # Reset index just to be sure
    data.reset_index(drop=True, inplace=True)
    targets.reset_index(drop=True, inplace=True)

    # Grab col names to separate out later
    cols_data = data.columns
    cols_target = targets.columns

    targets = targets.drop('sig_id', axis=1)

    # Append targets tp df
    df_agg = pd.concat([data, targets], axis=1)

    # Shuffle
    df_agg = df_agg.sample(frac=1).reset_index(drop=True)

    # Separate
    data = df_agg.loc[:, cols_data]
    targets = df_agg.loc[:, cols_target]

    return data, targets


def onehot(df, col):
    # Encode
    df_onehot = pd.get_dummies(df[col])

    # Get col names
    colnames = ["{}_{}".format(col, ii) for ii, tmp in enumerate(df_onehot.columns)]

    # Change col names
    df_onehot.columns = colnames

    # Add to df
    df = pd.concat([df, df_onehot], axis=1)

    # Drop original columns
    df = df.drop(col, axis=1)

    return df


# Do pre-processing
def prepare_data(df_train_feat, df_train_target, df_test_feat):
    if run_settings["b_remove_control"]:
        mask = df_train_feat["cp_type"] == "trt_cp"
        df_train_feat = df_train_feat.loc[mask, :]
        df_train_feat.reset_index(inplace=True, drop=True)
        df_train_feat = df_train_feat.drop(["cp_type"], axis=1)

        df_train_target = df_train_target.loc[mask, :]
        df_train_target.reset_index(inplace=True, drop=True)

        # Columns to one-hot encode
        cols_onehot = ['cp_time', 'cp_dose']

    else:
        # Columns to one-hot encode
        cols_onehot = ['cp_type', 'cp_time', 'cp_dose']

    # One hot
    for c in cols_onehot:
        df_train_feat = onehot(df_train_feat, c)
        df_test_feat = onehot(df_test_feat, c)

    return df_train_feat, df_train_target, df_test_feat


# Folds for CV
def get_folds(df_feat, df_target):
    # Init
    df_feat["k_fold"] = -1

    # Drop sig_id to use to stratified kfold
    df_target = df_target.drop("sig_id", axis=1)

    # CV
    cv_method = MultilabelStratifiedKFold(n_splits=5)

    for fold_n, (train_idx_, val_idx_) in enumerate(cv_method.split(df_feat, df_target)):
        df_feat.loc[val_idx_, "k_fold"] = fold_n

    return df_feat


In [None]:
# ====================================================
# [model]
# ====================================================
class ModelLibrary:

    def __init__(self):
        pass
    

    @staticmethod
    def model_1_hyper(n_feat, n_targets, params):

        in_layer = Input(shape=n_feat)

        z = Dense(params["n_hidden_1"], activation="relu")(in_layer)
        z = BatchNormalization()(z)
        z = Dropout(params["dropout"])(z)

        for i in range(params["num_block1"]):
            if params["weight_norm"]:
                z = WeightNormalization(Dense(params["n_hidden_1"], activation="relu"))(z)
            else:
                z = Dense(params["n_hidden_1"], activation="relu")(z)

            z = BatchNormalization()(z)
            z = Dropout(params["dropout"])(z)

        for i in range(params["num_block2"]):
            if params["weight_norm"]:
                z = WeightNormalization(Dense(params["n_hidden_2"], activation="relu"))(z)
            else:
                z = Dense(params["n_hidden_2"], activation="relu")(z)

            z = BatchNormalization()(z)
            z = Dropout(params["dropout"])(z)

        out_layer = Dense(n_targets, activation="sigmoid")(z)

        m = Model(inputs=[in_layer], outputs=[out_layer])

        return m
    

In [None]:
# ====================================================
# [training]
# ====================================================

# Train a single fold - with hyperopt
def train_fold_hyperopt(df_train_feat, df_train_target, fold_n, params):
    # Grab training, val data for this fold
    X_train = df_train_feat.loc[df_train_feat["k_fold"] != fold_n, feat_names].values
    y_train = df_train_target.loc[df_train_feat["k_fold"] != fold_n, target_names].values

    X_val = df_train_feat.loc[df_train_feat["k_fold"] == fold_n, feat_names].values
    y_val = df_train_target.loc[df_train_feat["k_fold"] == fold_n, target_names].values

    # train model for this fold
    model_lib = ModelLibrary()
    model = model_lib.model_1_hyper(n_feat=len(feat_names), n_targets=len(target_names), params=params)

    model.compile(Adam(lr=params["learn_rate"]), loss=BinaryCrossentropy())

    # If using callbacks
    if params["use_cb"]:
        cb_reducelr = ReduceLROnPlateau(
            monitor='val_loss', factor=0.3, patience=5, verbose=0, mode='auto',
            cooldown=0, min_lr=1e-5)

        cb_es = EarlyStopping(
            monitor='val_loss', min_delta=1e-5, patience=15, verbose=0, mode='auto',
            baseline=None, restore_best_weights=True)

        cb_list = [cb_reducelr, cb_es]
    else:
        cb_list = []

    fit_log = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=params["batch_size"],
                        epochs=params["epochs"], callbacks=cb_list)

    return fit_log.history["val_loss"][-1], model


# Main optimization function - should just return the mean cv val loss - a single number!!
def optimize(params, df_train_feat, df_train_target, df_test_feat):
    # Train
    val_loss = []
    test_pred = np.zeros((df_test_feat.shape[0], len(target_names)))
    for fold_n in range(N_FOLDS):
        fold_val_loss, model = train_fold_hyperopt(df_train_feat, df_train_target, fold_n, params)
        val_loss.append(fold_val_loss)

        # Predict on test set
        test_pred += model.predict(df_test_feat[feat_names].values)

    test_pred /= N_FOLDS

    return {"loss": np.mean(val_loss), "status": STATUS_OK, "test_pred": test_pred}


In [None]:
# ====================================================
# main
# ====================================================

# ==========================
# [data pre-processing]
# ==========================
# Load dataset
df_train_feat, df_train_target, df_test_feat = load_dataset()
df_train_feat.head()

# Shuffle data
df_train_feat, df_train_target = shuffle_data(df_train_feat, df_train_target)

# Print out some stuff
print("compound (cp_vehicle) vs control perturbation (ctrl_vehicle)\n{}".format(
    df_train_feat['cp_type'].value_counts()))
print("\n\ncp_dose\n{}".format(df_train_feat['cp_dose'].value_counts()))
print("\n\ncp_time\n{}".format(df_train_feat['cp_time'].value_counts()))

# Pre-process data (add dummies, etc)
df_train_feat, df_train_target, df_test_feat = prepare_data(df_train_feat, df_train_target, df_test_feat)


# Get feature, target names
feat_names = list(df_train_feat.drop(["sig_id"], axis=1).columns)
target_names = list(df_train_target.drop(["sig_id"], axis=1).columns)

# ==========================
# [training]
# ==========================

# --------------------------
# [hyperopt]
# --------------------------
if run_settings["b_hyperopt"]:

    # Get folds
    # Init
    df_train_feat["k_fold"] = -1

    # Drop sig_id to use to stratified kfold
    df_train_target = df_train_target.drop("sig_id", axis=1)

    cv_method = MultilabelStratifiedKFold(n_splits=N_FOLDS, random_state=20)

    for fold_n, (train_idx_, val_idx_) in enumerate(cv_method.split(df_train_feat, df_train_target)):
        df_train_feat.loc[val_idx_, "k_fold"] = fold_n

    # Define param space to run experiments
    param_search_space = {
        "n_hidden_1": scope.int(hp.quniform("n_hidden_1", 128, 2048, 64)),
        "n_hidden_2": scope.int(hp.quniform("n_hidden_2", 128, 2048, 64)),
        "num_block1": hp.choice("num_block1", [1, 2, 3, 4, 5]),
        "num_block2": hp.choice("num_block2", [1, 2, 3, 4, 5]),
        "learn_rate": hp.loguniform("learn_rate", -5, -2),
        "dropout": hp.choice("dropout", [0.1, 0.2, 0.3, 0.4, 0.5]),
        "batch_size": scope.int(hp.quniform("batch_size", 32, 256, 64)),
        "epochs": hp.choice("epochs", [1]),
        "use_cb": hp.choice("use_cb", [True, False]),
        "weight_norm": hp.choice("weight_norm", [True, False])
    }
    
    # Create a partial function that takes leaves out params - hyperopt requires that
    opt_func = partial(
        optimize,
        df_train_feat=df_train_feat,
        df_train_target=df_train_target,
        df_test_feat=df_test_feat
    )

    print("sample params:\n")
    print(hyperopt.pyll.stochastic.sample(param_search_space))

    # Init trials for logging
    trials = Trials()

    # Run optimization
    gold_digger = fmin(
        fn=opt_func,
        space=param_search_space,
        algo=tpe.suggest,
        max_evals=MAX_EVALS,
        trials=trials,
        verbose=False
    )

    # Losses for trials
    fnvals = [t['result']['loss'] for t in trials.trials]

    # Best params
    print("best params: {}".format(space_eval(param_search_space, gold_digger)))
    print("best loss: {}".format(trials.best_trial['result']['loss']))

    with open(experiment_name + ".pkl", "wb") as output:
        pickle.dump(trials, output, pickle.HIGHEST_PROTOCOL)
        

In [None]:
# ==========================
# [test pred]
# ==========================
if run_settings["b_run_submission"]:

    if run_settings["b_hyperopt"]:
        # Get best pred from hyperopt
        test_pred = trials.best_trial['result']['test_pred']

    # If control is removed, you won't have dummy vars in test set
    if run_settings["b_remove_control"]:
        test_pred[df_test_feat["cp_type"] == "ctl_vehicle", :] = 0
    else:
        test_pred[df_test_feat['cp_type_0'] == 1, :] = 0

    # Create submission
    submission = pd.concat([df_test_feat["sig_id"], pd.DataFrame(data=test_pred, columns=target_names)], axis=1)

    submission.to_csv("submission.csv", index=False)
    

In [None]:
submission.head()