 <font size="4"> **In this kernel, I show how I tuned the hyperparameters for the tutorial kernel  <a href="https://www.kaggle.com/sinamhd9/mechanisms-of-action-moa-tutorial?scriptVersionId=46332122">[Moa Tutorial]  </a> using Keras Tuner and its Bayesian optimization method.**

In [None]:
# Importing useful libraries
import warnings
warnings.filterwarnings("ignore")

# Adding iterative-stratification 
# Select add data from the right menu and search for iterative-stratification, then add it to your kernel.
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


from time import time
import gc

import numpy as np
import pandas as pd 

# ML tools 
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf 
import tensorflow.keras.backend as K
import kerastuner as kt

# Setting random seeds
np.random.seed(42)
tf.random.set_seed(42)

## Loading data

In [None]:
df_train = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')

df_target_ns = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')

df_target_s = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')

df_test = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

df_sample = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')


## Preprocess

Please see the references for the tutorial notebook.  <a href="https://www.kaggle.com/sinamhd9/mechanisms-of-action-moa-tutorial?scriptVersionId=46332122">[Moa Tutorial]  </a>

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer
transformer = QuantileTransformer(n_quantiles=100,random_state=42, output_distribution="normal")

def preprocess(df):
    df = df.drop('sig_id', axis=1)
    df['cp_time'] = df['cp_time'].map({24:1, 48:2, 72:3})
    df['cp_dose'] = df['cp_dose'].map({'D1':0, 'D2':1})
    g_features = [cols for cols in df.columns if cols.startswith('g-')]
    c_features = [cols for cols in df.columns if cols.startswith('c-')]
    for col in (g_features + c_features):
        vec_len = len(df[col].values)
        raw_vec = df[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)
        df[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    return df

X = preprocess(df_train)
X_test = preprocess(df_test)

print('Train data size', X.shape)
print('Test data size', X_test.shape)
y = df_target_s.drop('sig_id', axis=1)
print('target size', y.shape)
y0 =  df_target_ns.drop('sig_id', axis=1)

In [None]:
g_features = [cols for cols in X.columns if cols.startswith('g-')]
n_comp = 0.95

data = pd.concat([pd.DataFrame(X[g_features]), pd.DataFrame(X_test[g_features])])
data2 = (PCA(0.95, random_state=42).fit_transform(data[g_features]))
train2 = data2[:X.shape[0]]
test2 = data2[-X_test.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_g-{i}' for i in range(data2.shape[1])])
test2 = pd.DataFrame(test2, columns=[f'pca_g-{i}' for i in range(data2.shape[1])])

X = pd.concat((X, train2), axis=1)
X_test = pd.concat((X_test, test2), axis=1)

c_features = [cols for cols in X.columns if cols.startswith('c-')]
n_comp = 0.95

data = pd.concat([pd.DataFrame(X[c_features]), pd.DataFrame(X_test[c_features])])
data2 = (PCA(0.95, random_state=42).fit_transform(data[c_features]))
train2 = data2[:X.shape[0]]
test2 = data2[-X_test.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_c-{i}' for i in range(data2.shape[1])])
test2 = pd.DataFrame(test2, columns=[f'pca_c-{i}' for i in range(data2.shape[1])])

X = pd.concat((X, train2), axis=1)
X_test = pd.concat((X_test, test2), axis=1)
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(0.8)  
data = X.append(X_test)
data_transformed = var_thresh.fit_transform(data.iloc[:, 2:])

train_features_transformed = data_transformed[ : X.shape[0]]
test_features_transformed = data_transformed[-X_test.shape[0] : ]


X = pd.DataFrame(X[['cp_type', 'cp_time','cp_dose']].values.reshape(-1, 3),\
                              columns=['cp_type','cp_time','cp_dose'])

X = pd.concat([X, pd.DataFrame(train_features_transformed)], axis=1)


X_test = pd.DataFrame(X_test[['cp_type', 'cp_time','cp_dose']].values.reshape(-1, 3),\
                             columns=['cp_type','cp_time','cp_dose'])

X_test = pd.concat([X_test, pd.DataFrame(test_features_transformed)], axis=1)

print(X.shape)
print(X_test.shape)

In [None]:
from sklearn.cluster import KMeans
def fe_cluster(train, test, n_clusters_g = 35, n_clusters_c = 5, SEED = 239):
    
    features_g = list(train.columns[4:776])
    features_c = list(train.columns[776:876])
    def create_cluster(train, test, features, kind = 'g', n_clusters = n_clusters_g):
        train_ = train[features].copy()
        test_ = test[features].copy()
        data = pd.concat([train_, test_], axis = 0)
        kmeans = KMeans(n_clusters = n_clusters, random_state = SEED).fit(data)
        train[f'clusters_{kind}'] = kmeans.labels_[:train.shape[0]]
        test[f'clusters_{kind}'] = kmeans.labels_[train.shape[0]:]
        train = pd.get_dummies(train, columns = [f'clusters_{kind}'])
        test = pd.get_dummies(test, columns = [f'clusters_{kind}'])
        return train, test
    
    train, test = create_cluster(train, test, features_g, kind = 'g', n_clusters = n_clusters_g)
    train, test = create_cluster(train, test, features_c, kind = 'c', n_clusters = n_clusters_c)
    return train, test

X ,X_test=fe_cluster(X,X_test)
display(X.head(2))
print(X.shape)
display(X_test.head(2))
print(X_test.shape)

In [None]:
def fe_stats(train, test):
    
    features_g = list(train.columns[4:776])
    features_c = list(train.columns[776:876])
    
    for df in train, test:
        df['g_sum'] = df[features_g].sum(axis = 1)
        df['g_mean'] = df[features_g].mean(axis = 1)
        df['g_std'] = df[features_g].std(axis = 1)
        df['g_kurt'] = df[features_g].kurtosis(axis = 1)
        df['g_skew'] = df[features_g].skew(axis = 1)
        df['c_sum'] = df[features_c].sum(axis = 1)
        df['c_mean'] = df[features_c].mean(axis = 1)
        df['c_std'] = df[features_c].std(axis = 1)
        df['c_kurt'] = df[features_c].kurtosis(axis = 1)
        df['c_skew'] = df[features_c].skew(axis = 1)
        df['gc_sum'] = df[features_g + features_c].sum(axis = 1)
        df['gc_mean'] = df[features_g + features_c].mean(axis = 1)
        df['gc_std'] = df[features_g + features_c].std(axis = 1)
        df['gc_kurt'] = df[features_g + features_c].kurtosis(axis = 1)
        df['gc_skew'] = df[features_g + features_c].skew(axis = 1)
        
    return train, test

X,X_test=fe_stats(X,X_test)
display(X.head(2))
print(X.shape)
display(X_test.head(2))
print(X_test.shape)

In [None]:
y0 = y0[X['cp_type'] == 'trt_cp'].reset_index(drop = True)
y = y[X['cp_type'] == 'trt_cp'].reset_index(drop = True)
X = X[X['cp_type'] == 'trt_cp'].reset_index(drop = True)
X.drop(['cp_type'], axis=1, inplace=True)
X_test.drop(['cp_type'], axis=1, inplace=True)

print(X.shape)

In [None]:
p_min = 0.001
p_max = 0.999
def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -K.mean(y_true*K.log(y_pred) + (1-y_true)*K.log(1-y_pred))


## Hyperparameter tuning

In [None]:

def create_model(hp):
    num_cols = X.shape[1]
    inp = tf.keras.layers.Input(shape = (num_cols, ))
    x = tf.keras.layers.BatchNormalization()(inp)
    num_dense = hp.Int('num_dense', min_value=0, max_value=3, step=1)
    for i in range(num_dense):
        hp_units = hp.Int('units_{i}'.format(i=i), min_value=128, max_value=4096, step=128)
        hp_drop_rate = hp.Choice('dp_{i}'.format(i=i), values=[0.25,0.3,0.35,0.4,0.45,0.5,0.55, 0.6, 0.65,0.7])
        hp_activation = hp.Choice('dense_activation_{i}'.format(i=i),values=['relu', 'selu', 'elu', 'swish'])
        x = tf.keras.layers.Dense(units=hp_units, activation=hp_activation)(x)
        x = tf.keras.layers.Dropout(hp_drop_rate)(x)
        x = tf.keras.layers.BatchNormalization()(x)
    outputs = tf.keras.layers.Dense(206, activation='sigmoid')(x)
    model = tf.keras.Model(inp, outputs)
#     hp_learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=5e-3)
    learning_rate = 1e-3
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate),
                 loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.001), metrics=logloss)
    
    return model 

Increase the number of max_trials for getting better resutls. For example, you can use 150 for number of trials and then blend the 5 top models (n_top).

In [None]:
feats = np.arange(0,X.shape[1],1)
inp_size = int(np.ceil(1* len(feats)))
n_split = 5
bests=[]
seeds = [0, 1]
n_round = len(seeds)
for seed in seeds:
    split_cols = np.random.choice(feats, inp_size, replace=False)
    for n, (tr, te) in enumerate(MultilabelStratifiedKFold(n_splits = n_split, random_state = seed, shuffle = True).split(X, y)):
        st = time()
        tuner = kt.tuners.BayesianOptimization(create_model,
                     kt.Objective("val_logloss", direction="min"),
                     max_trials = 10, overwrite=True) 
        start_time = time()
        x_tr = X.astype('float64').values[tr][:, split_cols]
        x_val = X.astype('float64').values[te][:, split_cols]
        y0_tr, y0_val = y0.astype(float).values[tr], y0.astype(float).values[te]
        y_tr, y_val = y.astype(float).values[tr], y.astype(float).values[te]
        x_tt = X_test.astype('float64').values[:, split_cols]
        callbacks=[EarlyStopping(monitor='val_logloss', mode='min', patience=5)]
        start_time = time()
        tuner.search(x_tr, y_tr,validation_data=(x_val, y_val),
                            epochs = 150, batch_size = 128,
                            verbose = 0, callbacks = callbacks)        
        n_top = 5
        best_hps = tuner.get_best_hyperparameters(n_top)
        end_time = time()
        bests.append(best_hps)
        for i in range(n_top):
            print(best_hps[i].values)
        print('Seed', seed, 'Fold', n, 'Time elapsed:', "{:.2f}".format((end_time-start_time)/60), 'minutes')
        del tuner

In [None]:
print(len(bests)) # len(seeds) * n_split
print(bests[0][0].values) # Best for fold 0 seed 0 

In [None]:
print (bests[0][1].values) # Second best for fold 0 seed 0 
print(bests[1][0].values) # Best for fold 1 seed 0 
# and so on...


# References
<a href="https://blog.tensorflow.org/2020/01/hyperparameter-tuning-with-keras-tuner.html" > [1] Hyperparameter tuning with Keras Tuner
</a> <br>
<a href="https://www.kaggle.com/fchollet/moa-keras-kerastuner-best-practices" > [2] MoA: Keras + KerasTuner best practices</a>

 <font size="6"> **Please upvote if you liked it :) Thanks!**