## Search for Hyperparameters Using Random Search
Here, we use Random Search in Keras Tuner to find the best hyperparameters for RNN (Recurrent Neural Network) model later. The parameters include the number of neurons, the number of internal states in LSTM, dropout rate and learning rate.

In [4]:
!pip install keras-tuner



In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import kerastuner as kt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Load data from Google Colab or local computer.

In [7]:
# Run from Google Colab
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('drive/MyDrive/Colab/Hate/tweets_malay.csv')

Mounted at /content/drive


In [8]:
df.columns = ["text", "label"]
df

Unnamed: 0,text,label
0,haha babi dia punya tidak menyabar macam ada 1...,1
1,ini namanya pns kontol banyak gaya emosi aing ...,1
2,pukimak punya jantan trick baru dia guna bud...,1
3,pantat apa eh jual karipap inti basi nak menia...,1
4,ini warga emas ke oku le frontliner apa kepent...,1
...,...,...
1855,temen gw banget sudah tau doi nya toxic banget...,0
1856,kau komen lebai la mende la vid lucah pn kau l...,1
1857,pastu kau tahu kain dalam eh babi aku dah paka...,1
1858,sekarang ramai babi dah pandai drive kete atas...,1


In [9]:
df['label'].value_counts()

1    1188
0     672
Name: label, dtype: int64

Load the word embeddings from Malaya social media corpus.

In [10]:
import joblib

def load_word_vector(file = 'drive/MyDrive/Colab/Hate/word_vector.pkl'):
  from os import path

  if path.exists(file):    
    word_vector = joblib.load(file)
  else:
    !pip install malaya
    import malaya

    vocab, embedded = malaya.wordvector.load(model = 'socialmedia')
    wv = malaya.wordvector.WordVector(embedded, vocab)
    word_vector = {word: wv.get_vector_by_name(word) for word in wv.words}
    
  return word_vector

word_vector = load_word_vector()

Tokenize all words and build the word embedding matrix.

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size = 0.2, random_state = 123)

In [12]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(df['text'])
vocab_size = len(t.word_index) + 1

In [13]:
X_train_seq = t.texts_to_sequences(X_train)
X_test_seq = t.texts_to_sequences(X_test)

# Pad the sequences so each sequence is the same length
X_train_seq_padded = pad_sequences(X_train_seq, 50)
X_test_seq_padded = pad_sequences(X_test_seq, 50)

In [14]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 256))
for word, i in t.word_index.items():
	embedding_vector = word_vector.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

Define range of hyperparameters and the RNN model.

In [15]:
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.models import Sequential

from numpy.random import seed
from tensorflow.random import set_seed

In [16]:
def model_builder(hp):
  model = keras.Sequential()

  lstm = hp.Int('lstm', min_value=32, max_value=256, step=32, default=64)
  # lstm = 64

  hp_units = hp.Int('units', min_value=4, max_value=32, step=4, default=8)
  # hp_units = 8

  dropout = hp.Float('dropout', min_value=0.1, max_value=0.9, step=0.1, default=0.5)
  # dropout = 0.5
  
  # hp_learning_rate = hp.Choice('learning_rate', values=[0.01, 0.003, 0.001, 0.0003, 0.0001, 0.00003])
  hp_learning_rate = 0.001

  model.add(Embedding(vocab_size, 256, weights=[embedding_matrix], input_length=50, trainable=False))

  model.add(LSTM(lstm, dropout = dropout, recurrent_dropout = dropout))
  model.add(Dense(hp_units, activation = 'relu'))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate), loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

  return model

In [17]:
from kerastuner.tuners import RandomSearch

tuner = RandomSearch(
    model_builder,
    objective='val_accuracy',
    max_trials=25,
    executions_per_trial=1,
    directory='drive/MyDrive/Colab/Hate/randomsearch',
    overwrite = True,
    project_name='Hate Speech')



In [18]:
tuner.search_space_summary()

Search space summary
Default search space size: 3
lstm (Int)
{'default': 64, 'conditions': [], 'min_value': 32, 'max_value': 256, 'step': 32, 'sampling': None}
units (Int)
{'default': 8, 'conditions': [], 'min_value': 4, 'max_value': 32, 'step': 4, 'sampling': None}
dropout (Float)
{'default': 0.5, 'conditions': [], 'min_value': 0.1, 'max_value': 0.9, 'step': 0.1, 'sampling': None}


Search for best hyperparameters.

In [19]:
seed(123)
set_seed(234)

In [20]:
# tuner.search(X_train_seq_padded, y_train, epochs=10, validation_split=0.2)
tuner.search(X_train_seq_padded, y_train, epochs=10, validation_data = (X_test_seq_padded, y_test))

Trial 25 Complete [00h 02m 38s]
val_accuracy: 0.7419354915618896

Best val_accuracy So Far: 0.7715053558349609
Total elapsed time: 01h 16m 30s
INFO:tensorflow:Oracle triggered exit


In [21]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

In [22]:
best_hps.values

{'dropout': 0.4, 'lstm': 128, 'units': 32}

Show the search results.

In [23]:
tuner.results_summary()

Results summary
Results in drive/MyDrive/Colab/Hate/randomsearch/Hate Speech
Showing 10 best trials
Objective(name='val_accuracy', direction='max')
Trial summary
Hyperparameters:
lstm: 128
units: 32
dropout: 0.4
Score: 0.7715053558349609
Trial summary
Hyperparameters:
lstm: 224
units: 28
dropout: 0.30000000000000004
Score: 0.7580645084381104
Trial summary
Hyperparameters:
lstm: 64
units: 28
dropout: 0.2
Score: 0.7553763389587402
Trial summary
Hyperparameters:
lstm: 160
units: 8
dropout: 0.30000000000000004
Score: 0.7553763389587402
Trial summary
Hyperparameters:
lstm: 64
units: 20
dropout: 0.6
Score: 0.7526881694793701
Trial summary
Hyperparameters:
lstm: 64
units: 24
dropout: 0.1
Score: 0.7526881694793701
Trial summary
Hyperparameters:
lstm: 96
units: 24
dropout: 0.4
Score: 0.75
Trial summary
Hyperparameters:
lstm: 160
units: 12
dropout: 0.4
Score: 0.7446236610412598
Trial summary
Hyperparameters:
lstm: 96
units: 24
dropout: 0.30000000000000004
Score: 0.7419354915618896
Trial summary


Rebuild RNN model using the best hyperparameters.

In [24]:
# Build the model with the optimal hyperparameters and train it on the data for 20 epochs
model = tuner.hypermodel.build(best_hps)
# history = model.fit(X_train_seq_padded, y_train, batch_size = 32, epochs=20, validation_split=0.2)
history = model.fit(X_train_seq_padded, y_train, batch_size = 32, epochs=20, validation_data = (X_test_seq_padded, y_test))

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Best epoch: 5


In [25]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(X_train_seq_padded, y_train, epochs=best_epoch, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc802b9ccd0>

In [26]:
eval_result = hypermodel.evaluate(X_test_seq_padded, y_test)
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [0.5287325978279114, 0.7123655676841736]
