# Tuning the CNN neural classifier

Hyperparameter sweep for the **convolutional neural network classifier** for producing results with various **degrees of dataset imbalance**. Optimize the hyperparameters for the worst-case imbalance configuration. Optimization conducted by means of the tree-structured Parzen estimator.

## Import useful packages

In [None]:
# Generic packages
import numpy as np
import pandas as pd
import copy

In [None]:
# Sci-kit learn for machine learning tasks
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, roc_curve, auc, f1_score, make_scorer

In [None]:
# TensorFlow and Keras for neural networks
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras import layers
from keras.regularizers import l2
from keras import metrics
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

In [None]:
# NLTK for natural language processing
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# HyperOptfor Bayesian optimization
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import sys

In [None]:
# Custom helper-functions script (supplied to Colab manually)
import utils as uu

## Load and process data

In [None]:
# Load training set
train_set_file  = "train_set_imb_0.csv"
train_set = pd.read_csv(train_set_file)

In [None]:
# Preprocess texts
train_set['text'] = train_set['text'].apply(str)
train_set['text'] = train_set['text'].apply(uu.preprocess_text)

In [None]:
# Decouple texts and labels (deep copy to not modfy the original datasets accidentally)
texts_train = copy.deepcopy(train_set['text'])
labels_train = copy.deepcopy(train_set['is_about_cc'])

In [None]:
# Tokenize texts and get features
max_words = 5000
max_len = 55
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts_train)
sequences_train = tokenizer.texts_to_sequences(texts_train)
features_train = sequence.pad_sequences(sequences_train, maxlen=max_len, padding='post', truncating='post')

## Define hyperparameter search space and objective function

In [None]:
# Define the search space for hyperparameterss
hyperparam_space = {'n_filters': scope.int(hp.quniform('n_filters', 4, 128, q=1)),
                    'kernel_size': scope.int(hp.quniform('kernel_size', 2, 16, q=1)),
                    'dropout_rate': hp.uniform('dropout_rate', 0.25, 0.75),
                    'l2_val': hp.loguniform('l2_val', np.log(1e-6), np.log(1e-3)),
                    'embedding_dim' : scope.int(hp.quniform('embedding_dim', 64, 256, q=1)),
                    'max_words' : max_words,
                    'max_len' : max_len,
                    'batch_size' : scope.int(hp.quniform('batch_size', 16, 128, q=1)),
                    'epochs' : scope.int(hp.quniform('epochs', 16, 128, q=1)),
                    'learning_rate': hp.loguniform('learning_rate', np.log(1e-7), np.log(1e-4))
                             }

In [None]:
# Define the objective function for optimization
def objective(params):   
    print ('Params testing: ', params)

    # Split dataset into training validation sets
    inp_train, inp_valid, lbl_train, lbl_valid = train_test_split(features_train, labels_train, 
                                                                  test_size=0.15, 
                                                                  random_state=12345, 
                                                                  stratify=labels_train)
    
    # Compute class weights for compensation the imbalance in classes
    class_weights = class_weight.compute_class_weight(class_weight = "balanced",
                                        classes = np.unique(lbl_train),
                                        y = lbl_train)
    class_weights = dict(zip(np.unique(lbl_train), class_weights))

    # Define optimizer
    opt = Adam(learning_rate=params['learning_rate'], 
                   beta_1 = 0.9,
                   beta_2 = 0.99,
                   epsilon = 1e-07,
                   amsgrad = False)

    # Define sequential neural arhitecture
    model = Sequential()
    model.add(layers.Embedding(params['max_words'], params['embedding_dim'], input_length=params['max_len']))
    model.add(layers.Dropout(params['dropout_rate']))
    model.add(layers.Conv1D(filters=params['n_filters'], kernel_size=params['kernel_size'], kernel_initializer='glorot_uniform', padding='same', activation='relu', kernel_regularizer=l2(params['l2_val']), bias_regularizer=l2(params['l2_val'])))
    model.add(layers.MaxPooling1D())
    model.add(layers.Flatten())
    model.add(layers.Dense(1, kernel_initializer='glorot_uniform', activation='sigmoid', kernel_regularizer=l2(params['l2_val']), bias_regularizer=l2(params['l2_val'])))

    # Build model
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[metrics.AUC(name='auc_pr', curve="PR")])
    #model.summary()

    # Fit model
    model.fit(inp_train, lbl_train, epochs=params['epochs'], batch_size=params['batch_size'], verbose = 0, class_weight=class_weights)

    # Compute performance of the trial
    pred_scores = (model.predict(inp_valid) > 0.5).astype("int32")
    f1 = f1_score(lbl_valid, pred_scores)
    print('DONE!      F1 = ' + str(f1))
    sys.stdout.flush() 
    return {'loss': -f1, 'status': STATUS_OK}


## Run Bayesian optimization with Tree Parzen Estimator

In [None]:
trials = Trials()
best = fmin(objective, hyperparam_space, algo=tpe.suggest, max_evals=500, trials=trials)
print('Best hyperparams: ', best)

Params testing: 
{'batch_size': 109, 'dropout_rate': 0.4783174874796243, 'embedding_dim': 201, 'epochs': 58, 'kernel_size': 14, 'l2_val': 9.90822931775982e-05, 'learning_rate': 6.210853630482823e-06, 'max_len': 55, 'max_words': 5000, 'n_filters': 127}
DONE!      F1 = 0.24309392265193372
Params testing: 
{'batch_size': 61, 'dropout_rate': 0.6194478376751777, 'embedding_dim': 223, 'epochs': 73, 'kernel_size': 11, 'l2_val': 6.727534731707914e-05, 'learning_rate': 3.488530716431959e-05, 'max_len': 55, 'max_words': 5000, 'n_filters': 65}
DONE!      F1 = 0.607142857142857
Params testing: 
{'batch_size': 60, 'dropout_rate': 0.34962518775820844, 'embedding_dim': 91, 'epochs': 25, 'kernel_size': 14, 'l2_val': 0.000508332274656428, 'learning_rate': 1.6663577810464724e-05, 'max_len': 55, 'max_words': 5000, 'n_filters': 47}
DONE!      F1 = 0.16666666666666669
Params testing: 
{'batch_size': 70, 'dropout_rate': 0.6893764616579192, 'embedding_dim': 82, 'epochs': 62, 'kernel_size': 12, 'l2_val': 2.76