# Hyperparameter Tuning of FAQ Classification

Hyperparameter tuning of KNN Model

Author: Shreyash Gupta

Organization: IndiaMART InterMESH Pvt. Ltd.

# Loading data

Importing necessary modules

In [None]:
import pandas as pd

Reading the training and testing data

In [None]:
faq_train = pd.read_excel("faq.xlsx")
faq_test = pd.read_excel("faq_test.xlsx")

Analyzing the label distribution

In [None]:
faq_train['Remarks'].value_counts(normalize = True)

In [None]:
faq_test['Remarks'].value_counts(normalize = True)

# Data preprocessing

1. Remove numbers

In [None]:
faq_train['cleanQT'] = faq_train['Question Title'].str.replace("[0-9]"," ")
faq_test['cleanQT'] = faq_test['Question Title'].str.replace("[0-9]"," ")

2. Remove special symbols

In [None]:
special_symbols = '!@#$%^&*()_-+=[]\{}|;",.<>/?~:\"'
faq_train['cleanQT'] = faq_train['cleanQT'].apply(lambda rss: ''.join(ch for ch in rss if ch not in set(special_symbols)))
faq_test['cleanQT'] = faq_test['cleanQT'].apply(lambda rss: ''.join(ch for ch in rss if ch not in set(special_symbols)))

3. Convert all characters to lowercase

In [None]:
faq_train['cleanQT'] = faq_train['cleanQT'].str.lower()
faq_test['cleanQT'] = faq_test['cleanQT'].str.lower()

4. Remove white spaces

In [None]:
faq_test['cleanQT'] = faq_test['cleanQT'].apply(lambda rws: ' '.join(rws.split()))

Analyzing differences before/after preprocessing data

In [None]:
print("Training data")
print(faq_train.sample(5))
print("Testing data")
print(faq_test.sample(5))

# Preparing ELMo vectors

Importing necessary modules

In [None]:
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
import time
import pickle

Loading the ELMo module

In [None]:
tf.compat.v1.disable_eager_execution()
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable = True)

Defining function for creating ELMo vectors

In [None]:
def elmo_vectors(text):
    embeddings = elmo(text.tolist(),signature = "default", as_dict = True)["elmo"]
    with tf.compat.v1.Session() as session:
        session.run(tf.compat.v1.global_variables_initializer())
        session.run(tf.compat.v1.tables_initializer())
        return session.run(tf.reduce_mean(embeddings,1))

Splitting dataset into batches for better computation

In [None]:
elmo_start_time = time.time()
faq_train_list = [faq_train[i:i+100] for i in range(0,faq_train.shape[0],100)]
faq_test_list = [faq_test[i:i+100] for i in range(0,faq_test.shape[0],100)]

Extracting ELMo vectors

In [None]:
elmo_extraction_start_time = time.time()
faq_elmo_train = [elmo_vectors(x['cleanQT']) for x in faq_train_list]
faq_elmo_test = [elmo_vectors(x['cleanQT']) for x in faq_test_list]
elmo_extraction_end_time = time.time()
print("Total extraction time for ELMo vectors: {} seconds".format(elmo_extraction_end_time - elmo_extraction_start_time))

Checking dimensions of ELMo vectors

In [None]:
print("Training: ",len(faq_elmo_train))
print("Testing: ",len(faq_elmo_test))

Concatenatening all batches

In [None]:
elmo_concat_start_time = time.time()
elmo_faq_train = np.concatenate(faq_elmo_train, axis = 0)
elmo_faq_test = np.concatenate(faq_elmo_test, axis = 0)
elmo_end_time = elmo_concat_end_time = time.time()
print("Total concatenation time: {} seconds".format(elmo_concat_end_time - elmo_concat_start_time))
print("Total time for ELMo vector extraction: {} seconds".format(elmo_end_time - elmo_start_time))

Saving output to pickle file

In [None]:
pickle_out_train = open("elmo_faq_train_04062019.pickle","wb")
pickle_out_test = open("elmo_faq_test_04062019.pickle","wb")
pickle.dump(elmo_faq_train, pickle_out_train)
pickle.dump(elmo_faq_test,pickle_out_test)
pickle_out_train.close()
pickle_out_test.close()

Loading ELMo vectors pickle file

In [None]:
pickle_in_train = open("elmo_faq_train_04062019.pickle","rb")
pickle_in_test = open("elmo_faq_test_04062019.pickle","rb")
elmo_faq_train = pickle.load(pickle_in_train)
elmo_faq_test = pickle.load(pickle_in_test)

# Building the model

Splitting into training and validation sets

In [None]:
xtrain = pd.DataFrame(elmo_faq_train)
ytrain = faq_train['Remarks']
xvalid = pd.DataFrame(elmo_faq_test)
yvalid = faq_test['Remarks']

Importing metrics

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef

Buidling a K Nearest Neighbours Classifer

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knc_start_time = time.time()
knc = KNeighborsClassifier()
knc.fit(xtrain,ytrain)

Predicting on the validation set

In [None]:
pred_val_knc = knc.predict(xvalid)
knc_end_time = time.time()
print("Total time spent on KNC: {} seconds".format(knc_end_time - knc_start_time))

Evaluation of K Nearest Neighbours Classifer

In [None]:
print("KNC")
print("Precision: ",precision_score(yvalid,pred_val_knc,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_knc,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_knc,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_knc))

# Tuning the KNN model

Iterating through parameters

In [None]:
nn_list = [3, 5, 10]
w_list = ['uniform', 'distance']
a_list = ['auto', 'ball_tree', 'kd_tree', 'brute']
ls_list = [20, 30, 40]
p_list = [1, 2]
nj_list = [1, 2]
pr = []
r = []
f1 = []
mcc = []
for nj in nj_list:
    for p in p_list:
        for ls in ls_list:
            for a in a_list:
                for w in w_list:
                    for nn in nn_list:
                        knnclassifier = KNeighborsClassifier(n_neighbors = nn, weights = w, algorithm = a, leaf_size = ls, p = p, n_jobs = nj)
                        knnclassifier.fit(xtrain, ytrain)
                        pred_val_knc = knc.predict(xvalid)
                        print("KNC with nn - {}, w = {}, a - {}, ls = {}, p = {}, nj = {}".format(nn, w, a, ls, p, nj))
                        print("Precision: ",precision_score(yvalid,pred_val_knc,pos_label='FAQ'))
                        pr.append(precision_score(yvalid,pred_val_knc,pos_label='FAQ'))
                        print("Recall: ",recall_score(yvalid,pred_val_knc,pos_label='FAQ'))
                        r.append(recall_score(yvalid,pred_val_knc,pos_label='FAQ'))
                        print("F1 Score: ",f1_score(yvalid, pred_val_knc,pos_label='FAQ'))
                        f1.append(f1_score(yvalid, pred_val_knc,pos_label='FAQ'))
                        print("MCC: ",matthews_corrcoef(yvalid, pred_val_knc))
                        mcc.append(matthews_corrcoef(yvalid, pred_val_knc))
                        print()

Getting tuning results

In [None]:
print("Max precision: ",max(pr))
print("Max recall: ",max(r))
print("Max F1: ",max(f1))
print("Max MCC: ",max(mcc))