# ELMo Vectorization for FAQ Model

Program to create ELMo vectors for training and testing data. Using these vectors, different models are analyzed against 4 performance metrics

Author: Shreyash Gupta

Organization: IndiaMART InterMESH Pvt. Ltd

# Loading data

Importing necessary modules

In [None]:
import pandas as pd

Reading the training and testing data

In [None]:
faq_train = pd.read_excel("faq_train_updated.xlsx")
faq_test = pd.read_excel("faq_test.xlsx")

Analyzing the label distribution

In [None]:
faq_train['Remarks'].value_counts(normalize = True)

In [None]:
faq_test['Remarks'].value_counts(normalize = True)

# Data preprocessing

1. Remove numbers

In [None]:
faq_train['cleanQT'] = faq_train['Question Title'].str.replace("[0-9]"," ")
faq_test['cleanQT'] = faq_test['Question Title'].str.replace("[0-9]"," ")

2. Remove special symbols

In [None]:
special_symbols = '!@#$%^&*()_-+=[]\{}|;",.<>/?~:\"'
faq_train['cleanQT'] = faq_train['cleanQT'].apply(lambda rss: ''.join(ch for ch in rss if ch not in set(special_symbols)))
faq_test['cleanQT'] = faq_test['cleanQT'].apply(lambda rss: ''.join(ch for ch in rss if ch not in set(special_symbols)))

3. Convert all characters to lowercase

In [None]:
faq_train['cleanQT'] = faq_train['cleanQT'].str.lower()
faq_test['cleanQT'] = faq_test['cleanQT'].str.lower()

4. Remove white spaces

In [None]:
faq_test['cleanQT'] = faq_test['cleanQT'].apply(lambda rws: ' '.join(rws.split()))

Analyzing differences before/after preprocessing data

In [None]:
print("Training data")
print(faq_train.sample(5))
print("Testing data")
print(faq_test.sample(5))

# Preparing ELMo vectors

Importing necessary modules

In [None]:
import tensorflow_hub as hub
import tensorflow as tf
import time
import numpy as np
import pickle

Loading the ELMo module

In [None]:
tf.compat.v1.disable_eager_execution()
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable = True)

Defining function for creating ELMo vectors

In [None]:
def elmo_vectors(text):
    embeddings = elmo(text.tolist(),signature = "default", as_dict = True)["elmo"]
    with tf.compat.v1.Session() as session:
        session.run(tf.compat.v1.global_variables_initializer())
        session.run(tf.compat.v1.tables_initializer())
        return session.run(tf.reduce_mean(embeddings,1))

Splitting dataset into batches for better computation

In [None]:
elmo_start_time = time.time()
faq_train_list = [faq_train[i:i+100] for i in range(0,faq_train.shape[0],100)]

Extracting ELMo vectors

In [None]:
elmo_extraction_start_time = time.time()
faq_elmo_train = [elmo_vectors(x['cleanQT']) for x in faq_train_list]
elmo_extraction_end_time = time.time()
print("Total extraction time for ELMo vectors: {} seconds".format(elmo_extraction_end_time - elmo_extraction_start_time))

Checking dimensions of ELMo vectors

In [None]:
print("Training: ",len(faq_elmo_train))

Concatenatening all batches

In [None]:
elmo_concat_start_time = time.time()
elmo_faq_train = np.concatenate(faq_elmo_train, axis = 0)
elmo_end_time = elmo_concat_end_time = time.time()
print("Total concatenation time: {} seconds".format(elmo_concat_end_time - elmo_concat_start_time))
print("Total time for ELMo vector extraction: {} seconds".format(elmo_end_time - elmo_start_time))

Saving output to pickle file

In [None]:
pickle_out_train = open("elmo_faq_train_updated_11072019.pickle","wb")
pickle.dump(elmo_faq_train, pickle_out_train)
pickle_out_train.close()

Loading ELMo vectors pickle file

In [None]:
pickle_in_train = open("elmo_faq_train_upadated_11072019.pickle","rb")
pickle_in_test = open("elmo_faq_test_072019.pickle","rb")
elmo_faq_train = pickle.load(pickle_in_train)
elmo_faq_test = pickle.load(pickle_in_test)

# Building Different Models

Splitting into training and validation sets

In [None]:
x = pd.DataFrame(elmo_faq_train)
y = faq_train['Remarks']
xvalid = pd.DataFrame(elmo_faq_test)
yvalid = faq_test['Remarks']

Applying SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
smt = SMOTE(random_state = 1)
xtrain, ytrain = smt.fit_sample(x,y)

Loading metrics

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef

# Logistic Regression

Buidling a logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
lr_start_time = time.time()
regressor = LogisticRegression()
regressor.fit(xtrain,ytrain)

Predicting on the validation set

In [None]:
pred_val_lr = regressor.predict(xvalid)
lr_end_time = time.time()
print("Total time spent on LR: {} seconds".format(lr_end_time - lr_start_time))

Evaluation of Logistic Regression model

In [None]:
print("Logistic Regression")
print("Precision: ",precision_score(yvalid,pred_val_lr,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_lr,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_lr,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_lr))

# Naive Bayes

Building a Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_start_time = time.time()
nbclassifier = GaussianNB()
nbclassifier.fit(xtrain,ytrain)

Predicting on the validation set

In [None]:
pred_val_nb = nbclassifier.predict(xvalid)
nb_end_time = time.time()
print("Total time spent on NB: {} seconds".format(nb_end_time - nb_start_time))

Evaluation of Naive Bayes Classifier

In [None]:
print("Naive Bayes")
print("Precision: ",precision_score(yvalid,pred_val_nb,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_nb,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_nb,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_nb))

# Linear SVM

Creating a Linear SVM (SGD) Classifier

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_start_time = time.time()
sgdclassifier = SGDClassifier(random_state = 1)
sgdclassifier.fit(xtrain,ytrain)

Predicting on the validation set

In [None]:
pred_val_sgd = sgdclassifier.predict(xvalid)
sgd_end_time = time.time()
print("Total time spent on SVM (SGD): {} seconds".format(sgd_end_time - sgd_start_time))

Evaluation of Linear SVM (SGD) Classifier

In [None]:
print("Linear SVM (SGD)")
print("Precision: ",precision_score(yvalid,pred_val_sgd,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_sgd,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_sgd,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_sgd))

# Random Forest

Buidling a Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc_start_time = time.time()
rfc = RandomForestClassifier(random_state = 1)
rfc.fit(xtrain,ytrain)

Predicting on the validation set

In [None]:
pred_val_rfc = rfc.predict(xvalid)
rfc_end_time = time.time()
print("Total time spent on RFC: {} seconds".format(rfc_end_time - rfc_start_time))

Evaluation of Random Forest Classifier

In [None]:
print("RFC")
print("Precision: ",precision_score(yvalid,pred_val_rfc,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_rfc,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_rfc,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_rfc))

# K-Nearest Neighbors

Buidling a K Nearest Neighbors Classifer

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knc_start_time = time.time()
knc = KNeighborsClassifier()
knc.fit(xtrain,ytrain)

Predicting on the validation set

In [None]:
pred_val_knc = knc.predict(xvalid)
knc_end_time = time.time()
print("Total time spent on KNC: {} seconds".format(knc_end_time - knc_start_time))

Evaluation of K Nearest Neighbours Classifer

In [None]:
print("KNC")
print("Precision: ",precision_score(yvalid,pred_val_knc,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_knc,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_knc,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_knc))

# Decision Tree

Buidling a Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc_start_time = time.time()
dtc = DecisionTreeClassifier(random_state = 1)
dtc.fit(xtrain,ytrain)

Predicting on the validation set

In [None]:
pred_val_dtc = dtc.predict(xvalid)
dtc_end_time = time.time()
print("Total time spent on DTC: {} seconds".format(dtc_end_time - dtc_start_time))

Evaluation of Decision Tree Classifier

In [None]:
print("DTC")
print("Precision: ",precision_score(yvalid,pred_val_dtc,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_dtc,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_dtc,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_dtc))

# MLP

Buidling a MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier
mlp_start_time = time.time()
mlp = MLPClassifier(random_state = 1)
mlp.fit(xtrain,ytrain)

Predicting on the validation set

In [None]:
pred_val_mlp = mlp.predict(xvalid)
mlp_end_time = time.time()
print("Total time spent on MLP: {} seconds".format(mlp_end_time - mlp_start_time))

Evaluation of MLP Classifier

In [None]:
print("MLP")
print("Precision: ",precision_score(yvalid,pred_val_mlp,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_mlp,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_mlp,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_mlp))

# AdaBoost

Buidling a AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adab_start_time = time.time()
adab = AdaBoostClassifier(random_state = 1)
adab.fit(xtrain,ytrain)

Predicting on the validation set

In [None]:
pred_val_adab = adab.predict(xvalid)
adab_end_time = time.time()
print("Total time spent on AdaBoost: {} seconds".format(adab_end_time - adab_start_time))

Evaluation of AdaBoost Classifier

In [None]:
print("AdaBoost")
print("Precision: ",precision_score(yvalid,pred_val_adab,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_adab,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_adab,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_adab))

# QDA

Buidling a QDA Classifier

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda_start_time = time.time()
qda = QuadraticDiscriminantAnalysis(random_state = 1)
qda.fit(xtrain,ytrain)

Predicting on the validation set

In [None]:
pred_val_qda = qda.predict(xvalid)
qda_end_time = time.time()
print("Total time spent on QDA: {} seconds".format(qda_end_time - qda_start_time))

Evaluation of QDA Classifier

In [None]:
print("QDA")
print("Precision: ",precision_score(yvalid,pred_val_qda,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_qda,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_qda,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_qda))