# ELMo Vectorization for Banned Products

Creating ELMo vectors for total unique modelling dataset and then creating different models to classsify products as rejected or approved

Author: Shreyash Gupta

Organization: IndiaMART InterMESH Pvt. Ltd.

# Loading data

Importing necessary modules

In [None]:
import pandas as pd

Reading the training data

In [None]:
tumd = pd.read_csv("totaluniquemodeldata.txt", names = ["Product","Label"], engine = "python", error_bad_lines = False)

Creating table format from txt

In [None]:
tumd['Label'] = [tumd.values[i][0].split()[0].replace("__label__","",1) for i in range(tumd.shape[0])]
tumd['Product'] = [tumd.values[i][0].replace(tumd.values[i][0].split()[0] + " ","") for i in range(tumd.shape[0])]

Analyzing the label distribution

In [None]:
tumd['Label'].value_counts(normalize = True)

# Data preprocessing

1. Remove special symbols

In [None]:
special_symbols = '!@#$%^&*()_-+=[]\{}|;",.<>/?~:\"'
tumd['cleanProduct'] = tumd['Product'].apply(lambda rss: ''.join(ch for ch in rss if ch not in set(special_symbols)))

2. Convert all characters to lowercase

In [None]:
tumd['cleanProduct'] = tumd['cleanProduct'].str.lower()

3. Remove white spaces

In [None]:
tumd['cleanProduct'] = tumd['cleanProduct'].apply(lambda rws: ' '.join(rws.split()))

Analyzing differences before/after preprocessing data

In [None]:
tumd.sample(10)

# Preparing ELMo vectors

Importing necessary modules

In [None]:
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
import time
import pickle

Loading the ELMo module

In [None]:
tf.compat.v1.disable_eager_execution()
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable = True)

Defining function for creating ELMo vectors

In [None]:
def elmo_vectors(text):
    embeddings = elmo(text.tolist(),signature = "default", as_dict = True)["elmo"]
    with tf.compat.v1.Session() as session:
        session.run(tf.compat.v1.global_variables_initializer())
        session.run(tf.compat.v1.tables_initializer())
        return session.run(tf.reduce_mean(embeddings,1))

Splitting dataset into batches for better computation

In [None]:
elmo_start_time = time.time()
tumd_list = [tumd[i:i+100] for i in range(0,tumd.shape[0],100)]

Extracting ELMo vectors

In [None]:
elmo_extraction_start_time = time.time()
tumd_elmo_train = [elmo_vectors(x['cleanProduct']) for x in tumd_list]
elmo_extraction_end_time = time.time()
print("Total extraction time for ELMo vectors: {} seconds".format(elmo_extraction_end_time - elmo_extraction_start_time))

Concatenatening all batches

In [None]:
elmo_concat_start_time = time.time()
elmo_tumd_train = np.concatenate(tumd_elmo_train, axis = 0)
elmo_end_time = elmo_concat_end_time = time.time()
print("Total concatenation time: {} seconds".format(elmo_concat_end_time - elmo_concat_start_time))
print("Total time for ELMo vector extraction: {} seconds".format(elmo_end_time - elmo_start_time))

Saving output to pickle file

In [None]:
pickle_out = open("tumd_train_05062019.pickle","wb")
pickle.dump(tumd_faq_train, pickle_out)
pickle_out.close()

Loading ELMo vectors pickle file

In [None]:
pickle_in = open("tumd_train_03062019.pickle","rb")
tumd_faq_train = pickle.load(pickle_in)

# Building different models

Splitting the data into train and validation sets

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(tumd_faq_train,tumd['Label'],random_state = 42, test_size = 0.2)

Importing metrics

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef

# Logistic Regression

Buidling a logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
lr_start_time = time.time()
regressor = LogisticRegression()
regressor.fit(xtrain,ytrain)

Predicting on the validation set

In [None]:
pred_val_lr = regressor.predict(xvalid)
lr_end_time = time.time()
print("Total time spent on LR: {} seconds".format(lr_end_time - lr_start_time))
print("Individual time for LR inclusive of ELMo extraction: {} seconds".format(lr_end_time - elmo_start_time))

Evaluation of Logistic Regression model

In [None]:
print("Logistic Regression")
print("Precision: ",precision_score(yvalid,pred_val_lr,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_lr,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_lr,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_lr))

# Naive Bayes

Building a Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_start_time = time.time()
nbclassifier = GaussianNB()
nbclassifier.fit(xtrain,ytrain)

Predicting on the validation set

In [None]:
pred_val_nb = nbclassifier.predict(xvalid)
nb_end_time = time.time()
print("Total time spent on NB: {} seconds".format(nb_end_time - nb_start_time))
print("Individual time for NB inclusive of ELMo extraction: {} seconds".format(nb_end_time - elmo_start_time - (nb_start_time - lr_start_time)))

Evaluation of Naive Bayes Classifier

In [None]:
print("Naive Bayes")
print("Precision: ",precision_score(yvalid,pred_val_nb,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_nb,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_nb,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_nb))

# Linear SVM

Creating a Linear SVM (SGD) Classifier

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_start_time = time.time()
sgdclassifier = SGDClassifier()
sgdclassifier.fit(xtrain,ytrain)

Predicting on the validation set

In [None]:
pred_val_sgd = sgdclassifier.predict(xvalid)
sgd_end_time = time.time()
print("Total time spent on SVM (SGD): {} seconds".format(sgd_end_time - sgd_start_time))
print("Individual time for SVM (SGD) inclusive of ELMo extraction: {} seconds".format(sgd_end_time - elmo_start_time - (sgd_start_time - lr_start_time)))

Evaluation of Linear SVM (SGD) Classifier

In [None]:
print("Linear SVM (SGD)")
print("Precision: ",precision_score(yvalid,pred_val_sgd,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_sgd,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_sgd,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_sgd))