# Bidirectional LSTM and CNN Model

Model to classify user generated content (UGC) as FAQ or not FAQ

Author: Shreyash Gupta

Organization: IndiaMART InterMESH Pvt. Ltd.

# Loading Embeddings

Importing necessary modules

In [None]:
import codecs
from tqdm import tqdm
import numpy as np

Loading the embeddings into a set

In [None]:
embeddings_index = {}
vec = codecs.open('faqtrain.vec', encoding='utf-8')
for line in tqdm(vec):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
vec.close()
print("Word vectors: ", len(embeddings_index))

# Loading training and testing data

Importing necessary modules

In [None]:
import pandas as pd

Creating data frames for training and test data

In [None]:
faq_train = pd.read_excel("faqtrain.xlsx")
faq_test = pd.read_excel("faqtest.xlsx")

In [None]:
faq_test.shape

# Tokenizing the input data

Importing necessary modules

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

Assigning max sequence length

In [None]:
faq_train["Doc Length"] = faq_train["Question Title"].apply(lambda words: len(words.split(" ")))
MAX_SEQ_LEN = np.round(faq_train["Doc Length"].mean() + faq_train["Doc Length"].std()).astype(int)
print("Max sequence length = ", MAX_SEQ_LEN)

Loading train and test lists

In [None]:
MAX_NB_WORDS = 1500000
label_names = ['Remarks']
y_train = faq_train[label_names].values
processed_docs_train = faq_train['Question Title'].tolist()
processed_docs_test = faq_test['Question Title'].tolist()

Tokenizing input data

In [None]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS, lower = True, char_level = False)
tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)
word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)
word_index = tokenizer.word_index
print("Dictionary size: ", len(word_index))

Padding sequences to same length

In [None]:
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen = MAX_SEQ_LEN)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen = MAX_SEQ_LEN)

# Preparing embedding matrix

Creating the embedding matrix

In [None]:
embed_dim = 300
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)

Printing embedding results

In [None]:
print("Number of null word embeddings: ",np.sum(np.sum(embedding_matrix, axis=1) == 0))
print("Words not found: ", len(words_not_found))

# Training the core model

Importing necessary modules

In [None]:
import keras
from keras import backend as K
from keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Conv1D, MaxPooling1D, Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam

Defining model parameters

In [None]:
num_filters = 512
weight_decay = 1e-4
num_classes = len(label_names)
batch_size = 1024
num_epochs = 5
embed_dim = 300 

Building the sequential model

In [None]:
model = Sequential()
model.add(Embedding(nb_words, embed_dim, weights = [embedding_matrix], input_length = MAX_SEQ_LEN, trainable=False))
model.add(Bidirectional(LSTM(100, return_sequences = True, dropout = 0.25, recurrent_dropout = 0.1)))
model.add(Conv1D(num_filters, 5, activation = 'relu', padding = 'same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation = 'relu', padding = 'same'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.2))
model.add(Dense(32, activation = 'relu', kernel_regularizer = regularizers.l2(weight_decay)))
model.add(Dropout(0.6))
model.add(Flatten())
model.add(Dense(num_classes, activation = 'sigmoid'))

Optimizing the model

In [None]:
adam = Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-08, decay = 0.0)
model.compile(loss  ='binary_crossentropy', optimizer = adam, metrics = ['accuracy'])

Generating a summary

In [None]:
model.summary()

Encoding training labels

In [None]:
y_train = y_train.reshape(-1,)

In [None]:
y = pd.factorize(y_train)

Training the model

In [None]:
model.fit(word_seq_train, y[0], batch_size = batch_size, epochs = num_epochs, validation_split = 0.1, shuffle = True, verbose = 2)

# Score analysis

Encoding test labels

In [None]:
y_test = faq_test[label_names].values
y_test = y_test.reshape(-1,)
y_t = pd.factorize(y_test)

Printing score

In [None]:
score = model.evaluate(word_seq_test, y_t[0], batch_size = batch_size)
score

In [None]:
score = model.evaluate(word_seq_train, y[0], batch_size = batch_size)
score

# Prediction of model

Predicting on test data

In [None]:
y_pred = model.predict(word_seq_test)

Saving predictions

In [None]:
submission_df = pd.DataFrame(columns = ['QT'] + label_names)
submission_df['QT'] = faq_test['Question Title'].values 
submission_df[label_names] = y_pred 
submission_df.to_excel("faq_BiLSTM_CNN_pred.xlsx")