# Bidirectional LSTM and CNN Model

Model to classify user generated content (UGC) as FAQ or not FAQ

Author: Shreyash Gupta

Organization: IndiaMART InterMESH Pvt. Ltd.

# Loading Embeddings

Importing necessary modules

In [1]:
import codecs
from tqdm import tqdm
import numpy as np

Loading the embeddings into a set

In [2]:
embeddings_index = {}
vec = codecs.open('faqtrain.vec', encoding='utf-8')
for line in tqdm(vec):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
vec.close()
print("Word vectors: ", len(embeddings_index))

2838it [00:00, 8308.33it/s]


Word vectors:  2838


# Loading training and testing data

Importing necessary modules

In [3]:
import pandas as pd

Creating data frames for training and test data

In [4]:
faq_train = pd.read_excel("faqtrain.xlsx")
faq_test = pd.read_excel("faqtest.xlsx")

In [5]:
faq_test.shape

(424, 4)

# Tokenizing the input data

Importing necessary modules

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

Using TensorFlow backend.


Assigning max sequence length

In [7]:
faq_train["Doc Length"] = faq_train["Question Title"].apply(lambda words: len(words.split(" ")))
MAX_SEQ_LEN = np.round(faq_train["Doc Length"].mean() + faq_train["Doc Length"].std()).astype(int)
print("Max sequence length = ", MAX_SEQ_LEN)

Max sequence length =  12


Loading train and test lists

In [8]:
MAX_NB_WORDS = 1500000
label_names = ['Remarks']
y_train = faq_train[label_names].values
processed_docs_train = faq_train['Question Title'].tolist()
processed_docs_test = faq_test['Question Title'].tolist()

Tokenizing input data

In [9]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS, lower = True, char_level = False)
tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)
word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)
word_index = tokenizer.word_index
print("Dictionary size: ", len(word_index))

Dictionary size:  13024


Padding sequences to same length

In [10]:
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen = MAX_SEQ_LEN)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen = MAX_SEQ_LEN)

# Preparing embedding matrix

Creating the embedding matrix

In [11]:
embed_dim = 300
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)

Printing embedding results

In [12]:
print("Number of null word embeddings: ",np.sum(np.sum(embedding_matrix, axis=1) == 0))
print("Words not found: ", len(words_not_found))

Number of null word embeddings:  11893
Words not found:  11892


# Training the core model

Importing necessary modules

In [13]:
import keras
from keras import backend as K
from keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Conv1D, MaxPooling1D, Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam

Defining model parameters

In [14]:
num_filters = 512
weight_decay = 1e-4
num_classes = len(label_names)
batch_size = 1024
num_epochs = 5
embed_dim = 300 

Building the sequential model

In [15]:
model = Sequential()
model.add(Embedding(nb_words, embed_dim, weights = [embedding_matrix], input_length = MAX_SEQ_LEN, trainable=False))
model.add(Bidirectional(LSTM(100, return_sequences = True, dropout = 0.25, recurrent_dropout = 0.1)))
model.add(Conv1D(num_filters, 5, activation = 'relu', padding = 'same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation = 'relu', padding = 'same'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.2))
model.add(Dense(32, activation = 'relu', kernel_regularizer = regularizers.l2(weight_decay)))
model.add(Dropout(0.6))
model.add(Flatten())
model.add(Dense(num_classes, activation = 'sigmoid'))

W0618 15:32:15.135451  3272 deprecation.py:506] From C:\Users\Pooja\Anaconda3\lib\site-packages\tensorflow\python\keras\backend.py:4081: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Optimizing the model

In [16]:
adam = Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-08, decay = 0.0)
model.compile(loss  ='binary_crossentropy', optimizer = adam, metrics = ['accuracy'])

Generating a summary

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 12, 300)           3907500   
_________________________________________________________________
bidirectional (Bidirectional (None, 12, 200)           320800    
_________________________________________________________________
conv1d (Conv1D)              (None, 12, 512)           512512    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 6, 512)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 6, 512)            1835520   
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 3, 512)            0         
_________________________________________________________________
dropout (Dropout)            (None, 3, 512)            0

Encoding training labels

In [18]:
y_train = y_train.reshape(-1,)

In [19]:
y = pd.factorize(y_train)

Training the model

In [20]:
model.fit(word_seq_train, y[0], batch_size = batch_size, epochs = num_epochs, validation_split = 0.1, shuffle = True, verbose = 2)

Train on 18258 samples, validate on 2029 samples
Epoch 1/5
18258/18258 - 87s - loss: 0.3575 - accuracy: 0.8343 - val_loss: 0.0646 - val_accuracy: 0.9832
Epoch 2/5
18258/18258 - 84s - loss: 0.1768 - accuracy: 0.9306 - val_loss: 0.0539 - val_accuracy: 0.9857
Epoch 3/5
18258/18258 - 87s - loss: 0.1461 - accuracy: 0.9441 - val_loss: 0.0503 - val_accuracy: 0.9862
Epoch 4/5
18258/18258 - 91s - loss: 0.1327 - accuracy: 0.9516 - val_loss: 0.0504 - val_accuracy: 0.9872
Epoch 5/5
18258/18258 - 86s - loss: 0.1248 - accuracy: 0.9534 - val_loss: 0.0481 - val_accuracy: 0.9887


<tensorflow.python.keras.callbacks.History at 0x13e59e50f0>

# Score analysis

Encoding test labels

In [21]:
y_test = faq_test[label_names].values
y_test = y_test.reshape(-1,)
y_t = pd.factorize(y_test)

Printing score

In [22]:
score = model.evaluate(word_seq_test, y_t[0], batch_size = batch_size)
score



[2.5613081455230713, 0.125]

In [23]:
score = model.evaluate(word_seq_train, y[0], batch_size = batch_size)
score



[0.09576678113549963, 0.9653473]

# Prediction of model

Predicting on test data

In [24]:
y_pred = model.predict(word_seq_test)

Saving predictions

In [25]:
submission_df = pd.DataFrame(columns = ['QT'] + label_names)
submission_df['QT'] = faq_test['Question Title'].values 
submission_df[label_names] = y_pred 
submission_df.to_excel("faq_BiLSTM_CNN_pred.xlsx")