## Setup

In [1]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.layers import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import numpy as np
import pandas as pd
import os
import csv
from random import random, sample, seed

## Import data

In [3]:
df = pd.read_csv('new_faq.csv')
df.head(10)

Unnamed: 0,Query,Answer,Score
0,ace savings account platinum debit card annual...,20 per txn,0.134826
1,ace savings account platinum debit card annual...,Nil Charges,1.0
2,ace savings account platinum debit card annual...,Nil charges upto 5 transactions; thereafter Rs...,0.355285
3,ace savings account platinum debit card annual...,Rs.25 per transaction,0.64148
4,ace savings account platinum debit card annual...,Rs. 750,0.80336
5,ace savings account platinum debit card annual...,Rs. 250,0.851215
6,ace savings account platinum debit card annual...,Rs. 500,0.061913
7,ace savings account platinum debit card annual...,All txns are Chargeable,-0.033971
8,ace savings account platinum debit card annual...,8.50 per txn (All the charges are exclusive of...,0.026407
9,ace savings account platinum debit card annual...,150 per txn,0.171966


In [4]:
queries = list(df['Query'])
answers = list(df['Answer'])
prob = np.array(df['Score'], dtype=float)
prob.shape

(9972,)

In [6]:
print(len(queries))
print(len(answers))
print('Unique questions:',len(set(queries)))
print('Unique answers:',len(set(answers)))

9972
9972
Unique questions: 545
Unique answers: 18


## Pre-processing

In [7]:
max_features = 40000
vocab_size=2000
seq_maxlen=35

In [8]:
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence, Tokenizer

word_tokenizer = Tokenizer(max_features)
word_tokenizer.fit_on_texts(queries+answers)

queries_tf = word_tokenizer.texts_to_sequences(queries)
answers_tf = word_tokenizer.texts_to_sequences(answers)
print(queries_tf[52])

queries_tf = sequence.pad_sequences(queries_tf, maxlen=seq_maxlen)
answers_tf = sequence.pad_sequences(answers_tf, maxlen=seq_maxlen)
print(queries_tf[52])

[58, 5, 4, 84, 1, 2, 11, 3]
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0 58  5  4 84  1  2 11  3]


## Load pre-trained embeddings

In [9]:
WORD2VEC_EMBED_SIZE = 100
QA_EMBED_SIZE = 64

In [10]:
embeddings_path = 'glove.6B/glove.6B.100d.txt'

In [11]:
embedding_vectors = {}

with open(embeddings_path, 'r') as f:
    for line in f:
        line_split = line.strip().split(" ")
        vec = np.array(line_split[1:], dtype=float)
        word = line_split[0]
        embedding_vectors[word] = vec

In [12]:
weights_matrix = np.zeros((vocab_size, WORD2VEC_EMBED_SIZE))

for word, i in word_tokenizer.word_index.items():

    embedding_vector = embedding_vectors.get(word)
    if (embedding_vector is not None) and i <= vocab_size:
        weights_matrix[i] = embedding_vector

#index 0 vector should be all zeroes, index 1 vector should be the same one as above
print(weights_matrix[2,:])

[ 1.6292e-01 -3.1798e-01  4.2328e-01 -8.6767e-01  4.5101e-01  5.7857e-01
  2.6645e-02 -1.2648e-01  3.3465e-01 -4.2047e-02 -4.0596e-02  1.6478e-01
 -6.7344e-01 -3.3751e-01  3.5913e-01  5.7383e-01  8.4620e-01  3.6374e-01
  3.0630e-01 -6.8050e-02 -6.7610e-01 -1.9147e-01 -1.4594e-01  3.2621e-03
  6.6949e-01 -3.3588e-01  1.7868e-01 -3.9360e-01  1.7700e-01 -3.3642e-01
  1.9288e-01  1.0030e+00 -2.1794e-01  2.4271e-01  1.0935e+00 -1.0303e-01
 -7.9197e-01 -1.3506e-01  1.2156e-01 -9.8377e-01  1.0300e+00 -1.0242e+00
  6.0269e-01 -1.5986e-01 -2.6773e-01 -5.5630e-01  2.5834e-01 -8.5021e-02
 -1.5221e-01 -3.3717e-01  2.6358e-02  2.3171e-01 -1.8056e-01  5.7107e-01
  3.8556e-01 -1.5732e+00 -1.4902e-01  3.7826e-02  1.8485e+00  7.0210e-01
 -1.1697e-01  7.7822e-02  7.4620e-02  9.9570e-02 -2.1427e-01 -6.0061e-01
  9.4903e-02  8.0589e-01  5.5333e-01 -3.1359e-01 -9.0991e-01  5.3645e-02
 -1.4494e-01 -4.8532e-01  1.0335e-01  1.2182e+00 -2.2199e-01 -1.4934e-02
 -1.1355e+00  3.2790e-01  1.1733e+00 -5.2838e-01 -6

## Build model

In [13]:
BATCH_SIZE = 32
NBR_EPOCHS = 20

In [22]:
# output: (None, QA_EMBED_SIZE, seq_maxlen)
qin = Input(shape=(seq_maxlen,), dtype="int32")
qenc = Embedding(input_dim=vocab_size,
                 output_dim=WORD2VEC_EMBED_SIZE,
                 input_length=seq_maxlen,
                 weights=[weights_matrix])(qin)
qenc = LSTM(QA_EMBED_SIZE, return_sequences=True)(qenc)
qenc = Dropout(0.3)(qenc)

# output: (None, QA_EMBED_SIZE, seq_maxlen)
ain = Input(shape=(seq_maxlen,), dtype="int32")
aenc = Embedding(input_dim=vocab_size,
                 output_dim=WORD2VEC_EMBED_SIZE,
                 input_length=seq_maxlen,
                 weights=[weights_matrix])(ain)
aenc = LSTM(QA_EMBED_SIZE, return_sequences=True)(aenc)
aenc = Dropout(0.3)(aenc)

# attention model
attn = merge([qenc, aenc], mode="dot", dot_axes=[1, 1])
attn = Flatten()(attn)
attn = Dense(seq_maxlen * QA_EMBED_SIZE)(attn)
attn = Reshape((seq_maxlen, QA_EMBED_SIZE))(attn)

qenc_attn = merge([qenc, attn], mode="sum")
qenc_attn = Flatten()(qenc_attn)

output = Dense(2, activation="softmax")(qenc_attn)

model = Model(input=[qin, ain], output=[output])

  name=name)


In [23]:
print("Compiling model...")
model.compile(optimizer="adam", loss="categorical_crossentropy",
              metrics=["accuracy"])
model.summary()

Compiling model...
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 35)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 35)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 35, 100)      200000      input_3[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 35, 100)      200000      input_4[0][0]                    
__________________________________________________________________________________________

In [19]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(model).create(prog='dot', format='svg'))

from keras.utils import plot_model
plot_model(model, to_file='lstm_attn.png', show_shapes=True)

## Train the model

In [15]:
from keras.utils.np_utils import to_categorical
prob=to_categorical(prob)

In [16]:
from random import random, sample, seed

seed(123)
split = 0.3
idx = sample(range(queries_tf.shape[0]), queries_tf.shape[0])

#shuffle
queries_tf = queries_tf[idx, :]
answers_tf = answers_tf[idx, :]
prob = prob[idx, ]

In [17]:
print(queries_tf.shape)
print(answers_tf.shape)
print(prob.shape)

(9972, 35)
(9972, 35)
(9972, 2)


In [18]:
from keras.callbacks import CSVLogger
csv_logger = CSVLogger('lstm_atten_training.csv')

In [24]:
model.fit([queries_tf, answers_tf], 
          prob,
          batch_size=BATCH_SIZE,
          epochs=NBR_EPOCHS,
          validation_split=split, 
          callbacks=[csv_logger])

Train on 6980 samples, validate on 2992 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1321a98d0>