## Setup

In [1]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.layers import *

Using TensorFlow backend.


In [2]:
import numpy as np
import pandas as pd
import os
import csv
from random import random, sample, seed

## Import data

In [4]:
df = pd.read_csv('new_faq_abs.csv')
df.head()

Unnamed: 0,Query,Answer,Score
0,ace savings account platinum debit card annual...,20 per txn,0.231745
1,ace savings account platinum debit card annual...,Nil Charges,1.0
2,ace savings account platinum debit card annual...,Nil charges upto 5 transactions; thereafter Rs...,0.464369
3,ace savings account platinum debit card annual...,Rs.25 per transaction,0.309488
4,ace savings account platinum debit card annual...,Rs. 750,0.797618


In [5]:
queries = list(df['Query'])
answers = list(df['Answer'])
prob = np.array(df['Score'], dtype=float)

In [6]:
print(len(queries))
print(len(answers))
print(prob.shape)
print('Unique questions:',len(set(queries)))
print('Unique answers:',len(set(answers)))

## Pre-processing

In [7]:
max_features = 40000
vocab_size=2000
seq_maxlen=35

In [8]:
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence, Tokenizer

word_tokenizer = Tokenizer(max_features)
word_tokenizer.fit_on_texts(queries+answers)

queries_tf = word_tokenizer.texts_to_sequences(queries)
answers_tf = word_tokenizer.texts_to_sequences(answers)
print(queries_tf[52])

queries_tf = sequence.pad_sequences(queries_tf, maxlen=seq_maxlen)
answers_tf = sequence.pad_sequences(answers_tf, maxlen=seq_maxlen)
print(queries_tf[52])

[58, 5, 4, 84, 1, 2, 11, 3]
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0 58  5  4 84  1  2 11  3]


### Load pre-trained embeddings

In [9]:
WORD2VEC_EMBED_SIZE = 100
QA_EMBED_SIZE = 64

In [10]:
embeddings_path = '/Users/ardellelee/Downloads/glove.6B/glove.6B.100d.txt'

In [11]:
embedding_vectors = {}

with open(embeddings_path, 'r') as f:
    for line in f:
        line_split = line.strip().split(" ")
        vec = np.array(line_split[1:], dtype=float)
        word = line_split[0]
        embedding_vectors[word] = vec

In [12]:
weights_matrix = np.zeros((vocab_size, WORD2VEC_EMBED_SIZE))

for word, i in word_tokenizer.word_index.items():

    embedding_vector = embedding_vectors.get(word)
    if (embedding_vector is not None) and i <= vocab_size:
        weights_matrix[i] = embedding_vector

#index 0 vector should be all zeroes, index 1 vector should be the same one as above
print(weights_matrix[2,:])

[  1.62920000e-01  -3.17980000e-01   4.23280000e-01  -8.67670000e-01
   4.51010000e-01   5.78570000e-01   2.66450000e-02  -1.26480000e-01
   3.34650000e-01  -4.20470000e-02  -4.05960000e-02   1.64780000e-01
  -6.73440000e-01  -3.37510000e-01   3.59130000e-01   5.73830000e-01
   8.46200000e-01   3.63740000e-01   3.06300000e-01  -6.80500000e-02
  -6.76100000e-01  -1.91470000e-01  -1.45940000e-01   3.26210000e-03
   6.69490000e-01  -3.35880000e-01   1.78680000e-01  -3.93600000e-01
   1.77000000e-01  -3.36420000e-01   1.92880000e-01   1.00300000e+00
  -2.17940000e-01   2.42710000e-01   1.09350000e+00  -1.03030000e-01
  -7.91970000e-01  -1.35060000e-01   1.21560000e-01  -9.83770000e-01
   1.03000000e+00  -1.02420000e+00   6.02690000e-01  -1.59860000e-01
  -2.67730000e-01  -5.56300000e-01   2.58340000e-01  -8.50210000e-02
  -1.52210000e-01  -3.37170000e-01   2.63580000e-02   2.31710000e-01
  -1.80560000e-01   5.71070000e-01   3.85560000e-01  -1.57320000e+00
  -1.49020000e-01   3.78260000e-02

## Build model

In [13]:
BATCH_SIZE = 32
NBR_EPOCHS = 20

In [14]:
qenc = Sequential()
qenc.add(Embedding(output_dim=WORD2VEC_EMBED_SIZE, input_dim=vocab_size,
                   input_length=seq_maxlen,
                   weights=[weights_matrix]))
qenc.add(LSTM(QA_EMBED_SIZE, return_sequences=True))
qenc.add(Dropout(0.3))
qenc.add(Convolution1D(QA_EMBED_SIZE // 2, 5, border_mode="valid"))
qenc.add(MaxPooling1D(pool_length=2, border_mode="valid"))
qenc.add(Dropout(0.3))
qenc.add(Flatten())

aenc = Sequential()
aenc.add(Embedding(output_dim=WORD2VEC_EMBED_SIZE, input_dim=vocab_size,
                   input_length=seq_maxlen,
                   weights=[weights_matrix]))
aenc.add(LSTM(QA_EMBED_SIZE, return_sequences=True))
aenc.add(Dropout(0.3))
aenc.add(Convolution1D(QA_EMBED_SIZE // 2, 3, border_mode="valid"))
aenc.add(MaxPooling1D(pool_length=2, border_mode="valid"))
aenc.add(Dropout(0.3))
aenc.add(Flatten())

model= Sequential()
model.add(Merge([qenc, aenc], mode="concat", concat_axis=-1))
model.add(Dense(1, activation="softmax"))

  import sys
  


In [15]:
print("Compiling model...")
#model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.compile(optimizer="adam", loss="mse", metrics=["mse"])
model.summary()

Compiling model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_1 (Merge)              (None, 992)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 993       
Total params: 501,921
Trainable params: 501,921
Non-trainable params: 0
_________________________________________________________________


In [17]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(model).create(prog='dot', format='svg'))

from keras.utils import plot_model
plot_model(model, to_file='lstm_cnn1.png', show_shapes=True)

## Train the model

In [15]:
#from keras.utils.np_utils import to_categorical
#prob=to_categorical(prob)

In [27]:
#prob = np.array(df['Score'], dtype=float)

In [18]:
from random import random, sample, seed

seed(123)
split = 0.3
idx = sample(range(queries_tf.shape[0]), queries_tf.shape[0])

#shuffle
queries_tf = queries_tf[idx, :]
answers_tf = answers_tf[idx, :]
prob = prob[idx, ]

print(queries_tf.shape)
print(answers_tf.shape)
print(prob.shape)

(9972, 35)
(9972, 35)
(9972,)


In [21]:
from keras.callbacks import CSVLogger
csv_logger = CSVLogger('lstm_attn_training.csv')

In [19]:
model.fit([queries_tf, answers_tf], 
          prob,
          batch_size=100,
          epochs=10,
          validation_split=split, 
          #callbacks=[csv_logger]
         )

Train on 6980 samples, validate on 2992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

KeyboardInterrupt: 