## Identification of Quora Duplicates using Embed, Encode, Attend & Predict
(based on methods described by Matthew Honnibal at https://explosion.ai/blog/deep-learning-formula-nlp#entailment)

In [None]:
#! pip install keras
#! pip install spacy
#! python -m spacy download en_vectors_web_lg

#### import spacy, numpy and other utility libraries

In [2]:
import spacy
import numpy as np

In [3]:
import pandas as pd
import numpy as np
import pickle

In [4]:
data_folder = '/media/siri/78C6823EC681FD1E/minio/data/dq-data/dl/'
input_folder = '/media/siri/78C6823EC681FD1E/minio/data/dq-data/'

#### load previously split data using train_test_split

In [5]:
X_train = pickle.load(open(input_folder+'X_train.p', 'rb'))
X_test = pickle.load(open(input_folder+'X_test.p', 'rb'))
y_train = pickle.load(open(input_folder+'y_train.p', 'rb'))
y_test = pickle.load(open(input_folder+'y_test.p', 'rb'))

In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270872 entries, 186150 to 121959
Data columns (total 4 columns):
qid1         270872 non-null int64
qid2         270872 non-null int64
question1    270872 non-null object
question2    270872 non-null object
dtypes: int64(2), object(2)
memory usage: 10.3+ MB


#### load spacy glove vectors

In [47]:
nlp = spacy.load('en_vectors_web_lg')

#### function to retrieve the word vectors 
Used to convert the train and test datasets (text to IDs) based on the glove vectors. 
Accounts for OOV tokens by adding a set of OOV vectors and assigning them randomly to OOV tokens. 
This function has been taken from https://github.com/explosion/spaCy/tree/master/examples/keras_parikh_entailment and adapted to this problem

In [48]:
def create_dataset(nlp, X, num_oov, max_length, norm_vectors = True):
    len_q1 = X['question1'].size
    sents = pd.concat([X['question1'], X['question2']]).values
    
    # the extra +1 is for a zero vector represting NULL for padding
    num_vectors = max(lex.rank for lex in nlp.vocab) + 2 
    
    # create random vectors for OOV tokens
    oov = np.random.normal(size=(num_oov, nlp.vocab.vectors_length))
    oov = oov / oov.sum(axis=1, keepdims=True)
    
    vectors = np.zeros((num_vectors + num_oov, nlp.vocab.vectors_length), dtype='float32')
    vectors[num_vectors:, ] = oov
    for lex in nlp.vocab:
        if lex.has_vector and lex.vector_norm > 0:
            vectors[lex.rank + 1] = lex.vector / lex.vector_norm if norm_vectors == True else lex.vector
            
    sents_as_ids = []
    for sent in sents:
        doc = nlp(sent)
        word_ids = []
        
        for i, token in enumerate(doc):
            # skip odd spaces from tokenizer
            if token.has_vector and token.vector_norm == 0:
                continue
                
            if i > max_length:
                break
                
            if token.has_vector:
                word_ids.append(token.rank + 1)
            else:
                # if we don't have a vector, pick an OOV entry
                word_ids.append(token.rank % num_oov + num_vectors) 
                
        # there must be a simpler way of generating padded arrays from lists...
        word_id_vec = np.zeros((max_length), dtype='int')
        clipped_len = min(max_length, len(word_ids))
        word_id_vec[:clipped_len] = word_ids[:clipped_len]
        sents_as_ids.append(word_id_vec)        
        
    return vectors, np.array(sents_as_ids[:len_q1]), np.array(sents_as_ids[len_q1:])

#### convert train dataset (text to IDs) based on the glove vectors

In [None]:
w2v, q1_train_w2v, q2_train_w2v = create_dataset(nlp, X_train, 100, 50, True)

#### convert test dataset (text to IDs) based on the glove vectors

In [None]:
_, q1_test_w2v, q2_test_w2v = create_dataset(nlp, X_test, 100, 50, True)

In [None]:
q1_train_w2v.shape

#### pickle the word vectors and train and test token IDs for later use

In [None]:
pickle.dump(w2v, open(data_folder+'w2v.p', 'wb'))
pickle.dump(q1_train_w2v, open(data_folder+'q1_train_w2v.p', 'wb'))
pickle.dump(q2_train_w2v, open(data_folder+'q2_train_w2v.p', 'wb'))
pickle.dump(q1_test_w2v, open(data_folder+'q1_test_w2v.p', 'wb'))
pickle.dump(q2_test_w2v, open(data_folder+'q2_test_w2v.p', 'wb'))

## Modeling

#### load previously pickled word vectors and train and test token IDs

In [5]:
w2v = pickle.load(open(data_folder+'w2v.p', 'rb'))
q1_train_w2v = pickle.load(open(data_folder+'q1_train_w2v.p', 'rb'))
q2_train_w2v = pickle.load(open(data_folder+'q2_train_w2v.p', 'rb'))
q1_test_w2v = pickle.load(open(data_folder+'q1_test_w2v.p', 'rb'))
q2_test_w2v = pickle.load(open(data_folder+'q2_test_w2v.p', 'rb'))

#### import keras and scikit-learn

In [11]:
from keras import layers, Model, models, initializers
from keras.engine.topology import Layer
from keras import backend as K
from keras.optimizers import Adam,RMSprop, Nadam, SGD
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Using TensorFlow backend.


In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

#### configure tensorflow backend to dynamically grow memory (prevent OOM)

In [9]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
                                    # (nothing gets printed in Jupyter, only if you run it standalone)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default session for Keras

### Method 1 - Decomposable Attention Model
(based on Example 1. A Decomposable Attention Model for Natural Language Inference from https://explosion.ai/blog/deep-learning-formula-nlp. Method 1 functions have been taken from https://github.com/explosion/spaCy/tree/master/examples/keras_parikh_entailment and adapted to this problem)

#### function to create embedding (word vector embeddings) for the NN
(used by both models)

In [10]:
def create_embedding(vectors, max_length, projected_dim):
    return models.Sequential([
        layers.Embedding(
            vectors.shape[0],
            vectors.shape[1],
            input_length=max_length,
            weights=[vectors],
            trainable=False),
        
        layers.TimeDistributed(
            layers.Dense(projected_dim,
                         activation=None,
                         use_bias=False))
    ])

#### function to create feed-forward layer for the NN

In [11]:
def create_feedforward(num_units=200, activation='relu', dropout_rate=0.2):
    return models.Sequential([
        layers.Dense(num_units, activation=activation, use_bias=True),
        layers.Dropout(dropout_rate),
        layers.Dense(num_units, activation=activation, use_bias=True),
        layers.Dropout(dropout_rate)
    ])

#### function to normalize attention weights
(as described in 3.1 of https://arxiv.org/pdf/1606.01933v1.pdf)

In [12]:
def normalizer(axis):
    def _normalize(att_weights):
        exp_weights = K.exp(att_weights)
        sum_weights = K.sum(exp_weights, axis=axis, keepdims=True)
        return exp_weights/sum_weights
    return _normalize

#### function to sum the comparison vectors
(as described in 3.3 of https://arxiv.org/pdf/1606.01933v1.pdf)

In [13]:
def sum_word(x):
    return K.sum(x, axis=1)

#### function to build the NN
(as described in 3.1 Attend, 3.2 Compare and 3.3 Aggregate of https://arxiv.org/pdf/1606.01933v1.pdf)

In [14]:
def build_model(vectors, max_length, num_hidden, num_classes, projected_dim,  
                dropout_rate0=0.2, dropout_rate1=0.2, dropout_rate2=0.2, dropout_rate3=0.2):
    input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')
    input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')
    
    # embeddings (projected)
    embed = create_embedding(vectors, max_length, projected_dim)
    a = embed(input1)
    b = embed(input2)     
    
    # step 1: attend
    F = create_feedforward(num_hidden, dropout_rate=dropout_rate1)
    att_weights = layers.dot([F(a), F(b)], axes=-1, normalize=True)
    
    G = create_feedforward(num_hidden, dropout_rate=dropout_rate2)    
    
    norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
    norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
    alpha = layers.dot([norm_weights_a, a], axes=1)
    beta  = layers.dot([norm_weights_b, b], axes=1)

    # step 2: compare
    comp1 = layers.concatenate([a, beta])
    comp2 = layers.concatenate([b, alpha])
    v1 = layers.TimeDistributed(G)(comp1)
    v2 = layers.TimeDistributed(G)(comp2)

    # step 3: aggregate
    v1_sum = layers.Lambda(sum_word)(v1)
    v2_sum = layers.Lambda(sum_word)(v2)
    concat = layers.concatenate([v1_sum, v2_sum])
        
    H = create_feedforward(num_hidden, dropout_rate=dropout_rate3)
    out = H(concat)
    out = layers.Dense(num_classes, activation='sigmoid', use_bias=True)(out)
    
    model = Model([input1, input2], out)
    
    model.compile(optimizer=Nadam(lr=0.00001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

#### Build NN and display model  summary
(TODO - dropout rates to be optimized further using hyperparameter optimization methods)

In [15]:
K.clear_session()
m1 = build_model(w2v, 50, 200, 1, 200, dropout_rate1=0.2, dropout_rate2=0.2, dropout_rate3=0.2)
m1.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words1 (InputLayer)             (None, 50)           0                                            
__________________________________________________________________________________________________
words2 (InputLayer)             (None, 50)           0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 50, 200)      321381600   words1[0][0]                     
                                                                 words2[0][0]                     
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 50, 200)      80400       sequential_1[1][0]               
          

#### Fit NN model  with the train data
(TODO - batch_size and epochs to be optimized further using hyperparameter optimization methods)

In [16]:
m1.fit([q1_train_w2v, q2_train_w2v], y_train, batch_size=150, epochs=50,
      validation_data=([q1_test_w2v, q2_test_w2v], y_test))

Train on 270872 samples, validate on 133415 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f2b57df89e8>

#### Model Evaluation

In [17]:
target_names = ['not duplicate', 'duplicate']

In [18]:
convert_binary = lambda x: 1 if x[0] >= .5 else 0

In [19]:
#m.evaluate(test_sents,y_test)

In [21]:
y_pred_dl1 = m1.predict([q1_test_w2v, q2_test_w2v], batch_size=150)

In [22]:
y_pred_dl1_classes = np.array([convert_binary(y) for y in y_pred_dl1])

In [23]:
score_dl1 = accuracy_score(y_test.values, y_pred_dl1_classes)
rscore_dl1 = recall_score(y_test.values, y_pred_dl1_classes)
pscore_dl1 = precision_score(y_test.values, y_pred_dl1_classes)
print('Accuracy score for DL method 1 ', score_dl1)
print('Recall score for DL method 1  ', rscore_dl1)
print('Precision score for DL method 1  ', pscore_dl1)

Accuracy score for DL method 1  0.7804669639845595
Recall score for DL method 1   0.6092414747293888
Precision score for DL method 1   0.7480700527144177


In [24]:
print(classification_report(y_test.values, y_pred_dl1_classes, target_names=target_names))

               precision    recall  f1-score   support

not duplicate       0.79      0.88      0.84     84267
    duplicate       0.75      0.61      0.67     49148

    micro avg       0.78      0.78      0.78    133415
    macro avg       0.77      0.74      0.75    133415
 weighted avg       0.78      0.78      0.77    133415



### Method 2 - Hierarchical Attention Networks
(Code implemented based on algorithm described at Example 2: Hierarchical Attention Networks for Document Classification from https://explosion.ai/blog/deep-learning-formula-nlp). 

#### custom keras layer to implement the attention mechanism (with trainable weights) for the NN
(implementation based on word and sentence attention layers described in https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf and keras custom layer example https://keras.io/layers/writing-your-own-keras-layers/)

In [13]:
from keras.layers import Layer, RNN

class Attention_Layer(Layer):

    def __init__(self, output_dim):
        self.output_dim = output_dim
        super(Attention_Layer, self).__init__()

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.W = self.add_weight(name='W', 
                                      shape=(input_shape[-1], self.output_dim),
                                      initializer='uniform',
                                      trainable=True)
        self.b = self.add_weight(name='b', 
                                      shape=(self.output_dim,),
                                      initializer='uniform',
                                      trainable=True)
        self.u = self.add_weight(name='u', 
                                      shape=(self.output_dim,1),
                                      initializer='uniform',
                                      trainable=True)
        super(Attention_Layer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, h_it):        
        u_it = K.tanh(K.bias_add(K.dot(h_it, self.W), self.b))
        att_weights = K.dot(u_it, self.u)
        exp_weights = K.exp(att_weights)
        sum_weights = K.sum(exp_weights, axis=1, keepdims=True)
        alpha_it = exp_weights/sum_weights
        return K.sum(h_it*alpha_it, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

#### function to build the hierarchical NN
(implementation based on GRU-based word and sentence encoders and word and sentence attention layers described in https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf)

In [19]:
def build_hier_model(vectors=w2v, max_length=50, num_hidden=200, num_classes=1, 
                projected_dim=200, learn_rate=0.0001, dropout_rate1=0.2, dropout_rate2=0.2):
    
    K.clear_session()
    # input    
    model_input = layers.Input(shape=(2, max_length), dtype='int32')
    
    # embeddings (projected)
    embed = create_embedding(vectors, max_length, projected_dim)
    
    # step 1: word encoder
    word_sequence_input = layers.Input(shape=(max_length,), dtype='int32')
    h_w = layers.Bidirectional(layers.GRU(num_hidden, dropout=dropout_rate1, return_sequences=True))(embed(word_sequence_input))
    
    # step 2: word attention
    s_w = Attention_Layer(num_hidden, 1)(h_w)
    word_encode_attend = Model(word_seq_input, s_w)
    
    # step 3: sentence encoder
    sent_encode_attend = layers.TimeDistributed(word_encode_attend)(model_input)
    h = layers.Bidirectional(layers.GRU(num_hidden, dropout=dropout_rate2, return_sequences=True))(sent_encode_attend)
    
    # step 4: sentence attention
    v = Attention_Layer(num_hidden, 1)(h)
    
    # step 5: final classification
    out = layers.Dense(num_classes, activation='sigmoid', use_bias=True)(v)
    
    model = Model(model_input, out)
    
    model.compile(optimizer=Nadam(lr=learn_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

#### convert the train and test into format suitable for input into the NN 
(add a new dimension and concatenate along this dimension)

In [20]:
train_sents = np.concatenate([np.expand_dims(q1_train_w2v, axis=1),np.expand_dims(q2_train_w2v, axis=1)], axis=1)
test_sents = np.concatenate([np.expand_dims(q1_test_w2v, axis=1),np.expand_dims(q2_test_w2v, axis=1)], axis=1)

#### Hyperparameter optimization for the NN using GridSearchCV / RandomizedSearchCV 
(needs modification or alternate method like hyperas/hyperopt - as the current version takes a very long time to search)

In [None]:
model = KerasClassifier(build_fn=build_hier_model, verbose=3)
# define the grid search parameters
learn_rate = np.logspace(-6, -1, 4)
#momentum = np.linspace(0, 0.9, 4)
#optimizer = ['RMSprop', 'Adam', 'Nadam']
epochs = [10, 50, 100]
batch_size= [50, 150]
param_grid = dict(learn_rate=learn_rate, epochs=epochs, batch_size=batch_size)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_jobs=1, cv=5)
grid_result = grid.fit(train_sents, y_train)

In [None]:
grid_result.best_params_

#### Build NN and display model  summary
(TODO - dropout rates to be optimized further using hyperparameter optimization methods)

In [21]:
K.clear_session()
m2 = build_hier_model(w2v, 50, 200, 1, 200)
#m = build_hier_model(learn_rate=grid_result.best_params_['learn_rate'],
#                    momentum=grid_result.best_params_['momentum'])
m2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 2, 50)             0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 2, 400)            321943200 
_________________________________________________________________
bidirectional_2 (Bidirection (None, 2, 400)            721200    
_________________________________________________________________
attention__layer_2 (Attentio (None, 400)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 401       
Total params: 322,745,201
Trainable params: 1,423,601
Non-trainable params: 321,321,600
_________________________________________________________________


#### Fit NN model  with the train data
(TODO - batch_size and epochs to be optimized further using hyperparameter optimization methods)

In [22]:
m2.fit(train_sents, y_train, batch_size=150, epochs=50,
      validation_data=(test_sents, y_test))

Train on 270872 samples, validate on 133415 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f0c8da70400>

#### Model Evaluation

In [23]:
y_pred_dl2 = m.predict(test_sents, batch_size=150)

In [28]:
y_pred_dl2_classes = np.array([convert_binary(y) for y in y_pred_dl2])

In [29]:
score_dl2 = accuracy_score(y_test.values, y_pred_dl2_classes)
rscore_dl2 = recall_score(y_test.values, y_pred_dl2_classes)
pscore_dl2 = precision_score(y_test.values, y_pred_dl2_classes)
print('Accuracy score for DL method 2 ', score_dl2)
print('Recall score for DL method 2 ', rscore_dl2)
print('Precision score for DL method 2 ', pscore_dl2)

Accuracy score for DL method 2  0.8237379604991942
Recall score for DL method 2  0.803776348986734
Precision score for DL method 2  0.7401079136690647


In [30]:
print(classification_report(y_test.values, y_pred_dl2_classes, target_names=target_names))

               precision    recall  f1-score   support

not duplicate       0.88      0.84      0.86     84267
    duplicate       0.74      0.80      0.77     49148

    micro avg       0.82      0.82      0.82    133415
    macro avg       0.81      0.82      0.81    133415
 weighted avg       0.83      0.82      0.83    133415

