# Detecting Duplicate Quora Questions
## Dataset 
1. __Question Pairs Dataset from Quora__
    
    __Link:__ https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs

    The dataset consists of over 400,000 lines of potential question duplicate pairs. Each line contains IDs for each question in the pair, the full text for each question, and a binary value that indicates whether the line truly contains a duplicate pair.
    
    
2. __GloVe Embeddings: Global Vectors for Word Representation__
   
   __Link:__ http://nlp.stanford.edu/data/glove.840B.300d.zip

In [1]:
import pandas as pd
import numpy as np

## Preprocessing

In [2]:
data = pd.read_csv('data/quora_duplicate_questions.tsv', sep='\t')
data = data.drop(['id', 'qid1', 'qid2'], axis=1)

In [3]:
# length-based features
data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))

# differences in lengths of two questions
data['diff_len'] = data.len_q1 - data.len_q2

# character length based features
data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))

# word length based features
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))

# common words in the two questions using intersection
data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split())
                                 .intersection(set(str(x['question2']).lower().split()))), axis=1)

# mark all the above set of features as feature set-1 (fs_1)
fs_1 = ['len_q1', 'len_q2', 'diff_len', 'len_char_q1', 
        'len_char_q2', 'len_word_q1', 'len_word_q2',     
        'common_words']

## Examine fuzzywuzzy 
The next set of features are based on __fuzzy string matching.__

__Package required:__ __fuzzywuzzy__ and __python-levenshtein__ (an important dependency of __fuzzywuzzy__ for faster processing)

In [4]:
# examine fuzz.QRatio (a higher Qration value means higher similarity bw two questions)
from fuzzywuzzy import fuzz

fuzz.QRatio("Why did Trump win the Presidency?", 
           "How did Donald Trump win the 2016 Presidential Election")

67

In [5]:
fuzz.QRatio("How can I start an online shopping (e-commerce) website?", 
            "Which web technology is best suitable for building a big E-Commerce website?")

60

In [6]:
# examine fuzz.partial_ratio (a notably difference bw the above two cases when using partial_ratio)
fuzz.partial_ratio("Why did Trump win the Presidency?", 
   "How did Donald Trump win the 2016 Presidential Election")

73

In [7]:
fuzz.partial_ratio("How can I start an online shopping (e-commerce) website?", 
                   "Which web technology is best suitable for building a big E-Commerce website?")

57

## Create another set of features using fuzzywuzzy

In [8]:
data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(
    str(x['question1']), str(x['question2'])), axis=1)

data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(
    str(x['question1']), str(x['question2'])), axis=1)

data['fuzz_partial_ratio'] = data.apply(lambda x: 
                    fuzz.partial_ratio(str(x['question1']), 
                    str(x['question2'])), axis=1)

data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: 
                    fuzz.partial_token_set_ratio(str(x['question1']), 
                    str(x['question2'])), axis=1)

data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: 
                    fuzz.partial_token_sort_ratio(str(x['question1']), 
                    str(x['question2'])), axis=1)

data['fuzz_token_set_ratio'] = data.apply(lambda x: 
                    fuzz.token_set_ratio(str(x['question1']), 
                    str(x['question2'])), axis=1)

data['fuzz_token_sort_ratio'] = data.apply(lambda x: 
                    fuzz.token_sort_ratio(str(x['question1']), 
                    str(x['question2'])), axis=1)

In [9]:
# mark all the above set of features as feature set-2 (fs_2)
fs_2 = ['fuzz_qratio', 'fuzz_WRatio', 'fuzz_partial_ratio', 
       'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio',
       'fuzz_token_set_ratio', 'fuzz_token_sort_ratio']

## TF-IDF and SVD features
The 3rd set of features isa combination of TFIDF and SVD

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from copy import deepcopy

In [11]:
# parameters for TfidfVectorizer have been proved to work well for a wide range of NLP, especially text classification
# one might need to change is stop_words
tfv_q1 = TfidfVectorizer(min_df=3,
                        max_features=None,
                        strip_accents='unicode',
                        analyzer='word',
                        token_pattern=r'\w{1,}',
                        ngram_range=(1, 2),
                        use_idf=1,
                        smooth_idf=1,
                        sublinear_tf=1,
                        stop_words='english')

tfv_q2 = deepcopy(tfv_q1)

In [12]:
q1_tfidf = tfv_q1.fit_transform(data.question1.fillna(""))
q2_tfidf = tfv_q2.fit_transform(data.question2.fillna(""))

In [13]:
from sklearn.decomposition import TruncatedSVD

svd_q1 = TruncatedSVD(n_components=180)
svd_q2 = TruncatedSVD(n_components=180)

In [14]:
question1_vectors = svd_q1.fit_transform(q1_tfidf)
question2_vectors = svd_q2.fit_transform(q2_tfidf)

In [15]:
from scipy.stats import skew, kurtosis

data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]

data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

In [16]:
from scipy import sparse

# stack two different TF-IDFs for the 2 questions horizontally and feed to a machine learning model
# for text columns, sparse.hstack might be a good choice
fs3_1 = sparse.hstack((q1_tfidf, q2_tfidf))

In [17]:
# combine two questions and calculate ft-idf
tfv = TfidfVectorizer(min_df=3, 
                      max_features=None, 
                      strip_accents='unicode', 
                      analyzer='word', 
                      token_pattern=r'\w{1,}',
                      ngram_range=(1, 2), 
                      use_idf=1, 
                      smooth_idf=1, 
                      sublinear_tf=1,
                      stop_words='english')

q1q2 = data.question1.fillna("")
q1q2 = q1q2 + " " + data.question2.fillna("")
fs3_2 = tfv.fit_transform(q1q2)

In [18]:
# obtain the third set of features by stacking the matrices question1_vectors, question2_vectors together
fs3_3 = np.hstack((question1_vectors, question2_vectors))

In [19]:
fs3_4 = ['skew_q1vec', 'skew_q2vec', 'kur_q1vec', 'kur_q2vec']

In [20]:
del([tfv_q1, tfv_q2, tfv, q1q2, question1_vectors, question2_vectors, svd_q1, svd_q2, q1_tfidf, q2_tfidf])

In [21]:
# check garbage collector
# force the garbage collector to release unreferenced memory
import gc
gc.collect()

87

## Word2Vec embeddings
We're going to use a pretrained Word2vec model trained on the Google News corpus.

__Link:__ https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

__Package required__:
  1. Gensim to load the Word2vec features
  2. pyemd to relate two Word2vec vectors

In [22]:
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format(
        'data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [23]:
import nltk

try:
    nltk.download('punkt')
    nltk.download('stopwords')
except:
    pass

from nltk.corpus import stopwords # stopwords such as a, an, the, ... will be ommited
from nltk import word_tokenize

stop_words = set(stopwords.words('english'))

# convert sentences to vectors
def sent2vec(sent, model):
    # M is a list containing all the important words 
    M = []
    words = word_tokenize(str(sent).lower())
    
    for word in words:
        if word not in stop_words:
            if word.isalpha(): # check if word is alphabetic character
                if word in model: # check if word is already part of Word2Vec
                    M.append(model[word])
    
    # convert M to array for better processing
    M = np.array(M)
    if len(M) > 0:
        v = M.sum(axis=0)
        return v / np.sqrt((v**2).sum()) # standardize vector
    else :
        return model.get_vector('null') # return a null vector

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\peiya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\peiya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [24]:
# create Word2Vec vectors for question 1 and 2
w2v_q1 = np.array([sent2vec(q, model) for q in data.question1])
w2v_q2 = np.array([sent2vec(q, model) for q in data.question2])

In [25]:
# implement all the different distance measures between the vectors of the Word2vec embeddings of the Quora questions
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

data['cosine_distance'] = [cosine(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['cityblock_distance'] = [cityblock(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['jaccard_distance'] = [jaccard(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['canberra_distance'] = [canberra(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['euclidean_distance'] = [euclidean(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['minkowski_distance'] = [minkowski(x,y,3) for (x,y) in zip(w2v_q1, w2v_q2)]
data['braycurtis_distance'] = [braycurtis(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]

In [26]:
# all the feature names related to distances are gathered under the list fs4_1
fs4_1 = ['cosine_distance', 'cityblock_distance', 
         'jaccard_distance', 'canberra_distance', 
         'euclidean_distance', 'minkowski_distance',
         'braycurtis_distance']

In [27]:
# Word2vec matrices for the two questions are horizontally stacked and stored away in w2v
w2v = np.hstack((w2v_q1, w2v_q2))

In [28]:
# release memory
del([w2v_q1, w2v_q2])
gc.collect()

11460

In [29]:
# implement Word Mover's Distance returning the distance between two questions
def wmd(s1, s2, model):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    
    return model.wmdistance(s1, s2)

In [31]:
# apply wmd function
data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2'], model), axis=1)

In [32]:
model.init_sims(replace=True) # normalizing word2vec vectors 
data['norm_wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2'], model), axis=1)

In [33]:
# all feature names related to wmd are gathered under the list fs4_2
fs4_2 = ['wmd', 'norm_wmd']

In [35]:
# release memory
del([model])
gc.collect()

107

## Machine Learning Models

In [36]:
# check the memory again
import psutil
psutil.virtual_memory()

svmem(total=16981487616, available=5024292864, percent=70.4, used=11957194752, free=5024292864)

In [38]:
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [39]:
# standardizing the data
scaler = StandardScaler()

In [44]:
# create input X and output y
X = data[fs_1+fs_2+fs3_4+fs4_1+fs4_2] # filter the fs_1, fs_2, fs3_4, fs4_1, and fs4_2 set of variables
X = X.replace([np.inf, -np.inf], np.nan).fillna(0).values # preprocessing inf and NaN
X = scaler.fit_transform(X) # standardize the data
X = np.hstack((X, fs3_3)) # stack the fs3_3 sparse SVD data matrix horizontally

y = data.is_duplicate.values # get the is_duplicate label
y = y.astype('float32').reshape(-1, 1)

In [45]:
# set random seed
seed = 42
np.random.seed(seed)

# separating 1/10 of the data for validation purposes
n_all, _ = y.shape
idx = np.arange(n_all)
np.random.shuffle(idx)
 
n_split = n_all // 10
idx_val = idx[:n_split]
idx_train = idx[n_split:]
 
x_train = X[idx_train]
y_train = np.ravel(y[idx_train]) # return a contiguous flattened array.
 
x_val = X[idx_val]
y_val = np.ravel(y[idx_val]) # return a contiguous flattened array.

In [47]:
# train and predict with Logistic Regression model
logres = linear_model.LogisticRegression(C=0.1, solver='sag', max_iter=1000)

logres.fit(x_train, y_train)
lr_preds = logres.predict(x_val)
log_res_accuracy = np.sum(lr_preds == y_val) / len(y_val)
print("Logistic regression accuracy: {:.3f}".format(log_res_accuracy))

Logistic regression accuracy: 0.744


In [48]:
# train and predict with XGBoost
params = dict()
params['objective'] = 'binary:logistic'
params['eval_metric'] = ['logloss', 'error']
params['eta'] = 0.02 # learning rate
params['max_depth'] = 4 # shallow depth to prevent overfit

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_val, label=y_val)

# create watch list to keep an eye on the valid set early stopping
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

# train the xgb model, stop if validation doesn't decrease for over 50 steps
bst = xgb.train(params, d_train, 5000, watchlist, early_stopping_rounds=50, verbose_eval=100)

[0]	train-logloss:0.687437	train-error:0.297336	valid-logloss:0.687544	valid-error:0.297583
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 50 rounds.
[100]	train-logloss:0.501917	train-error:0.261259	valid-logloss:0.503814	valid-error:0.263252
[200]	train-logloss:0.468034	train-error:0.244923	valid-logloss:0.470746	valid-error:0.246209
[300]	train-logloss:0.451674	train-error:0.234383	valid-logloss:0.454916	valid-error:0.236637
[400]	train-logloss:0.441258	train-error:0.227559	valid-logloss:0.445179	valid-error:0.230874
[500]	train-logloss:0.434009	train-error:0.222786	valid-logloss:0.43852	valid-error:0.226793
[600]	train-logloss:0.428219	train-error:0.218922	valid-logloss:0.43332	valid-error:0.223033
[700]	train-logloss:0.423202	train-error:0.215582	valid-logloss:0.428955	valid-error:0.220584
[800]	train-logloss:0.418775	train-error:0.212545	valid-logloss:0.425191	valid-error:0.218729
[900]	train

In [49]:
xgb_preds = (bst.predict(d_valid) >= 0.5).astype(int)
xgb_accuracy = np.sum(xgb_preds==y_val) / len(y_val)

print(xgb_accuracy)

0.8055356303643424


## Detecting Quora's Duplicates

In [51]:
import zipfile

from tqdm import tqdm_notebook as tqdm
tqdm.monitor_interval = 0

import tensorflow as tf

print("TensorFlow version: {}".format(tf.__version__))

TensorFlow version: 1.13.1


In [53]:
# create a dataframe from the Quora dataset
try:
    df = data[['question1', 'question2', 'is_duplicate']]
except:
    df = pd.read_csv('data/quora_duplicate_questions.tsv', sep='\t')
    df = df.drop(['id', 'qid1', 'qid2'], axis=1)
    
df = df.fillna('')
y = df.is_duplicate.values
y = y.astype('float32').reshape(-1, 1)

In [54]:
# tokenize the data and convert the data to sequences
Tokenizer = tf.keras.preprocessing.text.Tokenizer
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

# set the maximum number of words to 200,000
tk = Tokenizer(num_words=200000) 

# set maximum sequence length to 40, 
# if sentence has more than 40 words, it will be cutoff to 40 words only
max_len = 40 

In [55]:
# fit the tk on the concatenated list of the first and second questions,
# in order to learn all the possible word terms in the corpus
tk.fit_on_texts(list(df.question1) + list(df.question2))
x1 = tk.texts_to_sequences(df.question1)
x1 = pad_sequences(x1, maxlen=max_len)
 
x2 = tk.texts_to_sequences(df.question2)
x2 = pad_sequences(x2, maxlen=max_len)

# word_index dictionary contains all the tokenized words, 
# paird with an corresponding assigned index
word_index = tk.word_index

In [59]:
# load GloVe embeddings
embedding_matrix = np.zeros((len(word_index) + 1, 300), dtype='float32')

glove_zip = zipfile.ZipFile('data/glove.840B.300d.zip')
glove_file = glove_zip.filelist[0]

f_in = glove_zip.open(glove_file)

for line in tqdm(f_in):
    values = line.split(b' ')
    word = values[0].decode()
    
    if word not in word_index:
        continue 
    i = word_index[word]
    coefs = np.asarray(values[1:], dtype='float32')
    embedding_matrix[i, :] = coefs
    
f_in.close()
glove_zip.close()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [69]:
# function to prepare batches to feed into the DNN
# the function takes the question seqs and based on step param (batch size) 
# to return a list of lists, with internal lists are the seq batches to be learned
def prepare_batches(seq, step):
    n = len(seq)
    res = []
    
    for i in range(0, n, step):
        res.append(seq[i:i+step])
        
    return res

# convolutional layer
def conv1d(inputs, num_filters, filter_size, padding='same'):
    he_std = np.sqrt(2 / (filter_size * num_filters))
    out = tf.layers.conv1d(
        inputs=inputs, filters=num_filters, padding=padding,
        kernel_size=filter_size,
        activation=tf.nn.relu,
        kernel_initializer=tf.random_normal_initializer(stddev=he_std))
    return out


# maxpooling layer
def maxpool1d_global(X):
    out = tf.reduce_max(X, axis=1)
    return out


# dense layer
def dense(X, size, activation=None):
    # He initialization procedures
    he_std = np.sqrt(2/ int(X.shape[1]))
    
    out = tf.layers.dense(X, units=size,
                         activation=activation,
                         kernel_initializer=tf.random_normal_initializer(stddev=he_std))
    
    return out


# time distributed dense layer
def time_distributed_dense(X, dense_size):
    shape = X.shape.as_list()
    assert len(shape) == 3
    _, w, d = shape
 
    X_reshaped = tf.reshape(X, [-1, d])
    H = dense(X_reshaped, dense_size, tf.nn.relu)
 
    return tf.reshape(H, [-1, w, dense_size])


# lstm layer
def lstm(X, size_hidden, size_out):
    with tf.variable_scope('lstm_%d' % np.random.randint(0, 100)):
        he_std = np.sqrt(2 / (size_hidden * size_out))
        W = tf.Variable(tf.random_normal([size_hidden, size_out], stddev=he_std))
        b = tf.Variable(tf.zeros([size_out]))
 
        size_time = int(X.shape[1])
        X = tf.unstack(X, size_time, axis=1)
 
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(size_hidden, forget_bias=1.0)
        outputs, states = tf.contrib.rnn.static_rnn(lstm_cell, X, dtype='float32')
        out = tf.matmul(outputs[-1], W) + b
 
        return out

In [70]:
# training params
max_features = 200000
filter_length = 5
nb_filter = 64
pool_length = 4
learning_rate = 0.001

In [71]:
# initialize tensorflow graph
graph = tf.Graph()
graph.seed = 1

with graph.as_default():
    place_q1 = tf.placeholder(tf.int32, shape=(None, max_len))
    place_q2 = tf.placeholder(tf.int32, shape=(None, max_len))
    place_y = tf.placeholder(tf.float32, shape=(None, 1))
    place_training = tf.placeholder(tf.bool, shape=())
 
    glove = tf.Variable(embedding_matrix, trainable=False)
    q1_glove_lookup = tf.nn.embedding_lookup(glove, place_q1)
    q2_glove_lookup = tf.nn.embedding_lookup(glove, place_q2)
 
    emb_size = len(word_index) + 1
    emb_dim = 300
    emb_std = np.sqrt(2 / emb_dim)
    emb = tf.Variable(tf.random_uniform([emb_size, emb_dim], -emb_std, emb_std))
    q1_emb_lookup = tf.nn.embedding_lookup(emb, place_q1)
    q2_emb_lookup = tf.nn.embedding_lookup(emb, place_q2)
   
    model1 = q1_glove_lookup
    model1 = time_distributed_dense(model1, 300)
    model1 = tf.reduce_sum(model1, axis=1)
 
    model2 = q2_glove_lookup
    model2 = time_distributed_dense(model2, 300)
    model2 = tf.reduce_sum(model2, axis=1)
 
    model3 = q1_glove_lookup
    model3 = conv1d(model3, nb_filter, filter_length, padding='valid')
    model3 = tf.layers.dropout(model3, rate=0.2, training=place_training)
    model3 = conv1d(model3, nb_filter, filter_length, padding='valid')
    model3 = maxpool1d_global(model3)
    model3 = tf.layers.dropout(model3, rate=0.2, training=place_training)
    model3 = dense(model3, 300)
    model3 = tf.layers.dropout(model3, rate=0.2, training=place_training)
    model3 = tf.layers.batch_normalization(model3, training=place_training)
 
    model4 = q2_glove_lookup
    model4 = conv1d(model4, nb_filter, filter_length, padding='valid')
    model4 = tf.layers.dropout(model4, rate=0.2, training=place_training)
    model4 = conv1d(model4, nb_filter, filter_length, padding='valid')
    model4 = maxpool1d_global(model4)
    model4 = tf.layers.dropout(model4, rate=0.2, training=place_training)
    model4 = dense(model4, 300)
    model4 = tf.layers.dropout(model4, rate=0.2, training=place_training)
    model4 = tf.layers.batch_normalization(model4, training=place_training)
 
    model5 = q1_emb_lookup
    model5 = tf.layers.dropout(model5, rate=0.2, training=place_training)
    model5 = lstm(model5, size_hidden=300, size_out=300)
 
    model6 = q2_emb_lookup
    model6 = tf.layers.dropout(model6, rate=0.2, training=place_training)
    model6 = lstm(model6, size_hidden=300, size_out=300)
 
    merged = tf.concat([model1, model2, model3, model4, model5, model6], axis=1)
    #merged = tf.concat([model1, model2], axis=1)
    merged = tf.layers.batch_normalization(merged, training=place_training)
 
    for i in range(5):
        merged = dense(merged, 300, activation=tf.nn.relu)
        merged = tf.layers.dropout(merged, rate=0.2, training=place_training)
        merged = tf.layers.batch_normalization(merged, training=place_training)
 
    merged = dense(merged, 1, activation=tf.nn.sigmoid)
   
    loss = tf.losses.log_loss(place_y, merged)
 
    prediction = tf.round(merged)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(place_y, prediction), 'float32'))
    opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
 
    # for batchnorm
    extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(extra_update_ops):
        step = opt.minimize(loss)
 
    init = tf.global_variables_initializer()
 
session = tf.Session(config=None, graph=graph)
session.run(init)

Instructions for updating:
Use keras.layers.conv1d instead.
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.batch_normalization instead.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [72]:
# set random seed
seed = 1
np.random.seed(seed)

# split the dataset into a training part (9/10) and a testing one (1/10).
n_all, _ = y.shape
idx = np.arange(n_all)
np.random.shuffle(idx)
 
n_split = n_all // 10
idx_val = idx[:n_split]
idx_train = idx[n_split:]
 
x1_train = x1[idx_train]
x2_train = x2[idx_train]
y_train = y[idx_train]
 
x1_val = x1[idx_val]
x2_val = x2[idx_val]
y_val = y[idx_val]

In [74]:
val_idx = np.arange(y_val.shape[0])
val_batches = prepare_batches(val_idx, 5000)

no_epochs = 10
tqdm.monitor_interval = 0

for i in range(no_epochs):
    np.random.seed(i)
    
    train_idx_shuffle = np.arange(y_train.shape[0])
    np.random.shuffle(train_idx_shuffle)
    batches = prepare_batches(train_idx_shuffle, 384)
   
    progress = tqdm(total=len(batches))
    for idx in batches:
        feed_dict = {
            place_q1: x1_train[idx],
            place_q2: x2_train[idx],
            place_y: y_train[idx],
            place_training: True,
        }
        _, acc, l = session.run([step, accuracy, loss], feed_dict)
        progress.update(1)
        progress.set_description(('{:.3f} / {:.3f}').format(acc, l))
 
 
    y_pred = np.zeros_like(y_val)
    for idx in val_batches:
        feed_dict = {
            place_q1: x1_val[idx],
            place_q2: x2_val[idx],
            place_y: y_val[idx],
            place_training: False,
        }
        y_pred[idx, :] = session.run(prediction, feed_dict)
 
    print('Epoch {}, accuracy: {:.3f}'.format(i, np.mean(y_val == y_pred)))

HBox(children=(IntProgress(value=0, max=948), HTML(value='')))

Epoch 0, accuracy: 0.797


HBox(children=(IntProgress(value=0, max=948), HTML(value='')))

Epoch 1, accuracy: 0.803


HBox(children=(IntProgress(value=0, max=948), HTML(value='')))

Epoch 2, accuracy: 0.810


HBox(children=(IntProgress(value=0, max=948), HTML(value='')))

Epoch 3, accuracy: 0.810


HBox(children=(IntProgress(value=0, max=948), HTML(value='')))

Epoch 4, accuracy: 0.808


HBox(children=(IntProgress(value=0, max=948), HTML(value='')))

Epoch 5, accuracy: 0.810


HBox(children=(IntProgress(value=0, max=948), HTML(value='')))

Epoch 6, accuracy: 0.817


HBox(children=(IntProgress(value=0, max=948), HTML(value='')))

Epoch 7, accuracy: 0.816


HBox(children=(IntProgress(value=0, max=948), HTML(value='')))

Epoch 8, accuracy: 0.816


HBox(children=(IntProgress(value=0, max=948), HTML(value='')))

Epoch 9, accuracy: 0.816


In [81]:
# evaluate model on a sample
def convert_text(txt, tokenizer, padder):
    x = tokenizer.texts_to_sequences(txt)
    x = padder(x, maxlen=max_len)
    return x  

def evaluate_questions(a, b, tokenizer, padder, pred):
    feed_dict = {
            place_q1: convert_text([a], tk, pad_sequences),
            place_q2: convert_text([b], tk, pad_sequences),
            place_y: np.zeros((1,1)),
            place_training: False,
        }
    return session.run(pred, feed_dict)
    
isduplicated = lambda a, b: evaluate_questions(a, b, tk, pad_sequences, prediction)

a = "Why are there so many duplicated questions on Quora?"
b = "Why do people ask similar questions on Quora multiple times?"

print("Answer: %0.2f" % isduplicated(a, b))

Answer: 1.00
