In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
train_df = pd.read_csv('shuffledTraining4K.csv')
print(train_df.shape)

(4000, 6)


In [0]:
from sklearn.utils import shuffle
train_df = shuffle(train_df)
test_df = train_df[3500:]
train_df = train_df[:3500]
for q in ['question1', 'question2']:
    train_df[q + '_n'] = train_df[q]
print(train_df.shape)
print(test_df.shape)

(3500, 8)
(500, 6)


In [0]:
embedding_dim = 300
max_seq_length = 256
use_w2v = True

In [0]:
from time import time
import pandas as pd
import nltk
nltk.download('stopwords')
import matplotlib

matplotlib.use('Agg')
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow.python.keras.models import Model, Sequential
from tensorflow.python.keras.layers import Input, Embedding, LSTM, GRU, Conv1D, Conv2D, GlobalMaxPool1D, Dense, Dropout
import re

from tensorflow.python.keras import backend as K
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from nltk.corpus import stopwords
from gensim.models import KeyedVectors

import gensim

import numpy as np

import itertools


def text_to_word_list(text):
    # Pre process and convert texts to a list of words
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text


def make_w2v_embeddings(df, embedding_dim=300, empty_w2v=False):
    vocabs = {}
    vocabs_cnt = 0

    vocabs_not_w2v = {}
    vocabs_not_w2v_cnt = 0

    # Stopwords
    stops = set(stopwords.words('english'))

    # Load word2vec
    print("Loading word2vec model(it may takes 2-3 mins) ...")

    if empty_w2v:
        word2vec = EmptyWord2Vec
    else:
        word2vec = KeyedVectors.load_word2vec_format("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", binary=True)
        # word2vec = gensim.models.word2vec.Word2Vec.load("./data/Quora-Question-Pairs.w2v").wv

    for index, row in df.iterrows():
        # print(index)
        # Print the number of embedded sentences.
        if index != 0 and index % 1000 == 0:
            print("{:,} sentences embedded.".format(index), flush=True)

        # Iterate through the text of both questions of the row
        for question in ['question1', 'question2']:

            q2n = []  # q2n -> question numbers representation
            for word in text_to_word_list(row[question]):
                # Check for unwanted words
                if word in stops:
                    continue

                # If a word is missing from word2vec model.
                if word not in word2vec.vocab:
                    if word not in vocabs_not_w2v:
                        vocabs_not_w2v_cnt += 1
                        vocabs_not_w2v[word] = 1

                # If you have never seen a word, append it to vocab dictionary.
                if word not in vocabs:
                    vocabs_cnt += 1
                    vocabs[word] = vocabs_cnt
                    q2n.append(vocabs_cnt)
                else:
                    q2n.append(vocabs[word])

            # Append question as number representation
            df.at[index, question + '_n'] = q2n

    embeddings = 1 * np.random.randn(len(vocabs) + 1, embedding_dim)  # This will be the embedding matrix
    embeddings[0] = 0  # So that the padding will be ignored

    # Build the embedding matrix
    for word, index in vocabs.items():
        if word in word2vec.vocab:
            embeddings[index] = word2vec.word_vec(word)
    del word2vec

    return df, embeddings


def split_and_zero_padding(df, max_seq_length):
    # Split to dicts
    X = {'left': df['question1_n'], 'right': df['question2_n']}

    # Zero padding
    for dataset, side in itertools.product([X], ['left', 'right']):
        dataset[side] = pad_sequences(dataset[side], padding='pre', truncating='post', maxlen=max_seq_length)

    return dataset


#  --

class ManDist(Layer):
    """
    Keras Custom Layer that calculates Manhattan Distance.
    """

    # initialize the layer, No need to include inputs parameter!
    def __init__(self, **kwargs):
        self.result = None
        super(ManDist, self).__init__(**kwargs)

    # input_shape will automatic collect input shapes to build layer
    def build(self, input_shape):
        super(ManDist, self).build(input_shape)

    # This is where the layer's logic lives.
    def call(self, x, **kwargs):
        self.result = K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True))
        return self.result

    # return output shape
    def compute_output_shape(self, input_shape):
        return K.int_shape(self.result)


class EmptyWord2Vec:
    """
    Just for test use.
    """
    vocab = {}
    word_vec = {}


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
train_df, embeddings = make_w2v_embeddings(train_df, embedding_dim=embedding_dim, empty_w2v=not use_w2v)

Loading word2vec model(it may takes 2-3 mins) ...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


1,000 sentences embedded.
3,000 sentences embedded.


In [0]:
validation_size = int(len(train_df) * 0.1)
training_size = len(train_df) - validation_size

X = train_df[['question1_n', 'question2_n']]
Y = train_df['is_duplicate']

In [0]:
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size)
# X_train = X
# Y_train = Y

In [0]:
validation_df = pd.read_csv('drive/My Drive/Colab Notebooks/stackOverflowTestSet.csv')
print(validation_df.shape)

(3000, 6)


In [0]:
validation_df = shuffle(validation_df)
for q in ['question1', 'question2']:
    validation_df[q + '_n'] = validation_df[q]

In [0]:
validation_df, valid_embeddings = make_w2v_embeddings(validation_df, embedding_dim=embedding_dim, empty_w2v=not use_w2v)

Loading word2vec model(it may takes 2-3 mins) ...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


1,000 sentences embedded.
2,000 sentences embedded.


In [0]:
X_validation = validation_df[['question1_n', 'question2_n']]
Y_validation = validation_df['is_duplicate']

In [0]:
X_train = split_and_zero_padding(X_train, max_seq_length)
X_validation = split_and_zero_padding(X_validation, max_seq_length)
Y_train = Y_train.values
Y_validation = Y_validation.values

In [0]:
gpus = 2
batch_size = 1024 * gpus
n_epoch = 50
n_hidden = 50

In [0]:
x = Sequential()
x.add(Embedding(len(embeddings), embedding_dim,
                weights=[embeddings], input_shape=(max_seq_length,), trainable=False))

In [0]:
x.add(LSTM(n_hidden))

shared_model = x

# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

In [0]:
malstm_distance = ManDist()([shared_model(left_input), shared_model(right_input)])
model = Model(inputs=[left_input, right_input], outputs=[malstm_distance])

In [0]:
# model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy', 'mean_squared_error'])
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='mean_squared_error', metrics=['acc',f1_m,precision_m, recall_m])

model.summary()
shared_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 50)           2450700     input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
man_dist_2 (ManDist)            (None, 1)            0           sequential_2[1][0]         

In [0]:
training_start_time = time()
malstm_trained = model.fit([X_train['left'], X_train['right']], Y_train,
                           batch_size=batch_size, epochs=n_epoch,
                           validation_data=([X_validation['left'], X_validation['right']], Y_validation))
training_end_time = time()
print("Training time finished.\n%d epochs in %12.2f" % (n_epoch,
                                                        training_end_time - training_start_time))

Train on 3150 samples, validate on 350 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Training time finished.
50 epochs in      1133.41


In [0]:
model.save('lstm.final1')
model_file = drive.CreateFile({'title' : 'lstm.final1'})
model_file.SetContentFile('lstm.final1')
model_file.Upload()

In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

success!


In [0]:
drive.CreateFile({'id': model_file.get('id')})

GoogleDriveFile({'id': '1JPVP08X8DbhQj33PdhEDKuYsVcI2rPHe'})

In [0]:
plt.plot(malstm_trained.history['mean_squared_error'])
# plt.plot(malstm_trained.history['accuracy'])

# plt.plot(malstm_trained.history['mean_absolute_error'])
# plt.plot(malstm_trained.history['mean_absolute_percentage_error'])
# plt.plot(malstm_trained.history['cosine_proximity'])
plt.show()

In [0]:
plt.subplot(211)
plt.plot(malstm_trained.history['acc'])
plt.plot(malstm_trained.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
# plt.show()

# Plot loss
plt.subplot(212)
plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.tight_layout(h_pad=1.0)
# plt.show()
plt.savefig('final1.png')
print(str(malstm_trained.history['val_acc'][-1])[:6] +
      "(max: " + str(max(malstm_trained.history['val_acc']))[:6] + ")")
print("Done.")

0.6657(max: 0.6685)
Done.


  """Entry point for launching an IPython kernel.
  # This is added back by InteractiveShellApp.init_path()


In [0]:
# test_df = pd.read_csv('drive/My Drive/stackOverflowFinal.csv')
# test_df = pd.read_csv('drive/My Drive/Colab Notebooks/stackOverflowTestSet.csv')
test_df = pd.read_csv('shuffledTraining4K.csv')
test_df = train_df[3500:]
print(test_df.shape)

(0, 8)


In [0]:
for q in ['question1', 'question2']:
    test_df[q + '_n'] = test_df[q]

In [0]:
print(test_df)

        id  ...                                        question2_n
2737   296  ...  How do concentric and eccentric contraction co...
3336  2847  ...  What is the best farewell sample letter from a...
3925  2451  ...  Why do we often judge people by their appearance?
906    502  ...                         Is World War III imminent?
590   2573  ...  What are the biggest blunders in the history o...
...    ...  ...                                                ...
2692  5230  ...  What are macromolecules, and what are some exa...
2402  2430  ...  How can I move to Canada legally with a studen...
1756  2812  ...  Which is the best and worst bank in India to o...
412    880  ...  Do we have telescopes powerful enough nowadays...
2121  1406  ...               How do I get rid of severe dandruff?

[500 rows x 8 columns]


In [0]:
test_df, test_embeddings = make_w2v_embeddings(test_df, embedding_dim=embedding_dim, empty_w2v=not use_w2v)

Loading word2vec model(it may takes 2-3 mins) ...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


2,000 sentences embedded.


In [0]:
test_X = test_df[['question1_n', 'question2_n']]
test_Y = test_df['is_duplicate']


In [0]:
test_X = split_and_zero_padding(test_X, max_seq_length)
test_Y = test_Y.values

In [0]:
# print(test_X)
# print(test_Y)
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# loss, accuracy, f1_score, precision, recall = model.evaluate([test_X['left'], test_X['right']], test_Y, verbose=0)



Using TensorFlow backend.


In [0]:
print("Loss: ", loss)
print("accuracy: ", accuracy)
print("f1_score: ", f1_score)
print("precision: ", precision)
print("recall: ", recall)

Loss:  0.3521001856649915
accuracy:  0.5143333
f1_score:  0.026159678
precision:  0.25531915
recall:  0.013962766


In [0]:
prediction = model.predict([test_X['left'], test_X['right']])

In [0]:
correct = 0;
wrong = 0;
correctPrint = 0;
wrongPrint = 0;
for i in range(0, len(prediction)):
    if prediction[i] > 0.5:
      similarity = 1
    else:
      similarity = 0
    if similarity == test_Y[i]:
      if (correctPrint < 10 and similarity == 1):
        print("Correct")
        print("Sentence 1: ", test_df['question1'][i])
        print("Sentence 2: ", test_df['question2'][i])
        print("Predicted: ", similarity);
        print("Correct: ", test_Y[i]);
        correctPrint = correctPrint + 1
      correct = correct + 1;
    else:
      if (wrongPrint < 10):
        print("Wrong")
        print("Sentence 1: ", test_df['question1'][i])
        print("Sentence 2: ", test_df['question2'][i])
        print("Predicted: ", similarity);
        print("Correct: ", test_Y[i]);
        wrongPrint = wrongPrint + 1
      wrong = wrong + 1;

Wrong
Sentence 1:  What are the differences between a "traditional" IRA and a Roth IRA?
Sentence 2:  Tax on money withdrawn from Roth 401(k) and Roth IRA when living outside the United States and over 59.5-year-old
Predicted:  0
Correct:  1
Wrong
Sentence 1:  Should I put money in both a ROTH and Traditional IRA?
Sentence 2:  Tax on money withdrawn from Roth 401(k) and Roth IRA when living outside the United States and over 59.5-year-old
Predicted:  0
Correct:  1
Wrong
Sentence 1:  Pros, cons, & differences in investing in 401k vs. IRA?
Sentence 2:  Tax on money withdrawn from Roth 401(k) and Roth IRA when living outside the United States and over 59.5-year-old
Predicted:  0
Correct:  1
Wrong
Sentence 1:  What is the difference between a Rollover IRA and a Roth IRA?
Sentence 2:  Tax on money withdrawn from Roth 401(k) and Roth IRA when living outside the United States and over 59.5-year-old
Predicted:  0
Correct:  1
Wrong
Sentence 1:  18 year old making $60k a year; how should I invest

In [0]:
samp = """donald trump will be on the ballot in california as the nominee of the republican party and the american independent party. 
so is trump elected as a republican president or this particular party\'s president. why can a candidate be on the ballot for 2 parties. 


why is this possible.  aren\'t there laws regarding this"""

In [0]:
print(samp.replace("\r","").replace("\n",""))

donald trump will be on the ballot in california as the nominee of the republican party and the american independent party. so is trump elected as a republican president or this particular party's president. why can a candidate be on the ballot for 2 parties. why is this possible.  aren't there laws regarding this


In [0]:
# prediction = model.predict(["What is the story of Kohinoor (Koh-i-Noor) Diamond?", "What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?"])

AttributeError: ignored

In [0]:
print(test_X['left'])

[[   0    0    0 ...    3    4    3]
 [   0    0    0 ...    4    2    3]
 [   0    0    0 ...   23   24    3]
 ...
 [   0    0    0 ... 4817 5400  372]
 [   0    0    0 ...   34 4612  532]
 [   0    0    0 ... 1474 5773 4085]]


In [0]:
example_test = pd.read_csv('drive/My Drive/Colab Notebooks/exampleTest.csv')
for q in ['question1', 'question2']:
    example_test[q + '_n'] = example_test[q]

print(example_test)
example_test, example_test_embeddings = make_w2v_embeddings(example_test, embedding_dim=embedding_dim, empty_w2v=not use_w2v)
example_test_X = example_test[['question1_n', 'question2_n']]
example_test_X = split_and_zero_padding(example_test_X, max_seq_length)
prediction = model.predict([example_test_X['left'], example_test_X['right']])
print(prediction)

   id  ...                                        question2_n
0   1  ...  What would happen if the Indian government sto...

[1 rows x 8 columns]
Loading word2vec model(it may takes 2-3 mins) ...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[[0.5993961]]
