<a href="https://colab.research.google.com/github/samsenko/quora-duplicates/blob/master/quora_duplicates_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---



**To do:** Reduce number of Epochs to 60; Reduce dropout rate; Figure out how to load model in less time than it takes to train model

---

In [1]:
import spacy
import tensorflow as tf
import numpy as np
import zlib
import glob
import os

from contextlib import suppress
from collections import defaultdict, Counter

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from tensorflow.python.keras.layers import Embedding, Input, Activation, Masking, Dense, Dropout, GRU, Bidirectional, BatchNormalization, Lambda, Flatten
from tensorflow.python.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import to_categorical
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.backend import abs

tf.logging.set_verbosity('WARN')
if 'COLAB_TPU_ADDR' in os.environ:
  TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
else:
  TPU_WORKER = None

Using TensorFlow backend.


In [2]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

downloaded = drive.CreateFile({'id':'1VHycjtP6NcpmPFyXJxMd-XykMG0dTpzj'}) 
downloaded.GetContentFile('quora_embedded.npz') 

[?25l[K    1% |▎                               | 10kB 8.6MB/s eta 0:00:01[K    2% |▋                               | 20kB 1.8MB/s eta 0:00:01[K    3% |█                               | 30kB 2.6MB/s eta 0:00:01[K    4% |█▎                              | 40kB 1.9MB/s eta 0:00:01[K    5% |█▋                              | 51kB 2.3MB/s eta 0:00:01[K    6% |██                              | 61kB 2.8MB/s eta 0:00:01[K    7% |██▎                             | 71kB 3.2MB/s eta 0:00:01[K    8% |██▋                             | 81kB 3.6MB/s eta 0:00:01[K    9% |███                             | 92kB 4.1MB/s eta 0:00:01[K    10% |███▎                            | 102kB 3.2MB/s eta 0:00:01[K    11% |███▋                            | 112kB 3.3MB/s eta 0:00:01[K    12% |████                            | 122kB 4.9MB/s eta 0:00:01[K    13% |████▎                           | 133kB 4.9MB/s eta 0:00:01[K    14% |████▋                           | 143kB 9.1MB/s eta 0:00:01[K

In [0]:
loaded = np.load('quora_embedded.npz')

In [0]:
emb1, emb2, targets = loaded['a'], loaded['b'], loaded['c']

In [9]:
print(emb1.shape, emb2.shape, targets[:50000].shape)


(50000, 30, 300) (50000, 30, 300) (50000,)


In [0]:
def get_siamese_model(input_shape):
    """
        Model architecture
    """
    
    # Define the tensors for the two input questions
    left_input = Input(shape = input_shape)
    right_input = Input(shape = input_shape)
    
    # Recurrent Neural Network
    model = Sequential()
    gru = Bidirectional(
        GRU(
            256,
            dropout=0.2,
            recurrent_dropout=0.2,
            return_sequences=True,
        )
    )
    norm = BatchNormalization()
    dense = Dense(1024)
    flatten = Flatten()
    
    model.add(gru)
    model.add(norm)
    model.add(flatten)
    model.add(dense)
    
    # Generate the encodings (feature vectors) for the two questions
    encoded_l = model(left_input)
    encoded_r = model(right_input)
    
    # Add a customized layer to compute the absolute difference between the encodings
    L1_layer = Lambda(lambda tensors:abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([encoded_l, encoded_r])
    
    # Add a dense layer with a sigmoid unit to generate the similarity score
    prediction = Dense(1,activation='sigmoid')(L1_distance)
    
    # Connect the inputs with the outputs
    siamese_net = tf.keras.Model(inputs=[left_input,right_input],outputs=prediction)
    
    #compile model
    siamese_net.summary()
    siamese_net.compile(
      tf.train.AdamOptimizer(learning_rate=0.0001),
      loss='binary_crossentropy',
      metrics=['accuracy'],
    )
    
    # return the model
    return siamese_net

In [11]:
model = get_siamese_model((30, 300))
model = tf.contrib.tpu.keras_to_tpu_model(
  model,
  strategy=tf.contrib.tpu.TPUDistributionStrategy(
    tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 30, 300)      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 30, 300)      0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, 1024)         16587264    input_1[0][0]                    
                                                                 input_2[0][0]                    
_____________________

In [12]:
model.fit(
  [emb1, emb2], targets,
  validation_split=0.1,
  epochs=60,
  batch_size=64,
)

Train on 50000 samples, validate on 5000 samples
Epoch 1/60
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<tensorflow.python.keras.callbacks.History at 0x7fe522e84ef0>

In [13]:
model.save('quora_duplicates_feb20.h5')



In [0]:
from google.colab import auth
from googleapiclient.http import MediaFileUpload
from googleapiclient.discovery import build

In [15]:
drive_service = build('drive', 'v3')

def save_file_to_drive(name, path):
    file_metadata = {
      'name': name,
      'mimeType': 'application/octet-stream'
     }

    media = MediaFileUpload(path, 
                    mimetype='application/octet-stream',
                    resumable=True)

    created = drive_service.files().create(body=file_metadata,
                                   media_body=media,
                                   fields='id').execute()

    print('File ID: {}'.format(created.get('id')))

    return created

save_file_to_drive("quora_duplicates_feb20.h5", "./quora_duplicates_feb20.h5")

File ID: 1iX4OMiRN5gzhnau6leZ1Q0uvkxjc9ooS


{'id': '1iX4OMiRN5gzhnau6leZ1Q0uvkxjc9ooS'}

In [74]:
inputs1 = np.array(["Why is triclosan used in toothpaste?"]*8)
inputs2 = np.array(["What common toothpaste has triclosan?"]*8)

#!python -m spacy download en_core_web_md
#embedding = spacy.load('en_core_web_md')
embedding_length = 300
max_question_length = 30

def embed_question(inputs, max_datapoints):
  global embedding_length, max_question_length
  embedded_inputs = []
  i = 0
  errors = 0
  for question in inputs:
    if i % 1000 == 0:
      print("i = ", i)
    if i == max_datapoints:
      break
    i += 1
    doc = embedding(str(question))
    padded = np.zeros((max_question_length, embedding_length))
    new_question = np.array([])
    for word in doc:
      new_question = np.vstack((new_question, word.vector)) if new_question.size else word.vector
    try:
      padded[:new_question.shape[0]] = new_question
    except:
      errors += 1
      padded[:new_question.shape[0]] = new_question[:max_question_length]
    embedded_inputs.append(padded)
  embedded_inputs = np.array(embedded_inputs)
  print("errors: ", errors)
  return embedded_inputs

emb1 = embed_question(inputs1, 8)
emb2 = embed_question(inputs2, 8)

i =  0
errors:  0
i =  0
errors:  0


In [75]:
model.predict([emb1, emb2])[0][0] #probability the questions are duplicates

0.00048187375