<a href="https://colab.research.google.com/github/samsenko/quora-duplicates/blob/master/quora_duplicates_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---



**To do:** Reduce number of Epochs to 60; Reduce dropout rate; Figure out how to load model in less time than it takes to train model

---

In [0]:
import spacy
import tensorflow as tf
import numpy as np
import zlib
import glob
import os

from contextlib import suppress
from collections import defaultdict, Counter

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from tensorflow.python.keras.layers import Embedding, Input, Activation, Masking, Dense, Dropout, GRU, Bidirectional, BatchNormalization, Lambda, Flatten
from tensorflow.python.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import to_categorical
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.backend import abs

tf.logging.set_verbosity('WARN')
if 'COLAB_TPU_ADDR' in os.environ:
  TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
else:
  TPU_WORKER = None

Using TensorFlow backend.


In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

downloaded = drive.CreateFile({'id':'1VHycjtP6NcpmPFyXJxMd-XykMG0dTpzj'}) 
downloaded.GetContentFile('quora_embedded.npz') 

[?25l[K    1% |▎                               | 10kB 19.1MB/s eta 0:00:01[K    2% |▋                               | 20kB 2.3MB/s eta 0:00:01[K    3% |█                               | 30kB 3.4MB/s eta 0:00:01[K    4% |█▎                              | 40kB 2.2MB/s eta 0:00:01[K    5% |█▋                              | 51kB 2.7MB/s eta 0:00:01[K    6% |██                              | 61kB 3.2MB/s eta 0:00:01[K    7% |██▎                             | 71kB 3.6MB/s eta 0:00:01[K    8% |██▋                             | 81kB 4.1MB/s eta 0:00:01[K    9% |███                             | 92kB 4.6MB/s eta 0:00:01[K    10% |███▎                            | 102kB 3.5MB/s eta 0:00:01[K    11% |███▋                            | 112kB 3.5MB/s eta 0:00:01[K    12% |████                            | 122kB 4.9MB/s eta 0:00:01[K    13% |████▎                           | 133kB 4.9MB/s eta 0:00:01[K    14% |████▋                           | 143kB 9.0MB/s eta 0:00:01[

In [0]:
loaded = np.load('quora_embedded.npz')

In [0]:
emb1, emb2, targets = loaded['a'], loaded['b'], loaded['c']

In [0]:
print(emb1.shape, emb2.shape, targets[:50000].shape)

(50000, 30, 300) (50000, 30, 300) (50000,)


In [0]:
def get_siamese_model(input_shape):
    """
        Model architecture
    """
    
    # Define the tensors for the two input questions
    left_input = Input(shape = input_shape)
    right_input = Input(shape = input_shape)
    
    # Recurrent Neural Network
    model = Sequential()
    gru = Bidirectional(
        GRU(
            256,
            dropout=0.5,
            recurrent_dropout=0.5,
            return_sequences=True,
        )
    )
    norm = BatchNormalization()
    dense = Dense(1024)
    flatten = Flatten()
    
    model.add(gru)
    model.add(norm)
    model.add(flatten)
    model.add(dense)
    
    # Generate the encodings (feature vectors) for the two questions
    encoded_l = model(left_input)
    encoded_r = model(right_input)
    
    # Add a customized layer to compute the absolute difference between the encodings
    L1_layer = Lambda(lambda tensors:abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([encoded_l, encoded_r])
    
    # Add a dense layer with a sigmoid unit to generate the similarity score
    prediction = Dense(1,activation='sigmoid')(L1_distance)
    
    # Connect the inputs with the outputs
    siamese_net = tf.keras.Model(inputs=[left_input,right_input],outputs=prediction)
    
    #compile model
    siamese_net.summary()
    siamese_net.compile(
      tf.train.AdamOptimizer(learning_rate=0.0001),
      loss='binary_crossentropy',
      metrics=['accuracy'],
    )
    
    # return the model
    return siamese_net

In [0]:
model = get_siamese_model((30, 300))
model = tf.contrib.tpu.keras_to_tpu_model(
  model,
  strategy=tf.contrib.tpu.TPUDistributionStrategy(
    tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_35 (InputLayer)           (None, 30, 300)      0                                            
__________________________________________________________________________________________________
input_36 (InputLayer)           (None, 30, 300)      0                                            
__________________________________________________________________________________________________
sequential_16 (Sequential)      (None, 1024)         16587264    input_35[0][0]                   
                                                                 input_36[0][0]                   
__________________________________________________________________________________________________
lambda_5 (Lambda)               (None, 1024)         0           sequential_16[0][0]              
          

In [0]:
model.fit(
  [emb1, emb2], targets,
  validation_split=0.1,
  epochs=100,
  batch_size=64,
)

Train on 50000 samples, validate on 5000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/10

<tensorflow.python.keras.callbacks.History at 0x7f403816edd8>

In [0]:
model.save('quora_duplicates_feb19.h5')



In [0]:
from google.colab import auth
from googleapiclient.http import MediaFileUpload
from googleapiclient.discovery import build

In [0]:
drive_service = build('drive', 'v3')

def save_file_to_drive(name, path):
    file_metadata = {
      'name': name,
      'mimeType': 'application/octet-stream'
     }

    media = MediaFileUpload(path, 
                    mimetype='application/octet-stream',
                    resumable=True)

    created = drive_service.files().create(body=file_metadata,
                                   media_body=media,
                                   fields='id').execute()

    print('File ID: {}'.format(created.get('id')))

    return created

save_file_to_drive("quora_duplicates_feb19.h5", "./quora_duplicates_feb19.h5")

File ID: 1v1fTQgqcjM8uUfMkc3ZzxvaMga9S9uy6


{'id': '1v1fTQgqcjM8uUfMkc3ZzxvaMga9S9uy6'}