In [1]:
# Colab library to upload files to notebook
from google.colab import files

# Install Kaggle library
!pip install -q kaggle

In [2]:
!python -m spacy download en


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [3]:
# Upload kaggle API key file
uploaded = files.upload()

Saving kaggle.json to kaggle.json


!cat kaggle.json

In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

kaggle.json


In [5]:
#Download data
!kaggle competitions download -c quora-question-pairs

Downloading sample_submission.csv.zip to /content
  0% 0.00/4.95M [00:00<?, ?B/s]
100% 4.95M/4.95M [00:00<00:00, 45.6MB/s]
Downloading test.csv.zip to /content
 89% 102M/114M [00:01<00:00, 67.2MB/s] 
100% 114M/114M [00:01<00:00, 90.9MB/s]
test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
Downloading train.csv.zip to /content
 43% 9.00M/21.2M [00:00<00:00, 27.8MB/s]
100% 21.2M/21.2M [00:00<00:00, 50.7MB/s]


In [6]:
!unzip -q train.csv.zip -d .


In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import check_output
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import os
import gc

import re
from nltk.corpus import stopwords
#import distance
from nltk.stem import PorterStemmer
#from bs4 import BeautifulSoup

import spacy
import random

import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy import data
from torchtext.legacy.data import Field, BucketIterator


In [8]:
df = pd.read_csv("/content/train.csv",encoding='latin-1')
df = df.fillna('')
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [9]:
df.shape

(404290, 6)

In [10]:
spacy_text = spacy.load('en')


In [11]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


In [12]:
def tokenize_text(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_text.tokenizer(text)]

In [13]:

Q1 = Field(tokenize = tokenize_text,
           init_token = '<sos>',
           eos_token = '<eos>',
           lower = True,
           batch_first = True)

Q2 = Field(tokenize = tokenize_text,
           init_token = '<sos>',
           eos_token = '<eos>',
           lower = True,
           batch_first = True)

In [14]:
LABEL = data.LabelField(dtype = torch.float)

In [15]:
fields = [('q1', Q1), ('q2', Q2), ('label', LABEL)]

In [16]:
example = [torchtext.legacy.data.Example.fromlist([df.question1[i], df.question2[i], df.is_duplicate[i]], fields) for i in range(df.shape[0])]


In [17]:
Dataset = torchtext.legacy.data.Dataset(example, fields)


In [18]:
(train_data, valid_data) = Dataset.split(split_ratio= [0.25, 0.75], random_state = random.seed(SEED))
len(train_data), len(valid_data)

(101072, 303218)

In [19]:
MAX_VOCAB_SIZE = 25_000

Q1.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
Q2.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

we have to build vocabulary on both

In [20]:

print(f"Unique tokens in Question 1 vocabulary: {len(Q1.vocab)}")
print(f"Unique tokens in Question 2 vocabulary: {len(Q2.vocab)}")
print(f"Unique tokens in Label vocabulary: {len(LABEL.vocab)}")

Unique tokens in Question 1 vocabulary: 25004
Unique tokens in Question 2 vocabulary: 25004
Unique tokens in Label vocabulary: 2


In [21]:
print(vars(train_data[10]))

{'q1': ['is', 'nothing', 'free', 'in', 'this', 'world', '?'], 'q2': ['why', 'is', 'nothing', 'free', 'in', 'this', 'world', '?'], 'label': 0}


In [22]:
BATCH_SIZE = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, valid_data), 
    batch_size = BATCH_SIZE,
    device = device)

In [23]:
import torch.nn as nn

class Encoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx ):
    
    super().__init__()
    
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)

    self.fc = nn.Linear(hidden_dim * 2, output_dim)
    
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, text):
    embedded = self.dropout(self.embedding(text))
    #packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
    outputs, (hidden, cell) = self.lstm(embedded)
    #output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

    hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
    return self.fc(hidden)
    #return hidden



In [24]:
class SiameseLSTM(nn.Module):
  def __init__(self, encoder, device):
    super().__init__()
    self.encoder1 = encoder
    self.encoder2 = encoder
    self.device = device
  def forward(self, q1, q2):
    output1 = self.encoder1(q1)
    output2 = self.encoder2(q2)

    #print(output1.shape, output2.shape)

    #distance
    dist = torch.exp(- torch.sum(torch.abs(output1 - output2), dim=1, keepdim=True))
    #dist = torch.dist(output1, output2, p=2)
    return dist


In [26]:
INPUT_DIM = len(Q1.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 128
N_LAYERS = 3
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = Q1.vocab.stoi[Q1.pad_token]

enc = Encoder(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM,
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)



In [27]:
model = SiameseLSTM(enc, device)

In [28]:
model.to(device)

SiameseLSTM(
  (encoder1): Encoder(
    (embedding): Embedding(25004, 100, padding_idx=1)
    (lstm): LSTM(100, 128, num_layers=3, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=256, out_features=128, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (encoder2): Encoder(
    (embedding): Embedding(25004, 100, padding_idx=1)
    (lstm): LSTM(100, 128, num_layers=3, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=256, out_features=128, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [29]:

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

SiameseLSTM(
  (encoder1): Encoder(
    (embedding): Embedding(25004, 100, padding_idx=1)
    (lstm): LSTM(100, 128, num_layers=3, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=256, out_features=128, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (encoder2): Encoder(
    (embedding): Embedding(25004, 100, padding_idx=1)
    (lstm): LSTM(100, 128, num_layers=3, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=256, out_features=128, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [30]:

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,559,344 trainable parameters


In [31]:

LEARNING_RATE = 0.003

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [32]:
import torch.nn.functional as F

class ContrastiveLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, dist, label):

        #if label=0, maximize dist and if label=1, minimize the distance
        loss_contrastive = torch.mean(1/2*(label) * torch.pow(dist, 2) +
                                      1/2*(1-label) * torch.pow(F.relu(self.margin - dist), 2)
                                      )
        return loss_contrastive

In [33]:
criterion = ContrastiveLoss()

In [34]:
model = model.to(device)
criterion = criterion.to(device)

In [35]:
def train(model, iterator, optimizer, criterion): 
  model.train()
  epoch_loss = 0

  for i, batch in enumerate(iterator):
    q1 = batch.q1
    q2 = batch.q2
    label = batch.label
    #print(q1, q2, label)

    maxsize = max(q1[0].shape, q2[0].shape)
    #print(maxsize)

    
    if q1.shape[1] < maxsize[0]:
      #print("padding q1")
      to_be_padded_shape = ( q1.shape[0], maxsize[0] - q1.shape[1])
      padding = torch.zeros(to_be_padded_shape, dtype=torch.int64, device=device)
      q1 = torch.cat((q1, padding), dim=1)
    else:
      #print("padding q2")
      to_be_padded_shape = ( q2.shape[0], maxsize[0] - q2.shape[1])
      padding = torch.zeros(to_be_padded_shape, dtype=torch.int64, device=device)
      q2 = torch.cat((q2, padding), dim=1)

    #q1 = q1.to(device)
    #
    q2 = q2.to(device)
    optimizer.zero_grad()

    output = model(q1, q2)

    

    loss = criterion(output, label)

    loss.backward()

    optimizer.step()

    epoch_loss += loss.item()
    if (i%25 == 0):
      print(output[0].item(), label[0].item())
    if (i%200 == 0):
      print("Loss", loss.item())
  
  return epoch_loss / len(iterator)


In [None]:
a = torch.randn((4,4))
a[0]

tensor([-0.9661, -1.1189,  0.7667,  1.9898])

In [37]:
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
  train_loss = train(model, train_iterator, optimizer, criterion)
  print(f'\tEpoch : {epoch} --- Train Loss: {train_loss:.3f}')

0.6722813844680786 0.0
Loss 0.11917537450790405
0.593150794506073 0.0
0.5140672922134399 0.0
0.6633709669113159 0.0
0.5850601196289062 1.0
0.6540926694869995 1.0
0.6086536645889282 0.0
0.6347788572311401 0.0
0.5939784049987793 0.0
Loss 0.11503319442272186
0.5712102651596069 0.0
0.6216328144073486 0.0
0.6278300881385803 1.0
0.6403073668479919 0.0
0.6027401685714722 0.0
0.5822219848632812 0.0
0.5774683952331543 1.0
	Epoch : 0 --- Train Loss: 0.117
0.6700252890586853 0.0
Loss 0.11067039519548416
0.5885022878646851 0.0
0.6223042607307434 0.0
0.611527681350708 0.0
0.6356885433197021 1.0
0.6825144290924072 1.0
0.6343337893486023 1.0
0.616695761680603 1.0
0.5980194807052612 1.0
Loss 0.11394061893224716
0.6224979162216187 1.0
0.6257814168930054 0.0
0.6610662341117859 0.0
0.6571629643440247 1.0
0.5777915120124817 0.0
0.6523122787475586 0.0
0.7085046172142029 0.0
	Epoch : 1 --- Train Loss: 0.117
0.592287003993988 1.0
Loss 0.11996766924858093
0.6009474396705627 1.0
0.6304569244384766 1.0
0.631825

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
if T.q1.shape[1] < maxsize[0]:
  print("padding q1")
  to_be_padded_shape = ( T.q1.shape[0], maxsize[0] - T.q1.shape[1])
  padding = torch.zeros(to_be_padded_shape, dtype=torch.int64)
  T.q1 = torch.cat((T.q1, padding), dim=1)
else:
  print("padding q2")
  to_be_padded_shape = ( T.q2.shape[0], maxsize[0] - T.q2.shape[1])
  padding = torch.zeros(to_be_padded_shape, dtype=torch.int64)
  T.q2 = torch.cat((T.q2, padding), dim=1)


In [None]:
T.q1.shape, T.q2.shape, padding.shape

In [None]:
T.q2[0].dtype

In [None]:
outcome = model(T.q1, T.q2)

In [None]:
outcome

In [None]:
T.label