# In this notebook we use the same siamese LSTM method as described in 

https://github.com/rapat82/QQP/blob/master/QQP_LSTM.ipynb

# instead of training embeddings we extract embeddings from BERT model

# $\color{blue}{\text{Summary of main results}}$

# - Embeddings from BERT model can enhance performance when coupled with non linear correlators (see the link above)

# - Obtaining sentence embeddings by just using the [CLS] token output from the last BERT layer typically does not constitute a good sentence representation

# - Below BERT model is only used in evaluation mode bert_model.eval()

# - Fine tuning a BERT model along the lines of 
https://github.com/rapat82/ReOrNot/blob/master/realornot-bertbase.ipynb

# would give much better results

# - Since this dataset is quite large, training BERT for this task could take a long time on a GPU (upto 25-30 hrs)

# - To run this notebook locally, change the path of data files and files for pretrained bert model accordingly 

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/pretrained-bert-models-for-pytorch/bert-base-cased-vocab.txt
/kaggle/input/pretrained-bert-models-for-pytorch/bert-base-multilingual-cased-vocab.txt
/kaggle/input/pretrained-bert-models-for-pytorch/bert-large-uncased-vocab.txt
/kaggle/input/pretrained-bert-models-for-pytorch/bert-large-cased-vocab.txt
/kaggle/input/pretrained-bert-models-for-pytorch/bert-base-uncased-vocab.txt
/kaggle/input/pretrained-bert-models-for-pytorch/bert-base-multilingual-uncased-vocab.txt
/kaggle/input/pretrained-bert-models-for-pytorch/bert-base-chinese-vocab.txt
/kaggle/input/pretrained-bert-models-for-pytorch/bert-large-uncased/bert_config.json
/kaggle/input/pretrained-bert-models-for-pytorch/bert-large-uncased/pytorch_model.bin
/kaggle/input/pretrained-bert-models-for-pytorch/bert-base-uncased/bert_config.json
/kaggle/input/pretrained-bert-models-for-pytorch/bert-base-uncased/pytorch_model.bin
/kaggle/input/pretrained-bert-models-for-pytorch/bert-base-cased/bert_config.json
/kaggle/input/pre

In [2]:
train_df_data = pd.read_csv('/kaggle/input/quora-question-pairs/train.csv.zip').fillna('')

In [3]:
q1 = train_df_data['question1'].values
q2 = train_df_data['question2'].values
y = train_df_data['is_duplicate'].values

In [4]:
from transformers import BertTokenizer, BertModel, BertConfig
import torch
tokenizer = BertTokenizer.from_pretrained('../input/pretrained-bert-models-for-pytorch/bert-base-uncased-vocab.txt',
                                         do_lower_case=True)
def numericalize(q1_list, m_len):
    output_list = []
    output_list_token_type_ids = []
    output_list_attention_mask = []
    for i in range(len(q1_list)):
        temp = tokenizer.encode_plus(q1_list[i], max_length=m_len, 
                                            truncation_strategy='longest_first', 
                                            pad_to_max_length=True, return_tensors='pt')
        output_list.append(temp['input_ids'])
        output_list_token_type_ids.append(temp['token_type_ids'])
        output_list_attention_mask.append(temp['attention_mask'])
    output_tensor = torch.stack(output_list).squeeze()
    output_tensor_ids = torch.stack(output_list_token_type_ids).squeeze()
    output_tensor_mask = torch.stack(output_list_attention_mask).squeeze()
    return output_tensor, output_tensor_ids, output_tensor_mask

In [5]:
m_len = 100
q1_tokens, q1_ids, q1_masks = numericalize(q1, m_len)
q2_tokens, q2_ids, q2_masks = numericalize(q2, m_len)

In [6]:
seed=33
np.random.seed(seed)
perm = np.random.permutation(q1_tokens.shape[0])
q1t_s = np.zeros_like(q1_tokens)
q1id_s = np.zeros_like(q1_ids)
q1m_s = np.zeros_like(q1_masks)
q2t_s = np.zeros_like(q2_tokens)
q2id_s = np.zeros_like(q2_ids)
q2m_s = np.zeros_like(q2_masks)
y_s = np.zeros_like(y)
np.take(q1_tokens,perm,axis=0,out=q1t_s)
np.take(q1_ids,perm,axis=0,out=q1id_s)
np.take(q1_masks,perm,axis=0,out=q1m_s)
np.take(q2_tokens,perm,axis=0,out=q2t_s)
np.take(q2_ids,perm,axis=0,out=q2id_s)
np.take(q2_masks,perm,axis=0,out=q2m_s)
np.take(y,perm,axis=0,out=y_s)
split_frac=0.9
iindex = int(len(q1_tokens)*split_frac)
q1_tok_train, q1_ids_train, q1_masks_train = q1t_s[:iindex], q1id_s[:iindex], q1m_s[:iindex]
q1_tok_val, q1_ids_val, q1_masks_val = q1t_s[iindex:], q1id_s[iindex:], q1m_s[iindex:]
q2_tok_train, q2_ids_train, q2_masks_train = q2t_s[:iindex], q2id_s[:iindex], q2m_s[:iindex]
q2_tok_val, q2_ids_val, q2_masks_val = q2t_s[iindex:], q2id_s[iindex:], q2m_s[iindex:]
train_y, val_y = y_s[:iindex], y_s[iindex:] 

In [7]:
from torch.utils.data import TensorDataset, DataLoader

In [8]:
train_data = TensorDataset(torch.from_numpy(q1_tok_train), torch.from_numpy(q1_ids_train), 
                         torch.from_numpy(q1_masks_train), torch.from_numpy(q2_tok_train), 
                         torch.from_numpy(q2_ids_train), torch.from_numpy(q2_masks_train), 
                         torch.from_numpy(train_y))
val_data = TensorDataset(torch.from_numpy(q1_tok_val), torch.from_numpy(q1_ids_val), 
                         torch.from_numpy(q1_masks_val), torch.from_numpy(q2_tok_val), 
                         torch.from_numpy(q2_ids_val), torch.from_numpy(q2_masks_val), 
                         torch.from_numpy(val_y))

In [9]:
train_bs = 128
train_loader = DataLoader(train_data, shuffle = True, batch_size=train_bs)
valid_loader = DataLoader(val_data, shuffle = True, batch_size=train_bs)

In [10]:
train_on_gpu=torch.cuda.is_available()

In [11]:
bert_model_config = '../input/pretrained-bert-models-for-pytorch/bert-base-uncased/bert_config.json'
bert_config = BertConfig.from_json_file(bert_model_config)
#bert_config.output_hidden_states=True
bert_model = BertModel.from_pretrained('../input/pretrained-bert-models-for-pytorch/bert-base-uncased/', config = bert_config)
bert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [12]:
import torch.nn as nn

class QQPLSTM(nn.Module):
    # The model below will calculate the similarity of two questions
    #def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim,
    #            n_layers, drop_prob = 0.5):
    def __init__(self, seq_len, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        
        super(QQPLSTM, self).__init__()
        
        self.seq_len = seq_len
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim =embedding_dim
        self.drop_prob = drop_prob
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 
                           n_layers, dropout = drop_prob,
                           batch_first = True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(7*hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward_half(self, x, hidden):
        lstm_output, hidden = self.lstm(x, hidden) 
        lstm_output = lstm_output.contiguous().view(-1,self.hidden_dim)
        
        return lstm_output, hidden
    def forward(self, x1, x2, hidden1, hidden2):
        batch_size = x1.size(0)
        lstm_output1, hidden1 = self.forward_half(x1, hidden1)
        lstm_output2, hidden2 = self.forward_half(x2, hidden2)
        lstm_output = torch.cat((torch.max(lstm_output1,lstm_output2), 
                                 torch.abs(lstm_output1-lstm_output2),
                                 lstm_output1*lstm_output2,
                                 torch.abs(lstm_output1**2-lstm_output2**2),
                                torch.abs(lstm_output1**3-lstm_output2**3),
                                torch.abs(lstm_output1**4-lstm_output2**4),
                                torch.abs(lstm_output1**3-lstm_output2**3)*(lstm_output1+lstm_output2)/2), 
                                dim=1)
        out = self.dropout(lstm_output)
        out = self.fc(out)
        #out = self.dropout2(out)
        #out = self.fc2(out)
        #out = self.dropout3(out)
        #out = self.fc3(out)
        # out is now of dimension rows X output_size
        sigmoid_out = self.sigmoid(out)
        sigmoid_out = sigmoid_out.view(batch_size, -1)
        # sigmoid_out is now of dimension batch_size X columns
        sigmoid_out = sigmoid_out[:,-1]
        # This was the step where we took the last batch of 'labels'
        return sigmoid_out, hidden1, hidden2

    def init_hidden(self, batch_size):
        
        # Let's create new tensors initialized to zero for the hidden state and cell state of the LSTM
        # these should be two tensors of size n_layers X batch_size X hidden
        # There should be n_layers*2 for bidirectional 
        weight = next(self.parameters()).data
        
        if train_on_gpu:
            hidden=(weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                   weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden=(weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                   weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
            
        return hidden

In [13]:
output_size = 1
embedding_dim = 768
hidden_dim = 512
n_layers = 2
seq_len = 200
net = QQPLSTM(seq_len, output_size, embedding_dim, hidden_dim, n_layers)

In [14]:
lr = 0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [15]:
def accuracy(y_actual, y_pred):
#    y_ = y_pred > 0
#    return np.sum(y_actual == y_).astype('int') / y_actual.shape[0]
    y_ = np.round(np.array(y_pred))
    return np.sum(y_actual == y_) / y_actual.shape[0]

In [16]:
import time
epochs = 6
clip = 5 # Gradient clipping

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
net = net.to(device)

net.train()
# Now our network is in training mode, lets train for some epochs
#accuracy_old_old =0.0
loss_vs_epoch = []
valloss_vs_epoch = []
valid_loss_old = 100.0
bert_model.to('cuda')
bert_bs = train_bs
for e in range(0,epochs):
    t1=time.ctime()
    print(t1)
    # create hidden state
    # do this for every batch
    for q1t, q1id, q1m, q2t, q2id, q2m, labels in train_loader:
        with torch.no_grad():
            q1_embed = []
            q2_embed = []
            q1t = q1t.long().to(device)
            q1id = q1id.long().to(device)
            q1m = q1m.long().to(device)
            q2t = q2t.long().to(device)
            q2id = q2id.long().to(device)
            q2m = q2m.long().to(device)
            labels = labels.float().to(device)
            q1_bert = bert_model(q1t, q1id, q1m)[0]
            q1_embed.append(q1_bert)        
            q2_bert = bert_model(q2t, q2id, q2m)[0]
            q2_embed.append(q2_bert)        
            q1_tensor = torch.cat(q1_embed)
            q2_tensor = torch.cat(q2_embed)
        batch_size=q1t.shape[0]
        #print(batch_size)
        h1 = net.init_hidden(batch_size)
        h2 = net.init_hidden(batch_size)
        # Create new variables for the hidden state, so we don't backpropagate through entire training history
        h1 = tuple([each.data for each in h1])
        h2 = tuple([each.data for each in h2])
        # zero out the accumulated gradients
        net.zero_grad()
        #x1=x1.float()
        output, h1, h2 = net(q1_tensor, q2_tensor, h1, h2)
        loss = criterion(output, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
    loss_vs_epoch.append([e+1, loss.item()])
    val_losses = []
    net.eval()
    avg_acc = 0
    preds = []
    original = []
    for q1t, q1id, q1m, q2t, q2id, q2m, labels in valid_loader:
        with torch.no_grad():
            q1_embed = []
            q2_embed = []
            q1t = q1t.long().to(device)
            q1id = q1id.long().to(device)
            q1m = q1m.long().to(device)
            q2t = q2t.long().to(device)
            q2id = q2id.long().to(device)
            q2m = q2m.long().to(device)
            labels = labels.float().to(device)
            q1_bert = bert_model(q1t, q1id, q1m)[0]
            q1_embed.append(q1_bert)        
            q2_bert = bert_model(q2t, q2id, q2m)[0]
            q2_embed.append(q2_bert)        
            q1_tensor = torch.cat(q1_embed)
            q2_tensor = torch.cat(q2_embed)
        batch_size=q1t.shape[0]
        #print(batch_size)
        val_h1 = net.init_hidden(batch_size)
        val_h2 = net.init_hidden(batch_size)
        # Create new variables for the hidden state, so we don't backpropagate through entire training history
        val_h1 = tuple([each.data for each in val_h1])
        val_h2 = tuple([each.data for each in val_h2])
        output, val_h1, val_h2 = net(q1_tensor, q2_tensor, val_h1, val_h2)
        val_loss = criterion(output, labels)
        acc = accuracy(labels.cpu().numpy(), output.detach().cpu().numpy().squeeze())
        val_losses.append(val_loss.item())
        preds.append(output.cpu().detach().numpy())
        original.append(labels.float().cpu().detach().numpy())
        avg_acc += acc
    net.train()
    print( "Epoch: {}/{}---finished---accuracy:{}".format(e+1,epochs, avg_acc / len(valid_loader)))

Mon Mar  2 22:56:51 2020
Epoch: 1/6---finished---accuracy:0.7389140706654279
Mon Mar  2 23:44:11 2020
Epoch: 2/6---finished---accuracy:0.7756160815381489
Tue Mar  3 00:31:33 2020
Epoch: 3/6---finished---accuracy:0.7797341790297294
Tue Mar  3 01:18:53 2020
Epoch: 4/6---finished---accuracy:0.8153590338665659
Tue Mar  3 02:06:13 2020
Epoch: 5/6---finished---accuracy:0.8031222328271977
Tue Mar  3 02:53:38 2020
Epoch: 6/6---finished---accuracy:0.8099662223464174
