In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pytorch-transformers
!pip install pytorch-pretrained-bert pytorch-nlp
import logging
logging.basicConfig(level=logging.INFO)
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('/kaggle/input/quora-question-pairs/train.csv.zip')
data = data.dropna()
print("The percentage of non similar question pairs is : ")
print(len(data[data['is_duplicate']==0].index)*100/len(data.index))
print("The percentage of similar question pairs is : ")
print(len(data[data['is_duplicate']==1].index)*100/len(data.index))

In [None]:
data = data.sample(n=7000)

print("The percentage of non similar question pairs after sampling is : ")
print(len(data[data['is_duplicate']==0].index)*100/len(data.index))
print("The percentage of similar question pairs after sampling is : ")
print(len(data[data['is_duplicate']==1].index)*100/len(data.index))

# store the labels 
labels = data.is_duplicate.values

In [None]:
data.info()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# function to tokenize and generate input ids for the tokens
# returns a list of input ids

def prep_data(ques1, ques2):
  all_input_ids = []
  
  for (q1,q2) in zip(ques1, ques2):
    
    # first sentence is appended with [CLS] and [SEP] in the beginning and end
    q1 = '[CLS] ' + q1 + ' [SEP] '
    tokens = tokenizer.tokenize(q1)
    
    # 0 denotes first sentence
    seg_ids = [0] * len(tokens)
    
    # second sentence is appended with [SEP] in the end
    q2 = q2 + ' [SEP] '
    tok_q2 = tokenizer.tokenize(q2)
    
    # seg ids is appended with 1 to denote second sentence
    seg_ids += [1] * len(tok_q2)
    
    # first and second sentence tokens are appended together
    tokens += tok_q2
    
    # input ids are generated for the tokens (one question pair)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # input ids are stored in a separate list
    all_input_ids.append(input_ids)
    
  return all_input_ids


all_input_ids = prep_data(data['question1'].values, data['question2'].values)

In [None]:
MAX_LEN = 128

# Pad our input tokens
pad_input_ids = pad_sequences(all_input_ids,
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in pad_input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(pad_input_ids, labels, 
                                                            random_state=2018, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, pad_input_ids,
                                             random_state=2018, test_size=0.2)

In [None]:
batch_size = 32
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=3e-5,
                     warmup=.1)

def accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    
    # Add batch to GPU
    
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    
    # Backward pass
    loss.backward()
    
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    
    # Add batch to GPU
    
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
      
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))