# Quora Duplicate Questions Detection


In [None]:
from sklearn import model_selection

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

DIR = '/kaggle/input'

In [None]:
!unzip /kaggle/input/quora-question-pairs/train.csv.zip

In [None]:
!ls

In [None]:
df = pd.read_csv("train.csv")
df['kfold'] = -1

df = df.sample(frac=1.,random_state=2021).reset_index(drop=True)

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=False)

for fold, (train_idx, val_idx) in enumerate(kf.split(X=df, y = df.is_duplicate.values)):
    print(len(train_idx), len(val_idx))
    df.loc[val_idx, 'kfold'] = fold

# Fix`nan` in `train`

In [None]:
df.dropna(inplace=True)

In [None]:
df.question1.isna().sum(), df.question2.isna().sum(), df.question1.isnull().sum(), df.question2.isnull().sum()

In [None]:
df.to_csv("train_folds.csv", index=False)

In [None]:
df_fold = pd.read_csv("train_folds.csv")

# Check sentence length distribution

In [None]:
def sent_len(input_str: str):
    input_str = str(input_str)
    return len(input_str.strip().split(" "))

In [None]:
df_fold["question1_len"] = list(map(sent_len, df_fold.question1.values.tolist()))

In [None]:
df_fold["question2_len"] = list(map(sent_len, df_fold.question2.values.tolist()))

In [None]:
df_fold.head()

In [None]:
df_fold.question1_len.plot.hist(bins=20);

In [None]:
df_fold.question2_len.plot.hist(bins=20);

# Load Universal Sentence Encoder

In [None]:
import tensorflow_hub as hub

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
embeddings = embed([
    "The quick brown fox jumps over the lazy dog.",
    "I am a sentence for which I would like to get its embedding"])



In [None]:
import torch

## Ensure reproducibility

In [None]:
#Reproducing same results
SEED = 2021

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

In [None]:
import torch.nn as nn

In [None]:
BATCH_SIZE = 256

# Design Train Dataloader

In [None]:
from torch.utils.data import DataLoader, Dataset

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
class QuoraTrainData(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        q1 = self.df.iloc[idx].question1
        q2 = self.df.iloc[idx].question2
        label = self.df.iloc[idx].is_duplicate
        
        return {"q1": q1, "q2": q2, "label": label}

In [None]:

FOLD_MAPPPING = {
    0: [1, 2, 3, 4],
    1: [0, 2, 3, 4],
    2: [0, 1, 3, 4],
    3: [0, 1, 2, 4],
    4: [0, 1, 2, 3]
}

In [None]:
FOLD = 0

In [None]:
train_df = df_fold[df_fold.kfold.isin(FOLD_MAPPPING.get(FOLD))].reset_index(drop=True)
valid_df = df_fold[df_fold.kfold==FOLD].reset_index(drop=True)

In [None]:
train_df.shape, valid_df.shape

In [None]:
valid_df.head()

In [None]:
train_dataset = QuoraTrainData(train_df)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
valid_dataset = QuoraTrainData(valid_df)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)

## Check train dataset

In [None]:
# 5th example
train_dataset.__getitem__(5)

In [None]:
valid_dataset.__getitem__(5)

## Check train dataloader

In [None]:
train_iter = iter(train_loader)
res = train_iter.next()

# Design Model

## Simple multilayer perceptron - no nonlinearity 

In [None]:
class IsDuplicate(nn.Module):
    def __init__(self, output_dim: int, emb_dim: int, hid_dim=512):
        """Simple MultiLayerPerceptron
            Linear model
        """
        super().__init__()
        #dense layer
        self.fc1 = nn.Linear(emb_dim * 2, hid_dim)
        
        self.fc2 = nn.Linear(hid_dim, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text1:[str], text2:[str]):
        """
        text1: list of strings from question1, len: batch_size
        text2: list of strings from question2, len: batch_size
        """
        
        emb1 = embed(text1)
        e1 = torch.from_numpy(emb1.numpy()).to(device)
        # e1.size()
        
        emb2 = embed(text2)
        e2 = torch.from_numpy(emb2.numpy()).to(device)
        # e2.size()
        
        hidden = torch.cat((e1, e2), dim = 1)
        
        #hidden = [batch size, hid dim * num directions]
        dense_outputs1=self.fc1(hidden)
        dense_outputs2=self.fc2(dense_outputs1)

        #Final activation function
        outputs=self.act(dense_outputs2)
        
        return outputs

## Adding deeper non-linear model

- The original architecture idea came from [here](https://www.linkedin.com/pulse/duplicate-quora-question-abhishek-thakur/). But the original architecture is heavily simplified to the below structure with the use of transfer learning using `Universal Sentence Encoder`

<center>
<img src='https://raw.githubusercontent.com/msank00/Kaggle_202101_Quora_Duplicate_Questions/main/images/NN_Architecture.jpg' width='400'>    
</center>

In [None]:
class IsDuplicateAdv(nn.Module):
    def __init__(self, output_dim: int, emb_dim: int, hid_dim=512):
        """Non Linear model
        """
        super().__init__()
        #dense layer
        
        self.batchnorm1 = nn.BatchNorm1d(emb_dim * 2)
        self.dropout = nn.Dropout(p=0.2)
        self.nonlinear = nn.PReLU()
        
        self.fc1 = nn.Linear(emb_dim * 2, hid_dim)
        self.batchnorm2 = nn.BatchNorm1d(hid_dim)
        self.fc2 = nn.Linear(hid_dim, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text1:[str], text2:[str]):
        """
        text1: list of strings from question1, len: batch_size
        text2: list of strings from question2, len: batch_size
        """
        
        emb1 = embed(text1)
        e1 = torch.from_numpy(emb1.numpy()).to(device)
        
        emb2 = embed(text2)
        e2 = torch.from_numpy(emb2.numpy()).to(device)
        
        # merged
        x = torch.cat((e1, e2), dim = 1)
        x = self.batchnorm1(x)
        
        
        x=self.fc1(x)
        x = self.nonlinear(x)
        x = self.dropout(x)
        x = self.batchnorm2(x)
        
        x=self.fc2(x)

        #Final activation function
        outputs=self.act(x)
        
        return outputs

## Test model

In [None]:
model = IsDuplicateAdv(output_dim=2, emb_dim=512).to(device)

prob = model(text1= res['q1'], text2 = res['q2'])

In [None]:
print(model)

## Model parameters

In [None]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params+=param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
    
count_parameters(model)

In [None]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.BCELoss()

# `BCEWithLogitLoss()` more stable than `Sigmoid()` + `BCELoss()`. Why?

> `Sigmoid()` + `BCELoss()` = `BCEWithLogitLoss()`



- [Ans](https://discuss.pytorch.org/t/bce-loss-vs-cross-entropy/97437/2)



In [None]:
#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.argmax(preds, dim=1)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

## Train

In [None]:
def train(model, train_data_loader, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    n_batch = len(train_data_loader)
    for i, batch in enumerate(train_data_loader):
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        q1, q2, label = batch['q1'], batch['q2'], batch['label'] 
        
        label = label.to(device)
        #convert to 1D tensor
        predictions = model(q1, q2)
        
        #print(predictions.dtype)
        #print(label.float().dtype)
        #compute the loss
        loss = criterion(predictions[:,1], label.float())        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, label.float())   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        batch_loss = loss.item()
        batch_acc = acc.item() 
        epoch_loss += batch_loss  
        epoch_acc +=  batch_acc  
        if i % 100 == 0:
            print(f"\t\t\t > trn batch_no: {i}/{n_batch}, batch_loss: {np.round(batch_loss, 4)}, batch_acc: {np.round(batch_acc, 4)}")
        
    return epoch_loss / len(train_data_loader), epoch_acc / len(train_data_loader)

## Evaluate

In [None]:
def evaluate(model, valid_data_loader, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    n_batch = len(valid_data_loader)
    with torch.no_grad():
    
        for i, batch in enumerate(valid_data_loader):
        
            #retrieve question pair and labels
            q1, q2, label = batch['q1'], batch['q2'], batch['label']
            label = label.to(device)
            #convert to 1d tensor
            predictions = model(q1, q2)
            
            #compute loss and accuracy
            
            loss = criterion(predictions[:,1], label.float())
            acc = binary_accuracy(predictions, label.float())
            
            #loss and accuracy
            batch_loss = loss.item()
            batch_acc = acc.item() 
            epoch_loss += batch_loss  
            epoch_acc +=  batch_acc 
            if i % 50 == 0:
                print(f"\t\t\t > val batch_no: {i}/{n_batch}, batch_loss: {np.round(batch_loss,4)}, batch_acc: {np.round(batch_acc, 4)}")
            
        
    return epoch_loss / len(valid_data_loader), epoch_acc / len(valid_data_loader)


## Train Loop

In [None]:
N_EPOCHS = 5
best_valid_loss = float('inf')

history = {
    "train_loss": [],
    "train_acc": [],
    "valid_loss": [],
    "valid_acc": []
}

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_loader, criterion)
    
    train_loss = np.round(train_loss,4)
    train_acc = np.round(train_acc, 4)
    valid_loss = np.round(valid_loss, 4)
    valid_acc = np.round(valid_acc, 4)
    
    history["train_loss"].append(train_loss)
    history["train_acc"].append(train_acc)
    history["valid_loss"].append(valid_loss)
    history["valid_acc"].append(valid_acc)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'Epoch: {epoch+1}/{N_EPOCHS} \tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
history

# Plot training performance

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

In [None]:
plt.plot(history["train_loss"], label="train")
plt.plot(history["valid_loss"], label="val")
plt.title("Loss vs Epoch")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.grid(alpha=0.3)
plt.legend()
plt.show()

In [None]:
plt.plot(history["train_acc"], label="train")
plt.plot(history["valid_acc"], label="val")
plt.title("Accuracy vs Epochs")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.grid(alpha=0.3)
plt.legend()
plt.show()