# Quora Duplicate Questions Detection

- This version is a refactored version of [this](https://www.kaggle.com/sankarshan7/quora-duplicate-question?scriptVersionId=51988650) original kernel
- The [orignal](https://www.kaggle.com/sankarshan7/quora-duplicate-question?scriptVersionId=51988650) `pytorch` kernel is refactored to integrate with `PyTorchLightning` 
- Some `PytorchLightning` refactoring style has been taken from this kernel: [Lish-moa baseline approach by Adrew Lukyanenko](https://www.kaggle.com/artgor/lish-moa-baseline-approach/notebook#Data-exploration)

In [None]:
from sklearn import model_selection

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

DIR = '/kaggle/input'

## Data Cleaning and preparation

In [None]:
!ls -lh /kaggle/input/quora-question-pairs/

In [None]:
!unzip /kaggle/input/quora-question-pairs/sample_submission.csv.zip

In [None]:
!unzip /kaggle/input/quora-question-pairs/train.csv.zip

In [None]:
!unzip /kaggle/input/quora-question-pairs/test.csv.zip

In [None]:
!ls -lh

In [None]:
df_sub = pd.read_csv("sample_submission.csv")

In [None]:
df_sub.shape

In [None]:
#df_sub[df_sub.test_id.isin([1046690, 1461432, 379205, 817520, 943911, 1270024,  2345796])]

In [None]:
df_test = pd.read_csv("test.csv")

In [None]:
df_sub.shape, df_test.shape

## Fix `nan` issue in Test data

In [None]:
import numpy as np
df_test = df_test.replace(np.nan, 'nan', regex=True)

In [None]:
df_test.question1.isna().sum(), df_test.question2.isna().sum()

## Create validation dataset

In [None]:
df = pd.read_csv("train.csv")
df['kfold'] = -1

df = df.sample(frac=1.,random_state=2021).reset_index(drop=True)

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=False)

for fold, (train_idx, val_idx) in enumerate(kf.split(X=df, y = df.is_duplicate.values)):
    print(len(train_idx), len(val_idx))
    df.loc[val_idx, 'kfold'] = fold

In [None]:
df.shape

In [None]:
df[df.question1.isna()]

In [None]:
df[df.question2.isna()]

## Fix `nan` issue in `train` data

In [None]:
df.dropna(inplace=True)

In [None]:
df.question1.isna().sum(), df.question2.isna().sum(), df.question1.isnull().sum(), df.question2.isnull().sum()

In [None]:
df.to_csv("train_folds.csv", index=False)

In [None]:
df_fold = pd.read_csv("train_folds.csv")

# Load Universal Sentence Encode

In [None]:
import tensorflow_hub as hub

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
embeddings = embed([
    "The quick brown fox jumps over the lazy dog.",
    "I am a sentence for which I would like to get its embedding"])

embeddings

In [None]:
import torch

In [None]:
#Reproducing same results
SEED = 2021

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

In [None]:
import torch.nn as nn

In [None]:
BATCH_SIZE = 256

In [None]:
import pytorch_lightning as pl

In [None]:

FOLD_MAPPPING = {
    0: [1, 2, 3, 4],
    1: [0, 2, 3, 4],
    2: [0, 1, 3, 4],
    3: [0, 1, 2, 4],
    4: [0, 1, 2, 3]
}

In [None]:
FOLD = 0

In [None]:
train_df = df_fold[df_fold.kfold.isin(FOLD_MAPPPING.get(FOLD))].reset_index(drop=True)
valid_df = df_fold[df_fold.kfold==FOLD].reset_index(drop=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

# Network Architecture

- Before integrating with the `pytorchlightning`, let's desing the network in vanilla pytorch

- The original architecture idea came from [here](https://www.linkedin.com/pulse/duplicate-quora-question-abhishek-thakur/). But the original architecture is heavily simplified with the use of transfer learning using `Universal Sentence Encoder`

<center>
<img src='https://raw.githubusercontent.com/msank00/Kaggle_202101_Quora_Duplicate_Questions/main/images/NN_Architecture.jpg' width='400'>    
</center>

In [None]:
class IsDuplicateAdv(nn.Module):
    def __init__(self, output_dim: int, emb_dim: int, hid_dim=512):
        """Non Linear model
        """
        super().__init__()
        #dense layer
        
        self.batchnorm1 = nn.BatchNorm1d(emb_dim * 2)
        self.dropout = nn.Dropout(p=0.2)
        self.nonlinear = nn.PReLU()
        
        self.fc1 = nn.Linear(emb_dim * 2, hid_dim)
        self.batchnorm2 = nn.BatchNorm1d(hid_dim)
        self.fc2 = nn.Linear(hid_dim, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text1:[str], text2:[str]):
        """
        text1: list of strings from question1, len: batch_size
        text2: list of strings from question2, len: batch_size
        """
        
        emb1 = embed(text1)
        e1 = torch.from_numpy(emb1.numpy())
        
        emb2 = embed(text2)
        e2 = torch.from_numpy(emb2.numpy())
        
        # merged
        x = torch.cat((e1, e2), dim = 1)
        x = self.batchnorm1(x)
        
        
        x=self.fc1(x)
        x = self.nonlinear(x)
        x = self.dropout(x)
        x = self.batchnorm2(x)
        
        x=self.fc2(x)

        #Final activation function
        outputs=self.act(x)
        
        return outputs

In [None]:
import torch.optim as optim
criterion = nn.BCELoss()

In [None]:
#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.argmax(preds, dim=1)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

## Wrap vanila `pytorch` network with `pytorchlightning`

In [None]:
class QuoraQPair(pl.LightningModule):
    def __init__(self, model: IsDuplicateAdv):
        super().__init__()
        self.model = model
        
    def forward(self, text1:[str], text2:[str]):
        return self.model(text1, text2)
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.model.parameters(), lr=0.0001)
        return optimizer
    
    def training_step(self, batch, batch_idx: int):
        q1, q2, label = batch['q1'], batch['q2'], batch['label'] 
        label = label.float()
        predictions = self.model(q1, q2)
        loss = criterion(predictions[:,1], label) 
        acc = binary_accuracy(predictions, label) 
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
        self.log('train_acc', acc, on_step=False, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
        return {'loss': loss, 'acc': acc}

    
    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        avg_acc = torch.stack([x['acc'] for x in outputs]).mean()
        self.log('avg_train_loss', avg_loss, on_epoch=True, sync_dist=True, prog_bar=False,logger=True,on_step=False)
        self.log('avg_train_acc', avg_acc, on_epoch=True, sync_dist=True, prog_bar=False,logger=True,on_step=False)
    
    def validation_step(self, batch, batch_idx: int):
        q1, q2, label = batch['q1'], batch['q2'], batch['label'] 
        label = label.float()
        predictions = self.model(q1, q2)
        loss = criterion(predictions[:,1], label) 
        acc = binary_accuracy(predictions, label) 
        self.log('valid_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log('valid_acc', acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return {'loss': loss, 'acc': acc}
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        avg_acc = torch.stack([x['acc'] for x in outputs]).mean()
        self.log('avg_val_loss', avg_loss, on_epoch=True, sync_dist=True, prog_bar=False,logger=True,on_step=False)
        self.log('avg_val_acc', avg_acc, on_epoch=True, sync_dist=True, prog_bar=False,logger=True,on_step=False)
        
        

**NOTE**

- Set `on_step=False` for better logging 

# Data Module

- [pl.DataModule Official Document](https://pytorch-lightning.readthedocs.io/en/stable/datamodules.html)
- [How to use it in real case - Kaggle MoA Prediction by Andrew Lukyanenko](https://www.kaggle.com/artgor/lish-moa-baseline-approach)

In [None]:
from torch.utils.data import DataLoader, Dataset

In [None]:
class QuoraTrainData(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        q1 = self.df.iloc[idx].question1
        q2 = self.df.iloc[idx].question2
        label = self.df.iloc[idx].is_duplicate
        
        return {"q1": q1, "q2": q2, "label": label}

In [None]:
class QuoraTestData(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        q1 = self.df.iloc[idx].question1
        q2 = self.df.iloc[idx].question2
        
        return {"q1": q1, "q2": q2}

In [None]:
class QuoraQPairDataModule(pl.LightningDataModule):
    def __init__(self, train_df:pd.DataFrame, valid_df: pd.DataFrame, batch_size:int):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.valid_df = valid_df
        
    def prepare_data(self):
        # any data downloading / preprocessing
        pass
    
    def setup(self, stage=None):
        # setup torch dataset
        self.train_dataset = QuoraTrainData(self.train_df)
        self.valid_dataset = QuoraTrainData(self.valid_df)
    
    def train_dataloader(self):
        train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)
        return train_loader
    
    def val_dataloader(self):
        valid_loader = DataLoader(self.valid_dataset, batch_size=self.batch_size, num_workers=4)
        return valid_loader
    
    def test_dataloader(self):
        pass

In [None]:
from pytorch_lightning import Callback
class MetricsCallback(Callback):
    """PyTorch Lightning metric callback."""

    def __init__(self):
        super().__init__()
        self.metrics = {"train": [], "val": []}

    def on_validation_end(self, trainer, pl_module):
        self.metrics["val"].append(trainer.logged_metrics)
    
    def on_train_end(self, trainer, pl_module):
        self.metrics["train"].append(trainer.logged_metrics)

In [None]:
net = IsDuplicateAdv(output_dim=2, emb_dim=512)
model = QuoraQPair(net)
dm = QuoraQPairDataModule(train_df, valid_df, BATCH_SIZE)

## Set logger for accessing training history

- [PyTorch CSVLOgger](https://pytorch-lightning.readthedocs.io/en/latest/generated/pytorch_lightning.loggers.CSVLogger.html)

In [None]:
from pytorch_lightning.loggers import CSVLogger

In [None]:
# logger
import os
csvlogger = CSVLogger(
    save_dir=os.getcwd(),
    name="exp_logs"
)
os.getcwd()

In [None]:
metrics_callback = MetricsCallback()

In [None]:
trainer = pl.Trainer(max_epochs=5,
                     default_root_dir=os.getcwd(),
                     logger=csvlogger,
                     deterministic=True) # callbacks = [metrics_callback]

In [None]:
trainer.fit(model, dm)

# How to log metrics properly using PyTorchLightning

- [Why are losses different when logging from '_step' (with on_epoch=True) compared to logging from '_epoch_end'? #5539](https://github.com/PyTorchLightning/pytorch-lightning/issues/5539)
- [Understanding different values of training/validation loss in callback_metrics dictionary](https://forums.pytorchlightning.ai/t/understanding-different-values-of-training-validation-loss-in-callback-metrics-dictionary/568)

In [None]:
ls -lh exp_logs/

In [None]:
df_metrics = pd.read_csv('exp_logs/version_0/metrics.csv')

In [None]:
df_metrics

In [None]:
df_metrics_val = df_metrics[["avg_val_loss", "avg_val_acc", "epoch"]].dropna()
df_metrics_train = df_metrics[["avg_train_loss", "avg_train_acc", "epoch"]].dropna()

In [None]:
df_metrics_val.avg_val_loss.values

# Training performance

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

In [None]:
plt.plot(df_metrics_train.avg_train_loss.values, label="train")
plt.plot(df_metrics_val.avg_val_loss.values, label="val")
plt.title("Loss vs Epoch")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.grid(alpha=0.3)
plt.legend()
plt.show()

In [None]:
plt.plot(df_metrics_train.avg_train_acc.values, label="train")
plt.plot(df_metrics_val.avg_val_acc.values, label="val")
plt.title("Accuracy vs Epochs")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.grid(alpha=0.3)
plt.legend()
plt.show()

# Prepare test data

In [None]:
df_test.head()

In [None]:
test_dataset = QuoraTestData(df_test)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False, num_workers=4)

# Inference

In [None]:
predictions = np.zeros(df_test.shape[0])
model_inference = model.model
model_inference.eval()

In [None]:
test_iter = iter(test_loader)
tres = test_iter.next()

In [None]:
df_test.head()

In [None]:
temp = model_inference(tres['q1'], tres['q2'])[:,1].detach().cpu().numpy()
temp

In [None]:
predictions

In [None]:
from tqdm import tqdm

In [None]:
for ind, batch in tqdm(enumerate(test_loader), total=len(test_loader)):
    p = model_inference(batch['q1'], batch['q2'])[:,1].detach().cpu().numpy()
    predictions[ind * 1024:(ind + 1) * 1024] = p

In [None]:
predictions.shape

In [None]:
s = pd.DataFrame({'test_id': df_test['test_id'].values, 'is_duplicate': predictions})

In [None]:
s.head()

In [None]:
s.to_csv("submission.csv", index=False)

In [None]:
#s.shape
#s[s.test_id.isin([1128118])]
#df_sub.shape
#df_sub.head()
#df_sub[df_sub.test_id.isin([1128118,1128119 ])]