In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !unzip /kaggle/input/quora-question-pairs/train.csv.zip

In [None]:
import torch
from transformers import AutoTokenizer
from tqdm import tqdm
import time
import random
import datetime

from sklearn import model_selection
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
import torch.nn as nn
from transformers import BertModel

In [None]:
import numpy as np
import torch


class AverageMeter:
    """
    Computes and stores the average and current value
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class EarlyStopping:
    def __init__(self, patience=7, mode="max", delta=0.001):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score


def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


In [None]:
class Settings:
#     PROJ_NAME = 'Text-Similarity-Using-BERT'
#     root_path = os.getcwd().split(PROJ_NAME)[0] + PROJ_NAME + "\\"
#     APPLICATION_PATH = root_path + "backend\\services\\text_similarity\\application\\"
    # setting up logs path
#     LOGS_DIRECTORY = root_path + "backend\\services\\text_similarity\\logs\\logs.txt"

    checkpoint = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    # training data directory
    TRAIN_DATA = "/kaggle/working/train.csv"

    # test data directory
    TEST_DATA = "/kaggle/input/quora-question-pairs/test.cs"

    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # labels
    possible_labels = {'not_duplicate': 0, 'duplicate': 1}
    # number of labels
    num_labels = 1
    # dropout
    dropout = 0.3
    input_dim = 768

    # max length for embeddings
    max_len = 256

    # bert no decay layers
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    seed_value = 42
    test_size = 0.2

    # weights path
    WEIGHTS_PATH = "text_similarity_model.bin"
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 16
    EPOCHS = 10
    RANDOM_STATE = 42
    TRAIN_NUM_WORKERS = 4
    VAL_NUM_WORKERS = 2
    patience = 4
    mode = "max"


In [None]:
class BERTDataset:
    def __init__(self, sentence_1, sentence_2, targets):
        self.settings = Settings
        self.sentence_1 = sentence_1
        self.sentence_2 = sentence_2
        self.targets = targets
        assert len(self.sentence_1) == len(self.sentence_2) == len(self.targets)

    def __len__(self):
        return len(self.sentence_1)

    def __getitem__(self, item):
        s1 = self.sentence_1[item]
        s2 = self.sentence_2[item]
        target = self.targets[item]

        inputs = self.settings.tokenizer.encode_plus(
            s1, s2,
            add_special_tokens=True,
            max_length=self.settings.max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            'input_ids': torch.tensor(ids),
            'attention_mask': torch.tensor(mask),
            'token_type_ids': torch.tensor(token_type_ids),
            'targets': torch.tensor(target)
        }



In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, freeze_params=False):
        super(BERTClassifier, self).__init__()
        self.settings = Settings
        self.bert = BertModel.from_pretrained(self.settings.checkpoint, return_dict=False)

        if not freeze_params:
            # freeze all the parameters
            for param in self.bert.parameters():
                param.requires_grad = False

        self.bert_drop = nn.Dropout(self.settings.dropout)
        self.out = nn.Linear(self.settings.input_dim, self.settings.num_labels)

    def forward(self, ids, mask, token_type_ids):
        o1, o2 = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        bo = self.bert_drop(o2)
        output = self.out(bo)

        return output


In [None]:
class Engine:
    def __init__(self):
        pass

    def loss_fn(self, outputs, targets):
        return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

    def set_seed(self, seed_value=42):
        random.seed(seed_value)
        np.random.seed(seed_value)
        torch.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)

    def accuracy_threshold(self, y_pred, y_true, thresh: float = 0.5, sigmoid: bool = True):
        if sigmoid:
            y_pred = y_pred.sigmoid()
        return ((y_pred > thresh) == y_true.byte()).float().mean().item()

    def train_fn(self, data_loader, model, optimizer, device, scheduler):
        print("Starting training...\n")
        # Reset the total loss for this epoch.
        total_loss, batch_loss, batch_counts = 0, 0, 0
        t0_epoch, t0_batch = time.time(), time.time()
        model.train()
        for step, data in tqdm(enumerate(data_loader), total=len(data_loader)):
            batch_counts += 1
            b_input_ids = data['input_ids']
            b_attn_mask = data['attention_mask']
            b_labels = data['targets']
            b_token_type_ids = data['token_type_ids']

            # moving tensors to device
            b_input_ids = b_input_ids.to(device)
            b_attn_mask = b_attn_mask.to(device)
            b_labels = b_labels.to(device)
            b_token_type_ids = b_token_type_ids.to(device)

            # optimizer.zero_grad()

            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because
            # accumulating the gradients is "convenient while training RNNs".
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()

            logits = model(
                ids=b_input_ids,
                mask=b_attn_mask,
                token_type_ids=b_token_type_ids
            )

            loss = self.loss_fn(logits, b_labels.float())
            batch_loss += loss.item()
            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value
            # from the tensor.
            total_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()
            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc
            optimizer.step()
            # Update the learning rate
            scheduler.step()

            if step % 2500 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = self.format_time(time.time() - t0_epoch)
                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(data_loader), elapsed))

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_loss / len(data_loader)
        # Measure how long this epoch took.
        training_time = self.format_time(time.time() - t0_epoch)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))

    def eval_fn(self, data_loader, model, device):
        print("Starting evaluation...\n")
        t0 = time.time()
        model.eval()
        val_accuracy = []
        val_loss = []
        with torch.no_grad():
            for step, data in tqdm(enumerate(data_loader), total=len(data_loader)):
                b_input_ids = data['input_ids']
                b_attn_mask = data['attention_mask']
                b_labels = data['targets']
                b_token_type_ids = data['token_type_ids']

                # moving tensors to device
                b_input_ids = b_input_ids.to(device)
                b_attn_mask = b_attn_mask.to(device)
                b_labels = b_labels.to(device)
                b_token_type_ids = b_token_type_ids.to(device)

                logits = model(
                    ids=b_input_ids,
                    mask=b_attn_mask,
                    token_type_ids=b_token_type_ids
                )
                loss = self.loss_fn(logits, b_labels.float())
                val_loss.append(loss.item())
                accuracy = self.accuracy_threshold(logits.view(-1, 1), b_labels.view(-1, 1))
                val_accuracy.append(accuracy)

        val_loss = np.mean(val_loss)
        val_accuracy = np.mean(val_accuracy)
        validation_time = self.format_time(time.time() - t0)

        print("  Average Validation Loss: {0:.2f}".format(val_loss))
        print("  Average Validation Accuracy: {0:.2f}".format(val_accuracy))
        print("  Validation took: {:}".format(validation_time))

        return val_loss, val_accuracy

    def format_time(self, elapsed):
        """
        Takes a time in seconds and returns a string hh:mm:ss
        """
        # Round to the nearest second.
        elapsed_rounded = int(round(elapsed))

        # Format as hh:mm:ss
        return str(datetime.timedelta(seconds=elapsed_rounded))


In [None]:
class Train:
    def __init__(self):
        # initialize required class
        self.settings = Settings
        self.engine = Engine()
        self.early_stopping = EarlyStopping(patience=self.settings.patience,
                                                  mode=self.settings.mode)

        # initialize required variables
        self.bert_text_model = None
        self.optimizer = None
        self.scheduler = None
        self.train_data_loader = None
        self.val_data_loader = None
        self.total_steps = None
        self.param_optimizer = None
        self.optimizer_parameters = None

    def __initialize(self):
        # Instantiate Bert Classifier
        self.bert_text_model = BERTClassifier()
        self.bert_text_model.to(self.settings.DEVICE)
        self.__optimizer_params()

        # Create the optimizer
        self.optimizer = AdamW(self.optimizer_parameters,
                               lr=5e-5,  # Default learning rate
                               eps=1e-8  # Default epsilon value
                               )

        # Set up the learning rate scheduler
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                         num_warmup_steps=0,  # Default value
                                                         num_training_steps=self.total_steps)

    def __optimizer_params(self):
        self.param_optimizer = list(self.bert_text_model.named_parameters())
        self.optimizer_parameters = [
            {
                "params": [
                    p for n, p in self.param_optimizer if not any(nd in n for nd in self.settings.no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in self.param_optimizer if any(nd in n for nd in self.settings.no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

    def __create_data_loaders(self, sentence1, sentence2, targets, batch_size, num_workers):
        dataset = BERTDataset(sentence_1=sentence1,
                              sentence_2=sentence2,
                              targets=targets)
        data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)

        return data_loader

    def __load_data(self, csv_data_path):
        df = pd.read_csv(csv_data_path).dropna().reset_index(drop=True)
        df_train, df_valid = model_selection.train_test_split(
            df,
            random_state=self.settings.seed_value,
            test_size=self.settings.test_size,
            stratify=df.is_duplicate.values

        )

        df_train = df_train.reset_index(drop=True)
        df_valid = df_valid.reset_index(drop=True)

        # creating Data Loaders
        # train data loader
        self.train_data_loader = self.__create_data_loaders(sentence1=df_train.question1.values,
                                                            sentence2=df_train.question2.values,
                                                            targets=df_train.is_duplicate.values,
                                                            num_workers=self.settings.TRAIN_NUM_WORKERS,
                                                            batch_size=self.settings.TRAIN_BATCH_SIZE)

        # validation data loader
        self.val_data_loader = self.__create_data_loaders(sentence1=df_valid.question1.values,
                                                          sentence2=df_valid.question2.values,
                                                          targets=df_valid.is_duplicate.values,
                                                          num_workers=self.settings.VAL_NUM_WORKERS,
                                                          batch_size=self.settings.VALID_BATCH_SIZE)

        self.total_steps = int(len(df_train) / self.settings.TRAIN_BATCH_SIZE * self.settings.EPOCHS)

    def __train(self):
        for epochs in range(self.settings.EPOCHS):
            self.engine.train_fn(data_loader=self.train_data_loader,
                                 model=self.bert_text_model,
                                 optimizer=self.optimizer,
                                 device=self.settings.DEVICE,
                                 scheduler=self.scheduler)

            val_loss, val_accuracy = self.engine.eval_fn(data_loader=self.val_data_loader,
                                                         model=self.bert_text_model,
                                                         device=self.settings.DEVICE)

            print(f"Validation accuracy = {val_accuracy}")

            self.early_stopping(epoch_score=val_accuracy,
                                model=self.bert_text_model,
                                model_path=self.settings.WEIGHTS_PATH)

            if self.early_stopping.early_stop:
                print("Early stopping")
                break

    def run(self):
        try:
            print("Loading and Preparing the Dataset-----!! ")
            self.__load_data(csv_data_path=self.settings.TRAIN_DATA)
            print("Dataset Successfully Loaded and Prepared-----!! ")
            print()
            print("Loading and Initializing the Bert Model -----!! ")
            self.__initialize()
            print("Model Successfully Loaded and Initialized-----!! ")
            print()
            print("------------------Starting Training-----------!!")
            self.engine.set_seed()
            self.__train()
            print("Training complete-----!!!")

        except BaseException as ex:
            print("Following Exception Occurred---!! ", str(ex))


In [None]:
t1 = Train()
t1.run()

### inference