In [None]:
!pip install transformers
!pip install datasets
!pip install transformers[sentencepiece]

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/contradictory-my-dear-watson/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
## Imports and declarations
import torch
import gc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import GradScaler, autocast
from transformers import AutoTokenizer, AutoModel, logging, AdamW, get_scheduler
from datasets import load_dataset
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
os.environ["TOKENIZERS_PARALLELISM"] = "true"
%matplotlib inline
logging.set_verbosity_warning()
logging.set_verbosity_error()

In [None]:
# preprocessing function
def preprocess(data, features, label):
  return data.drop(features, axis=1), data[label]


# load the train data and have a look
train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
features, labels = preprocess(train, ['label', 'lang_abv', 'id', 'language'], 'label')

In [None]:
# take a look at the features and labels
features.head()

In [None]:
labels.head()

In [None]:
# dataset class in pytorch
class BertyDataset(Dataset):
    
    def __init__(self, features, max_length, bert_model, tokenizer, labels=None, indexes=False):
        self.features = features
        self.labels = labels
        self.max_length = max_length
        self.model = bert_model
        self.tokenizer = tokenizer
        self.indexes = indexes
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        # select the sentences at the specified index (idx)
        sent1 = str(self.features.loc[idx, 'premise'])
        sent2 = str(self.features.loc[idx, 'hypothesis'])
        
        # tokenize the sentences
        batch = self.tokenizer(
            sent1, sent2,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )
        
        # in case we won't use labels
        if type(self.labels) == None:            
            return batch['input_ids'].squeeze(0), batch['attention_mask'].squeeze(0), batch['token_type_ids'].squeeze(0)
        
        if self.indexes:
            label = self.labels.loc[idx]
            
            return batch['input_ids'].squeeze(0), batch['attention_mask'].squeeze(0), batch['token_type_ids'].squeeze(0), torch.tensor(label).squeeze(0)
        
        label = self.labels.loc[idx]['label']
            
        return batch['input_ids'].squeeze(0), batch['attention_mask'].squeeze(0), batch['token_type_ids'].squeeze(0), torch.tensor(label).squeeze(0)

In [None]:
# initialzie the pretrained model
roberta = AutoModel.from_pretrained('joeddav/xlm-roberta-large-xnli')

# initialzie the tokenizer
tokenizer = AutoTokenizer.from_pretrained('joeddav/xlm-roberta-large-xnli')

In [None]:
# load mnli dataset
def load_mnli():
    """Load mnli data"""
    result = []
    dataset = load_dataset(path="glue", name="mnli")
    keys = ['train']
    
    for k in keys:
        for record in dataset[k]:
            c1, c2, c3 = record['premise'], record['hypothesis'], record['label']
    
            if c1 and c2 and c3 in {0,1,2}:
                result.append((c1,c2,c3,'en'))
    
    result = pd.DataFrame(result, columns=['premise', 'hypothesis', 'label', 'lang_abv'])
    
    return result

# load xnli dataset
def load_xnli():
    """Load xnli data"""
    result = []
    dataset = load_dataset('xnli', 'all_languages')
    
    for k in dataset.keys():
        for record in dataset[k]:
            hp, pr, lb = record['hypothesis'], record['premise'], record['label']
            
            if hp and pr and lb in {0,1,2}:
                    
                for lang, translation in zip(hp['language'], hp['translation']):
                    pr_lang = pr.get(lang, None)
                        
                    if pr_lang is None:
                        continue
                            
                    result.append((pr_lang, translation, lb,lang))
    
    result = pd.DataFrame(result, columns=['premise', 'hypothesis', 'label', 'lang_abv'])
    
    return result


# take 25 000 samples from mnli and xnli datasets
mnli = load_mnli().loc[:24999]
mnli.drop(['lang_abv'], axis=1, inplace=True) # drop the lang_abv col
xnli = load_xnli().loc[:24999]
xnli.drop(['lang_abv'], axis=1, inplace=True) # drop the lang_abv col

In [None]:
# take a look at mnli and xnli datasets
print('MNLI dataset: ', mnli.shape)
mnli.head()

In [None]:
print('XNLI dataset: ', xnli.shape)
xnli.head()

In [None]:
# implementing the.... MASTER DATASET!!!!!! a.k.a. concatenate both mnli and xnli
master_data = pd.concat([mnli, xnli], axis=0)
# take a look at the dataset
print('THE MASTER DATA: ', master_data.shape)
master_data.head()

In [None]:
# split the data into train and val sets
x_train, x_val, y_train, y_val = train_test_split(master_data.drop(['label'], axis=1), master_data['label'], test_size=0.2, random_state=2022)

In [None]:
# take a look at the training features and labels
print('X train: ', x_train.shape)
x_train.head()

In [None]:
print('Y train: ', y_train.shape)
y_train.head()

In [None]:
# create dataset and dataloader with the merged dataset for training and validation
batch_size = 16
max_length = 128
x_mnli_dataset_train = BertyDataset(x_train.reset_index().drop(['index'], axis=1),
                                    labels=y_train.reset_index().drop(['index'], axis=1),
                                    max_length=max_length,
                                    bert_model=roberta,
                                    tokenizer=tokenizer
                                   )
x_mnli_dataset_val = BertyDataset(x_val.reset_index().drop(['index'], axis=1),
                                  labels=y_val.reset_index().drop(['index'], axis=1),
                                  max_length=max_length,
                                  bert_model=roberta,
                                  tokenizer=tokenizer
                                 )
x_mnli_dataloader_train = DataLoader(x_mnli_dataset_train, batch_size=batch_size)
x_mnli_dataloader_val = DataLoader(x_mnli_dataset_val, batch_size=batch_size)

In [None]:
# implement model class in pytorch
class Berty(torch.nn.Module):
    
    def __init__(self, bert_model, freeze_bert=False):
        super(Berty, self).__init__()
        
        # instance of BERT model
        self.bert = bert_model
        
        # freezing BERT's parameters
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
        # BERT's hidden size, hidden size of our classifier, number of output labels
        self.hid_size_bert = 1024
        self.output = 3
        
        # our classification layer
        self.classifier = torch.nn.Sequential(
            torch.nn.Dropout(p=0.2),
            torch.nn.Linear(self.hid_size_bert, self.output) 
        )
        
    def forward(self, input_ids, attention_mask, token_type_ids): 
        # feed inputs into BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        # extract the last hidden state of the '[CLS]' token for classification tasks
        last_hidden_state_cls = outputs[0][:, 0, :]
        
        # feed input into the classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)
        
        return logits

In [None]:
# example data - see the data we are going to feed into BERT
input_ids = x_mnli_dataset_val.__getitem__(21)[0]
attn_mask = x_mnli_dataset_val.__getitem__(21)[1]
segment_ids = x_mnli_dataset_val.__getitem__(21)[2]
label = x_mnli_dataset_val.__getitem__(21)[3]
print('Input IDs: ', input_ids)
print('\nAttention mask: ', attn_mask)
print('\nSegment IDs: ', segment_ids)
print('\nLabel: ', label)

In [None]:
# training function
def train(model, train_dataloader, val_dataloader, epochs=4, gradient_accumulations=16, debug=False):
    """Train the BERT classifier model."""
    
    print('Start training...\n')

    # put the model in training mode
    model.to(device).train()

    # init gradient scaler
    scaler = GradScaler()
    max_train_acc = 0
    
    for epoch_i in range(epochs):   
        # initialize monitoring variables
        # batch_loss and batch_count are added because
        # we need to normalize the loss with respect to
        # the number of batches we've accumulated
        epoch_loss = 0
        epoch_acc = []
        batch_loss = 0
        batch_count = 0
        batch_acc = []
        
        for step, batch in enumerate(train_dataloader):
            # load batch on GPU/CPU
            batch_input_ids, batch_attn_mask, batch_segment_ids, batch_labels = tuple(item.to(device) for item in batch)

            with autocast():
              logits = model(input_ids=batch_input_ids, attention_mask=batch_attn_mask, token_type_ids=batch_segment_ids)
              loss = loss_fn(logits, batch_labels)

            # scale the gradients and perform backward propagation
            optim.zero_grad()
            scaler.scale(loss / gradient_accumulations).backward()
            preds = torch.argmax(logits, dim=1).flatten()
            accuracy = (preds == batch_labels).cpu().numpy().mean() * 100
                        
            # clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            # update monitoring variables
            epoch_loss += loss.item()
            batch_loss += loss.item()
            batch_acc.append(accuracy)
            epoch_acc.append(accuracy)
            batch_count += 1

            # update with the accumulated gradients, step with the scheduler and zero the gradients of the model
            if (step + 1) % gradient_accumulations == 0:
              scaler.step(optim)
              lr_scheduler.step()
              scaler.update()

            # monitor every 100 steps
            if (step % 100 == 0 and step != 0) or (step == len(train_dataloader)-1):
                
                if debug:
                    print("Evaluation for epoch: {}/{}, batch: {}/{}".format(epoch_i+1, epochs, step, len(train_dataloader)))
                    print("Train acc: {:.2f} \nTrain loss: {:.2f}\n".format(np.mean(batch_acc), batch_loss/batch_count))
                
                if max_train_acc < np.mean(batch_acc):
                    max_train_acc = np.mean(batch_acc)
                    torch.save(model, "model.pt")
                
                batch_count = 0
                batch_loss = 0
                batch_acc = []
            
        # evaluate for the current epoch
        avg_train_loss = epoch_loss / len(train_dataloader)
        avg_train_acc = np.mean(epoch_acc)
        val_acc, val_loss = evaluate(model, val_dataloader)

        print('****EVALUATION****')
        print("Epoch: {}/{} \nTrain loss: {:.4f} \nTrain accuracy: {:.2f} \nVal loss: {:.4f} \nVal accuracy: {:.2f}\n"\
              .format(epoch_i+1, epochs, avg_train_loss, avg_train_acc, val_loss, val_acc))
        
    return model

        
# evaluation function
def evaluate(model, val_dataloader):
    """Evaluate trained model."""
    
    model.to(device).eval()
    
    # initialize monitoring variables
    val_accuracy, val_loss = [], []
    
    for step, batch in enumerate(val_dataloader):
        batch_input_ids, batch_attn_mask, batch_segment_ids, batch_labels = tuple(item.to(device) for item in batch)
        
        with torch.no_grad():
            logits = model(input_ids=batch_input_ids, attention_mask=batch_attn_mask, token_type_ids=batch_segment_ids)
            
        loss = loss_fn(logits, batch_labels)
        preds = torch.argmax(logits, dim=1).flatten()
        accuracy = (preds == batch_labels).cpu().numpy().mean() * 100
        
        # update monitoring variables
        val_loss.append(loss.item())
        val_accuracy.append(accuracy)
        
    # return average accuracy and loss for the validation dataset
    return np.mean(val_accuracy), np.mean(val_loss)

In [None]:
# init the model
bert_classifier = Berty(roberta)

# define an optimizer
optim = AdamW(
    bert_classifier.parameters(),
    lr=2e-5, # recommended values are 5e-5, 3e-5 or 2e-5
    eps=1e-8, # default epsilon value is 1e-8
    #weight_decay=1e-2
)

# define the learning rate scheduler
epochs = 3 # authors' recommendation is 2, 3 or 4 for fine-tuning
gradient_accumulations = 16
total_steps = len(x_mnli_dataloader_train) // gradient_accumulations * epochs # all training steps
# without the gradient accumulation the total training steps would be total_steps = len(dataloader) * total_epochs 
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optim,
    num_warmup_steps=0, # default value
    num_training_steps=total_steps
)

# define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# clear cuda cache
torch.cuda.empty_cache()

# training the model
model = train(bert_classifier, x_mnli_dataloader_train, x_mnli_dataloader_val, epochs=epochs, gradient_accumulations=gradient_accumulations)
#model = torch.load(os.path.join('/kaggle/input/contradictory-my-dear-bertson-model', 'model (1).pt'), map_location=torch.device('cpu'))

In [None]:
# load test data for submission
test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')
print('Test set: ', test.shape)
test.head()

In [None]:
# preprocess test data
test, test_ids = preprocess(test, ['lang_abv', 'language', 'id'], 'id')
print('Test IDs: ', test_ids.shape)
test_ids.head()

In [None]:
print('Test set: ', test.shape)
test.head()

In [None]:
# make a submission
from tqdm import tqdm

# make a prediction function
def predict(model, data, tokenizer):
    """Prediction function for Berty model"""
    predicted = []
    model.eval()
    
    for i in tqdm(range(data.shape[0])):
        # tokenize input
        batch = tokenizer(
                data['premise'].loc[i], data['hypothesis'].loc[i],
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_token_type_ids=True,
                return_tensors='pt'
            )
        
        # loading batches on the same device with the model
        input_ids, attn_mask, segment_ids = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['token_type_ids'].to(device)

        # predict results
        with torch.no_grad():
            logits = model(input_ids, attn_mask, segment_ids)
        
        preds = torch.argmax(logits, dim=1).flatten()
        predicted.append(preds.item())
    
    return predicted

preds = predict(model, test, tokenizer=tokenizer)

In [None]:
# make the submission csv and save it
submission = pd.concat([test_ids, pd.DataFrame({'prediction': preds})], axis=1)
submission.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
# 0 == entailment, 1 == neutral, 2 == contradiction
submission

In [None]:
sample_submission = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/sample_submission.csv')
sample_submission.head()

In [None]:
# just to take a look
classes = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
prem = 'Premise: '+test.loc[5191][0]
hyp = 'Hypothesis: '+test.loc[5191][1]
print(prem)
print(hyp)
print('Prediction: ', classes[submission.loc[5191][1]])