In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import random
import time
from babel.dates import format_date, format_datetime, format_time

import tensorflow as tf

import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler

from transformers import XLMRobertaTokenizer, XLMRobertaModel, AdamW, XLMRobertaForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from tensorflow.keras.layers import Dropout

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [None]:
# Check device 
# Get the GPU device name if available.
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available. {}'.format(torch.cuda.device_count()))
    print('We will use the GPU: {}'.format(torch.cuda.get_device_name(0)))

# If we dont have GPU but a CPU, training will take place on CPU instead
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
torch.cuda.empty_cache()
    
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
data_train_path = '../input/contradictory-my-dear-watson/train.csv'
data_test_path = '../input/contradictory-my-dear-watson/test.csv'

# Load and shuffle train dataset
data_train = pd.read_csv(data_train_path)
data_train.sample(frac=1) # Shuffle train dataframe
data_train['hypothesis'].astype(str)
data_train['premise'].astype(str)

# Load test dataset
data_test = pd.read_csv(data_test_path)
data_test['hypothesis'].astype(str)
data_test['premise'].astype(str)

In [None]:
data_train.head()

In [None]:
data_train.info()

In [None]:
# Get nÂº of texts in the different languages available in the dataset
data_train.groupby('language')['id'].count().sort_values(ascending=False)

In [None]:
# get length of all the premises in the train set
seq_len_premise = [len(i.split()) for i in data_train['premise']]

pd.Series(seq_len_premise).hist(bins = 25)

In [None]:
# get length of all the hypothesis in the train set
seq_len_hypothesis = [len(i.split()) for i in data_train['hypothesis']]

pd.Series(seq_len_hypothesis).hist(bins = 25)

# Preprocess

In [None]:
random_seed = 42

SEQ_LEN = 90 # Lower than max_position_embeddings field in config file (GPU exhausted)
#model_name = 'roberta-base'
model_name = 'xlm-roberta-base'
batch_size = 16
epochs = 15 # number of training epochs
learning_rate = 1e-5 # Controls how large a step is taken when updating model weights during training.
steps_per_epoch = 50
num_workers = 3

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(data_train[['premise', 'hypothesis']].values.tolist(), 
                                                      data_train['label'], test_size=0.20, random_state=random_seed)


In [None]:
# Load the RoBERTa tokenizer and tokenize the data
print('Loading BERT tokenizer...')
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name, do_lower_case=True)

tokens = tokenizer(x_train, truncation=True)

token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [None]:
#print(f' Tokens: {tokens}')
#print(f' Tokens IDs: {token_ids}')

In [None]:
tokenizer.special_tokens_map

In [None]:
trencoding = tokenizer.batch_encode_plus(
  x_train,
  max_length=SEQ_LEN,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  truncation=True,
  padding='longest',
  return_attention_mask=True,
)

valencoding = tokenizer.batch_encode_plus(
  x_valid,
  max_length=SEQ_LEN,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  truncation=True,
  padding='longest',
  return_attention_mask=True,
)

testencoding = tokenizer.batch_encode_plus(
  data_test[['premise', 'hypothesis']].values.tolist(),
  max_length=SEQ_LEN,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  truncation=True,
  padding='longest',
  return_attention_mask=True,
)

In [None]:
trencoding.keys()

# Find Class Weights

In [None]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_wts = compute_class_weight('balanced', np.unique(data_train['label'].values.tolist()), 
                                 data_train['label'])

#print(class_wts)

# convert class weights to tensor
weights= torch.tensor(class_wts,dtype=torch.float)
weights = weights.to(device)

# loss function
#cross_entropy  = nn.NLLLoss(weight=weights) 
cross_entropy  = nn.CrossEntropyLoss(weight=weights) 

# Data loaders

In [None]:
def loadData(prep_df, batch_size, num_workers, sampler):
    
    return  DataLoader(
            prep_df,
            batch_size=batch_size,
            num_workers=num_workers,
            sampler=sampler,
            pin_memory=True
        )

## convert lists to tensors
train_seq = torch.tensor(trencoding['input_ids'])
train_mask = torch.tensor(trencoding['attention_mask'])
train_y = torch.tensor(y_train.tolist())

val_seq = torch.tensor(valencoding['input_ids'])
val_mask = torch.tensor(valencoding['attention_mask'])
val_y = torch.tensor(y_valid.tolist())

test_seq = torch.tensor(testencoding['input_ids'])
test_mask = torch.tensor(testencoding['attention_mask'])

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)
# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)
# Train Data Loader
traindata = loadData(train_data, batch_size, num_workers, train_sampler)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)
# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)
# Val Data Loader
valdata = loadData(val_data, batch_size, num_workers, val_sampler)


# wrap tensors
test_data = TensorDataset(test_seq, test_mask)
# sampler for sampling the data during training
test_sampler = SequentialSampler(test_data)
# Val Data Loader
testdata = DataLoader(test_data)


print('Number of data in the train set', len(traindata))
print('Number of data in the validation set', len(valdata))
print('Number of data in the test set', len(testdata))

In [None]:
#for ids, mask, l in traindata:
#    print(mask)

# Model definition

In [None]:
#class XML_RoBERTa_Arch(nn.Module):
#    
#    def __init__(self, n_classes):
#        
#        super(XML_RoBERTa_Arch, self).__init__()
#        
#        self.bert = RobertaModel.from_pretrained(model_name, return_dict=False)
#        self.d1 = nn.Dropout(0.2)
#        self.l1 = torch.nn.Linear(768, 64)
#        self.bn1 = torch.nn.LayerNorm(64)
#        self.d2 = torch.nn.Dropout(0.2)
#        self.l2 = torch.nn.Linear(64, n_classes)
#
#    def forward(self, input_ids, attention_mask):
#        _, x = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#        x = self.d1(x)
#        x = self.l1(x)
#        x = self.bn1(x)
#        x = torch.nn.Tanh()(x)
#        x = self.d2(x)
#        x = self.l2(x)
#        
#        return x

In [None]:
class XML_RoBERTa_Arch(nn.Module):
    
    def __init__(self, n_classes, freeze_bert=False):
        
        super(XML_RoBERTa_Arch,self).__init__()
        # Instantiating BERT model object
        self.bert = XLMRobertaModel.from_pretrained(model_name, return_dict=False)
        
        # Freeze bert layers
        if freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False
                
        self.bert_drop_1 = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size) # (768, 64)
        self.bn = nn.BatchNorm1d(768) # (768)
        self.bert_drop_2 = nn.Dropout(0.25)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes) # (768,3)


    def forward(self, input_ids, attention_mask):
        _, output = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
        )
        output = self.bert_drop_1(output)
        output = self.fc(output)
        output = self.bn(output)
        output = self.bert_drop_2(output)
        output = self.out(output)        
        return output


In [None]:
class_names = np.unique(data_train['label'])
print('Downloading the XML RoBERTa custom model...')
model = XML_RoBERTa_Arch(len(class_names))
model.to(device) # Model to GPU

#optimizer parameters
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [{'params': [p for n, p in param_optimizer 
                                    if not any(nd in n for nd in no_decay)],'weight_decay':0.001},
                        {'params': [p for n, p in param_optimizer 
                                    if any(nd in n for nd in no_decay)],'weight_decay':0.0}]

print('Preparing the optimizer...')
#optimizer 
optimizer = AdamW(optimizer_parameters, lr=learning_rate)
steps = steps_per_epoch
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = steps
)

# Train the model

In [None]:
# function to train the bert model
def trainBERT():
  
    print('Training...')
    model.train()
    total_loss, total_accuracy = 0, 0

    # empty list to save model predictions
    total_preds=[]

    # iterate over batches
    for step, batch in enumerate(traindata):
    
        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(traindata)))

        if torch.cuda.is_available():
            # push the batch to gpu
            batch = [r.to(device) for r in batch]

        sent_id, mask, labels = batch
        # clear previously calculated gradients 
        model.zero_grad()        
        # get model predictions for the current batch
        preds = model(sent_id, mask)
        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)
        # add on to the total loss
        total_loss = total_loss + loss.item()
        # backward pass to calculate the gradients
        loss.backward()
        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # update parameters
        optimizer.step()
        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()
        # append the model predictions
        total_preds.append(preds)
        
        torch.cuda.empty_cache()

    # compute the training loss of the epoch
    avg_loss = total_loss / len(traindata)

    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [None]:
# function for evaluating the model
def evaluate():
  
    print("\nEvaluating...")
    t0 = time.time()
    # deactivate dropout layers
    model.eval()
    total_loss, total_accuracy = 0, 0
    
    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step, batch in enumerate(valdata):
        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(valdata)))

        if torch.cuda.is_available():
            # push the batch to gpu
            batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad(): # Dont store any previous computations, thus freeing GPU space

            # model predictions
            preds = model(sent_id, mask)
            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds, labels)
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)

        torch.cuda.empty_cache()
    # compute the validation loss of the epoch
    avg_loss = total_loss / len(valdata) 
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
# set initial loss to infinite
best_valid_loss = float('inf')

# Empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

# for each epoch perform training and evaluation
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = trainBERT()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'xmlrob_weights.pt') # Save model weight's (you can also save it in .bin format)
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

In [None]:
print('\nTest Set...')

test_preds = []

print('Total batches:', len(testdata))

for fold_index in range(0, 3):
    
    print('\nFold Model', fold_index)

    # Load the fold model
    path_model = 'xmlrob_weights.pt'
    model.load_state_dict(torch.load(path_model))

    # Send the model to the GPU
    model.to(device)

    stacked_val_labels = []
    
    # Put the model in evaluation mode.
    model.eval()

    # Turn off the gradient calculations.
    # This tells the model not to compute or store gradients.
    # This step saves memory and speeds up validation.
    torch.set_grad_enabled(False)


    # Reset the total loss for this epoch.
    total_val_loss = 0

    for j, test_batch in enumerate(testdata):

        inference_status = 'Batch ' + str(j + 1)

        print(inference_status, end='\r')

        b_input_ids = test_batch[0].to(device)
        b_input_mask = test_batch[1].to(device)   


        outputs = model(b_input_ids, 
                attention_mask=b_input_mask)

        # Get the preds
        preds = outputs[0]

        # Move preds to the CPU
        val_preds = preds.detach().cpu().numpy()
        
        
        # Stack the predictions.
        if j == 0:  # first batch
            stacked_val_preds = val_preds
        else:
            stacked_val_preds = np.vstack((stacked_val_preds, val_preds))

        
    test_preds.append(stacked_val_preds)
    
            
print('\nPrediction complete.')     

In [None]:
test_preds

In [None]:
# Sum the predictions of all fold models
for i, item in enumerate(test_preds):
    if i == 0:
        preds = item
        
    else:
        # Sum the matrices
        preds = item + preds

        
# Average the predictions
avg_preds = preds/(len(test_preds))

# Take the argmax. 
# This returns the column index of the max value in each row.
test_predictions = np.argmax(avg_preds, axis=1)

# Take a look of the output
print(type(test_predictions))
print(len(test_predictions))
print()
print(test_predictions)

In [None]:
len(data_test['id'])

# Submission

In [None]:
path_sample = '../input/contradictory-my-dear-watson/sample_submission.csv'
submission = pd.read_csv(path_sample)    
submission['prediction'] = test_predictions

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)