In [None]:
# # for TPU
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [None]:
'''
Importing necessary python libraries
'''
import random
import math
import csv
import array
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

from tqdm.notebook import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from transformers import BertTokenizer, XLMRobertaTokenizer
from transformers import XLMRobertaConfig
from transformers import BertForSequenceClassification, XLMRobertaForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# # for TPU
# import torch_xla
# import torch_xla.core.xla_model as xm

In [None]:
# Fetching training and validation data and converting it to the dataframe
# df = pd.read_csv("/kaggle/input/indore-datathon-2021/train.tsv", names=['Relation','Sentence','NER1','NER2'], sep="\t")
# df.drop(index=df.index[0], axis=0, inplace=True)
# df = df.drop(columns=['NER1', 'NER2'])

# dfv = pd.read_csv("/kaggle/input/indore-datathon-2021/valid.tsv", names=['Id','Sentence', 'NER1', 'NER2'], sep="\t")
# dfv = dfv.drop(columns=['NER1', 'NER2'])

dft = pd.read_csv("/kaggle/input/indore-datathon-2021/test.tsv", names=['Id','Sentence','NER1','NER2'], sep="\t")
dft.drop(index=dft.index[0], axis=0, inplace=True)
# dft = dft.drop(columns=['NER1', 'NER2'])

# Fetching extra labeled dump data for english (resource rich language)
df_english = pd.read_csv("/kaggle/input/indore-datathon-2021/en.tsv", names=['Relation','Sentence','NER1','NER2'], sep="\t")
# df_english = df_english.drop(columns=['NER1', 'NER2'])
df_english.drop(index=df_english.index[0], axis=0, inplace=True)

# Fetching extra unlabeled dump data for all the languages
df_un_english = pd.read_csv("/kaggle/input/entire-dataset/english_annotated_dump.csv", sep=",")
df_un_english = df_un_english.drop(columns=['Sentence', 'e1', 'e2', 'Unnamed: 0'])
# df_un_english = df_un_english.drop(columns=['NER1', 'NER2'])
df_un_english = df_un_english.rename(columns={'Sentence_tagged':'Sentence'})
df_un_english.drop(index=df_un_english.index[0], axis=0, inplace=True)

df_un_hindi = pd.read_csv("/kaggle/input/entire-dataset/hindi_annotated_dump.csv", sep=",")
df_un_hindi = df_un_hindi.drop(columns=['Sentence', 'e1', 'e2', 'Unnamed: 0'])
# df_un_hindi = df_un_hindi.drop(columns=['NER1', 'NER2'])
df_un_hindi = df_un_hindi.rename(columns={'Sentence_tagged':'Sentence'})
df_un_hindi.drop(index=df_un_hindi.index[0], axis=0, inplace=True)

df_un_bengali = pd.read_csv("/kaggle/input/entire-dataset/bengali_annotated_dump.csv", sep=",")
df_un_bengali = df_un_bengali.drop(columns=['Sentence', 'e1', 'e2', 'Unnamed: 0'])
# df_un_bengali = df_un_bengali.drop(columns=['NER1', 'NER2'])
df_un_bengali = df_un_bengali.rename(columns={'Sentence_tagged':'Sentence'})
df_un_bengali.drop(index=df_un_bengali.index[0], axis=0, inplace=True)

df_un_telugu = pd.read_csv("/kaggle/input/entire-dataset/telugu_annotated_dump.csv", sep=",")
df_un_telugu = df_un_telugu.drop(columns=['Sentence', 'e1', 'e2', 'Unnamed: 0'])
# df_un_telugu = df_un_telugu.drop(columns=['NER1', 'NER2'])
df_un_telugu = df_un_telugu.rename(columns={'Sentence_tagged':'Sentence'})
df_un_telugu.drop(index=df_un_telugu.index[0], axis=0, inplace=True)

# Fetching the previous validation data
dfv = pd.read_csv("/kaggle/input/entire-dataset/RoBERTa_Without_Token_Submission_Valid.csv", names=['Id', 'Relation','Sentence'], sep=",")
dfv = dfv.drop(columns=['Id'])
dfv.drop(index=dfv.index[0], axis=0, inplace=True)

In [None]:
'''
This is a pre-processing step. 
This function receives the raw data and add special tokens to the data as this is an essential step to train 'XLM-RoBERTa' model.
''' 
def tokenise_data(data):
    tokenised_sentences = []
    for sentence in data:
        if sentence != 'Sentence':
            tokenised_sentences.append("".join(['<s>', sentence, '</s>']))
        else:
            tokenised_sentences.append(sentence)

    data = tokenised_sentences
    return data

In [None]:
'''
This function is also part of the pre-processing step.
The function, takes in the dataframes and indexes the dataframe for better accessibility
'''
def indexing_data_frame(dataframe):
    dataframe['id'] = range(len(dataframe))
    dataframe.set_index('id', inplace=True)
    return dataframe

In [None]:
'''
This is also a pre-processing step.
The dataframes are passed as an argument to 'indexing_data_frame' function for indexing the dataframes
'''
# df = indexing_data_frame(df)
dfv = indexing_data_frame(dfv)
dft = indexing_data_frame(dft)

df_english = indexing_data_frame(df_english)

df_un_english = indexing_data_frame(df_un_english)
df_un_hindi = indexing_data_frame(df_un_hindi)
df_un_bengali = indexing_data_frame(df_un_bengali)
df_un_telugu = indexing_data_frame(df_un_telugu)
# df_prev = indexing_data_frame(df_prev)

In [None]:
'''
As the extra english data is also labeled, this data is appended together with the training data for combined training.
NOTE: Training Approaches:
        1) Training the model with both 'train.tsv' and 'en.tsv'
        2) Training the model with 'train.tsv'. Loading this model to train again with 'en.tsv'
      Both of these approaches results in same final model. Hence, combining them togethor was much more feasibile.
'''
df = df_un_english
# df = df.append(df_english)
# df = df.append(df_un_english)
df = df.append(df_un_hindi)
df = df.append(df_un_bengali)
df = df.append(df_un_telugu)
# df = df.append(df_prev)

In [None]:
def create_class_weight(labels_dict,mu=0.1):
    total = np.sum(list(labels_dict.values()))
    keys = labels_dict.keys()
    class_weight = dict()
    
    for key in keys:
#         score = math.log(mu*total/float(labels_dict[key]))
        score = (mu*total/float(labels_dict[key]))
        class_weight[key] = score 
    
    return class_weight

In [None]:
# relation_counts = relation_counts/max(relation_counts)
# relation_tensor = relation_counts.values.reshape(25,1)
# relation_counts = np.flip(relation_counts)
# relation_counts = np.copy(relation_counts)
# relation_tensor = torch.tensor(relation_counts)

In [None]:
'''
This is a pre-processing step. 
The data is sent to the 'tokenise_data' function to add the special tokens required for training.
'''
df.Sentence = tokenise_data(df.Sentence)
dfv.Sentence = tokenise_data(dfv.Sentence)
dft.Sentence = tokenise_data(dft.Sentence)

In [None]:
dfv

In [None]:
df.Relation.value_counts()

In [None]:
df.Relation.unique()

In [None]:
'''
Convert relation labels to integer values
'''
labels = df.Relation.unique()
label_dict = {label:index for index, label in enumerate(labels)}

In [None]:
relation_counts = df.Relation.value_counts()
label_weight_dict = {label:relation_counts[label] for index, label in enumerate(labels)}

In [None]:
label_weight_dict

In [None]:
weights = create_class_weight(label_weight_dict)

In [None]:
weights = list(weights.values())

In [None]:
for i in range(len(weights)):
    weights[i] = weights[i]/max(weights)
    
weights

In [None]:
relation_tensor = torch.tensor(weights)

In [None]:
relation_tensor

In [None]:
'''
Adding the numerical relation values to the dataframe of training data
'''
df['Relation'] = df['Relation'].map(label_dict)
dfv['Relation'] = dfv['Relation'].map(label_dict)

In [None]:
'''
Spliting up the data randomly into test and validation set.
As of now, Gold Standard dataset is not available. 
Hence we consider training data for both training and validation and validation data as a test data.
'''
X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.Relation.values, test_size=0.01, random_state=42)

In [None]:
# here they also add a label to indicate whether it is a training, validation or test set item.
# if you do cross-validation, you do not need such labeling.
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['Relation', 'data_type']).count()

In [None]:
'''
Pre-trained model types depending on the vocabulary size. We do not use the entire list of pre-trained configuration. 
For more information, refer to the following link: 
https://huggingface.co/transformers/_modules/transformers/models/xlm_roberta/tokenization_xlm_roberta.html#XLMRobertaTokenizer

Choose from 2 different model-types: 'xlm-roberta-base' or 'xlm-roberta-large'
'''
model_type = 'xlm-roberta-base'
tokenizer = XLMRobertaTokenizer.from_pretrained(model_type)

In [None]:
'''
Refer https://github.com/huggingface/tokenizers/issues/247 for more information regarding what special tokens are meant to do and 
how fine-tuning the model can help create better embeddings with these newly added tokens.
'''
# special_tokens_dict = {'additional_special_tokens': ['<e1>', '</e1>', '<e2>', '</e2>']}
# tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
# '''
# Another pre-processing step before we can train the model.
# Model training or fine-tuning requires the encoding of the data to the respective tensor format.
# This function takes in the data as an arguments, encodes the data and returns the data in the tensor format.
# '''
# def encode_data(data):
#     encoded_data = tokenizer.batch_encode_plus(
#                         data,
#                         add_special_tokens=False,
        
#                         max_length=512,
#                         padding='max_length',
#                         truncation=True,

#                         return_attention_mask=True,
#                         return_tensors='pt'
#                     )
#     return encoded_data

In [None]:
# '''
# Another pre-processing step before training the model.
# Model training or fine-tuning requires the encoding of the data to the respective tensor format.
# Hence, the data is passed on to 'encode_data' function, which returns the data in the respective tensor format.
# '''
# encoded_data_training = encode_data(df.Sentence.values)
# encoded_data_val = encode_data(dfv.Sentence.values)
# encoded_data_test = encode_data(dft.Sentence.values)

In [None]:
encoded_data_training = tokenizer.batch_encode_plus(
    df[df['data_type'] == 'train'].Sentence.values,
    add_special_tokens=False,
    
    max_length=512,
    padding='max_length',
    truncation=True,

    return_attention_mask=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df['data_type'] == 'val'].Sentence.values,
    add_special_tokens=False,
    
    max_length=512,
    padding='max_length',
    truncation=True,

    return_attention_mask=True,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    dft.Sentence.values,
    add_special_tokens=False,
    
    max_length=512,
    padding='max_length',
    truncation=True,

    return_attention_mask=True,
    return_tensors='pt'
)

In [None]:
input_ids_train = encoded_data_training['input_ids']
attention_masks_train = encoded_data_training['attention_mask']
labels_train = torch.tensor(df[df['data_type'] == 'train'].Relation.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df['data_type'] == 'val'].Relation.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']

In [None]:
'''
Fetch different layers of the encoded data required for training and prediction
This function takes in 
    'encoded_data', 'dataframe' and 'type' of data to 
        return 
    'input_ids', 'attention_mask' and 'labels' where applicable.
'''
def fetch_data_layers(encoded_data, dataframe, type):
    if type=='train' or type=='val':
        input_ids = encoded_data['input_ids']
        attention_masks = encoded_data['attention_mask']
        labels = torch.tensor(dataframe.Relation.values)
        return input_ids, attention_masks, labels
    else:
        input_ids = encoded_data['input_ids']
        attention_masks = encoded_data['attention_mask']
        return input_ids, attention_masks

In [None]:
# '''
# The 'encoded_data', 'dataframe' and 'type' of data are passed to function 'fetch_data_layers' which returns 
# 'input_ids', 'attention_mask' and 'labels' where applicable.
# '''
# input_ids_train, attention_masks_train, labels_train = fetch_data_layers(encoded_data_training, df,  'train')
# input_ids_val, attention_masks_val, labels_val = fetch_data_layers(encoded_data_val, dfv,  'val')
# input_ids_test, attention_masks_test = fetch_data_layers(encoded_data_test, dft,  'test')

In [None]:
'''
Basic configuration for training the model
'''
batch_size = 8
epochs = 5

In [None]:
len(labels_val)

In [None]:
'''
Creating the dataset from the fetched layers
'''
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test)

In [None]:
'''
Creating the dataloader by randomly sampling using the dataset previously created through the fetched layers.
'''
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
dataloader_val = DataLoader(dataset_val, sampler=RandomSampler(dataset_val), batch_size=batch_size)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size)

In [None]:
'''
Loading the pre-trained 'XLM-RoBERTa' model and passing the number of labels along with it as a configuration step.
'''
# config = XLMRobertaConfig.from_pretrained(model_type)
model = XLMRobertaForSequenceClassification.from_pretrained(model_type, num_labels=len(label_dict))
model.resize_token_embeddings(len(tokenizer))
model.load_state_dict(torch.load('/kaggle/input/model-checkpoint/Roberta_Model_Checkpoint.pth'))

In [None]:
from typing import Optional, Sequence

import torch.nn.functional as F
from torch import Tensor

softmax_layer = nn.LogSoftmax(dim=1)

# class FocalLoss(nn.modules.loss._WeightedLoss):
#     def __init__(self, weight=None, gamma=2, device='cpu'):
#         super(FocalLoss, self).__init__(weight)
#         # focusing hyper-parameter gamma
#         self.gamma = gamma

#         # class weights will act as the alpha parameter
#         self.weight = weight
        
#         # using deivce (cpu or gpu)
#         self.device = device
        
#         self.ce_loss = nn.CrossEntropyLoss()

#     def forward(self, _input, _target):
#         focal_loss = 0

#         for i in range(len(_input)):
#             # -log(pt)
#             cur_ce_loss = self.ce_loss(_input[i].view(-1, _input[i].size()[-1]), _target[i].view(-1))
#             # pt
#             pt = torch.exp(-cur_ce_loss)

#             if self.weight is not None:
#                 # alpha * (1-pt)^gamma * -log(pt)
#                 cur_focal_loss = self.weight[_target[i]] * ((1 - pt) ** self.gamma) * cur_ce_loss
#             else:
#                 # (1-pt)^gamma * -log(pt)
#                 cur_focal_loss = ((1 - pt) ** self.gamma) * cur_ce_loss
                
#             focal_loss = focal_loss + cur_focal_loss

#         if self.weight is not None:
#             focal_loss = focal_loss / self.weight.sum()
#             return focal_loss.to(self.device)
        
#         focal_loss = focal_loss / torch.tensor(len(probs))    
#         return focal_loss.to(self.device)

class FocalLoss(nn.Module):
    """ Focal Loss, as described in https://arxiv.org/abs/1708.02002.
    It is essentially an enhancement to cross entropy loss and is
    useful for classification tasks when there is a large class imbalance.
    x is expected to contain raw, unnormalized scores for each class.
    y is expected to contain class labels.
    Shape:
        - x: (batch_size, C) or (batch_size, C, d1, d2, ..., dK), K > 0.
        - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0.
    """

    def __init__(self,
                 alpha: Optional[Tensor] = None,
                 gamma: float = 0.,
                 reduction: str = 'mean',
                 ignore_index: int = -100):
        """Constructor.
        Args:
            alpha (Tensor, optional): Weights for each class. Defaults to None.
            gamma (float, optional): A constant, as described in the paper.
                Defaults to 0.
            reduction (str, optional): 'mean', 'sum' or 'none'.
                Defaults to 'mean'.
            ignore_index (int, optional): class label to ignore.
                Defaults to -100.
        """
        if reduction not in ('mean', 'sum', 'none'):
            raise ValueError(
                'Reduction must be one of: "mean", "sum", "none".')

        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ignore_index = ignore_index
        self.reduction = reduction

        self.nll_loss = nn.NLLLoss(
            weight=alpha, reduction='none', ignore_index=ignore_index)

    def __repr__(self):
        arg_keys = ['alpha', 'gamma', 'ignore_index', 'reduction']
        arg_vals = [self.__dict__[k] for k in arg_keys]
        arg_strs = [f'{k}={v}' for k, v in zip(arg_keys, arg_vals)]
        arg_str = ', '.join(arg_strs)
        return f'{type(self).__name__}({arg_str})'

    def forward(self, x: Tensor, y: Tensor) -> Tensor:
        if x.ndim > 2:
            # (N, C, d1, d2, ..., dK) --> (N * d1 * ... * dK, C)
            c = x.shape[1]
            x = x.permute(0, *range(2, x.ndim), 1).reshape(-1, c)
            # (N, d1, d2, ..., dK) --> (N * d1 * ... * dK,)
            y = y.view(-1)

        unignored_mask = y != self.ignore_index
        y = y[unignored_mask]
        if len(y) == 0:
            return 0.
        x = x[unignored_mask]

        # compute weighted cross entropy term: -alpha * log(pt)
        # (alpha is already part of self.nll_loss)
        log_p = F.log_softmax(x, dim=-1)
        ce = self.nll_loss(log_p, y)

        # get true class column from each row
        all_rows = torch.arange(len(x))
        log_pt = log_p[all_rows, y]

        # compute focal term: (1 - pt)^gamma
        pt = log_pt.exp()
        focal_term = (1 - pt)**self.gamma

        # the full loss: -alpha * ((1 - pt)^gamma) * log(pt)
        loss = focal_term * ce

        if self.reduction == 'mean':
            loss = loss.mean()
        elif self.reduction == 'sum':
            loss = loss.sum()

        return loss


def focal_loss(alpha: Optional[Sequence] = None,
               gamma: float = 0.,
               reduction: str = 'mean',
               ignore_index: int = -100,
               device='cpu',
               dtype=torch.float32) -> FocalLoss:
    """Factory function for FocalLoss.
    Args:
        alpha (Sequence, optional): Weights for each class. Will be converted
            to a Tensor if not None. Defaults to None.
        gamma (float, optional): A constant, as described in the paper.
            Defaults to 0.
        reduction (str, optional): 'mean', 'sum' or 'none'.
            Defaults to 'mean'.
        ignore_index (int, optional): class label to ignore.
            Defaults to -100.
        device (str, optional): Device to move alpha to. Defaults to 'cpu'.
        dtype (torch.dtype, optional): dtype to cast alpha to.
            Defaults to torch.float32.
    Returns:
        A FocalLoss object
    """
    if alpha is not None:
        if not isinstance(alpha, Tensor):
            alpha = torch.tensor(alpha)
        alpha = alpha.to(device=device, dtype=dtype)

    fl = FocalLoss(
        alpha=alpha,
        gamma=gamma,
        reduction=reduction,
        ignore_index=ignore_index)
    return fl
    
entropy_loss = focal_loss(alpha=None,gamma=2,device='cuda')

In [None]:
'''
Basic configuration for training the model
'''
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train)*epochs)

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [None]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v:k for k,v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
seed_val = 123
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = xm.xla_device()
model.to(device)

print(device)

In [None]:
def predict(dataloader_test, relation_tensor):

    model.eval()
    
    predictions = []
    
    for batch in tqdm(dataloader_test):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1]
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        logits = outputs['logits']
        
        relation_tensor = relation_tensor.type(torch.DoubleTensor)
        relation_tensor = relation_tensor.to(device)
        
#         print(logits)
        
        logits = logits.type(torch.DoubleTensor)
        logits = logits.to(device)
        new_logits = torch.tensor([])
        new_logits = new_logits.to(device)
        new_logits = logits * relation_tensor

        new_logits = new_logits.detach().cpu().numpy()
        predictions.append(new_logits)
    
    predictions = np.concatenate(predictions, axis=0)
            
    return predictions

In [None]:
def evaluate(dataloader_val, relation_tensor):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
            
        relation_tensor = relation_tensor.type(torch.DoubleTensor)
        relation_tensor = relation_tensor.to(device)
        
#         print(logits)
        
        logits = logits.type(torch.DoubleTensor)
        logits = logits.to(device)
        new_logits = torch.tensor([])
        new_logits = new_logits.to(device)
        
        new_logits = logits * relation_tensor
        ll = entropy_loss(new_logits, inputs['labels'])

        loss_val_total += ll.item()
        

        new_logits = new_logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(new_logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
relation_tensor

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    best_validation_loss = float('inf')
    
    progress_bar = tqdm(dataloader_train, 
                        desc='Epoch {:1d}'.format(epoch), 
                        leave=False, 
                        disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids':        batch[0],
            'attention_mask':   batch[1],
            'labels':           batch[2],
        }
        # inputs['input_ids'].to(device)
        # inputs['attention_mask'].to(device)
        # inputs['labels'].to(device)

        # outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=inputs['labels'])
        
#         print(inputs['labels'])
        
        outputs = model(**inputs)        
        loss = outputs[0]
        logits = outputs[1]
        relation_tensor = relation_tensor.type(torch.DoubleTensor)
        relation_tensor = relation_tensor.to(device)
        
#         print(logits)
        
        logits = logits.type(torch.DoubleTensor)
        logits = logits.to(device)
        new_logits = torch.tensor([])
        new_logits = new_logits.to(device)
        new_logits = logits * relation_tensor

        
        ll = entropy_loss(new_logits, inputs['labels'])
        
        # loss_train_total +=loss.item()
        loss_train_total +=ll.item()
        # loss.backward()
        ll.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        # progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})     
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(ll.item()/len(batch))})    
    
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val, relation_tensor)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')
    
    if val_loss <= best_validation_loss:
        torch.save(model.state_dict(), 'Roberta_Model_Checkpoint.pth')

In [None]:
accuracy_per_class(predictions, true_vals)

In [None]:
predictions = predict(dataloader_test, relation_tensor)

In [None]:
preds_flat = np.argmax(predictions, axis=1).flatten()
# len(preds_flat)

In [None]:
label_dict_rev = dict((v,k) for k,v in label_dict.items())

In [None]:
dft['predicted_relations_id'] = preds_flat
dft['predicted_relations'] = dft['predicted_relations_id'].map(label_dict_rev)
dft.to_csv('Roberta_Without_Tokens_2.csv')

In [None]:
dft['predicted_relations']