## Imports

In [None]:
# !pip install transformers[sentencepiece]

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
# print(torch.cuda.current_device())
# print(torch.cuda.get_device_name(torch.cuda.current_device()))

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
TRAINING_FILE = pd.read_csv("/kaggle/input/chaii-hindi-and-tamil-question-answering/train.csv")
TEST_FILE = pd.read_csv("/kaggle/input/chaii-hindi-and-tamil-question-answering/test.csv")
SUBMISSION_FILE = pd.read_csv("/kaggle/input/chaii-hindi-and-tamil-question-answering/sample_submission.csv")
# change it to True if you want to use GPU
use_gpu = False

In [None]:
import transformers
from pprint import pprint
from torch.utils.data import Dataset, DataLoader
# import torch
from transformers import AutoTokenizer, AutoModel
from torch.optim import Adam,AdamW

from transformers import logging

logging.set_verbosity_error()

I have used pretrained Roberta model with SQUAD dataset to fine tune for our use case. 

Pretrained model available at :: https://huggingface.co/deepset/xlm-roberta-large-squad2

## Data Preparation

In [None]:
tokenizer = AutoTokenizer.from_pretrained("deepset/xlm-roberta-large-squad2")

In [None]:
""" Code adopted from reference link """
class ChaiiDataset(Dataset):
    
    def __init__(self,df,max_len=356,doc_stride=128):
        
        self.df = df
        self.max_len = max_len 
        self.doc_stride = doc_stride
        self.labelled = 'answer_text' in df
        self.tokenizer = AutoTokenizer.from_pretrained("deepset/xlm-roberta-large-squad2",add_special_tokens=True)        
        self.tokenized_samples = self.tokenizer(
                                self.df['context'].values.tolist(),
                                self.df['question'].values.tolist(),
                                truncation="only_first",
                                max_length=self.max_len,
                                stride=self.doc_stride,
                                return_overflowing_tokens=True,
                                return_offsets_mapping=True,
                                padding="max_length")
        
    def __getitem__(self,idx):
        
        data = {}
        ids,mask,offset = self.tokenized_samples['input_ids'][idx],\
                        self.tokenized_samples['attention_mask'][idx],\
                        self.tokenized_samples['offset_mapping'][idx]
        
        data['index'] = idx
        data['ids'] = torch.tensor(ids)
        data['mask'] = torch.tensor(mask)
        data['offset'] = offset
        if self.labelled:
            
            answer_text,start,end = self.get_targets(idx)
            data['answer_text'] = answer_text
            data['start'] = torch.tensor(start)
            data['end'] = torch.tensor(end)
            
        
        return data
    
    def get_targets(self,idx):
        
        df_index = self.tokenized_samples['overflow_to_sample_mapping'][idx]
        start_char = (self.df.iloc[df_index]['answer_start'])
        end_char = start_char + len(self.df.iloc[df_index]['answer_text'])
        offset = self.tokenized_samples['offset_mapping'][idx]
        sequence_ids = self.tokenized_samples.sequence_ids(idx)
        end_offset = len(self.tokenized_samples['input_ids'][idx])-1
        start_offset = 1
        while sequence_ids[end_offset] != 0:
            end_offset -= 1
            
            
        start_idx = 0;end_idx=0
        ## answer not in context
        if (start_char > offset[end_offset][0] or end_char < offset[start_offset][0]):
            #print("In first loop")
            start_idx = 0;end_idx=0
            answer_text=""
        
            
        ## answer partially in context
        elif ((start_char <= offset[end_offset][0]) and (end_char >  offset[end_offset][0])):
            #print("in second loop")
            start_idx = 0;end_idx=0
            answer_text = ""
        
        ## answer fully inside context
        else:
            #print("In third loop")
            i=0
            while (start_idx < len(offset) and offset[i][0]<=start_char and offset[i][1]<start_char):
                start_idx+=1
                i+=1
            end_idx = i
            while (end_idx < len(offset) and offset[i][1]<end_char):
                end_idx+=1
                i+=1
            answer_text = self.df.iloc[df_index]['answer_text'].strip()
            
        
        return answer_text,start_idx, end_idx 
    
    def __len__(self):
        return len(self.tokenized_samples['overflow_to_sample_mapping'])


## Model

In [None]:
class ChaiiModel(torch.nn.Module):
    def __init__(self):
        # create neural network architecture here
        super(ChaiiModel,self).__init__()
        config = transformers.AutoConfig.from_pretrained("deepset/xlm-roberta-large-squad2")
        config.update(
            {
                "output_hidden_states": True,
                "add_pooling_layer": False,
            }
        )
        self.xlmrob = AutoModel.from_pretrained("deepset/xlm-roberta-large-squad2", config=config)
        self.l0 = torch.nn.Linear(1024, 2)
        
    def forward(self, ids, attention_mask):
        # feed forward execution happens here
        xlmrob_out = self.xlmrob(ids, attention_mask)
        sequence_output = xlmrob_out[0]
        logits = self.l0(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

    
        return start_logits, end_logits

## Loss and Eval Function

In [None]:
""" Adopted some code from  reference link """
def loss_fn(o1, o2, t1, t2):
    l1 = torch.nn.BCEWithLogitsLoss()(o1, t1)
    l2 = torch.nn.BCEWithLogitsLoss()(o2, t2)
    return l1 + l2

def safe_div(x,y):
    if y == 0:
        return 1
    return x / y

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return safe_div(float(len(c)) , (len(a) + len(b) - len(c)))

def get_jaccard_score(y_true,y_pred):
    assert len(y_true)==len(y_pred)
    score=0.0
    for i in range(len(y_true)):
        score += jaccard(y_true[i], y_pred[i])
        
    return score

## Training

In [None]:
class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
from tqdm import tqdm

def train_fn(data_loader, model, optimizer):
    if use_gpu == True:
        model = model.to(device="cuda")
    model.train()
    losses = AverageMeter()
    tk0 = tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        attention_mask = d["mask"]
        targets_start = d["start"]
        targets_end = d["end"]
        if use_gpu == True:
            ids = d["ids"].to(device="cuda")
            attention_mask = d["mask"].to(device="cuda")
            targets_start = d["start"].to(device="cuda")
            targets_end = d["end"].to(device="cuda")

        
        optimizer.zero_grad()
        o1, o2 = model(
            ids = ids,
            attention_mask = attention_mask
        )
        
        loss = loss_fn(o1, o2, targets_start, targets_end)
        loss.backward()
        optimizer.step()
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg)
        

In [None]:
from sklearn import model_selection
EPOCH = 1

def run():
    dfx = pd.read_csv(TRAINING_FILE)
    
    df_train, df_valid = model_selection.train_test_split(
        dfx,
        test_size = 0.1,
        random_state = 42,
    )
    
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)
    
    # pass it to dataset class and data loader - train df
    train_dataset = ChaiiDataset(df_train)
    train_dataloader =  DataLoader(train_dataset, batch_size=16, shuffle=True)
    # pass it to dataset class and data loader - valid df
    valid_dataset = ChaiiDataset(df_valid)
    valid_dataloader =  DataLoader(valid_dataset, batch_size=16, shuffle=True)

    
    # initialize model
    model = ChaiiModel()
    # initialize optimizer
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.01,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,lr=4e-5)
    # calculate number of steps
    steps = (len(df_train)*EPOCH)//16
    # loop over epochs - train_fn
    for epoch in range(EPOCH):
        train_fn(train_dataloader, model, optimizer)
    

In [None]:
# Invoke this function to start training
# run()

**Refernces ::** https://www.kaggle.com/shahules/chaii-xlm-custom-qa-baseline-train-infer