In [None]:
!pip uninstall fsspec -qq -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq
!pip install accelerate

In [None]:
import os
import gc
import sys
import math
import time
import tqdm
import random
import regex as re
import numpy as np
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from tqdm.auto import tqdm
from functools import partial
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from datasets import Dataset
from accelerate import Accelerator
from transformers import (AutoTokenizer,AutoModelForQuestionAnswering,AutoModel,
                          AutoConfig,AdamW,get_linear_schedule_with_warmup,
                          get_cosine_schedule_with_warmup)

from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
c_ = Fore.CYAN
g_ = Fore.GREEN
y_ = Fore.YELLOW
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL

In [None]:
config = {'model_path':'../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2',
          
          'max_length':384,
          'doc_stride':128,
          'max_answer_length':30,
          
          'lr':1e-5,
          'wd':1e-2,
    
          'epochs':1,
          'nfolds':5,
          'batch_size':4,
          'num_workers':4,
          'seed':1000}

for i in range(config['nfolds']):
    os.makedirs(f'model{i}',exist_ok=True)
    
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [None]:
def clean_data(df,train=True):
    df['orignal_context_len'] = df['context'].apply(lambda x:len(x.split(" ")))
    
    def remove_stuff(x):
        x = x.replace('\n','')
        x = x.replace("("," (").replace(")",") ")
        x = re.sub(r'https?\S+', '', x)
        x = re.sub(r'\[[0-9]]','',x)
        return x
    
    df['context'] = df['context'].apply(lambda x: remove_stuff(x))
    df['question'] = df['question'].apply(lambda x: remove_stuff(x))
            
    df['new_context_len'] = df['context'].apply(lambda x:len(x.split(" ")))
    
    print("Average original context len: ",df.orignal_context_len.mean())
    print("Average New context len: ",df.new_context_len.mean())

    #define new context and answer_start    
    def convert_answers(r):
        return {'answer_start': [r[0]], 'text': [r[1]]}
    
    if train:
        df['answer_text'] = df.answer_text.astype(str)
        df['answer_start'] = df[['context','answer_text']].apply(lambda x: x[0].find(x[1]),axis=1)
        df['answers'] = df[['answer_start', 'answer_text']].apply(convert_answers, axis=1)
        print("Answers not found in new context: ",(df.answer_start == -1).sum())
    
    return df

In [None]:
train_data = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/train.csv')
test_data = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')
sample = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/sample_submission.csv')

train_data['Fold'] = -1
kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=train_data['language'])):
    train_data.loc[valid_idx,'Fold'] = k
    
train_data = pd.concat([train_data.query("language == 'tamil'"),
                        train_data.query("language == 'hindi'").sample(n=368)])

train_data = clean_data(train_data)

In [None]:
train_data.shape,train_data.language.value_counts()

In [None]:
def prepare_train_features(examples, tokenizer, pad_on_right, max_length, doc_stride):
    examples['question'] = [q.lstrip() for q in examples['question']]
    
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length")
    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    
    offset_mapping = tokenized_examples.pop("offset_mapping")
    
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [None]:
class Model(nn.Module):
    def __init__(self,model_name):
        super(Model,self).__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.0,
                "layer_norm_eps": 1e-7,
                "add_pooling_layer": False,
            }
        )
        self.roberta = AutoModel.from_pretrained(model_name,config=self.config)
        self.roberta.pooler = nn.Identity()
        self.linear = nn.Linear(self.config.hidden_size,2)
        
    def loss_fn(self,start_logits,end_logits,start_positions,end_positions):
        if len(start_positions.size()) > 1:
            start_positions = start_positions.squeeze(-1)
        if len(end_positions.size()) > 1:
            end_positions = end_positions.squeeze(-1)

        ignored_index = start_logits.size(1)
        start_positions = start_positions.clamp(0, ignored_index)
        end_positions = end_positions.clamp(0, ignored_index)
        loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        total_loss = 0.75 * start_loss + 0.25 * end_loss 
        return total_loss
    
    def forward(self,**xb):
        x = self.roberta(input_ids=xb['input_ids'],attention_mask=xb['attention_mask'])[2]
        x = (x[-1] + x[-2])/2
        x = self.linear(x)
        start_logits,end_logits = x.split(1,dim=-1)
        start_logits,end_logits = start_logits.squeeze(-1).contiguous(),end_logits.squeeze(-1).contiguous()
        start_positions,end_positions = xb['start_positions'],xb['end_positions']
        
        loss = None
        if start_positions is not None and end_positions is not None:
            loss = self.loss_fn(start_logits, end_logits, start_positions, end_positions)
            
        return (start_logits,end_logits),loss

In [None]:
class ChaiiDataset:
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):            
        return {"input_ids": torch.tensor(self.data[idx]["input_ids"], dtype=torch.long),
                "attention_mask": torch.tensor(self.data[idx]["attention_mask"], dtype=torch.long),
                "start_positions":torch.tensor(self.data[idx]["start_positions"],dtype=torch.long),
                "end_positions":torch.tensor(self.data[idx]["end_positions"],dtype=torch.long) }

In [None]:
def run(fold):
    
    def evaluate(model,valid_loader):
        model.eval()
        valid_loss = 0
        with torch.no_grad():
            for i, inputs in enumerate(tqdm(valid_loader)):
                inputs = {key:val.reshape(val.shape[0],-1) for key,val in inputs.items()}
                outputs = model(**inputs)
                loss = outputs[1]
                valid_loss += loss.item()

        valid_loss /= len(valid_loader)
        return valid_loss
        
    def train_and_evaluate_loop(train_loader,valid_loader,model,optimizer,
                                epoch,fold,best_loss,lr_scheduler=None):
        train_loss = 0
        for i, inputs in enumerate(tqdm(train_loader)):
            optimizer.zero_grad()
            model.train()
            inputs = {key:val.reshape(val.shape[0],-1) for key,val in inputs.items()}
            outputs = model(**inputs)
            loss = outputs[1]
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
            if lr_scheduler:
                lr_scheduler.step()
        
        train_loss /= len(train_loader)
        valid_loss = evaluate(model,valid_loader) 

        if valid_loss <= best_loss:
            print(f"Epoch:{epoch} |Train Loss:{train_loss}|Valid Loss:{valid_loss}")
            print(f"{g_}Loss Decreased from {best_loss} to {valid_loss}{sr_}")

            best_loss = valid_loss
            torch.save(model.state_dict(),f'./model{fold}/model{fold}.bin')
            tokenizer.save_pretrained(f'./model{fold}')
                    
        return best_loss
        
    accelerator = Accelerator()
    print(f"{accelerator.device} is used")
    
    x_train,x_valid = train_data.query(f"Fold != {fold}"),train_data.query(f"Fold == {fold}")
        
    model = Model(config['model_path'])
    tokenizer = AutoTokenizer.from_pretrained(config['model_path'])
    pad_on_right = tokenizer.padding_side == 'right'
    
    train_dataset = Dataset.from_pandas(x_train)
    train_features = train_dataset.map(
                    partial(
                        prepare_train_features, 
                        tokenizer=tokenizer,
                        pad_on_right=pad_on_right, 
                        max_length=config['max_length'],
                        doc_stride=config['doc_stride']
                    ),
                    batched=True,
                    remove_columns=train_dataset.column_names)
        
    train_ds = ChaiiDataset(train_features)
    train_dl = DataLoader(train_ds,
                        batch_size = config["batch_size"],
                        num_workers = config['num_workers'],
                        shuffle=True,
                        pin_memory=True,
                        drop_last=True)
    
    valid_dataset = Dataset.from_pandas(x_valid)
    valid_features = valid_dataset.map(
                    partial(
                        prepare_train_features, 
                        tokenizer=tokenizer,
                        pad_on_right=pad_on_right, 
                        max_length=config['max_length'],
                        doc_stride=config['doc_stride']
                    ),
                    batched=True,
                    remove_columns=train_dataset.column_names)
        
    valid_ds = ChaiiDataset(valid_features)
    valid_dl = DataLoader(valid_ds,
                        batch_size = config["batch_size"],
                        num_workers = config['num_workers'],
                        shuffle=False,
                        pin_memory=True,
                        drop_last=False)

    optimizer = AdamW(model.parameters(),lr=config['lr'],weight_decay=config['wd'])    
    lr_scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                   num_warmup_steps=0,
                                                   num_training_steps= config['epochs'] * len(train_dl))

    model,train_dl,valid_dl,optimizer,lr_scheduler = accelerator.prepare(model,train_dl,valid_dl,optimizer,lr_scheduler)

    print(f"Fold: {fold}")
    best_loss = 9999
    start_time = time.time()
    for epoch in range(config["epochs"]):
        print(f"Epoch Started:{epoch}")
        best_loss = train_and_evaluate_loop(train_dl,valid_dl,model,optimizer,epoch,fold,best_loss,lr_scheduler)
        
        end_time = time.time()
        print(f"{m_}Time taken by epoch {epoch} is {end_time-start_time:.2f}s{sr_}")
        start_time = end_time
        
    return best_loss

In [None]:
best_loss_per_fold = [run(f) for f in range(config['nfolds'])]

In [None]:
print(best_loss_per_fold)
print(np.mean(best_loss_per_fold))

In [None]:
# 0.252042833133833