In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Notebook I used for reference

https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb

https://www.kaggle.com/thedrcat/chaii-eda-baseline/comments

https://www.kaggle.com/shahules/chaii-xlm-base-custom-qa-train-infer/data?scriptVersionId=72059052

https://www.kaggle.com/rhtsingh/chaii-qa-5-fold-xlmroberta-torch-fit

# Reading data

In [None]:
import pandas as pd

train = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/train.csv')
train.head()

In [None]:
train.language.value_counts()

In [None]:
def convert_answers(r):
    start = r[0]
    text = r[1]
    return {
        'answer_start': [start],
        'text': [text]
    }

In [None]:
train['answers'] = train[['answer_start', 'answer_text']].apply(convert_answers, axis=1)
train.head()

In [None]:
# train.to_csv('train.csv',index=False)

# Loading custom data in hugging face datasets lib

In [None]:
# ! pip install datasets transformers
!pip uninstall fsspec -qq -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq

In [None]:
from datasets import load_dataset, Dataset
from pprint import pprint

# Preprocessing

In [None]:
# model_checkpoint = "xlm-roberta-base"
# model_checkpoint = "deepset/xlm-roberta-large-squad2"
model_checkpoint = "../input/xlm-roberta-squad2/deepset/xlm-roberta-base-squad2"
# model_checkpoint = "../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2"

from transformers import XLMTokenizer,AutoTokenizer
# tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Doc_stride is used to handle large text: tokens>512

In [None]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

# putting all together

In [None]:
# In some padding required on left side
pad_on_right = tokenizer.padding_side == "right"

In [None]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")
#     print(len(offset_mapping))

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

# Creating Fold 

In [None]:
from sklearn.model_selection import StratifiedKFold
folds = 5
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
# df = train[:64]
df = train
for f, (t_, v_) in enumerate(kf.split(X=df, y=df.language.values)):
        df.loc[v_, 'kfold'] = f

In [None]:
# dataset = Dataset.from_pandas(train[:-64])
# eval_dataset = Dataset.from_pandas(train[-64:])

#just to verify pipeline
# dataset = Dataset.from_pandas(train[:32])
# eval_dataset = Dataset.from_pandas(train[32:48])

In [None]:
# tokenized_train_datasets = dataset.map(prepare_train_features, batched=True, remove_columns=dataset.column_names)
# tokenized_eval_datasets = eval_dataset.map(prepare_train_features, batched=True, remove_columns=eval_dataset.column_names)

# Finetune start

In [None]:
import gc
import torch
import torch.nn as nn
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, AutoConfig, AutoModel
from transformers import default_data_collator
from torch.utils.data import DataLoader
from torch.optim import Adam,AdamW
from tqdm import tqdm

data_collator = default_data_collator
gc.collect()
%env WANDB_DISABLED=True
config = {
    'model': model_checkpoint,
    'batch_size': 4,
    "epochs": 3,#1,
    'lr':0.001,
    'weight_decay':0,
    'grad_acc': 8
}

# Model

In [None]:
# def chaiiModel(model_checkpoint):
    
#      return AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
class chaiiModel(nn.Module):
    
    def __init__(self):
        super(chaiiModel,self).__init__()
        
        self.model_config = AutoConfig.from_pretrained(config['model'])
#         print(self.model_config)
        self.model_config.return_dict=True
#         self.model = AutoModelForQuestionAnswering.from_pretrained(config['model'],config=self.model_config)
        self.model = AutoModel.from_pretrained(config['model'], config=self.model_config)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.model_config.hidden_size,2)
        
    def forward(self,input_ids,attention_mask):
        
        output = self.model(input_ids,attention_mask)
#         print(output[0].shape)
#         print(output[1].shape)
        x = self.dropout(output[0])
        x = self.fc(x)
#         print(x.shape)
        start_logits,end_logits = x.split(1,dim=-1)
#         print(start_logits.shape)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
#         print(start_logits.shape)
                
        return start_logits, end_logits

# Loss and Jac score

In [None]:
def safe_div(x,y):
    if y == 0:
        return 1
    return x / y

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return safe_div(float(len(c)) , (len(a) + len(b) - len(c)))

def get_jaccard_score(y_true,y_pred):
    assert len(y_true)==len(y_pred)
    score=0.0
    for i in range(len(y_true)):
        score += jaccard(y_true[i], y_pred[i])
        
    return score

def chaii_loss(start_logits, end_logits, start_positions, end_positions):
    ce_loss = nn.CrossEntropyLoss()
#     print('chaii_loss')
#     print(start_logits.shape)
#     print(start_positions.shape)
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)    
    total_loss = (start_loss + end_loss)/2
    return total_loss

# generate Data

In [None]:
def getData(df, fold):
    train = df[df['kfold']!=fold]
    valid = df[df['kfold']==fold]
    dataset = Dataset.from_pandas(train)
    eval_dataset = Dataset.from_pandas(valid)
    tokenized_train_datasets = dataset.map(prepare_train_features, batched=True, remove_columns=dataset.column_names)
    tokenized_eval_datasets = eval_dataset.map(prepare_train_features, batched=True, remove_columns=eval_dataset.column_names)
    
    train_loader = DataLoader(tokenized_train_datasets,batch_size=config['batch_size'])
    valid_loader = DataLoader(tokenized_eval_datasets,batch_size=config['batch_size'])
    dataloaders = {'train':train_loader,'valid':valid_loader}
    
    return dataloaders

# Train and eval

In [None]:
def train_and_eval(model, dataloaders, criterion, optimizer, filename):
    val_loss = 10000
    for i in range(config['epochs']):
        for j in ['train','valid']:
            if j=='train':
                print(f'training start for epoch {i}')
                model.zero_grad()
                model.train()
                optimizer.zero_grad()
            else:
                print(f'eval start for epoch {i}')
                model.eval()
                
            epoch_loss = 0.0
            epoch_jaccard = 0.0
            for idx, data in enumerate(tqdm(dataloaders[j])):
                input_ids, attention_mask = torch.stack(data['input_ids']).cuda(), torch.stack(data['attention_mask']).cuda()
                start, end = torch.tensor(data['start_positions']).cuda(), torch.tensor(data['end_positions']).cuda()
#                 input_ids, attention_mask = data['input_ids'], data['attention_mask']
#                 start, end = data['start_positions'], data['end_positions']
                start_logits, end_logits = model(
                    input_ids=input_ids,
                    attention_mask = attention_mask
                )
#                 print(start_logits.shape)
                # The input is expected to contain raw, unnormalized scores for each class.    
#                 start_logits = torch.softmax(start_logits, dim=1)#.cpu().detach().numpy()
#                 end_logits = torch.softmax(end_logits, dim=1)#.cpu().detach().numpy()
                start_logits = start_logits.permute(1,0)
                end_logits = end_logits.permute(1,0)
#                 start_logits = torch.softmax(start_logits, dim=1)
#                 end_logits = torch.softmax(start_logits, dim=1)
                loss = criterion(start_logits, end_logits, start, end)
#                 print(loss.item())
                epoch_loss+=loss.item()
                if j =='train':
#                     print('grad calculated')
                    loss = loss/config['grad_acc']
                    loss.backward()
                    if idx%config['grad_acc']==0:
#                         print('weights updated')
                        optimizer.step()
                        optimizer.zero_grad()
            if j =='train': 
                optimizer.step()
                optimizer.zero_grad()
            avg_epoch_loss = epoch_loss/len(dataloaders[j])
            print(f'{j} loss:{avg_epoch_loss} for epoch {i}')
            if j == 'valid':
                if val_loss>avg_epoch_loss:
                    print(f'saving model initial val loss:{val_loss} which is now improve to {avg_epoch_loss}')
                    val_loss = avg_epoch_loss
                    torch.save(model.state_dict(), filename)
                else:
                    print('No improvement')
    return 'Train val complete'
            
#     pass

# Start

In [None]:
# args = getTrainArgs(model_checkpoint)
for fold in range(folds):
        print(f'Preparing data for fold number {fold}')
        dataloaders = getData(df, fold)
        print(f'model {fold} loading ...')
        model = chaiiModel()
        model.cuda()
        criterion = chaii_loss
        optimizer = AdamW(model.parameters(),lr=config['lr'], weight_decay=config['weight_decay'] )
        train_and_eval(model, dataloaders, criterion, optimizer, f"chaii-trained-model-{fold}")
        del model
        gc.collect()
        torch.cuda.empty_cache()
        print(f"Training fold {fold}")
        print("----------------------")