In [None]:
import sys
sys.path.append('../input/huggingface-accelerate')
sys.path.append('../input/transformershuggingface/transformer_repo')

In [None]:
!pip -q install madgrad

In [None]:
model_checkpoint='../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2'

In [None]:
!pip uninstall fsspec -qq -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq

In [None]:
import numpy as np
import pandas as pd
import gc
import os
import sys
from IPython.display import FileLink
%env WANDB_DISABLED=True
import torch
import datasets as d
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader
from src.accelerate import Accelerator
from torch.optim.swa_utils import AveragedModel, SWALR
from torch.optim.lr_scheduler import CosineAnnealingLR
import math
import os
import random
from tqdm.auto import tqdm
from transformers import (
    AdamW,
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    default_data_collator,
    get_scheduler,
    set_seed)
from madgrad import MADGRAD

In [None]:
# set the seed
seed=124
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
set_seed(seed)

In [None]:
# load  data
import pandas as pd
train_df=pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/train.csv')
test_df=pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')
submission_df=pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/sample_submission.csv')
squad=pd.read_csv('../input/squadhindi/xquad.csv')
mlqa=pd.read_csv('../input/mlqa-hindi-processed/mlqa_hindi.csv')

In [None]:
squad['id']=['sar'+str(item) for item in range(5427,5427+squad.shape[0])]
mlqa['id']=['sar'+str(item) for item in range(mlqa.shape[0])]
# merge train_df and squad
train_df=pd.concat([train_df,squad,mlqa])

In [None]:
def get_answers_as_dict(r):
    start = r[0]
    text = r[1]
    return {
        'answer_start': [start],
        'text': [text]
    }

train_df['answers'] = train_df[['answer_start', 'answer_text']].apply(get_answers_as_dict, axis=1)

train_df.reset_index(drop=True,inplace=True)

In [None]:
# split the dataset into 6 folds
train_df['fold']=-99
skf=StratifiedKFold(n_splits=6,random_state=seed,shuffle=True)
for i,(train_index, test_index) in enumerate(skf.split(train_df, train_df['language'])):
    train_df.loc[test_index,'fold']=i

#train_df.to_csv('train_df.csv',index=False)

In [None]:
# hyper parameters
#gradient_accumulation_steps = 2
max_length = 384
doc_stride = 128
# train
num_train_epochs = 1
train_batch_size = 4
eval_batch_size = 8
# optimizer
#learning_rate = 1.5e-5
# weight_decay = 1e-2
#epsilon=1e-8
# scheduler
#scheduler= 'linear'
warmup_ratio = 0.1
#num_warmup_steps=0
# evaluate
output_dir = 'output'

In [None]:
def prepare_train_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] !=1 :
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [None]:
def train(data,fold,model,config,tokenizer,output_dir,learning_rate=1.5e-5,
          gradient_accumulation_steps = 2,scheduler= 'linear',
         num_warmup_steps=0,
         epsilon=1e-8,correct_bias=True,
         no_decay = ["bias", "LayerNorm.weight"],
         weight_decay = 1e-2):
    accelerator = Accelerator()
    data=data[data['fold']!=fold]
    data.drop('language',axis=1,inplace=True)
    data=d.Dataset.from_pandas(data)
    data= data.map(prepare_train_features, batched=True,remove_columns=data.column_names)
    train_dataloader = DataLoader(
        data, shuffle=True, collate_fn= default_data_collator, batch_size=train_batch_size)
    del data
    group1=['layer.0.','layer.1.','layer.2.','layer.3.']
    group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
    group3=['layer.8.','layer.9.','layer.10.','layer.11.']
    group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.',
                   'layer.5.','layer.6.','layer.7.','layer.8.','layer.9.',
                   'layer.10.','layer.11.']
    optimizer_grouped_parameters = [
        #{
           # "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
           # "weight_decay":weight_decay,
       # },
        #{
           # "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
           # "weight_decay": 0.0,
        #},
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
         'weight_decay': weight_decay},
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
             'weight_decay':weight_decay, 'lr': learning_rate/2},
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
             'weight_decay':weight_decay, 'lr': learning_rate},
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
             'weight_decay':weight_decay, 'lr': learning_rate*2},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
             'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
             'weight_decay': 0.0, 'lr': learning_rate/2},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
             'weight_decay': 0.0, 'lr': learning_rate},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
             'weight_decay': 0.0, 'lr': learning_rate*2}
         ]


    optimizer =AdamW(optimizer_grouped_parameters, lr=learning_rate,
                     eps=epsilon,weight_decay=weight_decay)

    # Prepare everything with the `accelerator`.
    model, optimizer, train_dataloader= accelerator.prepare(
        model, optimizer, train_dataloader)    
  
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
    max_train_steps = num_train_epochs * num_update_steps_per_epoch

    lr_scheduler = get_scheduler(
        name=scheduler,
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=max_train_steps)
    #swa_model = AveragedModel(model)
    #scheduler = CosineAnnealingLR(optimizer, T_max=100)
    #swa_start = 1
    #swa_scheduler = SWALR(optimizer, swa_lr=0.05)
   
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
    completed_steps=0

    for epoch in range(num_train_epochs):
        model.train()
        
        #if epoch==1:
            #swa_model.update_parameters(model)
            #swa_scheduler.step()
            #break # break the loop
            
        for step, batch in enumerate(train_dataloader):
            for key,value in batch.items():
                batch[key]=value.to(device)
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss/gradient_accumulation_steps
            accelerator.backward(loss)
            if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps >= max_train_steps:
                break
    # Update bn statistics for the swa_model at the end
    #torch.optim.swa_utils.update_bn(train_dataloader, swa_model)
        
    output_dir=os.path.join(output_dir,f"checkpoint_fold-{fold}")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    torch.save(model.state_dict(), f"{output_dir}/pytorch_model.bin")
    config.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    del model
    #del swa_model
    del tokenizer
    del config
    del train_dataloader
    del optimizer
    del accelerator
    gc.collect()
    return 



In [None]:
#def l2_regularizer(weight_decay):
 #   def regularizer(model):
  #      l2 = 0.0
  #      for p in model.parameters():
   #         l2 += torch.sqrt(torch.sum(p ** 2))
    #    return 0.5 * weight_decay * l2
    #return regularizer

#def cyclic_learning_rate(epoch, cycle, alpha_1, alpha_2):
 #   def schedule(iter):
  #      t = ((epoch % cycle) + iter) / cycle
   #     if t < 0.5:
    #        return alpha_1 * (1.0 - 2.0 * t) + alpha_2 * 2.0 * t
     #   else:
      #      return alpha_1 * (2.0 * t - 1.0) + alpha_2 * (2.0 - 2.0 * t)
    #return schedule


#def adjust_learning_rate(optimizer, lr):
 #   for param_group in optimizer.param_groups:
  #      param_group['lr'] = lr
   # return lr

In [None]:
#def train(data,fold,model,config,tokenizer,output_dir,learning_rate=1.5e-5,cycle=4, lr_1=1.5e-5, lr_2=1.5e-6,
 #         gradient_accumulation_steps = 2, regularizer=None,
  #       epsilon=1e-8,
   #      no_decay = ["bias", "LayerNorm.weight"],
    #     weight_decay = 1e-4):
    #accelerator = Accelerator()
    #data=data[data['fold']!=fold]
    #data.drop('language',axis=1,inplace=True)
    #data=d.Dataset.from_pandas(data)
    #data= data.map(prepare_train_features, batched=True,remove_columns=data.column_names)
    #train_dataloader = DataLoader(
     #   data, shuffle=True, collate_fn= default_data_collator, batch_size=train_batch_size)
    #del data
    #group1=['layer.0.','layer.1.','layer.2.','layer.3.']
    #group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
    #group3=['layer.8.','layer.9.','layer.10.','layer.11.']
    #group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.',
     #              'layer.5.','layer.6.','layer.7.','layer.8.','layer.9.',
      #             'layer.10.','layer.11.']
    #optimizer_grouped_parameters = [
        #{
           # "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
           # "weight_decay":weight_decay,
       # },
        #{
           # "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
           # "weight_decay": 0.0,
        #},
     #   {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
      #   'weight_decay': weight_decay},
       #     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
        #     'weight_decay':weight_decay, 'lr': learning_rate/2},
         #   {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
          #   'weight_decay':weight_decay, 'lr': learning_rate},
           # {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
            # 'weight_decay':weight_decay, 'lr': learning_rate*2},
            #{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
            # 'weight_decay': 0.0},
            #{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
            # 'weight_decay': 0.0, 'lr': learning_rate/2},
            #{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
            # 'weight_decay': 0.0, 'lr': learning_rate},
            #{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
            # 'weight_decay': 0.0, 'lr': learning_rate*2}
         #]


    #optimizer =AdamW(optimizer_grouped_parameters, lr=learning_rate,
     #                eps=epsilon,weight_decay=weight_decay)
#
 #   # Prepare everything with the `accelerator`.
  #  model, optimizer, train_dataloader= accelerator.prepare(
   #     model, optimizer, train_dataloader)    
  #
   # num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
    #max_train_steps = num_train_epochs * num_update_steps_per_epoch
    
    # Only show the progress bar once on each machine.
    #progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
    #completed_steps=0
    
    #for epoch in range(num_train_epochs):
     #   model.train()
      #  lr_schedule =cyclic_learning_rate(epoch, cycle, lr_1, lr_2)
        
       # for step, batch in enumerate(train_dataloader):
        #    for key,value in batch.items():
         #       batch[key]=value.to(device)
            
            
          #  outputs = model(**batch)
           # loss = outputs.loss
            #loss = loss/gradient_accumulation_steps
            #if regularizer is not None:
             #   loss += regularizer(model)
            
            #accelerator.backward(loss)
            #if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
             #   optimizer.step()
             #   lr = lr_schedule(step/max_train_steps)
              #  adjust_learning_rate(optimizer, lr)
                    
               # optimizer.zero_grad()
                #progress_bar.update(1)
                #completed_steps += 1

            #if completed_steps >= max_train_steps:
             #   break
                
    #output_dir=os.path.join(output_dir,f"checkpoint_fold-{fold}")
    #if not os.path.exists(output_dir):
     #   os.makedirs(output_dir)
    #torch.save(model.state_dict(), f"{output_dir}/pytorch_model.bin")
    #config.save_pretrained(output_dir)
    #tokenizer.save_pretrained(output_dir)
    #del model
    #del tokenizer
    #del config
    #del train_dataloader
    #del optimizer
    #del accelerator
    #gc.collect()
    #return 

In [None]:
config = AutoConfig.from_pretrained(model_checkpoint)
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint,config=config)
device=torch.device("cuda")
model.to(device)
d3='s'

In [None]:
#for n,p in model.named_parameters():
    #print(n)
    #print("**********************")

In [None]:
#weight_decay=1e-3
#regularizer=l2_regularizer(weight_decay)

In [None]:
#train(train_df,0,model,config,tokenizer,output_dir,learning_rate=1.5e-5,
 #         gradient_accumulation_steps = 2,scheduler= 'linear',
  #      num_warmup_steps=0,epsilon=1e-8,correct_bias=True,no_decay = ["bias", "LayerNorm.weight"],
   #   weight_decay = 1e-2)
#FileLink(r"./output/checkpoint_fold-0/pytorch_model.bin")


In [None]:
#train(train_df,1,model,config,tokenizer,output_dir,learning_rate=1.4e-5,
 #        gradient_accumulation_steps = 2,scheduler= 'cosine',
  #      num_warmup_steps=0,epsilon=1e-8,correct_bias=True,no_decay = ["bias", "LayerNorm.weight"],
   #     weight_decay = 1e-2)
#FileLink(r"./output/checkpoint_fold-1/pytorch_model.bin")


In [None]:
#train(train_df,2,model,config,tokenizer,output_dir,learning_rate=1.5e-5,
  #      gradient_accumulation_steps = 2,scheduler= 'linear',
   #      num_warmup_steps=0,epsilon=1e-8,correct_bias=True,no_decay = ["bias", "LayerNorm.weight"],
    #    weight_decay = 1e-4)
#FileLink(r"./output/checkpoint_fold-2/pytorch_model.bin")


In [None]:
#train(train_df,3,model,config,tokenizer,output_dir,learning_rate=1.5e-5,
   # gradient_accumulation_steps = 2,scheduler= 'cosine',
   # num_warmup_steps=0,epsilon=1e-8,correct_bias=True,no_decay = ["bias", "LayerNorm.weight"],
  #  weight_decay = 1e-2)
#FileLink(r"./output/checkpoint_fold-3/pytorch_model.bin")


In [None]:
# train with learning rate divided by 2
#train(train_df,4,model,config,tokenizer,output_dir,learning_rate=1.5e-5,
 #        gradient_accumulation_steps = 2,scheduler= 'linear',
  #       num_warmup_steps=0,epsilon=1e-8,correct_bias=True,no_decay = ["bias", "LayerNorm.weight"],
   #  weight_decay = 1e-2)
#FileLink(r"./output/checkpoint_fold-4/pytorch_model.bin")


In [None]:
#train(train_df,5,model,config,tokenizer,output_dir,learning_rate=1.5e-5,
 #         gradient_accumulation_steps = 2,scheduler= 'linear',
  #       num_warmup_steps=2,epsilon=1e-8,correct_bias=True,no_decay = ["bias", "LayerNorm.weight"],
   #  weight_decay = 1e-2)
#FileLink(r"./output/checkpoint_fold-5/pytorch_model.bin")

