In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import gc
gc.enable()
import math
import json
import time
import random
import multiprocessing
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from sklearn import model_selection
from string import punctuation

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
import torch.optim as optim
from torch.utils.data import (
    Dataset, DataLoader,
    SequentialSampler, RandomSampler
)
from torch.utils.data.distributed import DistributedSampler

try:
    from apex import amp
    APEX_INSTALLED = True
except ImportError:
    APEX_INSTALLED = False

import transformers
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    MT5ForConditionalGeneration,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    logging,
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
)
logging.set_verbosity_warning()
logging.set_verbosity_error()

# Now Create Function

def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def optimal_num_of_loader_workers():
    num_cpus = multiprocessing.cpu_count()
    num_gpus = torch.cuda.device_count()
    optimal_value = min(num_cpus, num_gpus*4) if num_gpus else num_cpus - 1
    return optimal_value

print(f"Apex AMP Installed :: {APEX_INSTALLED}")
MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [None]:
class Configration:
    # model
    model_type = 'xlm_roberta'
    MT5_SMALL_name_or_path="../input/chaii-mt5/mT5-small-finetuned-tydiqa-for-xqa/mT5-small-finetuned-tydiqa-for-xqa"
    MT5_BASE_name_or_path="../input/chaii-mt5/mT5-base-finetuned-tydiQA-xqa/mT5-base-finetuned-tydiQA-xqa"
    MT5_SMALL_config_name="../input/chaii-mt5/mT5-small-finetuned-tydiqa-for-xqa/mT5-small-finetuned-tydiqa-for-xqa/config.json"
    MT5_BASE_config_name="../input/chaii-mt5/mT5-base-finetuned-tydiQA-xqa/mT5-base-finetuned-tydiQA-xqa/config.json"
    fp16 = True if APEX_INSTALLED else False
    fp16_opt_level = "O1"
    gradient_accumulation_steps = 2

    # tokenizer
    MT5_SMALL_tokenizer_name="../input/chaii-mt5/mT5-small-finetuned-tydiqa-for-xqa/mT5-small-finetuned-tydiqa-for-xqa"
    MT5_BASE_tokenizer_name="../input/chaii-mt5/mT5-base-finetuned-tydiQA-xqa/mT5-base-finetuned-tydiQA-xqa"
    max_seq_length = 400
    doc_stride = 135

    # train
    epochs = 1
    train_batch_size = 4
    eval_batch_size = 128

    # optimzer
    optimizer_type = 'AdamW'
    learning_rate = 1e-5
    weight_decay = 1e-2
    epsilon = 1e-8
    max_grad_norm = 1.0

    # scheduler
    decay_name = 'linear-warmup'
    warmup_ratio = 0.1

    # logging
    logging_steps = 10

    # evaluate
    output_dir = 'output'
    seed = 2021
    
args=Configration()

In [None]:
test=pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
test.head()

In [None]:
("ज्वाला गुट्टा की माँ का नाम क्या है").split()

In [None]:
class chaiiS2SDataset(Dataset):
    def __init__(self,
                data,
                tokenizer,
                 args):
        self.data=data
        self.tokenizer=tokenizer
        self.args=args
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,
                   idx):
        data_row=self.data.iloc[idx]
        source_encoding = self.tokenizer(
                                    data_row['question'],
                                    data_row['context'],
                                    max_length=self.args.max_seq_length,
                                    padding='max_length',
                                    truncation="only_second",
                                    return_attention_mask=True,
                                    add_special_tokens=True,
                                    return_tensors="pt"
                                    )

        return dict(
                    question=data_row['question'],
                    context=data_row['context'],
                    input_ids=source_encoding["input_ids"].flatten(),
                    attention_mask=source_encoding['attention_mask'].flatten()
                    )

In [None]:
tokenizer=AutoTokenizer.from_pretrained(args.MT5_SMALL_tokenizer_name)
config=AutoConfig.from_pretrained(args.MT5_SMALL_config_name)
model=MT5ForConditionalGeneration(config=config)
model.cuda()
model.load_state_dict(torch.load("../input/chaii-mt5/MT5-small-xqa/output/checkpoint-fold-0/pytorch_model.bin"))

In [None]:
test_dataset=chaiiS2SDataset(test,tokenizer,args)
test_dataloader=DataLoader(test_dataset,
                          num_workers=optimal_num_of_loader_workers(),
                          shuffle=False,
                           batch_size=args.eval_batch_size
                          )

In [None]:
batch=next(iter(test_dataloader))

In [None]:
batch["question"]

In [None]:
tokenizer.encode(" ")

In [None]:
tokenizer.decode(batch["input_ids"][0])

In [None]:
batch["input_ids"][0]

In [None]:
input_ids=batch["input_ids"].cuda()
attention_mask=batch["attention_mask"].cuda()
generated_ids = model.generate(
  input_ids=input_ids,
  attention_mask=attention_mask,
  num_beams=1,  # greedy search
  max_length=80,
  repetition_penalty=2.5,
  early_stopping=True,
  use_cache=True)

In [None]:
generated_ids.detach().cpu().tolist()

In [None]:
input_ids.shape

In [None]:
def generate_answer(batch):
    input_ids=batch["input_ids"].cuda()
    attention_mask=batch["attention_mask"].cuda()
    generated_ids = model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask)
#       num_beams=1,  # greedy search
#       max_length=80,
#       repetition_penalty=2.5,
#       early_stopping=True,
#       use_cache=True)

    return generated_ids.detach().cpu().tolist()

In [None]:
predictions=[]
for batch in test_dataloader:
    answer_tokens=generate_answer(batch)
    predictions.extend(answer_tokens)

In [None]:
decoded_preds=[]
for prediction in predictions:
    decoded_pred=[
          tokenizer.decode(generated_id)
          for generated_id in prediction
      ]
    decoded_preds.append(" ".join(decoded_pred))

In [None]:
decoded_preds