In [None]:
USE_PIPELINE = False
MODEL_PATH = "../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2" #"../input/localnb001-export-transformers"
MODEL_NAME_FINETUNED = "model_deepset_xlm_roberta_large_squad2" #"model"
batch_size = 2 # it's only used when USE_PIPELINE = False

from transformers import BertTokenizerFast, XLMRobertaTokenizerFast
from transformers import BertForQuestionAnswering, XLMRobertaForQuestionAnswering

MODEL_PATH_TO_OBJECT = {
    "../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2": [XLMRobertaTokenizerFast, XLMRobertaForQuestionAnswering],
    "../input/localnb001-export-transformers": [BertTokenizerFast, BertForQuestionAnswering]
}



import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#from statistics import mode
import collections
from tqdm import tqdm
import re
from transformers import pipeline
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
# from transformers import pipeline
# if DISABLE_INTERNET:
#     model_path = "../input/localnb001-export-transformers"
#     model = pipeline('question-answering', model=model_path, tokenizer=model_path, device=0)
# else:
#     model = pipeline('question-answering', model='bert-base-multilingual-cased', device=0)


tokenizer = MODEL_PATH_TO_OBJECT[MODEL_PATH][0].from_pretrained(MODEL_PATH)
model = MODEL_PATH_TO_OBJECT[MODEL_PATH][1].from_pretrained(MODEL_PATH)


# Load model weights and optimizer state
output_model = f"../input/localnb002-fine-tune/{MODEL_NAME_FINETUNED}.pth"
checkpoint = torch.load(output_model, map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])

if USE_PIPELINE:
    #model = pipeline('question-answering', model=model_path, tokenizer=model_path, device=0)
    model = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)
else:
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

# Inference

In [None]:
test = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
test.head()

In [None]:
def custom_mode(x):
    """
    Args:
        x: List[str]
    Returns:
        str
    """
    # drop "" from the list
    x = [el for el in x if el!=""]
    if len(x)==0:
        x = [""]
    #return mode(x)
    return collections.Counter(x).most_common()[0][0]

In [None]:
def test_fn_pipeline():
    test["PredictionString"] = ""
    tqdm_df_itertuples = tqdm(test.itertuples(), total=len(test))
    for row in tqdm_df_itertuples:
        i = row[0]
        context = row[2]
        question = row[3]
        
        output = model(question=question, context=context)
        pred = output["answer"]
        
        test.loc[i, "PredictionString"] = pred
        
    return test

In [None]:
def test_fn_naive(test, verbose=False):
    """
    Args:
        test: pandas.DataFrame
        verbose: bool
    Returns:
        test: pandas.DataFrame
    """
    test["PredictionString"] = ""
    progress_bar = tqdm(range(len(test_dataloader)))
    
    for i, batch in enumerate(test_dataloader):
        input_ids = batch['input_ids'].to(device)
        # attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            outputs = model(input_ids)

        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits
        # Get the most likely beginning of answer with the argmax of the score
        answer_start = torch.argmax(answer_start_scores, dim=-1)
        # Get the most likely end of answer with the argmax of the score
        answer_end = torch.argmax(answer_end_scores, dim=-1)
        
        #### RIOW
#         pred = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
#         test.loc[i, "PredictionString"] = pred
        
        if verbose:
            print("input_ids.shape: ",input_ids.shape)
            print("answer_start.shape: ", answer_start.shape)
            print("answer_end.shape: ", answer_end.shape)
            print()
        
        #for j in range(batch_size):
        pred_answer_j_s = []
        for j in range(len(batch)):
            #if len(batch)>1:
            if input_ids.shape[0]>1:
                pred_answer_j = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[j][answer_start[j]:answer_end[j]]))
            else:
                pred_answer_j = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
            pred_answer_j_s.append(pred_answer_j)
            #test.loc[i*batch_size+j, "PredictionString"] = pred_answer_j
            if verbose:
                print("pred_answer_j_s: ", pred_answer_j_s)
                print("mode(pred_answer_j_s): ", custom_mode(pred_answer_j_s))
                print()
        
        
        test.loc[i*len(batch)+j, "PredictionString"] = custom_mode(pred_answer_j_s)
        
        progress_bar.update(1)
        #### RIOWRIOW
        
    return test

In [None]:
# def test_fn(use_pipeline=False):
#     test["PredictionString"] = ""
#     tqdm_df_itertuples = tqdm(test.itertuples(), total=len(test))
#     for row in tqdm_df_itertuples:
#         i = row[0]
#         context = row[2]
#         question = row[3]
        
#         if use_pipeline:
#             output = model(question=question, context=context)
#             pred = output["answer"]
#         else:
#             inputs = tokenizer(question, 
#                                context, 
#                                add_special_tokens=True,
#                                max_length=512,
#                                padding=True, 
#                                truncation=True, 
#                                return_tensors="pt")
#             inputs.to(device)
#             input_ids = inputs["input_ids"].tolist()[0]
#             outputs = model(**inputs)
#             answer_start_scores = outputs.start_logits
#             answer_end_scores = outputs.end_logits

#             # Get the most likely beginning of answer with the argmax of the score
#             answer_start = torch.argmax(answer_start_scores)
#             # Get the most likely end of answer with the argmax of the score
#             answer_end = torch.argmax(answer_end_scores) + 1

#             pred = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

#         test.loc[i, "PredictionString"] = pred
        
#     return test

In [None]:
#### Functions for naive inference

# Text cleansing for context

#alphabets = "([A-Za-z])"
alphabets = "([\u0900-\u097F\u0B80-\u0BFF])" # Hindi & Tamil
prefixes = re.compile("(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt)[.]")
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = re.compile("[.](co|net|org|io|gov|edu|us)")
etal = re.compile(r"(\bet al)[.]")
urls = re.compile("(www)[.]")
digits =  re.compile("[.]([0-9])")

def split_into_sentences(text):
    """
    This function will be applied to context in df
    ----------------------------------------------
    Args: 
        text: str
    Returns:
        sentences: List[str]
    """
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = prefixes.sub("\\1<prd>",text)
    text = websites.sub("<prd>\\1",text)
    text = urls.sub("\\1<prd>",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = etal.sub("\\1<prd>", text)
    text = digits.sub("<prd>\\1",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    if sentences[-1] == '':
        sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences



def clean_text(txt, apply_space_norm=False, strip=False):
    """
    This function will be applied to question and answer in df
    ----------------------------------------------------------
    Args: 
        txt: str
        apply_space_norm: bool --default False
        strip: bool --default False
    Returns:
        text: str
    """
    txt = re.sub('\[[0-9]\]', '', txt) # remove Wikipedia's quotes
    if apply_space_norm:
        txt = re.sub('[^A-Za-z\u0900-\u097F\u0B80-\u0BFF0-9]+', ' ', str(txt).lower()) # Hindi: \u0900-\u097F, Tamil: \u0B80-\u0BFF
    if strip:
        txt = txt.strip()
    return txt



def shorten_sentences(sentences, max_length=512, overlap=60):
    """
    If a sentence is longer than `max_length`, break it into chunks of 
    length `max_length` with an overlap of length `overlap`.
    
    e.g. if the sentence has 50 tokens, max_length is 20, and overlap is 10.
    Then the first sentence will be token_i where i in [0,20)
    Second sentence will be token_i in [10,30).
    Third sentence [20, 40)
    Fourth [30, 50)
    Fifth [40, 60)
    ------------------------------------------------------------------------
    Args:
        sentences: List[str]
        max_length: int --default 512
        overlap: int --default 60
    Returns:
        shortend_sentences: List[str]
    """
    shortened_sentences = []
    for sentence in sentences:
        sentence = clean_text(sentence, apply_space_norm=True)
        words = sentence.split()
        num_words = len(words)
        if num_words > max_length:
            for start_index in range(0, num_words, max_length - overlap):
                shortened_sentences.append(' '.join(words[start_index:start_index+max_length]))
        else:
            shortened_sentences.append(sentence)
    return shortened_sentences

# refenrece:
# [1] https://stackoverflow.com/questions/41356013/how-to-detect-if-a-string-contains-hindi-devnagri-in-it-with-character-and-wor
# [2] https://en.wikipedia.org/wiki/Unicode_block
# [3] https://www.kaggle.com/nbroad/no-training-question-answering-model?scriptVersionId=66240356




def read_chaii(df):
    """
    Args:
        df: pd.DataFrame
    Returns:
        contexts: List[str]
        questions: List[str]
    """
    contexts = []
    questions = []
    ids = []
    for i,row in df.iterrows():
        row_context_sentences = row["context_sentences"]
        row_question = row["question"]
        row_id = row["id"]

        for context_sentence in row_context_sentences:
            contexts.append(context_sentence)
            questions.append(row_question)
            ids.append(row_id)
    
    return contexts, questions, ids



class ChaiiDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
if USE_PIPELINE:
    test = test_fn_pipeline()
else:
    test["context_sentences"] = test["context"].apply(clean_text).apply(split_into_sentences).apply(shorten_sentences)
    test["question"] = test["question"].apply(clean_text, apply_space_norm=True, strip=True)
    test_contexts, test_questions, test_ids = read_chaii(test)
    test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)
    test_dataset = ChaiiDataset(test_encodings)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    
    model.eval()
    test = pd.DataFrame({"id": test_ids, "context": test_contexts, "question": test_questions})
    test = test_fn_naive(test, verbose=False)

In [None]:
display(test.tail(20))

In [None]:
test.dropna(inplace=True)
test = test.groupby("id")["PredictionString"].agg(custom_mode).to_frame().reset_index()
display(test)

In [None]:
#test[["id", "PredictionString"]].to_csv("submission.csv", index=False)
test.to_csv("submission.csv", index=False)