# Use huggingface transformers pretrained QA 

## Install Lib

In [None]:
!pip -q install transformers

In [None]:
import numpy as np
import pandas as pd
import os
import transformers
import torch
from tqdm.notebook import tqdm

## Read data csv

In [None]:
df_train = pd.read_csv('/kaggle/input/chaii-hindi-and-tamil-question-answering/train.csv')
df_train.head()

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer,TrainingArguments, Trainer,default_data_collator

## Read Pretrained model
Normally we can download pretrain from pretrained name ( https://huggingface.co/models ) ex. **"deepset/xlm-roberta-large-squad2"**  
But in this competition we cant use Internet while summiting so I used the model that downloaded in kaggle dataset (https://www.kaggle.com/c/chaii-hindi-and-tamil-question-answering/discussion/266015) (https://www.kaggle.com/sauravmaheshkar/huggingface-question-answering-models)

In [None]:
#pt_name = "deepset/xlm-roberta-large-squad2"
pt_name = "../input/huggingface-question-answering-models/multilingual/xlm-roberta-large-squad2"

tokenizer = AutoTokenizer.from_pretrained(pt_name)
model = AutoModelForQuestionAnswering.from_pretrained(pt_name)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
model = model.to(device)

In [None]:
article = df_train.context.values
question = df_train.question.values

## Tokenizer example

example data

In [None]:
df_train.iloc[0]

Encode data to bert QA format   
![](https://www.researchgate.net/profile/Hussein-Mozannar/publication/333773105/figure/fig2/AS:769535128903681@1560482880271/Architecture-of-our-open-domain-question-answering-system-SOQAL-BERT-illustration-is.ppm)

In [None]:
temp_inputs = tokenizer.encode_plus(question[0], article[0], add_special_tokens=True, return_tensors="pt")
temp_inputs

In [None]:
temp_inputs['input_ids']

Decoded token -> text

In [None]:
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(temp_inputs['input_ids'][0]))

##  Prediction
I split long article by line ("\n") and use answer from line that return maximum score  

remake this function is pretty slow because batch size is 1

In [None]:
def predict(question, context):
    lines = context.split('\n')
    max_score = -999999
    ans_list = []
    max_ans = ''
    for line in lines:
        if len(line) > 800:
            line = line[:800]
        # Encode data
        inputs = tokenizer.encode_plus(question, line, add_special_tokens=True, return_tensors="pt").to(device)
        input_ids = inputs["input_ids"].tolist()[0] 
        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        # Prediction
        with torch.no_grad():
            pred = model(**inputs)
        # Find socre
        score = torch.max(pred['start_logits']).cpu() + torch.max(pred['end_logits']).cpu()
        startkey = torch.argsort(pred['start_logits'],descending=True).cpu().int()[0]
        endkey =  torch.argsort(pred['end_logits'],descending=True).cpu().int()[0]
        # Answer is start index to end index
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[startkey[0]:endkey[0]+1]))
        ans_list.append(answer)
        #print(score)
        if score > max_score and answer != '<s>' and answer != '<UNK/>' :
            #print(score)
            max_score = score
            max_ans = answer
    return max_ans

In [None]:
answers_list = []
model.eval()
for i in tqdm(range(len(question))):
    answer = predict(question[i], article[i])
    answers_list.append(answer)

### Exact Match score

In [None]:
sum(df_train['answer_text'] == answers_list)/len(answers_list)

### Jaccard score

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
jl = []
for s1,s2 in zip(df_train['answer_text'],answers_list):
    jl.append(jaccard(s1,s2))

In [None]:
sum(jl)/len(jl)

## Test data

In [None]:
df_test = pd.read_csv('/kaggle/input/chaii-hindi-and-tamil-question-answering/test.csv')
df_test.head()

In [None]:
len(df_test)

In [None]:
article = df_test.context.values
question = df_test.question.values

In [None]:
answers_list = []
model.eval()
for i in tqdm(range(len(question))):
    answer = predict(question[i], article[i])
    answers_list.append(answer)

In [None]:
df_test["PredictionString"] = answers_list

In [None]:
df_test

In [None]:
df_test[["id","PredictionString"]].to_csv('submission.csv',index=False)