In [None]:
DISABLE_INTERNET = True
INFERENCE_ONLY = True

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from transformers import pipeline
if DISABLE_INTERNET:
    #model_path = "../input/localnb001-export-transformers"
    model_path = "../input/indicbert/indic-bert-v1"
    model = pipeline('question-answering', model=model_path, tokenizer=model_path, device=0)
else:
    model = pipeline('question-answering', model='bert-base-multilingual-cased', device=0)
    #model = pipeline('question-answering', model='bert-base-cased', device=0)
#!pip install fugashi
#!pip install unidic-lite
#model = pipeline('question-answering', model='cl-tohoku/bert-base-japanese-v2')

In [None]:
if not INFERENCE_ONLY:
    context = "日本軍の英領マレー半島に対する上陸作戦（マレー作戦）及び米国領ハワイ諸島への真珠湾攻撃で開戦。その後、北アメリカ大陸西海岸（米領アラスカや米本土西海岸）、オセアニアを含む太平洋、東南アジア、アフリカ東岸を含むインド洋に戦場が拡大。蒋介石率いる中華民国政府も日本へ正式に宣戦布告したほか（日中戦争）、末期のソ連対日参戦によりアジア大陸東部でも激しい戦いが繰り広げられた。"
    question = "開戦時ハワイ諸島の支配国は?"
    output = model(context=context, question=question)
    print(output)

In [None]:
if not INFERENCE_ONLY:
    context = "The Second Sino-Japanese War between the Empire of Japan and the Republic of China had been in progress since 7 July 1937, with hostilities dating back as far as 19 September 1931 with the Japanese invasion of Manchuria. However, it is more widely accepted that the Pacific War itself began on 7 December (8 December Japanese time) 1941, when the Japanese invaded Thailand and attacked the British colonies of Malaya, Singapore, and Hong Kong as well as the United States military and naval bases in Hawaii, Wake Island, Guam, and the Philippines."
    question = "By which country was Hawaii dominated at the begining of the Pacific War?"
    output = model(context=context, question=question)
    print(output)

In [None]:
if not INFERENCE_ONLY:
    train = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/train.csv")
    display(train.head())

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def train_fn():
    train["PredictionString"] = ""
    tqdm_df_itertuples = tqdm(train.itertuples(), total=len(train))
    jcs = []
    for row in tqdm_df_itertuples:
        i = row[0]
        context = row[2]
        question = row[3]
        output = model(question=question, context=context)
        pred = output["answer"]
        train.loc[i, "PredictionString"] = pred

        # Evaluation
        actual = row[4] 
        jcs.append(jaccard(pred, actual))
        
    return train, np.mean(jcs)

In [None]:
if not INFERENCE_ONLY:
    train, score = train_fn()

In [None]:
if not INFERENCE_ONLY:
    print(score)
    display(train.tail())

reference: https://www.kaggle.com/nbroad/no-training-question-answering-model/data?scriptVersionId=66240356

# Inference

In [None]:
test = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
test.head()

In [None]:
def test_fn():
    test["PredictionString"] = ""
    tqdm_df_itertuples = tqdm(test.itertuples(), total=len(test))
    for row in tqdm_df_itertuples:
        i = row[0]
        context = row[2]
        question = row[3]
        output = model(question=question, context=context)
        pred = output["answer"]
        test.loc[i, "PredictionString"] = pred
        
    return test

In [None]:
test = test_fn()

In [None]:
display(test)

In [None]:
test[["id", "PredictionString"]].to_csv("submission.csv", index=False)