In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from tqdm import tqdm

In [None]:
DISABLE_INTERNET = True
INFERENCE = True

In [None]:
# from transformers import pipeline
# if DISABLE_INTERNET:
#     model_path = "../input/localnb001-export-transformers"
#     model = pipeline('question-answering', model=model_path, tokenizer=model_path, device=0)
# else:
#     model = pipeline('question-answering', model='bert-base-multilingual-cased', device=0)

from transformers import pipeline, BertForQuestionAnswering, BertTokenizerFast
import torch
if DISABLE_INTERNET:
    model_path = "../input/localnb001-export-transformers"
    model = BertForQuestionAnswering.from_pretrained(model_path)
    tokenizer = BertTokenizerFast.from_pretrained(model_path)
    
    # Load model weights and optimizer state
    output_model = "../input/localnb002-fine-tune/model.pth"
    checkpoint = torch.load(output_model, map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])
    
    #model = pipeline('question-answering', model=model_path, tokenizer=model_path, device=0)
    model = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)
else:
    model = pipeline('question-answering', model='bert-base-multilingual-cased', device=0)

In [None]:
if not INFERENCE:
    context = "日本軍の英領マレー半島に対する上陸作戦（マレー作戦）及び米国領ハワイ諸島への真珠湾攻撃で開戦。その後、北アメリカ大陸西海岸（米領アラスカや米本土西海岸）、オセアニアを含む太平洋、東南アジア、アフリカ東岸を含むインド洋に戦場が拡大。蒋介石率いる中華民国政府も日本へ正式に宣戦布告したほか（日中戦争）、末期のソ連対日参戦によりアジア大陸東部でも激しい戦いが繰り広げられた。"
    question = "ハワイ諸島の支配国は?"
    output = model(context=context, question=question)
    print(output)

In [None]:
if not INFERENCE:
    train = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/train.csv")
    train.head()

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
# def train_fn():
#     train["PredictionString"] = ""
#     tqdm_df_iterrows = tqdm(train.iterrows(), total=len(train))
#     jcs = []
#     for i,row in tqdm_df_iterrows:
#         context = row["context"]
#         question = row["question"]
#         output = model(question=question, context=context)
#         pred = output["answer"]
#         train.loc[i, "PredictionString"] = pred

#         # Evaluation
#         actual = row["answer_text"] 
#         jcs.append(jaccard(pred, actual))
        
#     return train, np.mean(jcs)

In [None]:
def train_fn():
    train["PredictionString"] = ""
    tqdm_df_itertuples = tqdm(train.itertuples(), total=len(train))
    jcs = []
    for row in tqdm_df_itertuples:
        i = row[0]
        context = row[2]
        question = row[3]
        output = model(question=question, context=context)
        pred = output["answer"]
        train.loc[i, "PredictionString"] = pred

        # Evaluation
        actual = row[4] 
        jcs.append(jaccard(pred, actual))
        
    return train, np.mean(jcs)

In [None]:
if not INFERENCE:
    train, score = train_fn()

In [None]:
if not INFERENCE:
    print(score)
    display(train.tail())

In [None]:
# i = 2
# context = train.loc[i, "context"]
# question = train.loc[i, "question"]
# answer_text = train.loc[i, "answer_text"]
# answer_start = train.loc[i, "answer_start"]
# print(context)

In [None]:
# output = model(question=question, context=context)
# print(output)
# print(answer_text, answer_start)

reference: https://www.kaggle.com/nbroad/no-training-question-answering-model/data?scriptVersionId=66240356

# Inference

In [None]:
test = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
test.head()

In [None]:
def test_fn():
    test["PredictionString"] = ""
    tqdm_df_itertuples = tqdm(test.itertuples(), total=len(test))
    for row in tqdm_df_itertuples:
        i = row[0]
        context = row[2]
        question = row[3]
        output = model(question=question, context=context)
        pred = output["answer"]
        test.loc[i, "PredictionString"] = pred
        
    return test

In [None]:
test = test_fn()

In [None]:
display(test)

In [None]:
test[["id", "PredictionString"]].to_csv("submission.csv", index=False)