In [None]:
import glob
import os
import sys
import numpy as np
import pandas as pd
import torch
import pytorch_lightning as pl
from transformers import AutoConfig, AutoTokenizer, AutoModelForQuestionAnswering
from typing import NamedTuple, Dict, List, Set, Callable, Optional
from tqdm import tqdm

In [None]:
batch_size = 128
answer_max_tokens = 14
top = 20
threshold_factor = 0.1
stride = 64
model_max_length = 512
gradient_checkpointing = False
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
pl.seed_everything(31)

In [None]:
device = torch.device('cpu')
save_function = torch.save
if torch.cuda.is_available():
    device = torch.device('cuda')
    for i in range(torch.cuda.device_count()):
        print(f"{i}: {torch.cuda.get_device_name(i)}")
        print('Memory Allocated:\t', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Memory Cached:\t\t', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
print(f"device={device}")

In [None]:
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
INPUT = '/kaggle/input/'
DATA = f"{INPUT}chaii-hindi-and-tamil-question-answering/"
TEMP = '/kaggle/temp/'
OUTPUT = '/kaggle/working/'
RESOURCE_DIR = f'{INPUT}d/ruhong/chaii-lib/kaggle-chaii-hindi-and-tamil-qa-1.0/'
PRETRAINED_DIR = f"{INPUT}pretrained/pretrained/"
model_name = "xlm_roberta_large_10"
#model_name = "deepset_xlmr_squad2"
tokenizer_dirs = {
    "xlm_roberta": f"{PRETRAINED_DIR}xlm-roberta-base",
    "xlm_roberta_large_05": f"{PRETRAINED_DIR}xlm-roberta-large",
    "xlm_roberta_large_08": f"{PRETRAINED_DIR}xlm-roberta-large",
    "xlm_roberta_large_10": f"{PRETRAINED_DIR}xlm-roberta-large",
    "deepset_xlmr_squad2": f"{PRETRAINED_DIR}deepset/xlm-roberta-base-squad2",
}
model_dirs = {
    "xlm_roberta": f"{RESOURCE_DIR}models/xlm_roberta/20211029_082207/lightning_logs/version_0/checkpoints",
    "xlm_roberta_large_05": f"{RESOURCE_DIR}models/xlm_roberta_large/20211105_085009/trial_0/fold_0/lightning_logs/version_0/checkpoints",
    "xlm_roberta_large_08": f"{RESOURCE_DIR}models/xlm_roberta_large/20211108_180312/lightning_logs/version_0/checkpoints",
    "xlm_roberta_large_10": f"{RESOURCE_DIR}models/xlm_roberta_large/20211110_082218/lightning_logs/version_0/checkpoints",
    "deepset_xlmr_squad2": f"{PRETRAINED_DIR}deepset/xlm-roberta-base-squad2",
}
sys.path.append(f'{INPUT}sgcharts-ml/src')
sys.path.append(f'{RESOURCE_DIR}src')
import mylib
import scml
from scml import nlp as snlp

In [None]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dirs[model_name], model_max_length=model_max_length)
input_keys = ["labels"] + tokenizer.model_input_names
is_right_padding = tokenizer.padding_side == "right"
print(f"{repr(tokenizer)}\ninput_keys={input_keys}")

In [None]:
config = AutoConfig.from_pretrained(model_dirs[model_name])
config.gradient_checkpointing = gradient_checkpointing
model = AutoModelForQuestionAnswering.from_pretrained(model_dirs[model_name], config=config)
print(repr(model.config))

In [None]:
def preprocess(col) -> Callable:
    def f(row) -> str:
        return mylib.preprocess(row[col])
    
    return f

In [None]:
test = pd.read_csv(f"{DATA}test.csv")
test.info()

In [None]:
cols = ["question", "context"]
for col in cols:
    print(f"Preprocess {col}...")
    test[col] = test.progress_apply(preprocess(col), axis=1)
test.head()

In [None]:
questions = test["question"].tolist()
contexts = test["context"].tolist()
s1, s2 = contexts, questions
truncation = "only_first"
if is_right_padding:
    s1, s2 = questions, contexts
    truncation = "only_second"

In [None]:
%%time
x = tokenizer(
    s1, 
    s2, 
    truncation=truncation, 
    padding="max_length",
    stride=stride,
    return_overflowing_tokens=True,
    return_offsets_mapping=False,
    return_special_tokens_mask=True,
)
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")
# all only supports torch.uint8 and torch.bool dtypes
special_tokens_mask = torch.tensor(x.pop("special_tokens_mask"), dtype=torch.uint8)
overflow_to_sample_mapping = x.pop("overflow_to_sample_mapping")
print(f"len={len(overflow_to_sample_mapping)}, overflow_to_sample_mapping={repr(overflow_to_sample_mapping)}")

# Inference

In [None]:
batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=batch_size, shuffle=False)
model.eval()
model.to(device)
start_logits = None
end_logits = None
with torch.no_grad():
    for batch in tqdm(batches):
        for k, v in batch.items():
            batch[k] = v.to(device)
        outputs = model(**batch)
        sl = outputs.start_logits
        el = outputs.end_logits
        if start_logits is None:
            start_logits = sl
        else:
            start_logits = torch.cat((start_logits, sl), 0)
        if end_logits is None:
            end_logits = el
        else:
            end_logits = torch.cat((end_logits, el), 0)
print(f"start_logits={start_logits.size()}, end_logits={end_logits.size()}")

In [None]:
input_ids = x["input_ids"]
scores = [-999] * len(questions)
answers = [""] * len(questions)
for i in tqdm(range(len(start_logits))):
    q = overflow_to_sample_mapping[i]
    cs = mylib.candidates(
        start_logits=start_logits[i],
        end_logits=end_logits[i],
        region=mylib.context_region(
            special_tokens_mask[i].tolist(),
            is_right_padding=is_right_padding,
        ),
        answer_max_tokens=answer_max_tokens,
        threshold_factor=threshold_factor,
        top=top,
    )
    if len(cs) == 0:
        continue
    c = cs[0]
    if c.score > scores[q]:
        scores[q] = c.score
        _ids = input_ids[i][c.start:c.end]
        tokens = tokenizer.convert_ids_to_tokens(_ids)
        text = tokenizer.convert_tokens_to_string(tokens)
        #text = mylib.postprocess(text)
        answers[q] = text

In [None]:
#print(scores)
#print(end_logits[0])

# Submission

In [None]:
sub = pd.read_csv(f"{DATA}sample_submission.csv", engine="c", low_memory=False)
sub["id"] = test["id"]
sub["PredictionString"] = answers

In [None]:
sub.info()

In [None]:
sub.head()

In [None]:
sub.to_csv("submission.csv", index = False)

# Debug

In [None]:
#!pip list