Approach summaries. Knitty-gritties in corresponding versions

- V1. V1: Blind XLMR, followed by Rembert followed by XLMR (squad1)
- V2. Quantile scoring approach
- V6. 3 Remberts, 2 XLMRs predicted, their top answer cleaned text and question embedding difference is fed to an LGBM Ranker. If all models are useless according to LGBM (How to guess this? +/- values?, fallback to rembert sq2ep3 ft)
- V10: Top 3 answers from 3 models(5 fold Muril, Single rembert, 5 fold XLMR) are fed to an LGBM booster optimised for ndcg.
- V16: Top 3 answers from 3 models(5 fold Muril, Single rembert, 5 fold XLMR) retrieve candidate texts using Stanza. These new question-goldensent pair are fed to bert ft on tydi ft on chaii. (V17, V18 = individual language stats)
- V19-?: All three models ensemble by mapping offsets to characters and mean of softmax scores. Then retokenize using XLMR and assign mean of character scores to a given token and post process as usual. Few versions are variants of same approach.
- (This notebook): Rudimentary, my dear Watson!

Just get intersection of cleaned answers from all three models and use scaled scores to rank them and generate outputs.

In [None]:
!pip uninstall fsspec -qq -y
!pip install --no-index --find-links=../input/hf-datasets/wheels datasets -qq
!pip uninstall transformers -qq -y
!pip install --no-index --find-links=../input/transformers-latest-model transformers -qq

In [None]:
from datasets import Dataset
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import numpy as np, json
from hf_qa_utils import *
from transformers import TrainingArguments, Trainer
from transformers import default_data_collator
data_collator = default_data_collator

torch.set_grad_enabled(False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device_id = 0 if torch.cuda.is_available() else -1

args = TrainingArguments(
    f"dummy",
    report_to=["tensorboard"],
    per_device_eval_batch_size=256,
)

mkeys = ["muril", "rembert", "xlmr"]

In [None]:
test_df = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
test_ds = Dataset.from_pandas(test_df)

# Get top 5 candidates per model type

In [None]:
topk = 5
SCALE_SCORES = True
def get_top_candidates(candidates):
    preddf = []
    for qid, arr in candidates.items():
        arr = sorted(arr, reverse=True, key=lambda x: x["score"])[:topk]
        for rec in arr:
            rec["id"] = qid
        preddf.extend(arr)
    
    preddf = pd.DataFrame(preddf)
    preddf = preddf.sort_values("score", ascending=False)
    preddf["aclean"] = preddf["text"].apply(clean_answer)
    preddf = preddf.groupby(["id", "aclean"])["score"].max().reset_index()
    if SCALE_SCORES:
        mn, mx = preddf["score"].min(), preddf["score"].max()
        preddf["score"] = (preddf["score"]-mn)/(mx-mn)

    return preddf


In [None]:
muril_ckpts = ["../input/muril-finetuning-indicx-on-squad2-epoch-2/muril_lg_indix_sq2ep1/checkpoint-3480", \
               "../input/folds-1-and-2-muril-indicx-sq2ep2-finetuning/muril_lg_indix_sq2ep1_fold1/checkpoint-3463", \
               "../input/folds-1-and-2-muril-indicx-sq2ep2-finetuning/muril_lg_indix_sq2ep1_fold2/checkpoint-3452", \
               "../input/folds-3-and-4-muril-indicx-sq2ep2-finetuning/muril_lg_indix_sq2ep1_fold3/checkpoint-3485", \
               "../input/folds-3-and-4-muril-indicx-sq2ep2-finetuning/muril_lg_indix_sq2ep1_fold4/checkpoint-3472"
              ]
rembert_ckpts = ["../input/rembert-finetuning-indicx-on-sq2-epoch3/rembert_indicx_over_squad2/checkpoint-1021"]
xlmr_ckpts = [f"../input/folds-consolidated-xlmr-qa-finetune-on-indix/fold{i}" for i in range(5)]

ckpt_meta = [("muril", muril_ckpts), ("rembert", rembert_ckpts), ("xlmr", xlmr_ckpts)]

pred_dfs = []
for model_type, ckpts in ckpt_meta:
    tokenizer = AutoTokenizer.from_pretrained(ckpts[0])
    tkwargs = {"tokenizer": tokenizer} 
    ds_feats = test_ds.map(prepare_validation_features, batched=True, remove_columns=test_ds.column_names, fn_kwargs=tkwargs)
    
    starts, ends = None, None
    for mname in ckpts:
        model = AutoModelForQuestionAnswering.from_pretrained(mname)
        trainer = Trainer(model, args, data_collator=data_collator, tokenizer=tokenizer)
        raw_vals = trainer.predict(ds_feats)
        if starts is None:
            starts, ends = raw_vals.predictions
        else:
            starts += raw_vals.predictions[0]
            ends += raw_vals.predictions[1]
    starts /= len(ckpts)
    ends /= len(ckpts)
    
    ds_feats.set_format(type=ds_feats.format["type"], columns=list(ds_feats.features.keys()))
    _, candidates = postprocess_qa_predictions(test_ds, ds_feats, (starts, ends), \
                                               cls_token_id=tokenizer.cls_token_id, n_best_size=5, \
                                               pp_cleanup=False, return_candidates=True)
    cdf = get_top_candidates(candidates)
    pred_dfs.append((model_type, cdf))

# Generate best answer from all model types

In [None]:
highpred = pd.merge(pred_dfs[0][-1], pred_dfs[1][-1], \
                    on=["id", "aclean"], how="outer", suffixes=["_"+pred_dfs[0][0], "_"+pred_dfs[1][0]])
highpred = pd.merge(highpred, pred_dfs[2][-1], on=["id", "aclean"], how="outer")
highpred.rename(columns={"score": f"score_{pred_dfs[2][0]}"}, inplace=True)

mkeys = list(map(lambda x:x[0], ckpt_meta))
for mkey in mkeys:
    minval = 0 if SCALE_SCORES else highpred[~highpred[f"score_{mkey}"].isna()][f"score_{mkey}"].min()-1
    highpred[f"score_{mkey}"].fillna(minval, inplace=True)

highpred['tot_score'] = highpred[f"score_{mkeys[0]}"]
for mkey in mkeys[1:]:
    highpred['tot_score'] += highpred[f"score_{mkey}"]

highpred = highpred.sort_values("tot_score", ascending=False) 
highpred = highpred.groupby(["id"]).head(1).reset_index(drop=True) 
highpred.rename(columns={"aclean": "PredictionString"}, inplace=True)

In [None]:
test_df = pd.merge(test_df, highpred, on="id", how="left")
test_df['PredictionString'].fillna('', inplace=True)

# Postprocessing

In [None]:
import re
year_ptrn = re.compile("\d{4}")

time_prefixes = ["கி.மு", "கி.பி", " ई", "ई.पू", "वर्ष", "सन"]
def update_year_answer(pred_ans):
    if any([tp in pred_ans for tp in time_prefixes]):
        return pred_ans
    ypreds = year_ptrn.findall(pred_ans)
    if len(ypreds)!=1:
        return pred_ans
    return ypreds[0]

years = ["எந்த ஆண்டு", "किस वर्ष", "किस साल"]
is_ans_year = (test_df["question"].str.contains("|".join(years), regex=True))
if is_ans_year.any():
    test_df.loc[is_ans_year, "PredictionString"] = test_df.loc[is_ans_year, "PredictionString"].apply(update_year_answer)
test_df['PredictionString'].fillna('', inplace=True)

In [None]:
_ = """import unicodedata
hin = [chr(i) for i in range(2406, 2416)]
enn = [f"{i}" for i in range(10)]

is_pred_hin = test_df["PredictionString"].apply(lambda x: set(x)<=set(hin))
if is_pred_hin.any():
    test_df["trans"] = test_df["PredictionString"].copy()
    test_df.loc[is_pred_hin, "trans"] = test_df.loc[is_pred_hin, "trans"].apply(lambda txt: "".join([enn[hin.index(c)] for c in txt]))

    is_trans_in_context = test_df.apply(lambda row: row["trans"] in row["context"], axis=1)
    if (is_pred_hin&is_trans_in_context).any():
        test_df.loc[is_pred_hin&is_trans_in_context, "PredictionString"] = test_df.loc[is_pred_hin&is_trans_in_context, "trans"]
test_df['PredictionString'].fillna('', inplace=True)
"""

In [None]:
test_df[['id', 'PredictionString']].to_csv('submission.csv', index=False) #With muril for hin and xlmr for tam and without excessive pp, public lb=0.82