In [None]:
!pip install -U --no-build-isolation --no-deps ../input/transformers-master/ -qq

In [None]:
import sys
sys.path.append("../input/tez-lib/")
import collections
import numpy as np
import transformers
import pandas as pd
from datasets import Dataset
from functools import partial
from tqdm import tqdm
import json
import torch

from sklearn import metrics
import transformers
import torch
import torch.nn as nn
import numpy as np
import tez
from string import punctuation

In [None]:
class ChaiiModel(tez.Model):
    def __init__(self, model_name, num_train_steps, steps_per_epoch, learning_rate):
        super().__init__()
        self.learning_rate = learning_rate
        self.steps_per_epoch = steps_per_epoch
        self.model_name = model_name
        self.num_train_steps = num_train_steps
        self.step_scheduler_after = "batch"

        hidden_dropout_prob: float = 0.0

        config = transformers.AutoConfig.from_pretrained(model_name)
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "add_pooling_layer": False,
            }
        )
        self.transformer = transformers.AutoModelForQuestionAnswering.from_pretrained(model_name, config=config)


    def forward(self, ids, mask, token_type_ids=None, start_positions=None, end_positions=None):
        transformer_out = self.transformer(ids, mask)

        start_logits = transformer_out.start_logits
        end_logits = transformer_out.end_logits
        
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        return (start_logits, end_logits), 0, {}

In [None]:
class ChaiiDataset:
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return {
            "ids": torch.tensor(self.data[item]["input_ids"], dtype=torch.long),
            "mask": torch.tensor(self.data[item]["attention_mask"], dtype=torch.long),
        }

In [None]:
def prepare_validation_features(examples, tokenizer, pad_on_right, max_length, doc_stride):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [None]:
def postprocess_qa_predictions(
    examples, tokenizer, features, raw_predictions, n_best_size=20, max_answer_length=30, squad_v2=False
):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
    all_answers = []

    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]

        min_null_score = None  # Only used if squad_v2 is True.
        valid_answers = []

        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]

            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": float(start_logits[start_index] + end_logits[end_index]),
                            "text": context[start_char:end_char],
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

        valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
        all_answers.append({"id": example["id"], "predictions": valid_answers})
    return all_answers, predictions

In [None]:
test_data = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
test_data["len"] = [len(x) for x in test_data["context"]]

do_inference = True #len(test_data) != 5

char_threshold = 25_000

short_data = test_data.copy()
# short_data = test_data[test_data["len"]<char_threshold].reset_index(drop=True)
# long_data = test_data[test_data["len"]>=char_threshold].reset_index(drop=True)

In [None]:
if do_inference:
    tokenizer = transformers.AutoTokenizer.from_pretrained("../input/murilbasecased")
    
    pad_on_right = tokenizer.padding_side == "right"
    max_length = 1024
    doc_stride = 512

    test_dataset = Dataset.from_pandas(short_data)
    test_features = test_dataset.map(
        partial(
            prepare_validation_features, 
            tokenizer=tokenizer,
            pad_on_right=pad_on_right, 
            max_length=max_length,
            doc_stride=doc_stride
        ),
        batched=True,
        remove_columns=test_dataset.column_names
    )
    test_feats_small = test_features.map(
        lambda example: example, remove_columns=['example_id', 'offset_mapping']
    )

    fin_start_logits = None
    fin_end_logits = None

    data_loader = torch.utils.data.DataLoader(
        ChaiiDataset(test_feats_small), 
        batch_size=32,
        num_workers=4,
        pin_memory=True,
        shuffle=False
    )

    
    model_name = "../input/muril-large-bigbird-1k-6f/nbroad/1k-shuf-squad-chaii-6f0"
    for fold in tqdm(range(6)):
        model = ChaiiModel(model_name=model_name, num_train_steps=0, steps_per_epoch=0, learning_rate=0)
        model.transformer.load_state_dict(torch.load(f"../input/muril-large-bigbird-1k-6f/nbroad/1k-shuf-squad-chaii-6f{fold}/pytorch_model.bin"))
        model.to("cuda")
        model.eval()

        start_logits = []
        end_logits = []

        for b_idx, data in enumerate(data_loader):
            with torch.no_grad():
                for key, value in data.items():
                    data[key] = value.to("cuda")
                output, _, _ = model(**data)
                start = output[0].detach().cpu().numpy()
                end = output[1].detach().cpu().numpy()
                start_logits.append(start)
                end_logits.append(end)

        start_logits = np.vstack(start_logits)
        end_logits = np.vstack(end_logits)

        if fin_start_logits is None:
            fin_start_logits = start_logits
            fin_end_logits = end_logits
        else:
            fin_start_logits += start_logits
            fin_end_logits += end_logits
            
#         to_save, fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (start_logits, end_logits))      
#         with open(f"top-preds-muril-large-f{fold}.json", "w") as fp:
#             json.dump(to_save, fp)

        del model
        torch.cuda.empty_cache()

In [None]:
if do_inference:
    fin_start_logits /= 6
    fin_end_logits /= 6
    
    to_save, fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (fin_start_logits, fin_end_logits))
#     with open('muril-large-preds.json', "w") as fp:
#         json.dump(to_save, fp)

    short_data["PredictionString"] = short_data["id"].map(fin_preds)

In [None]:
# if do_inference:
#     tokenizer = transformers.AutoTokenizer.from_pretrained("../input/bb-base-chaii")

In [None]:
# if do_inference:
#     pad_on_right = tokenizer.padding_side == "right"
#     max_length = 4096
#     doc_stride = 2048

#     test_dataset = Dataset.from_pandas(long_data)
#     test_features = test_dataset.map(
#         partial(
#             prepare_validation_features, 
#             tokenizer=tokenizer,
#             pad_on_right=pad_on_right, 
#             max_length=max_length,
#             doc_stride=doc_stride
#         ),
#         batched=True,
#         remove_columns=test_dataset.column_names
#     )
#     test_feats_small = test_features.map(
#         lambda example: example, remove_columns=['example_id', 'offset_mapping']
#     )

#     fin_start_logits = None
#     fin_end_logits = None

#     models = [
#         "../input/bb-base-chaii",
#         "../input/nbroad-flax-muril-bb-base-chaii-f2",
#         "../input/nbroad-flax-muril-bb-base-chaii-f3",
#         "../input/nbroad-flax-muril-bb-base-chaii-f4",
#         "../input/nbroad-flax-bb-base-chaii-f5",
#         "../input/nbroad-flax-muril-bb-base-chaii-f6",
#         "../input/nbroad-flax-muril-bb-base-chaii-f7", 
#     ]

#     data_loader = torch.utils.data.DataLoader(
#         ChaiiDataset(test_feats_small), 
#         batch_size=16,
#         num_workers=4,
#         pin_memory=True,
#         shuffle=False
#     )


#     for fold, model_name in tqdm(enumerate(models)):
#         model = ChaiiModel(model_name=model_name, num_train_steps=0, steps_per_epoch=0, learning_rate=0)
#         model.transformer.load_state_dict(torch.load(f"{model_name}/pytorch_model.bin"))
#         model.to("cuda")
#         model.eval()

#         start_logits = []
#         end_logits = []

#         for b_idx, data in enumerate(data_loader):
#             with torch.no_grad():
#                 for key, value in data.items():
#                     data[key] = value.to("cuda")
#                 output, _, _ = model(**data)
#                 start = output[0].detach().cpu().numpy()
#                 end = output[1].detach().cpu().numpy()
#                 start_logits.append(start)
#                 end_logits.append(end)

#         start_logits = np.vstack(start_logits)
#         end_logits = np.vstack(end_logits)

#         if fin_start_logits is None:
#             fin_start_logits = start_logits
#             fin_end_logits = end_logits
#         else:
#             fin_start_logits += start_logits
#             fin_end_logits += end_logits
            
#         to_save, fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (start_logits, end_logits))      
#         with open(f"top-preds-bb-f{fold}.json", "w") as fp:
#             json.dump(to_save, fp)

#         del model
#         torch.cuda.empty_cache()

In [None]:
# if do_inference:
#     fin_start_logits /= len(models)
#     fin_end_logits /= len(models)

In [None]:
# if do_inference:
#     to_save, fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (fin_start_logits, fin_end_logits))

In [None]:
# if do_inference:
#     long_data["PredictionString"] = long_data["id"].map(fin_preds)

In [None]:
# class ChaiiModel(tez.Model):
#     def __init__(self, model_name, num_train_steps, steps_per_epoch, learning_rate):
#         super().__init__()
#         self.learning_rate = learning_rate
#         self.steps_per_epoch = steps_per_epoch
#         self.model_name = model_name
#         self.num_train_steps = num_train_steps
#         self.step_scheduler_after = "batch"

#         hidden_dropout_prob: float = 0.0
#         layer_norm_eps: float = 1e-7

#         config = transformers.AutoConfig.from_pretrained(model_name)
#         config.update(
#             {
#                 "output_hidden_states": True,
#                 "hidden_dropout_prob": hidden_dropout_prob,
#                 "layer_norm_eps": layer_norm_eps,
#                 "add_pooling_layer": False,
#             }
#         )
#         self.transformer = transformers.AutoModel.from_pretrained(model_name, config=config)
#         self.output = nn.Linear(config.hidden_size, config.num_labels)

#     def forward(self, ids, mask, token_type_ids=None, start_positions=None, end_positions=None):
#         transformer_out = self.transformer(ids, mask)
#         sequence_output = transformer_out[0]
#         logits = self.output(sequence_output)
#         start_logits, end_logits = logits.split(1, dim=-1)
#         start_logits = start_logits.squeeze(-1).contiguous()
#         end_logits = end_logits.squeeze(-1).contiguous()

#         return (start_logits, end_logits), 0, {}

In [None]:
# if do_inference:
#     tokenizer = transformers.AutoTokenizer.from_pretrained("../input/xlmrob")

In [None]:
# if do_inference:
#     pad_on_right = tokenizer.padding_side == "right"
#     max_length = 384
#     doc_stride = 128


#     test_dataset = Dataset.from_pandas(short_data)
#     test_features = test_dataset.map(
#         partial(
#             prepare_validation_features, 
#             tokenizer=tokenizer,
#             pad_on_right=pad_on_right, 
#             max_length=max_length,
#             doc_stride=doc_stride
#         ),
#         batched=True,
#         remove_columns=test_dataset.column_names
#     )
#     test_feats_small = test_features.map(
#         lambda example: example, remove_columns=['example_id', 'offset_mapping']
#     )

#     fin_start_logits = None
#     fin_end_logits = None

#     for fold_ in tqdm(range(10)):
#         model = ChaiiModel(model_name="../input/xlmrob", num_train_steps=0, steps_per_epoch=0, learning_rate=0)
#         model.load(f"../input/deepsetsquad2-v2/pytorch_model_f{fold_}.bin", weights_only=True)
#         model.to("cuda")
#         model.eval()
#         data_loader = torch.utils.data.DataLoader(
#             ChaiiDataset(test_feats_small), 
#             batch_size=64,
#             num_workers=4,
#             pin_memory=True,
#             shuffle=False
#         )
#         start_logits = []
#         end_logits = []

#         for b_idx, data in enumerate(data_loader):
#             with torch.no_grad():
#                 for key, value in data.items():
#                     data[key] = value.to("cuda")
#                 output, _, _ = model(**data)
#                 start = output[0].detach().cpu().numpy()
#                 end = output[1].detach().cpu().numpy()
#                 start_logits.append(start)
#                 end_logits.append(end)

#         start_logits = np.vstack(start_logits)
#         end_logits = np.vstack(end_logits)

#         if fin_start_logits is None:
#             fin_start_logits = start_logits
#             fin_end_logits = end_logits
#         else:
#             fin_start_logits += start_logits
#             fin_end_logits += end_logits
        
#         to_save, fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (start_logits, end_logits))      
#         with open(f"top-preds-xlmr-f{fold_}.json", "w") as fp:
#             json.dump(to_save, fp)

#         del model
#         torch.cuda.empty_cache()

In [None]:
# if do_inference:
#     fin_start_logits /= 10
#     fin_end_logits /= 10

In [None]:
# if do_inference:
#     to_save, fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (fin_start_logits, fin_end_logits))
#     with open('xlmr-large-preds.json', "w") as fp:
#         json.dump(to_save, fp)

In [None]:
# if do_inference:
#     short_data["PredictionString"] = short_data["id"].map(fin_preds)

In [None]:
# sub2 = pd.DataFrame(submission, columns=["id", "PredictionString"])

# final = pd.concat([sub1, sub2], axis=0, ignore_index=True)
# final = final.merge(test_data[["context", "question", "id"]], on="id")

In [None]:
# short_data

In [None]:
# # folds have been averaged and have only 1 prediction file 
# if do_inference:
#     model_fold_preds = {}
#     with open('xlmr-large-preds.json') as fp:
#         model_fold_preds["xlmr"] = json.load(fp)
#     with open('muril-large-preds.json') as fp:
#         model_fold_preds["muril"] = json.load(fp)

#     from collections import Counter

#     voted_preds = {}
#     top_k = 4
#     for i in range(len(short_data)):
#         cnt = Counter()
#         for fold_, preds in enumerate(model_fold_preds.values()):
#             cnt.update([text["text"] for text in preds[i]["predictions"]][:top_k])
            
#         most_common = cnt.most_common(top_k)
#         voted_preds[preds[i]["id"]] = most_common[0][0]
        
#     short_data["PredictionString"] = short_data["id"].map(voted_preds)

In [None]:
# if do_inference:
#     model_fold_preds = {}
#     model = "xlmr"
#     for fold_ in range(10):
#         with open(f"top-preds-{model}-f{fold_}.json") as fp:
#             model_fold_preds[f"{model}-{fold_}"] = json.load(fp)

#     from collections import Counter

#     voted_preds = {}
#     top_k = 100
#     for i in range(len(short_data)):
#         cnt = Counter()
#         for fold_, preds in enumerate(model_fold_preds.values()):
#             cnt.update([text["text"] for text in preds[i]["predictions"]][:top_k])
            
#         most_common = cnt.most_common(10)
#         voted_preds[preds[i]["id"]] = most_common[0][0]
        
#     short_data["PredictionString"] = short_data["id"].map(voted_preds)

In [None]:
# if do_inference:
#     model_fold_preds = {}
    
#     model = "bb"
#     for fold_ in range(7):
#         with open(f"top-preds-{model}-f{fold_}.json") as fp:
#             model_fold_preds[f"{model}-{fold_}"] = json.load(fp)

#     from collections import Counter

#     voted_preds = {}
#     for i in range(len(long_data)):
#         cnt = Counter()
#         for fold_, preds in enumerate(model_fold_preds.values()):
#             cnt.update([text["text"] for text in preds[i]["predictions"]][:top_k])
            
#         most_common = cnt.most_common(10)
#         voted_preds[preds[i]["id"]] = most_common[0][0]
        
#     long_data["PredictionString"] = long_data["id"].map(voted_preds)

In [None]:
# if do_inference:
#     test_data = pd.concat([short_data, long_data], axis=0, ignore_index=True)

In [None]:
if do_inference:
    test_data = short_data.copy()

In [None]:
test_data

In [None]:
bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";"]
bad_endings = ["-", "(", ")", "–", ",", ";"]

tamil_ad = "கி.பி"
tamil_bc = "கி.மு"
tamil_km = "கி.மீ"
hindi_ad = "ई"
hindi_bc = "ई.पू"

cleaned_preds = []
if do_inference:
    for pred, context in test_data[["PredictionString", "context"]].values:
        if pred == "":
            cleaned_preds.append(pred)
            continue

        # I haven't check sure if this makes a difference, but there is one answer in the training set that ends like this and I think it is an annotator mistake
        # see my notebook here for details https://www.kaggle.com/nbroad/chaii-qa-character-token-languages-eda 
        if pred.endswith("..."):
            pred = pred[:-3]

        pred = pred.lstrip("".join(bad_starts))
        pred = pred.rstrip("".join(bad_endings))

        if any([pred.endswith(tamil_ad), pred.endswith(tamil_bc), pred.endswith(tamil_km), pred.endswith(hindi_ad), pred.endswith(hindi_bc)]) and pred+"." in context:
            pred = pred+"."


        cleaned_preds.append(pred)

    test_data["PredictionString"] = cleaned_preds

In [None]:
if do_inference:
    test_data["pred_len"] = [len(x) for x in test_data["PredictionString"]]
# do something if the prediction is too short

In [None]:
if do_inference:
    test_data[["id", "PredictionString"]].to_csv("submission.csv", index=False)
else:
    test_data["PredictionString"] = "lol"
    test_data[["id", "PredictionString"]].to_csv("submission.csv", index=False)

In [None]:
if do_inference:
    print(test_data[["id", "PredictionString"]])