In [None]:
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq

In [None]:
import sys
sys.path.append("../input/tez-lib/")
import collections
import numpy as np
import transformers
import pandas as pd
from datasets import Dataset
from functools import partial
from tqdm import tqdm
import torch
from sklearn import metrics
import transformers
import torch
import torch.nn as nn
import numpy as np
import tez
from string import punctuation

In [None]:
class ChaiiModel(tez.Model):
    def __init__(self, model_name, num_train_steps, steps_per_epoch, learning_rate):
        super().__init__()
        self.learning_rate = learning_rate
        self.steps_per_epoch = steps_per_epoch
        self.model_name = model_name
        self.num_train_steps = num_train_steps
        self.step_scheduler_after = "batch"
        hidden_dropout_prob: float = 0.0
        layer_norm_eps: float = 1e-7
        config = transformers.AutoConfig.from_pretrained(model_name)
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        self.transformer = transformers.AutoModel.from_pretrained(model_name, config=config)
        self.output = nn.Linear(config.hidden_size, config.num_labels)
    def forward(self, ids, mask, token_type_ids=None, start_positions=None, end_positions=None):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out[0]
        logits = self.output(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()
        return (start_logits, end_logits), 0, {}

In [None]:
class ChaiiDataset:
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, item):
        return {
            "ids": torch.tensor(self.data[item]["input_ids"], dtype=torch.long),
            "mask": torch.tensor(self.data[item]["attention_mask"], dtype=torch.long),
        }

In [None]:
def prepare_validation_features(examples, tokenizer, pad_on_right, max_length, doc_stride):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["example_id"] = []
    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]
    return tokenized_examples

In [None]:
def postprocess_qa_predictions(
    examples, tokenizer, features, raw_predictions, n_best_size=20, max_answer_length=30, squad_v2=False
):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
    predictions = collections.OrderedDict()
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]
        min_null_score = None
        valid_answers = []
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char:end_char],
                        }
                    )
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer
    return predictions

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("../input/xlmrob")

In [None]:
pad_on_right = tokenizer.padding_side == "right"
max_length = 384
doc_stride = 128
test_data = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
test_dataset = Dataset.from_pandas(test_data)
test_features = test_dataset.map(
    partial(
        prepare_validation_features, 
        tokenizer=tokenizer,
        pad_on_right=pad_on_right, 
        max_length=max_length,
        doc_stride=doc_stride
    ),
    batched=True,
    remove_columns=test_dataset.column_names
)
test_feats_small = test_features.map(
    lambda example: example, remove_columns=['example_id', 'offset_mapping']
)
fin_start_logits = None
fin_end_logits = None
for fold_ in tqdm(range(10)):
    model = ChaiiModel(model_name="../input/xlmrob", num_train_steps=0, steps_per_epoch=0, learning_rate=0)
    model.load(f"../input/deepsetsquad2-v2/pytorch_model_f{fold_}.bin", weights_only=True)
    model.to("cuda")
    model.eval()
    data_loader = torch.utils.data.DataLoader(
        ChaiiDataset(test_feats_small), 
        batch_size=32,
        num_workers=4,
        pin_memory=True,
        shuffle=False
    )
    start_logits = []
    end_logits = []
    for b_idx, data in enumerate(data_loader):
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output, _, _ = model(**data)
            start = output[0].detach().cpu().numpy()
            end = output[1].detach().cpu().numpy()
            start_logits.append(start)
            end_logits.append(end)
    start_logits = np.vstack(start_logits)
    end_logits = np.vstack(end_logits)
    if fin_start_logits is None:
        fin_start_logits = start_logits
        fin_end_logits = end_logits
    else:
        fin_start_logits += start_logits
        fin_end_logits += end_logits
    del model
    torch.cuda.empty_cache()

In [None]:
fin_start_logits /= 10
fin_end_logits /= 10

In [None]:
fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (fin_start_logits, fin_end_logits))

In [None]:
submission = []
for p1, p2 in fin_preds.items():
    p2 = " ".join(p2.split())
    p2 = p2.strip(punctuation)
    submission.append((p1, p2))

In [None]:
sample = pd.DataFrame(submission, columns=["id", "PredictionString"])
sample.to_csv("submission.csv", index=False)

In [None]:
sample