In [None]:
import pandas as pd
import numpy as np
import os
import gc 
device = 'cuda'

In [None]:
!pip uninstall fsspec -qq -y
!pip uninstall transformers -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq

import sys
sys.path.append("../input/transformers-master/src/")

In [None]:
sub = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/sample_submission.csv')
sub.head(1)

In [None]:
train = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/train.csv')
train.head()

In [None]:
train.language.value_counts()

In [None]:
test = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')
test.head()

In [None]:
len(test)

# Baseline

Based on: https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb

In [None]:
import transformers

In [None]:
model_checkpoint = '../input/chaii-model-0816-2/chaii-bert-trained_knr_alex'
batch_size = 200

In [None]:
from transformers import AutoTokenizer


In [None]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [None]:
from datasets import Dataset

In [None]:
def convert_answers(r):
    start = r[0]
    text = r[1]
    return {
        'answer_start': [start],
        'text': [text]
    }

In [None]:
%env WANDB_DISABLED=True

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, AutoTokenizer, default_data_collator
args = TrainingArguments(
    f"chaii-qa",
    do_eval=False,
    do_predict=True,
    dataloader_num_workers=2,
    per_device_eval_batch_size=batch_size,
)

In [None]:
from transformers import default_data_collator
from scipy.special import softmax
data_collator = default_data_collator
max_answer_length = 30
from tqdm.auto import tqdm

class Prep:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def prepare_validation_features(self, examples):
        
        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
        # left whitespace
        examples["question"] = [q.lstrip() for q in examples["question"]]
        pad_on_right = self.tokenizer.padding_side == "right"

        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = self.tokenizer(
            examples["question" if pad_on_right else "context"],
            examples["context" if pad_on_right else "question"],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_length,
            stride=doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        # We keep the example_id that gave us this feature and we will store the offset mappings.
        tokenized_examples["example_id"] = []

        for i in range(len(tokenized_examples["input_ids"])):
            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i)
            context_index = 1 if pad_on_right else 0

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    def postprocess_qa_predictions(self, examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
        all_start_logits, all_end_logits = raw_predictions
        #all_start_logits = expit(all_start_logits)
        #all_end_logits = expit(all_end_logits)
        # Build a map example to its corresponding features.
        example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
        features_per_example = collections.defaultdict(list)
        for i, feature in enumerate(features):
            features_per_example[example_id_to_index[feature["example_id"]]].append(i)

        # The dictionaries we have to fill.
        predictions = collections.OrderedDict()

        # Logging.
        print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

        #ta_stemmer = TamilStemmer()

        # Let's loop over all the examples!
        for example_index, example in enumerate(tqdm(examples)):
            # Those are the indices of the features associated to the current example.
            feature_indices = features_per_example[example_index]

            min_null_score = None # Only used if squad_v2 is True.
            valid_answers = []

            context = example["context"]
            start_flgs = np.ones(len(context)) * (-100)
            end_flgs = np.ones(len(context)) * (-100)
            # Looping through all the features associated to the current example.
            for feature_index in feature_indices:
                # We grab the predictions of the model for this feature.
                start_logits = all_start_logits[feature_index]
                end_logits = all_end_logits[feature_index]
                # This is what will allow us to map some the positions in our logits to span of texts in the original
                # context.
                offset_mapping = features[feature_index]["offset_mapping"]

                # Update minimum null prediction.
                cls_index = features[feature_index]["input_ids"].index(self.tokenizer.cls_token_id)
                feature_null_score = start_logits[cls_index] + end_logits[cls_index]
                if min_null_score is None or min_null_score < feature_null_score:
                    min_null_score = feature_null_score

                for i, offset in enumerate(offset_mapping):
                    if offset is not None:
                        start_flgs[offset[0]:offset[1]] = start_flgs[offset[0]:offset[1]].clip(start_logits[i], None)
                        end_flgs[offset[0]:offset[1]] = end_flgs[offset[0]:offset[1]].clip(end_logits[i], None)


            best_answer = {"start_logits": softmax(start_flgs), "end_logits": softmax(end_flgs)}

            # Let's pick our final answer: the best one or the null answer (only for squad_v2)
            for k, v in example.items():
                if k not in best_answer:
                    best_answer[k] = v

            predictions[example["id"]] = best_answer#["text"]#text#best_answer["text"]

        return predictions



In [None]:
import torch
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
import collections

test_dataset = Dataset.from_pandas(test)

In [None]:
#model_checkpoint = '../input/exp25/chaii/mrm8488_bert-multi-cased-finedtuned-xquad-tydiqa-goldp_exp25/'

paths = [
    '../input/chaii-model-0816/chaii-bert-trained_knr_gtrain/chaii-bert-trained_knr_gtrain',
    
    '../input/exp78-exp79/chaii/deepset_xlm-roberta-large-squad2_exp78',
    '../input/exp78-exp79/chaii/AlexKay_xlm-roberta-large-qa-multilingual-finedtuned-ru_exp78',
    '../input/exp78-exp79/chaii/deepset_xlm-roberta-large-squad2_exp79',
    '../input/exp78-exp79/chaii/AlexKay_xlm-roberta-large-qa-multilingual-finedtuned-ru_exp79',
    
    '../input/exp32-exp33-seed1/chaii/deepset_xlm-roberta-large-squad2_exp32',
    '../input/exp32-exp33-seed1/chaii/AlexKay_xlm-roberta-large-qa-multilingual-finedtuned-ru_exp32',
        ]
tokenizer = AutoTokenizer.from_pretrained(paths[0])
model = AutoModelForQuestionAnswering.from_pretrained(paths[0])


prep = Prep(tokenizer)
test_features = test_dataset.map(
    prep.prepare_validation_features,
    batched=True,
    remove_columns=test_dataset.column_names
)
test_feats_small = test_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

pred = [0, 0]

for path in paths:
    print(path)
    checkpoint = torch.load(os.path.join(path, 'pytorch_model.bin'))
    model.load_state_dict(checkpoint)
    model.eval()
    model = model.to(device)
    trainer = Trainer(
        model,
        args,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    test_predictions = trainer.predict(test_feats_small)
    
    pred[0] += test_predictions.predictions[0] / len(paths)
    pred[1] += test_predictions.predictions[1] / len(paths)
    
test_features.set_format(type=test_features.format["type"], columns=list(test_features.features.keys()))
final_test_predictions_tkm_xlm = prep.postprocess_qa_predictions(test_dataset, test_features, pred)

del test_features, model, trainer
gc.collect()

In [None]:
#model_checkpoint = '../input/exp25/chaii/mrm8488_bert-multi-cased-finedtuned-xquad-tydiqa-goldp_exp25/'

paths = [
    '../input/exp32-exp33-rem-info/chaii/google_rembert_exp32',
    '../input/exp32-exp33-rem-info/chaii/google_rembert_exp33',
    '../input/exp32-exp33-seed1/chaii/google_rembert_exp32',
        ]
tokenizer = AutoTokenizer.from_pretrained(paths[0])
prep = Prep(tokenizer)
test_features = test_dataset.map(
    prep.prepare_validation_features,
    batched=True,
    remove_columns=test_dataset.column_names
)
test_feats_small = test_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

pred = [0, 0]

for path in paths:
    print(path)
    
    model = AutoModelForQuestionAnswering.from_pretrained(path)

    model.to(device)
    model.eval()
    trainer = Trainer(
        model,
        args,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    test_predictions = trainer.predict(test_feats_small)
    
    pred[0] += test_predictions.predictions[0] / len(paths)
    pred[1] += test_predictions.predictions[1] / len(paths)
    
test_features.set_format(type=test_features.format["type"], columns=list(test_features.features.keys()))
final_test_predictions_tkm_rem = prep.postprocess_qa_predictions(test_dataset, test_features, pred)

del test_features, model, trainer
gc.collect()

In [None]:
#model_checkpoint = '../input/exp25/chaii/mrm8488_bert-multi-cased-finedtuned-xquad-tydiqa-goldp_exp25/'

paths = [
    '../input/exp32-exp33-muril/chaii/google_muril-large-cased_exp32',
    '../input/exp32-exp33-muril/chaii/google_muril-large-cased_exp33',
        ]
tokenizer = AutoTokenizer.from_pretrained(paths[0])
prep = Prep(tokenizer)
test_features = test_dataset.map(
    prep.prepare_validation_features,
    batched=True,
    remove_columns=test_dataset.column_names
)
test_feats_small = test_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

pred = [0, 0]

for path in paths:
    print(path)
    
    model = AutoModelForQuestionAnswering.from_pretrained(path)

    model.to(device)
    model.eval()
    trainer = Trainer(
        model,
        args,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    test_predictions = trainer.predict(test_feats_small)
    w = 2 if '_exp32' in path else 1
    pred[0] += test_predictions.predictions[0] * w / 3
    pred[1] += test_predictions.predictions[1] * w / 3
    print('w =', w)
    
test_features.set_format(type=test_features.format["type"], columns=list(test_features.features.keys()))
final_test_predictions_tkm_mul = prep.postprocess_qa_predictions(test_dataset, test_features, pred)

del test_features, model, trainer
gc.collect()

In [None]:
#model_checkpoint = '../input/exp25/chaii/mrm8488_bert-multi-cased-finedtuned-xquad-tydiqa-goldp_exp25/'

paths = [
    '../input/exp32-exp33-rem-info/chaii/microsoft_infoxlm-large_exp32',
    '../input/exp32-exp33-rem-info/chaii/microsoft_infoxlm-large_exp33',
    '../input/exp32-exp33-seed1/chaii/microsoft_infoxlm-large_exp32',
        ]
tokenizer = AutoTokenizer.from_pretrained(paths[0])
prep = Prep(tokenizer)
test_features = test_dataset.map(
    prep.prepare_validation_features,
    batched=True,
    remove_columns=test_dataset.column_names
)
test_feats_small = test_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

pred = [0, 0]

for path in paths:
    print(path)
    
    model = AutoModelForQuestionAnswering.from_pretrained(path)

    model.to(device)
    model.eval()
    trainer = Trainer(
        model,
        args,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    test_predictions = trainer.predict(test_feats_small)
    
    pred[0] += test_predictions.predictions[0] / len(paths)
    pred[1] += test_predictions.predictions[1] / len(paths)
    
test_features.set_format(type=test_features.format["type"], columns=list(test_features.features.keys()))
final_test_predictions_tkm_info = prep.postprocess_qa_predictions(test_dataset, test_features, pred)

del test_features, model, trainer
gc.collect()

In [None]:
starts = []
ends = []
for id_ in tqdm(test['id'].values):
    start_logit = (final_test_predictions_tkm_xlm[id_]['start_logits'] * 2.2
                  + final_test_predictions_tkm_rem[id_]['start_logits'] * 1.7
                  + final_test_predictions_tkm_info[id_]['start_logits'] * 0.3
                  + final_test_predictions_tkm_mul[id_]['start_logits'] * 1.2
                  ) / 4.2
    start_logit += np.arange(start_logit.shape[0])[::-1] * 1.e-10
    end_logit = (final_test_predictions_tkm_xlm[id_]['end_logits'] * 2.2
                  + final_test_predictions_tkm_rem[id_]['end_logits'] * 1.7
                  + final_test_predictions_tkm_info[id_]['end_logits'] * 0.3
                 + final_test_predictions_tkm_mul[id_]['end_logits'] * 1.2
                  ) / 4.2
    end_logit += np.arange(end_logit.shape[0]) * 1.e-10
    
    idx, score = max((
                 ((i, j), start_logit[i] + end_logit[j]) 
                 for i in np.argsort(start_logit)[-20:] 
                 for j in np.argsort(end_logit)[-20:] if i <= j and j - i <= 100
                ), key=lambda x: x[1])
    starts.append(idx[0])
    ends.append(idx[1] + 1)
    
test['start'] = starts
test['end'] = ends
test['text'] = test.apply(lambda x: x.context[x.start:x.end], axis=1)

In [None]:
import re
bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";"]
bad_endings = ["...", "-", "(", ")", "–", ",", ";"]

aaa = ["கி.பி",  
        "கி.மு",
        "கி.மீ",
         "ई",
        "ई.पू",
"वी.एन",
"कि.मी",
      ]
deg = '|'.join(aaa)

def postproc(data):
    text = data['text']
    text = text.replace('\n', '').strip()
    for s in '()-':
        text = text.strip(s).strip()
    text = re.sub('[\(\)]', ' ', text)
    if re.match('^[०१२३४५६७८९]+$', text) is not None:
        text_ = str(text)
        for i, s in enumerate('०१२३४५६७८९'):
            text_ = text_.replace(s, f'{i}')
        if text_ in data['context']:
            text = text_

    data['text'] = text
    return data

def postproc2(data):
    pred = data['text']
    context  = data['context']
    if pred == "":
        return data
    while any([pred.startswith(y) for y in bad_starts]):
        pred = pred[1:]
    while any([pred.endswith(y) for y in bad_endings]):
        if pred.endswith("..."):
            pred = pred[:-3]
        else:
            pred = pred[:-1]
    
        
    if re.search(f'({deg})$', pred) is not None and pred + "." in context:
        pred = pred+"."

    if pred[-7:] == 'கி.மீ.2':
        pred = pred[:-1]
    #if pred[-5:] == 'கிமீ²':
    #    pred = pred[:-1]
        
    pred = re.sub('^0-', '', pred)

    data['text'] = pred
    return data

test['PredictionString'] = test.apply(lambda x: postproc2(postproc(x)).text, axis=1)

In [None]:
sub = test[['id', 'PredictionString']]
sub.to_csv('submission.csv', index=False)
sub