In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip uninstall fsspec -qq -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

import ast
import torch
import collections
import numpy as np
import pandas as pd
import transformers
from tqdm.auto import tqdm
from datasets import Dataset
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, default_data_collator, TrainingArguments, Trainer

In [None]:
def convert_answers(r):
    start = r[0]
    text = r[1]
    return {
        'answer_start': [start],
        'text': [text]
    }

def convert_answers_squad(r):
    raw = ast.literal_eval(r[1:-1])
    start = raw['answer_start']
    text = raw['text']
    return {
        'answer_start': [start],
        'text': [text]
    }

def read_chaii(path):
    chaii = pd.read_csv(path)
    chaii = chaii.sample(frac=1)
    chaii['answers'] = chaii[['answer_start', 'answer_text']].apply(convert_answers, axis=1)
    return chaii

def read_squad(path, language, size=500):
    squad = pd.read_csv(path)
    squad['answers'] = squad['answers'].apply(convert_answers_squad)
    squad = squad[squad['is_in'] == True]
    squad['language'] = [language]*len(squad)
    squad.drop(columns=['is_in', 'c_id'])
    return squad[:size]

def read_mlqa(path):
    mlqa = pd.read_csv(path)
    mlqa['answers'] = mlqa[['answer_start', 'answer_text']].apply(convert_answers, axis=1)
    mlqa['id'] = [str(x) for x in range(len(mlqa['question']))]
    return mlqa

def get_predictions(data, features, raw_predictions):
    all_start_logits, all_end_logits = raw_predictions
    
    features_per_example = collections.defaultdict(list)
    example_id_to_index = {k: i for i, k in enumerate(data["id"])}
#     print(features)
    
    for i, feature in enumerate(features):             
#         print(feature.keys())
#         print(features_per_example, example_id_to_index)
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()
    for example_index, example in enumerate(tqdm(data)):
        feature_indices = features_per_example[example_index]

        min_null_score = None
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]

            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -21 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -21 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None or offset_mapping[end_index] is None or end_index<start_index or end_index-start_index+1>30):
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}
        
        predictions[example["id"]] = best_answer["text"]

    return predictions

def jaccard(row): 
    str1 = row[0]
    str2 = row[1]
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, train, valid, validation=False):
        self.train = train
        self.valid = valid
        
        train_dataset = Dataset.from_pandas(train)
        valid_dataset = Dataset.from_pandas(valid)
        
        if validation:
            self.validation_features = valid_dataset.map(self.custom_tokenize_valid, batched=True, remove_columns=valid_dataset.column_names)
        else:
            self.tok_train_ds = train_dataset.map(self.custom_tokenize_train, batched=True, remove_columns=train_dataset.column_names)
            self.tok_valid_ds = valid_dataset.map(self.custom_tokenize_train, batched=True, remove_columns=train_dataset.column_names)

    def get_ds(self):
        return self.tok_train_ds, self.tok_valid_ds
    
    def get_valid_ds(self):
        return self.validation_features
            
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)
    
    def custom_tokenize_train(self, data):
        data["question"] = [q.strip() for q in data["question"]]
        
        tok_data = tokenizer(
            data["question"],
            data["context"],
            truncation="only_second",
            max_length=384,
            stride=128,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        )
        
        overflow_to_sample_mapping = tok_data.pop("overflow_to_sample_mapping")
        offset_mapping = tok_data.pop("offset_mapping")

        tok_data["start_positions"] = []
        tok_data["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            input_ids = tok_data["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            sequence_ids = tok_data.sequence_ids(i)

            sample_index = overflow_to_sample_mapping[i]
            answers = data["answers"][sample_index]

            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tok_data["start_positions"].append(cls_index)
                tok_data["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tok_data["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tok_data["end_positions"].append(token_end_index + 1)

        return tok_data
    
    def custom_tokenize_valid(self, data):
        data["question"] = [q.strip() for q in data["question"]]
        
        tok_data = tokenizer(
            data["question"],
            data["context"],
            truncation="only_second",
            max_length=384,
            stride=128,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        )
        
        overflow_to_sample_mapping = tok_data.pop("overflow_to_sample_mapping")

        tok_data["example_id"] = []

        for i in range(len(tok_data["input_ids"])):
            sequence_ids = tok_data.sequence_ids(i)

            sample_index = overflow_to_sample_mapping[i]
            tok_data["example_id"].append(data["id"][sample_index])

            tok_data["offset_mapping"][i] = [(o if sequence_ids[k] == 1 else None) for k, o in enumerate(tok_data["offset_mapping"][i])]

        return tok_data
    
    def get_features():
        return self.encodings.map(lambda example: example, remove_columns=['example_id', 'offset_mapping']), self.encodings


In [None]:
train = read_chaii('../input/chaii-hindi-and-tamil-question-answering/train.csv')
test = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')

model_checkpoint = '../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

df_train_chaii = train
print('train_chaii', len(df_train_chaii))
df_train_mlqa = read_mlqa('../input/mlqa-hindi-processed/mlqa_hindi.csv')
print('train_mlqa', len(df_train_mlqa))
df_tmp1 = df_train_chaii.append(df_train_mlqa)
df_train_squad_hi = read_squad('../input/google-translated-squad20-to-hindi-and-tamil/squad_hi.csv', 'hindi')
df_train_squad_ta = read_squad('../input/google-translated-squad20-to-hindi-and-tamil/squad_ta.csv', 'tamil')
df_tmp2 = df_train_squad_hi.append(df_train_squad_ta)

train_combined = df_tmp1.append(df_tmp2)
print('train_combined', len(train_combined))
df_train = train_combined[:-64].reset_index(drop=True)
df_valid = train_combined[-64:].reset_index(drop=True)

my_dataset = SquadDataset(df_train, df_valid, validation=False)
tok_train_ds, tok_valid_ds = my_dataset.get_ds()

args = TrainingArguments(
    "best-model-so-far",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    warmup_ratio=0.1,
    gradient_accumulation_steps=8,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
)

data_collator = default_data_collator

trainer = Trainer(
    model,
    args,
    train_dataset=tok_train_ds,
    eval_dataset=tok_valid_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
valid_dataset = Dataset.from_pandas(df_valid)
# validation_features = valid_dataset.map(prepare_validation_features, batched=True, remove_columns=valid_dataset.column_names)
val_ds = SquadDataset(df_valid, df_valid, validation=True)
validation_features = val_ds.get_valid_ds()
# valid_feats_small, validation_features = val_dataset.get_features()

valid_feats_small = validation_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])
raw_predictions = trainer.predict(valid_feats_small)
final_predictions = get_predictions(valid_dataset, validation_features, raw_predictions.predictions)

references = [{"id": ex["id"], "answer": ex["answers"]['text'][0]} for ex in valid_dataset]
res = pd.DataFrame(references)
res['prediction'] = res['id'].apply(lambda r: final_predictions[r])
res['jaccard'] = res[['answer', 'prediction']].apply(jaccard, axis=1)
print(res.jaccard.mean())
res

In [None]:
test_dataset = Dataset.from_pandas(test)

test_ds = SquadDataset(test, test, validation=True)
test_features = test_ds.get_valid_ds()
print(test_features)

test_feats_small = test_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])
test_predictions = trainer.predict(test_feats_small)
test_features.set_format(type=test_features.format["type"], columns=list(test_features.features.keys()))
final_test_predictions = get_predictions(test_dataset, test_features, test_predictions.predictions)

id_list = [{"id":idx} for idx in test_dataset['id']]
res = pd.DataFrame(id_list)
res['PredictionString'] = res['id'].apply(lambda r: final_test_predictions[r])
res.to_csv('submission.csv', index=False)
res
