In [None]:
import json
import math
import numpy as np
import pandas as pd 
import random
import os
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Sampler, Dataset, DataLoader
import torch.nn.init as init
from torch.nn import Parameter
from torch.autograd.function import InplaceFunction

from transformers import AutoConfig
from transformers import AutoTokenizer
from transformers import AutoModel
import shutil
import collections

device = torch.device("cuda")

In [None]:
class config:
    # PATH
    INPUT_DIR = "/kaggle/input/chaii-hindi-and-tamil-question-answering"
    MODEL_NAME = "/kaggle/input/xlm-roberta-squad2/deepset/xlm-roberta-base-squad2"
    HINDI_MODEL_PATH = '/kaggle/input/chaii-train-with-divided-hindi-and-tamil/exp0016_hindi/chaii_f0_best_jaccard_model.bin'
    TAMIL_MODEL_PATH = '/kaggle/input/chaii-train-with-divided-hindi-and-tamil/exp0016_tamil/chaii_f0_best_jaccard_model.bin'
    # parameter
    SEED = 0
    HIDDEN_DROPOUT_PROB = 0.1
    LAYER_NORM_EPS = 1e-5
    # tokenizer
    TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
    NOT_WATCH_PARAM = ["NOT_WATCH_PARAM", "TOKENIZER", "INPUT_DIR", "OUTPUT_DIR"]
    MAX_LEN = 384
    STRIDE = 128
    MAX_ANSWER_LEN = 30
    CONTENT_ID= 1
    N_BEST = 20

In [None]:
def set_seed(seed=config.SEED):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
# cf: https://www.kaggle.com/thedrcat/chaii-eda-baseline
def encode_train_example(example):
    question = example['question'].lstrip()
    context = example['context']

    tokenized_examples = config.TOKENIZER(
            question,
            context,
            truncation="only_second",
            max_length=config.MAX_LEN,
            stride=config.STRIDE,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_token_type_ids=True,
            padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples["offset_mapping"]

    tokenized_examples['sequence_ids'] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(config.TOKENIZER.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)

        tokenized_examples['sequence_ids'].append(sequence_ids)

    return tokenized_examples


def extract_feature_example(df):
    tokenized_examples = {
        'input_ids': [],
        'attention_mask': [],
        'token_type_ids': [],
        'offset_mapping': [],
        'sequence_ids': [],
        'example_id': [],
    }
    for _, row in df.iterrows():
        tokenized_example = encode_train_example(row)
        tokenized_examples['input_ids'].extend(tokenized_example['input_ids'])
        tokenized_examples['attention_mask'].extend(tokenized_example['attention_mask'])
        tokenized_examples['token_type_ids'].extend(tokenized_example['token_type_ids'])
        tokenized_examples['offset_mapping'].extend(tokenized_example['offset_mapping'])
        tokenized_examples['sequence_ids'].extend(tokenized_example['sequence_ids'])
        tokenized_examples['example_id'].extend([row['id'] for _ in range(len(tokenized_example['input_ids']))])
    return tokenized_examples

In [None]:
class ChAIIDataset(Dataset):
    
    def __init__(self, df, train):
        self.feature_examples = extract_feature_example(df)
        self.train = train
        
    def __len__(self):
        return len(self.feature_examples['input_ids'])
    
    def __getitem__(self, item):
        d = {
            'input_ids': torch.tensor(self.feature_examples['input_ids'][item]),
            'attention_mask': torch.tensor(self.feature_examples['attention_mask'][item]),
            'token_type_ids': torch.tensor(self.feature_examples['token_type_ids'][item]),
            'offset_mapping': torch.tensor(self.feature_examples['offset_mapping'][item]),
            'example_id': self.feature_examples['example_id'][item],
        }
        if not self.train:
            d['sequence_ids'] = self.feature_examples['sequence_ids'][item]
        return d


In [None]:
def predict_test(models, dset, test_df):
    all_logits, losses = [], []
    for d in dset:
        example_id = d['example_id']
        lang = test_df.query(f'id=="{example_id}"')['language'].iloc[0]
        model = models[lang]
        model.eval()
        with torch.no_grad():
            outputs = model(
                d['input_ids'].unsqueeze(0).to(device),
                d['attention_mask'].unsqueeze(0).to(device),
                #d['token_type_ids'].unsqueeze(0).to(device),
            )
        start_logits = outputs['start_logits'].cpu()
        end_logits = outputs['end_logits'].cpu()
        all_logits.append((start_logits, end_logits))
    return all_logits

In [None]:
def post_processing(all_logits, train_df, dset):
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(dset):
        features_per_example[feature["example_id"]].append(i)
    
    predicts = []
    for example_id, feature_indices in features_per_example.items():
        row = train_df.query(f'id=="{example_id}"').iloc[0]
        context = row['context']
        predict_answers = []
        for feature_index in feature_indices:
            assert dset.feature_examples['example_id'][feature_index] == example_id
            
            start_logits, end_logits = all_logits[feature_index]
            offset_mapping = dset.feature_examples["offset_mapping"][feature_index]
            sequence_ids = dset.feature_examples["sequence_ids"][feature_index]
            offset_mapping = [o if i == config.CONTENT_ID else None for i, o in zip(sequence_ids, offset_mapping)]
            
            start_indexes = np.argsort(start_logits[0].numpy())[-config.N_BEST:]
            end_indexes = np.argsort(end_logits[0].numpy())[-config.N_BEST:]
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offset_mapping[start_index] is None or offset_mapping[end_index] is None:
                        continue
                    if len(offset_mapping) <= start_index or len(offset_mapping) <= end_index:
                        continue
                    if start_index > end_index or (end_index - start_index) > config.MAX_ANSWER_LEN:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]

                    text = context[start_char:end_char]
                    score = start_logits[0, start_index] + end_logits[0, end_index]

                    predict_answers.append({
                        'predict_text': text,
                        'score': score.item()
                    })
            
        if len(predict_answers) > 0:
            best_answer = sorted(predict_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"predict_text": "", "score": 0.0}
            
        best_answer['id'] = example_id
        predicts.append(best_answer)
    predict_df = pd.DataFrame(predicts)
    return predict_df

In [None]:
class ChaiiModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.transformer_config = AutoConfig.from_pretrained(config.MODEL_NAME)
        self.transformer_config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": config.HIDDEN_DROPOUT_PROB,
                "layer_norm_eps": config.LAYER_NORM_EPS,
                "add_pooling_layer": False,
            }
        )
        self.transformer = AutoModel.from_pretrained(config.MODEL_NAME, config=self.transformer_config)
        self.qa_outputs = nn.Linear(self.transformer_config.hidden_size, 2)
        self.__init_weights(self.qa_outputs)
        
    def __init_weights(self,module):
        if isinstance(module,nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.transformer_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        
    def forward(self, input_ids, attention_mask, token_type_ids=None, start_positions=None, end_positions=None):
        transformer_out = self.transformer(
            input_ids,
            attention_mask,
            #token_type_ids,
        )
        sequence_output = transformer_out['last_hidden_state']  # 'last_hidden_state', 'pooler_output', 'hidden_states'
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1) #.contiguous()
        end_logits = end_logits.squeeze(-1) #.contiguous()
        
        if start_positions is not None and end_positions is not None:
            loss = self.loss_fn(start_logits, end_logits, start_positions, end_positions)
        else:
            loss = None

        return {
            'start_logits': start_logits,
            'end_logits': end_logits,
            'loss': loss,
        }
    
    def loss_fn(self, start_logits, end_logits, start_positions, end_positions):
        total_loss = None

        ignored_index = start_logits.size(1)
        start_positions = start_positions.clamp(0, ignored_index)
        end_positions = end_positions.clamp(0, ignored_index)

        loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        total_loss = (start_loss + end_loss) / 2
        
        return total_loss

In [None]:
def main():
    test_df = pd.read_csv(f'{config.INPUT_DIR}/test.csv')
    sub_df = pd.read_csv(f'{config.INPUT_DIR}/sample_submission.csv')

    test_dataset = ChAIIDataset(test_df, train=False)


    models = {}
    for lang in ["hindi", "tamil"]:
        model = ChaiiModel()
        if lang == "hindi":
            model.load_state_dict(torch.load(config.HINDI_MODEL_PATH, map_location=torch.device('cpu')))
        elif lang == "tamil":
            model.load_state_dict(torch.load(config.TAMIL_MODEL_PATH, map_location=torch.device('cpu')))
        else:
            raise
        model.to(device)
        models[lang] = model
        
    set_seed()
    all_logits = predict_test(models, test_dataset, test_df)
    predict_df = post_processing(all_logits, test_df, test_dataset)

    result_df = predict_df.merge(test_df, how='left', on='id')

    result_df = result_df[['id', 'predict_text']]
    result_df.columns = sub_df.columns

    result_df.to_csv('submission.csv', index=None)

In [None]:
if __name__ == "__main__":
    main()

In [None]:
!head submission.csv