### This notebook is basically the same as @rhtsingh's [notebook](https://www.kaggle.com/rhtsingh/chaii-qa-5-fold-xlmroberta-torch-infer) but it uses RemBERT instead of XLM-R

In the future, I'll probably add more folds to see how good the ensemble will be.

If you don't know what RemBERT is, see my discussion [here](https://www.kaggle.com/c/chaii-hindi-and-tamil-question-answering/discussion/267827#1489286)

fold | data | reinit | cv | public lb 
-----|-------|-------|--- | ---
0 | chaii              | 0 | |0.756  
0 | e2 chaii              | 0 | 0.674 | 0.761  
1 | e2 chaii | 0 | 0.679 | 0.752
2 | e2 chaii | 0 | 0.647 | 0.751
3 | e2 chaii | 0 | 0.714 | 0.747
1 | chaii              | 1 | |0.726  
2 | chaii              | 1 | |0.723
3 | chaii              | 1 | |0.731
0 | chaii, mlqa, xquad | 0 | |0.739   
1 | chaii, mlqa, xquad | 0 | |0.735
2 | chaii, mlqa, xquad | 0 | |0.736
3 | chaii, mlqa, xquad | 0 | |0.743

In [None]:
# since rembert is a new model, the master branch of transformers needs to be installed
!pip install -U --no-build-isolation --no-deps ../input/transformers-master/ -qq

In [None]:
import os
import gc
gc.enable()
import random
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import (
    Dataset, DataLoader,
    SequentialSampler
)
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    logging,
)
logging.set_verbosity_warning()
logging.set_verbosity_error()

def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

### CFGuration

In [None]:
class CFG:
    
    test_file = "../input/chaii-hindi-and-tamil-question-answering/test.csv"
    
    # model
    model_name_or_path = "../input/rembert-e2-f3/output/checkpoint-fold-3"
    config_name = "../input/rembert-e2-f3/output/checkpoint-fold-3"

    # tokenizer
    tokenizer_name = "../input/rembert-e2-f3/output/checkpoint-fold-3"
    max_seq_length = 384
    doc_stride = 128

    eval_batch_size = 32    
    seed = 2021

### Dataset Retriever

In [None]:
class DatasetRetriever(Dataset):
    def __init__(self, features):
        super().__init__()
        self.features = features
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, item):   
        feature = self.features[item]
        return {
                'input_ids':torch.tensor(feature['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
                'offset_mapping':feature['offset_mapping'],
                'sequence_ids':feature['sequence_ids'],
                'id':feature['example_id'],
                'context': feature['context'],
                'question': feature['question']
            }

### Model

In [None]:
class Model(nn.Module):
    def __init__(self, modelname_or_path, config):
        super(Model, self).__init__()
        self.config = config
        self.model = AutoModel.from_pretrained(modelname_or_path, config=config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self._init_weights(self.qa_outputs)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
                    
                
    def forward(
        self, 
        input_ids, 
        attention_mask=None, 
        # token_type_ids=None
    ):
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
        )

        sequence_output = outputs[0]
        pooled_output = outputs[1]
        
        # sequence_output = self.dropout(sequence_output)
        qa_logits = self.qa_outputs(sequence_output)
        
        start_logits, end_logits = qa_logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
    
        return start_logits, end_logits

### Utilities

In [None]:
def make_model(path):
    config = AutoConfig.from_pretrained(path)
    tokenizer = AutoTokenizer.from_pretrained(path, config=config)
    model = Model(path, config=config)
    return config, tokenizer, model

### Covert Examples to Features (Preprocess)

In [None]:
def prepare_test_features(args, example, tokenizer):
    example["question"] = example["question"].lstrip()
    
    tokenized_example = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        max_length=args.max_seq_length,
        stride=args.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    features = []
    for i in range(len(tokenized_example["input_ids"])):
        feature = {}
        feature["example_id"] = example['id']
        feature['context'] = example['context']
        feature['question'] = example['question']
        feature['input_ids'] = tokenized_example['input_ids'][i]
        feature['attention_mask'] = tokenized_example['attention_mask'][i]
        feature['offset_mapping'] = tokenized_example['offset_mapping'][i]
        feature['sequence_ids'] = [0 if i is None else i for i in tokenized_example.sequence_ids(i)]
        features.append(feature)
    return features

### Postprocess QA Predictions

In [None]:
import collections

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in examples.iterrows():
        feature_indices = features_per_example[example_index]

        min_null_score = None
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            sequence_ids = features[feature_index]["sequence_ids"]
            context_index = 1

            features[feature_index]["offset_mapping"] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(features[feature_index]["offset_mapping"])
            ]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}
        
        predictions[example["id"]] = best_answer["text"]
        
        
    return predictions

### Data Factory

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_name)

def get_test_data():
    test_df = pd.read_csv(CFG.test_file)
    test_features = []
    for i, row in test_df.iterrows():
        test_features += prepare_test_features(CFG, row, tokenizer)

    test_dataset = DatasetRetriever(test_features)
    dataloader = DataLoader(
        test_dataset,
        batch_size=CFG.eval_batch_size, 
        sampler=SequentialSampler(test_dataset),
        num_workers=4,
        pin_memory=True, 
        drop_last=False
    )
    
    return test_df, test_features, dataloader

test_df, test_features, test_dataloader = get_test_data()

### Intialize Inference

In [None]:
def get_predictions(checkpoint_path):
    print("Getting predictions for", checkpoint_path)
    config, tokenizer, model = make_model(checkpoint_path)
    model.cuda();
    model.load_state_dict(
        torch.load(checkpoint_path + '/pytorch_model.bin')
    );
    
    start_logits = []
    end_logits = []
    for batch in test_dataloader:
        with torch.no_grad():
            o1, o2 = model(batch['input_ids'].cuda(), batch['attention_mask'].cuda())
            
            start_logits.append(o1.cpu().numpy().tolist())
            end_logits.append(o2.cpu().numpy().tolist())
    del model, tokenizer, config
    gc.collect()
    return np.vstack(start_logits), np.vstack(end_logits)

### Ensemble Folds

In [None]:
all_start_logits = []
all_end_logits = []

# only 1 fold for now, may add more later
for k in range(4):
    path = f"../input/rembert-e2-f{k}/output/checkpoint-fold-{k}"
    if k == 0:
        path = "../input/chaii-qa-rembert-e2-f0-chaii/output/checkpoint-fold-0"
    start_l, end_l = get_predictions(path)
    all_start_logits.append(start_l)
    all_end_logits.append(end_l)

start_logits = np.array(all_start_logits).mean(axis=0)
end_logits = np.array(all_end_logits).mean(axis=0)

In [None]:
predictions = postprocess_qa_predictions(test_df, test_features, (start_logits, end_logits))
test_df["PredictionString"] = test_df['id'].map(predictions)

## Post-process

Using some rules I made after exploring the training set answers here: https://www.kaggle.com/nbroad/chaii-qa-character-token-languages-eda

In [None]:
# bad_starts = [" ", "\n", "\t", ".",  ")",  "]", "-", "–",  ",", ";", "@", "#", "?", "!", "^", "&", "*"]
# bad_endings = [" ", "\n", "\t", "-", "(", "[", "–", ",", ";", "@", "#", "?", "!", "$", "%", "^", "&", "*"]

bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";"]
bad_endings = ["-", "(", ")", "–", ",", ";"]

tamil_ad = "கி.பி"
tamil_bc = "கி.மு"
tamil_km = "கி.மீ"
hindi_ad = "ई"
hindi_bc = "ई.पू"

cleaned_preds = []
for pred, context in test_df[["PredictionString", "context"]].to_numpy():
    if pred == "":
        cleaned_preds.append(pred)
        continue
     
    # I haven't check sure if this makes a difference, but there is one answer in the training set that ends like this and I think it is an annotator mistake
    # see my notebook here for details https://www.kaggle.com/nbroad/chaii-qa-character-token-languages-eda 
    if pred.endswith("..."):
        pred = pred[:-3]
    
    pred = pred.lstrip("".join(bad_starts))
    pred = pred.rstrip("".join(bad_endings))
    
    if any([pred.endswith(tamil_ad), pred.endswith(tamil_bc), pred.endswith(tamil_km), pred.endswith(hindi_ad), pred.endswith(hindi_bc)]) and pred+"." in context:
        pred = pred+"."
    

    cleaned_preds.append(pred)

test_df["PredictionString"] = cleaned_preds

In [None]:
test_df[['id', 'PredictionString']].to_csv('submission.csv', index=False)

print(test_df[['id', 'PredictionString']])