In [None]:
import os, gc, math, json, time, random, collections, sys
import multiprocessing as mp
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
gc.enable()

from string import punctuation
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, SequentialSampler

from transformers import AdamW,AutoConfig, AutoTokenizer,AutoModelForQuestionAnswering, TrainingArguments, Trainer

In [None]:
MAX_SEQ_LENGTH = 400
DOC_STRIDE = 135
EVAL_BATCH_SIZE = 128

In [None]:
def create_features(example, tokenizer):
    example["question"] = example["question"].lstrip()
    
    tokenized = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        max_length=MAX_SEQ_LENGTH,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    features = []
    for i in range(len(tokenized["input_ids"])):
        features.append(
            {
                "example_id": example['id'],
                'context': example['context'],
                'question': example['question'],
                'input_ids': tokenized['input_ids'][i],
                'attention_mask': tokenized['attention_mask'][i],
                'offset_mapping': tokenized['offset_mapping'][i],
                'sequence_ids': [0 if i is None else i for i in tokenized.sequence_ids(i)]
            }
        )
    return features


In [None]:
class DataTransformer(Dataset):
    def __init__(self, data):
        super(DataTransformer, self).__init__()
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):   
        feature = self.data[item]
        return {
            'input_ids':torch.tensor(feature['input_ids'], dtype=torch.long),
            'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
            'offset_mapping':feature['offset_mapping'],
            'sequence_ids':feature['sequence_ids'],
            'id':feature['example_id'],
            'context': feature['context'],
            'question': feature['question']
        }

In [None]:
def tokenize_after_split(s):
    return ' '.join(s.split())

test = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')

test['context'] = test['context'].apply(tokenize_after_split)
test['question'] = test['question'].apply(tokenize_after_split)
tokenizer = AutoTokenizer.from_pretrained('../input/renew-2')

test_features = []
for i, row in test.iterrows():
    test_features += create_features(row, tokenizer)

test_dataset = DataTransformer(test_features)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=EVAL_BATCH_SIZE, 
    sampler=SequentialSampler(test_dataset),
    num_workers=2,
    pin_memory=True
)

In [None]:
def get_pred(checkpoint_path):
    
    model = AutoModelForQuestionAnswering.from_pretrained('../input/renew-2')
    config = AutoConfig.from_pretrained('../input/renew-2')
    tokenizer = AutoTokenizer.from_pretrained('../input/renew-2')
    
    model.to('cuda')
    model.load_state_dict(torch.load(checkpoint_path))
    
    start_logits, end_logits = [], []
    for batch in test_dataloader:
        with torch.no_grad():
            outputs = model(batch['input_ids'].cuda(), batch['attention_mask'].cuda())
            outputs_start, outputs_end = outputs.start_logits, outputs.end_logits
            start_logits.append(outputs_start.cpu().numpy().tolist())
            end_logits.append(outputs_end.cpu().numpy().tolist())
            del outputs_start, outputs_end
    del model, tokenizer, config
    gc.collect()
    return np.vstack(start_logits), np.vstack(end_logits)

In [None]:
start_logits_list, end_logits_list = [], []
num_models = 3
for i in range(num_models):
    s, e = get_pred(f'../input/renew-{2*(i+1)}/pytorch_model.bin')
    start_logits_list.append(s)
    end_logits_list.append(e)

start_logits, end_logits = 0, 0

for i in range(num_models):
    start_logits += start_logits_list[i]
    end_logits += end_logits_list[i]

start_logits /= num_models
end_logits /= num_models

In [None]:
def post_process(egs, features, raw_pred):
    s_logits, e_logits = raw_pred
    
    id_to_idx = {}    
    for i, k in enumerate(egs["id"]):
        id_to_idx[k] = i
    
    f_per_ex = collections.defaultdict(list)
    for i, feature in enumerate(features):
        f_per_ex[id_to_idx[feature["example_id"]]].append(i)

    preds = collections.OrderedDict()
    
    for i, example in egs.iterrows():
        okay_ans = []
        min_score = float('inf')
        
        context = example["context"]
        for feature_idx in f_per_ex[i]:

            start_logits, end_logits, sequence_ids = s_logits[feature_idx], e_logits[feature_idx], features[feature_idx]['sequence_ids']
            features[feature_idx]["offset_mapping"] = [(off if sequence_ids[j] == 1 else None) for j, off in enumerate(features[feature_idx]["offset_mapping"])]
            offset_mapping = features[feature_idx]["offset_mapping"]
            CLS = tokenizer.cls_token_id
            cls_pos = features[feature_idx]["input_ids"].index(CLS)
            min_score = min(float('inf'), start_logits[cls_pos] + end_logits[cls_pos])
            start_indices, end_indices = np.argsort(start_logits)[-1 : -21 : -1].tolist(), np.argsort(end_logits)[-1 : -21 : -1].tolist()
            
            for start_idx in start_indices:
                for end_idx in end_indices:
                    if any([
                        start_idx >= len(offset_mapping), 
                        end_idx >= len(offset_mapping), 
                        offset_mapping[start_idx] is None, 
                        offset_mapping[end_idx] is None,
                        end_idx < start_idx,
                        end_idx - start_idx > 29
                    ]):
                        continue
 
                    answer = {
                        "score": start_logits[start_idx] + end_logits[end_idx],
                        "text": context[offset_mapping[start_idx][0]:offset_mapping[end_idx][1]]
                    }
                    okay_ans.append(answer)
        
        if len(okay_ans) > 0:
            best_answer = list(sorted(
                okay_ans, 
                key=lambda x: x["score"]
            ))
            best_answer.reverse()
            best_answer = best_answer[0]
        else:
            best_answer = {
                "text": "", 
                "score": 0.0
            }
        
        preds[example["id"]] = best_answer["text"]
        
        
    return preds

In [None]:
fin_preds = post_process(test, test_features, (start_logits, end_logits))

submission = []
for p in fin_preds.items():
    ID, ans = p
    ans = " ".join(ans.split())
    ans = ans.strip(punctuation)
    submission.append((ID, ans))
    
test_data = pd.merge(
    left=test,
    right=pd.DataFrame(submission, columns=["id", "PredictionString"]),
    on='id'
)
test_data[['id', 'PredictionString']].to_csv('submission.csv', index=False)

In [None]:
!cat submission.csv