In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import transformers
import torch
import random
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import time

In [None]:
# If GPU available
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device=torch.device("cuda")
    print('There are %d GPU(s) available.' %torch.cuda.device_count())
    print('We will use the GPU:',torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU.')
    device = torch.device("cpu")

In [None]:
def fix_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

fix_random_seed(10)

In [None]:
class config:
    plm = '../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2'
    split_ratio = 0.9
    batch_size = 8
    lr = 1e-5
    epochs = 2
    max_length = 256
    stride = 128
    
CONFIG = config()

In [None]:
# Follwing 2 files: Pre-Processed by @Rohit Singh (https://www.kaggle.com/rhtsingh/external-data-mlqa-xquad-preprocessing)
xquad_df = pd.read_csv('../input/mlqa-hindi-processed/xquad.csv')
mlqa_df = pd.read_csv('../input/mlqa-hindi-processed/mlqa_hindi.csv')

initial_data_df = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/train.csv')
initial_data_df = initial_data_df.append(mlqa_df)
initial_data_df = initial_data_df.append(xquad_df).sample(frac=1)

split_idx = int(len(initial_data_df.index) * CONFIG.split_ratio)

initial_train_df = initial_data_df.iloc[:split_idx]
initial_valid_df = initial_data_df.iloc[split_idx:]

In [None]:
def add_answer_end_col(df):
    answer_ends = []
    for index, row in df.iterrows():
        answer_ends.append(row['answer_start'] + len(row['answer_text']))
    
    df.insert(loc = 5, column = "answer_end", value = answer_ends)
    
    return df

initial_train_df = add_answer_end_col(initial_train_df)
initial_valid_df = add_answer_end_col(initial_valid_df)

In [None]:
initial_train_df

In [None]:
def get_model(plm):
    model = transformers.AutoModelForQuestionAnswering.from_pretrained(plm)
    model.to(device)
    return model

def get_tokenizer(plm):
    tokenizer = transformers.AutoTokenizer.from_pretrained(plm)
    return tokenizer

In [None]:
def get_features(df_row, tokenizer):
    batch_encoding = tokenizer (
        df_row['question'],
        df_row['context'],
        truncation='only_second', padding="max_length", max_length = CONFIG.max_length,
        stride = CONFIG.stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_token_type_ids=True
    )
    
    features = []
    for i, encoding in enumerate(batch_encoding.encodings):
        feature = {}
        feature['ids'] = encoding.ids
        feature['attention_mask'] = encoding.attention_mask
        feature['offset'] = encoding.offsets
        feature['token'] = encoding.tokens
        #feature['token_type_ids'] = encoding.type_ids
        class_index = encoding.ids.index(tokenizer.cls_token_id)
        
        for j in range (len(encoding.sequence_ids)):
            if encoding.sequence_ids[j] != None and encoding.sequence_ids[j] == 1:
                context_start_idx = j
                context_start_offset = encoding.offsets[j][0]
                break

        feature['start_position'] = feature['end_position'] = class_index
        for j in range(context_start_idx, len(encoding.offsets)):
            offset = encoding.offsets[j]
            if offset[0] <= df_row['answer_start'] and df_row['answer_start'] < offset[1]:
                feature['start_position'] = j
            if offset[0] < df_row['answer_end'] and df_row['answer_end'] <= offset[1]:
                feature['end_position'] = j
                break
        if feature['start_position'] == class_index or feature['end_position'] == class_index:
            feature['start_position'] = feature['end_position'] = class_index
        features.append(feature)            
    return features

tokenizer = get_tokenizer(CONFIG.plm)

train_features = []
for _, row in initial_train_df.iterrows():
    train_features += get_features(row, tokenizer)

valid_features = []
for _, row in initial_valid_df.iterrows():
    valid_features += get_features(row, tokenizer)

In [None]:
#print (train_features[0])

In [None]:
# I don't need random sampler because data is already shuffled
def get_dataloader(features, batch_size):
    ids = []
    attention_masks = []
    token_type_ids = []    
    start_positions = []
    end_positions = []
    for feature in features:
        ids.append(feature['ids'])
        attention_masks.append(feature['attention_mask'])
        start_positions.append(feature['start_position'])
        end_positions.append(feature['end_position'])
        #token_type_ids.append(feature['token_type_ids'])                
    #data = TensorDataset(torch.tensor(ids), torch.tensor(attention_masks), torch.tensor(start_positions), torch.tensor(end_positions), torch.tensor(token_type_ids))
    data = TensorDataset(torch.tensor(ids), torch.tensor(attention_masks), torch.tensor(start_positions), torch.tensor(end_positions))
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
    
    return dataloader

In [None]:
def get_optimizer_scheduler(model, dataloader, epochs, lr):
    #Set learning rate
    optimizer=AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    epochs=epochs

    #Training steps is no_of_batches*no_of_epochs
    total_steps=len(dataloader)*epochs

    #Learning rate scheduler
    scheduler=get_linear_schedule_with_warmup(optimizer, num_warmup_steps= total_steps * 0.2, num_training_steps = total_steps)
    
    return optimizer, scheduler

In [None]:
from torch.utils.data import SequentialSampler
import operator

def predict(device, model, tokenizer, question, context):
    inputs = tokenizer (
        question,
        context,
        truncation='only_second', padding="max_length", max_length = CONFIG.max_length,
        stride = CONFIG.stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_token_type_ids=True
    )
    
    CLASS_INDEX = inputs.input_ids[0].index(tokenizer.cls_token_id)
    
    for i in range(len(inputs.sequence_ids())):
        if inputs.sequence_ids()[i] != None and inputs.sequence_ids()[i] == 1:
            context_start = i
            break
    #data = TensorDataset(torch.tensor(inputs.input_ids), torch.tensor(inputs.attention_mask), torch.tensor(inputs.offset_mapping), torch.tensor(inputs.type_ids))
    data = TensorDataset(torch.tensor(inputs.input_ids), torch.tensor(inputs.attention_mask), torch.tensor(inputs.offset_mapping))
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=4)
    
    start_list = []
    end_list = []
    
    answer_list = []
    for batch_i, batch in enumerate(dataloader):
        #outputs = model(batch[0].to(device), attention_mask=batch[1].to(device), token_type_ids=batch[3].to(device))    
        outputs = model(batch[0].to(device), attention_mask=batch[1].to(device))    
        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits
        
        for i in range(len(outputs.start_logits)): # for each result in batch
            max_start_score = 0.0
            start_idx = 0
            for j in range(context_start, len(outputs.start_logits[i])):
                if answer_start_scores[i][j] > max_start_score:
                    start_idx = j
                    max_start_score = answer_start_scores[i][j]
            max_end_score = 0.0
            end_idx = 0
            for j in range(start_idx, len(outputs.start_logits[i])):                    
                if answer_end_scores[i][j] > max_end_score:
                    end_idx = j
                    max_end_score = answer_end_scores[i][j]
            if start_idx < context_start or end_idx < context_start:
                continue
            answer_list.append((batch[2][i][start_idx][0].item(), batch[2][i][end_idx][1].item(), 
                                answer_start_scores[i][start_idx].item(), answer_end_scores[i][end_idx].item()))
        
    sorted_answer_list = sorted(answer_list, key=lambda tup: tup[2]+tup[3], reverse=True)
    
    if len(sorted_answer_list) != 0:
        return context[sorted_answer_list[0][0]:sorted_answer_list[0][1]].strip(), sorted_answer_list, []
    else:
        return "", [], []

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def score(model, valid_df):
    jaccard_sum = 0.0
    for index, row in valid_df.iterrows():
        context = row['context']
        question = row['question']
        answer_gold = row['answer_text']
        answer_result, start_score_list, end_score_list = predict(device, model, tokenizer, question, context)
        jaccard_sum += jaccard(answer_result, answer_gold)
    return jaccard_sum / len(valid_df)

In [None]:
def train(parameters):
    print (parameters)
    batch_size = CONFIG.batch_size
    lr = parameters['lr']
    epochs = CONFIG.epochs

    train_dataloader = get_dataloader(train_features, CONFIG.batch_size)
    print ('train_loader size: {}'.format(len(train_dataloader)))
    model = get_model(CONFIG.plm)
    optimizer, scheduler = get_optimizer_scheduler(model, train_dataloader, epochs, lr)

    train_check_every = 200
    model_id = int(time.time())
    print ('model_id: {}'.format(model_id))
    for epoch in range(epochs):
        batch_count = 0
        epoch_loss = 0.0
        model.train()
        for i, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            start_positions = batch[2].to(device)
            end_positions = batch[3].to(device)
            #token_type_ids = batch[4].to(device)            
            
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            #outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions, token_type_ids=token_type_ids)
            loss = outputs[0]
            
            loss.backward()
            optimizer.step()
            scheduler.step()
            
            batch_count += 1
            epoch_loss += float(loss)
            if i % train_check_every == 0 and i != 0:
                print ('epoch: {}, batch: {}, loss: {}'.format(epoch, i, loss))
        print ('-----------------------------------------------------')
        print ('epoch: {}, epoch avrage loss: {}'.format(epoch, epoch_loss / batch_count))
        
        model.eval()        
        print ("ecpoch: {}, Jaccard Score: {}".format(epoch, score(model, initial_valid_df)))
        model_path = './model/{}/{}'.format(model_id, epoch)
        model.save_pretrained(model_path)
        print ('Save epoch model: {}'.format(model_path))        
        
        print ('-----------------------------------------------------')

    return model

In [None]:
params = {'batch_size': CONFIG.batch_size, 'lr': CONFIG.lr, 'epochs': CONFIG.epochs}
model = train (params)

In [None]:
test_df = pd.read_csv('/kaggle/input/chaii-hindi-and-tamil-question-answering/test.csv')
model.eval()
with open('submission.csv', 'w') as fwp:
    fwp.write('id,PredictionString\n')
    for index, row in test_df.iterrows():
        try:
            answer, _, _ = predict(device, model, tokenizer, row['question'], row['context'])
            answer = answer.replace('"', '')
            fwp.write('{},"{}"\n'.format(row['id'], answer))
        except Exception as e:
            fwp.write('{},""\n'.format(row['id']))