In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv('/kaggle/input/chaii-hindi-and-tamil-question-answering/train.csv')
test=pd.read_csv('/kaggle/input/chaii-hindi-and-tamil-question-answering/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
from sklearn.model_selection import train_test_split
train, val =train_test_split(train, test_size=0.25)

In [None]:
train=train.reset_index(drop=True)
val=val.reset_index(drop=True)



In [None]:
def read_dataset(df):
    contexts = []
    questions = []
    answers = []
    for grp1 in df['context']:
        contexts.append(grp1)
    for grp2 in df['question']:
        questions.append(grp2)
    return contexts,questions

In [None]:
train_contexts, train_questions = read_dataset(train)
val_contexts, val_questions  = read_dataset(val)

In [None]:
train_answers=[]
val_answers=[]
for i,j in zip(train['answer_start'],train['answer_text']):
    train_answers.append({'answer_start':i,'answer_text':j})
for i,j in zip(val['answer_start'],val['answer_text']):
    val_answers.append({'answer_start':i,'answer_text':j})
    
    

In [None]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['answer_text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1   
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2

In [None]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(0,len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [None]:
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)


In [None]:
import torch

class chaiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
train_dataset = chaiDataset(train_encodings)
val_dataset = chaiDataset(val_encodings)

In [None]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased",return_dict=True,output_attentions=True)

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(1):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
model_path='./model'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
model = DistilBertForQuestionAnswering.from_pretrained(model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)

In [None]:
model = model.to(device)
model.eval()
val_loader = DataLoader(val_dataset, batch_size=16)
acc = []
for batch in val_loader:
    with torch.no_grad():
         input_ids = batch['input_ids'].to(device)
         attention_mask = batch['attention_mask'].to(device)
         start_true = batch['start_positions'].to(device)
         end_true = batch['end_positions'].to(device)
         outputs = model(input_ids, attention_mask=attention_mask)
         start_pred = torch.argmax(outputs['start_logits'], dim=1)
         end_pred = torch.argmax(outputs['end_logits'], dim=1)
         acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
         acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
acc = sum(acc)/len(acc)
print(acc)

In [None]:
for contxt,quas in zip(val_contexts,val_questions):
    val_encodings=tokenizer(contxt, quas, truncation=True, padding=True)
    with torch.no_grad():
         outputs = model(input_ids, attention_mask=attention_mask)
         start_pred = torch.argmax(outputs['start_logits'], dim=1)
         end_pred = torch.argmax(outputs['end_logits'], dim=1)
         all_tokens = tokenizer.convert_ids_to_tokens(val_encodings["input_ids"])
         answer = ' '.join(all_tokens[start_pred[0]:end_pred[0]])
         print("The qusetion is :",quas)
         print("The answer is :",answer)
         print('*'*50)