### one time installations

In [1]:
# !gdown --id 1pb7gEkctrVrJA79EAIo7H7nuzD6uV1fW
# !gdown --id 1oIeAE9HXXKWPcYa-AZ0ht5ef6sKe_Vh_
# !gdown --id 10rAuIDvsYR2yDiCqP7GmYGPc-UmtLbJb

In [2]:
# !pip install --quiet transformers
# !pip install --quiet datasets 
# !pip install --quiet SentencePiece
# !pip install --quiet pytorch-lightning

### libraries

In [3]:
import numpy as np
import pandas as pd
import sklearn
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers import Trainer, TrainingArguments

### hyper parameters

In [4]:
class hyperparameters:
    # tokenizer
    tokenizer_name = "deepset/xlm-roberta-large-squad2" # model_name # CHANGE THIS; TRY XLM-ROBERTA
    max_len = 384 # maximum length of context and question in a datapoint
    overlap_len = 128 # overlap between two parts of the context when it is split
    
    # model
    model_name = "deepset/xlm-roberta-large-squad2"
    

In [5]:
hyperparams = hyperparameters()

#### remove following code

In [6]:
from datasets import load_dataset

In [7]:
datasets = load_dataset("squad_v2")

Reusing dataset squad_v2 (/home/shubham/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
def prepare_train_features(examples):
    max_length = 384
    doc_stride = 128
    pad_on_right = True
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [11]:
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

Loading cached processed dataset at /home/shubham/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-1f8f445facbaec6d.arrow


  0%|          | 0/12 [00:00<?, ?ba/s]

In [12]:
print(datasets)
print(datasets["train"].column_names)
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})
['id', 'title', 'context', 'question', 'answers']
DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions'],
        num_rows: 133320
    })
    validation: Dataset({
        features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions'],
        num_rows: 12360
    })
})


### data - understanding the pipeline

In [10]:
tokenizer = AutoTokenizer.from_pretrained(hyperparams.tokenizer_name)

In [13]:
train_df = pd.read_csv('train.csv', encoding='utf-8')
# test_df = pd.read_csv('test.csv')
# sample_df = pd.read_csv('sample_submission.csv')

In [14]:
train_df = sklearn.utils.shuffle(train_df, random_state=4).reset_index(drop=True)

In [15]:
# # converting into Squad format
# def convert_answers(row):
#     return {'answer_start': [row[0]], 'text': [row[1]]}

# train_df['answers'] = train_df[['answer_start', 'answer_text']].apply(convert_answers, axis=1)

In [16]:
train_df = train_df.loc[:2]
# train_df

In [17]:
train_df

Unnamed: 0,id,context,question,answer_text,answer_start,language
0,5179ed725,ரேடியம் (Radium) என்பது Ra என்ற மூலக்கூற்று வா...,ரேடியம் எப்போது கண்டுபிடிக்கப்பட்டது?,1898,975,tamil
1,b028b54cf,हेलेन एडम्स केलर (27 जून 1880 - 1 जून 1968) एक...,हेलेन केलर की मृत्यु किस वर्ष में हुई थी?,1968,38,hindi
2,86eff66f2,फ़ूड एण्ड ड्रग एडमिनिस्ट्रेशन (FDA या USFDA) स...,खाद्य एवं औषधि प्रशासन का मुख्यालय कहाँ पर है?,"सिल्वर स्प्रिंग, मैरीलैंड",1092,hindi


In [18]:
# for idx in range(len(train_df)):
#     if train_df.loc[idx,'language'] == "hindi" and len(tokenizer(train_df.loc[idx, 'context'], train_df.loc[idx, 'question'])['input_ids']) > hyperparams.max_len:
#         break

In [19]:
out = tokenizer(
    list(train_df['question']), list(train_df['context']),
    max_length=hyperparams.max_len, 
    truncation='only_second',
    stride=hyperparams.overlap_len,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    padding=True,
)

In [20]:
print(out.keys())
print(len(out['input_ids']), len(out['offset_mapping']))

dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])
62 62


In [21]:
print(len(out['offset_mapping']))
# for i in range(len(out['offset_mapping'])):
#     print(len(out.sequence_ids(i)))

62


In [22]:
print(tokenizer.cls_token, tokenizer.bos_token, tokenizer.padding_side)

<s> <s> right


In [23]:
print(len(out['input_ids']))

62


In [24]:
# print(tokenizer.decode(out['input_ids'][0]))
# print(tokenizer.decode(out['input_ids'][1]))
# print(tokenizer.decode(out['input_ids'][2]))

In [25]:
map_x2context_idx = out['overflow_to_sample_mapping']
print(map_x2context_idx, type(map_x2context_idx))

[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] <class 'list'>


In [26]:
map_offset = out['offset_mapping']

In [27]:
idx = random.randint(0, len(out['input_ids']))
idx

0

In [28]:
x_idx = out['input_ids'][idx]
x_context_idx = map_x2context_idx[idx]
print(idx, x_context_idx, len(x_idx))

0 0 384


In [29]:
x_x1_x2 = out.sequence_ids(idx) # sequence ids

In [30]:
# tokenizer.decode(x_idx)

In [31]:
answer_start = train_df.loc[x_context_idx, 'answer_start']
answer = train_df.loc[x_context_idx, 'answer_text']
print(answer)
print(answer_start)

1898
975


In [32]:
cls_idx = x_idx.index(tokenizer.cls_token_id)
cls_idx

0

In [33]:
y_start_char = train_df.loc[x_context_idx, 'answer_start']
y_end_char = train_df.loc[x_context_idx, 'answer_start'] + len(train_df.loc[x_context_idx, 'answer_text']) # note it is one char ahead

### Data pipeline

In [46]:
def prepare_chaii(data_df, tokenizer):
    # prepare_chaii takes in raw data and returns tokenized data 
    # along with position of first token and last token in the answer_text
    
    # strip trailing and leading whitespaces in context, question, and (answer_text)?
    data_df.loc[:, 'context'] = data_df.loc[:, 'context'].apply(lambda sen : str(sen).strip())
    data_df.loc[:, 'question'] = data_df.loc[:, 'question'].apply(lambda sen : str(sen).strip())
    data_df.loc[:, 'answer_text'] = data_df.loc[:, 'answer_text'].apply(lambda sen : str(sen).strip())
    
    data_tok = tokenizer(
        list(train_df['question']), list(train_df['context']),
        max_length=hyperparams.max_len, 
        truncation='only_second',
        stride=hyperparams.overlap_len,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding=True,
    )
    
    # data_df contains original raw data having question, context
    # data_tok contains tokenized data, where context might have split into multiple sentences 
    # data_tok is a dict, containing keys : dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])
    # every value is a list, and no tensors here
    
    # adding two more keys that will contain the position of first token and last token in the answer_text
    data_tok['start_positions'], data_tok['end_positions'] = [], []
    
    n_sents = len(data_tok['input_ids'])
    map_id_sent2context = data_tok['overflow_to_sample_mapping'] # id means index! since input_ids means various inputs to the model
    map_offsets = data_tok['offset_mapping']
    assert len(map_offsets) == len(map_id_sent2context) == n_sents
    
    for input_id in range(n_sents):
        sent = data_tok['input_ids'][input_id]
        
        # get the answer_start and answer_text for this input_id using the id in data_df
        context_id = map_id_sent2context[input_id]
        answer_text = data_df.loc[context_id, 'answer_text']
        answer_start = data_df.loc[context_id, 'answer_start']
        answer_end = answer_start + len(answer_text) # will use this in next code block
        
        # check whether the answer is present in the current input_id or not using offsets
        qn_context_id = data_tok.sequence_ids(input_id)
        
            # first: get the start_idx_token and end_idx_token of context
        start_idx_token = qn_context_id.index(1)
        end_idx_token = len(qn_context_id) - qn_context_id[::-1].index(1) - 1
        
            # second: use the offsets for input_id to find if answer_start and answer_end are inside this chunk of context or not
        offset_map = map_offsets[input_id]

        if answer_start >= offset_map[start_idx_token][0] and answer_end <= offset_map[end_idx_token][1]:
            # now finally get the idx_token for the first and last token in the answer_text
            while answer_start >= offset_map[start_idx_token][0] and start_idx_token < len(sent):
                start_idx_token += 1
            while answer_end <= offset_map[end_idx_token][1]:
                end_idx_token -= 1
            
            data_tok['start_positions'].append(start_idx_token - 1)
            data_tok['end_positions'].append(end_idx_token + 1)
        
        else:
            cls_token_idx = sent.index(tokenizer.cls_token_id)
            assert cls_token_idx == 0
            data_tok['start_positions'].append(0) # cls token index
            data_tok['end_positions'].append(0) # cls token index

    return data_tok     

In [47]:
class chaii_ka_data(Dataset):
    def __init__(self, data_df, tokenizer, train=True):
        super(chaii_ka_data, self).__init__()
        '''
            train = True means train, train = False means val set; test = True means test set (without labels)
            data_df is the pandas dataframe containing context, question, ...        
        '''
        
        # tokenize data samples context;question, and create new samples if overflow
        self.data_tok = prepare_chaii(data_df, tokenizer)
    
    def __getitem__(self, input_id): # index is input_id as used in prepare_chaii()
        # sent = self.data_tok['input_ids'][input_id]
        # att_mask = self.data_tok['attention_mask'][input_id]
        # offset_map = self.data_tok['offset_mapping'][input_id]
        # start_idx_tok = self.data_tok['start_positions'][input_id]
        # end_idx_tok = self.data_tok['end_positions'][input_id]
        
        return {k: torch.tensor(v[input_id], dtype=torch.long) for k,v in self.data_tok.items()}
    
    def __len__(self):
        return len(self.data['input_ids'])

In [48]:
trainset = chaii_ka_data(train_df, tokenizer)

In [50]:
train_df

Unnamed: 0,id,context,question,answer_text,answer_start,language
0,5179ed725,ரேடியம் (Radium) என்பது Ra என்ற மூலக்கூற்று வா...,ரேடியம் எப்போது கண்டுபிடிக்கப்பட்டது?,1898,975,tamil
1,b028b54cf,हेलेन एडम्स केलर (27 जून 1880 - 1 जून 1968) एक...,हेलेन केलर की मृत्यु किस वर्ष में हुई थी?,1968,38,hindi
2,86eff66f2,फ़ूड एण्ड ड्रग एडमिनिस्ट्रेशन (FDA या USFDA) स...,खाद्य एवं औषधि प्रशासन का मुख्यालय कहाँ पर है?,"सिल्वर स्प्रिंग, मैरीलैंड",1092,hindi


In [49]:
next(iter(trainset))

{'input_ids': tensor([     0,      6,  31250, 106273,    938, 237805, 142272,  91622,   2798,
          59920,     32,      2,      2,      6,  31250, 106273,    938,     15,
          12248,  28483,     16,  28191,   2552,  12407,  34148,   4864,   2798,
          39639,  35867,      6,  58789, 189641,  23618,   6390, 151421,  55039,
          34044, 100128,   3769,   6509,  35097,   3219,  59386,   4548, 199777,
              5, 116683, 165380, 136259,  14073, 169604,      5, 116683, 165380,
          79471,  35186, 130724, 169604,      5,  59386,   4548,  72307,  31756,
           2690,   7827,   3937,  46800,  10540,  60070,   9000,  36242,  24183,
            116, 130130, 173021,  35097,   6001,   9823, 127183,  59386, 127076,
              6,  31250, 106273,    938, 169604,      5,  12751, 114319,  25650,
          12359,  66705,  22727, 105971,  43716,  78884,  68960,   3686,   8182,
              5,  46018,  12784, 155012,      6,  31250, 106273,    938, 197508,
         127316

In [42]:
# a = [(1,2.1)]
# torch.tensor(a, dtype=torch.long)

tensor([[1, 2]])

### model

### training

In [27]:
model = AutoModelForQuestionAnswering.from_pretrained(hyperparams.model_name)

Downloading:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

### evaluation

### references
1. https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb
2. https://huggingface.co/transformers/internal/tokenization_utils.html#transformers.tokenization_utils_base.PreTrainedTokenizerBase.__call__