In [2]:
%pip install transformers datasets evaluate

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [1]:
from datasets import load_dataset

squad = load_dataset("squad")

In [2]:
train = squad['train']
val = squad['validation']

print('Features:', train.column_names)
print('No of train rows:', train.num_rows)
print('No of validation rows:', val.num_rows)

Features: ['id', 'title', 'context', 'question', 'answers']
No of train rows: 87599
No of validation rows: 10570


We dont have the test set for Squad-1.0, as it is unreleased. So we are going to split the train set into train and test sets:

In [3]:
train = train.train_test_split(test_size=0.2)
test = train['test']
train = train['train']
print('No of train rows:', train.num_rows)
print('No of test rows:', test.num_rows)

No of train rows: 70079
No of test rows: 17520


Example row:

In [4]:
train[0]

{'id': '5725e5ce38643c19005ace55',
 'title': 'Buckingham_Palace',
 'context': "Directly underneath the State Apartments is a suite of slightly less grand rooms known as the semi-state apartments. Opening from the Marble Hall, these rooms are used for less formal entertaining, such as luncheon parties and private audiences. Some of the rooms are named and decorated for particular visitors, such as the 1844 Room, decorated in that year for the State visit of Tsar Nicholas I of Russia, and, on the other side of the Bow Room, the 1855 Room, in honour of the visit of Emperor Napoleon III of France. At the centre of this suite is the Bow Room, through which thousands of guests pass annually to the Queen's Garden Parties in the Gardens. The Queen and Prince Philip use a smaller suite of rooms in the north wing.",
 'question': 'Where are the suites located that the Queen and Prince Phillip use?',
 'answers': {'text': ['the north wing'], 'answer_start': [721]}}

In [5]:
import numpy as np
context_lengths = np.array(list(map(len, train['context'])) + list(map(len, test['context'])) + list(map(len, val['context'])))
question_lengths = np.array(list(map(len, train['question'])) + list(map(len, test['question'])) + list(map(len, val['question'])))
answer_lengths = np.array(list(map(lambda x: len(x['text']), train['answers'])) + list(map(lambda x: len(x['text']), test['answers'])) + list(map(lambda x: len(x['text']), val['answers'])))

In [6]:
print('Context length:')
print(f'Average: {context_lengths.mean()}, Median: {np.median(context_lengths)}, Min: {context_lengths.min()}, Max: {context_lengths.max()}')
print('Question length:')
print(f'Average: {question_lengths.mean()}, Median: {np.median(question_lengths)}, Min: {question_lengths.min()}, Max: {question_lengths.max()}')
print('Answer length:')
print(f'Average: {answer_lengths.mean()}, Median: {np.median(answer_lengths)}, Min: {answer_lengths.min()}, Max: {answer_lengths.max()}')

Context length:
Average: 757.0149945502144, Median: 694.0, Min: 151, Max: 4063
Question length:
Average: 59.618790045737455, Median: 56.0, Min: 1, Max: 25651
Answer length:
Average: 1.246065458545977, Median: 1.0, Min: 1, Max: 6


Preprocessing: 
1. Some examples in a dataset may have a very long `context` that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the `context` by setting `truncation="only_second"`.
2. Next, map the start and end positions of the answer to the original `context` by setting
   `return_offset_mapping=True`.
3. With the mapping in hand, now you can find the start and end tokens of the answer. Use the [sequence_ids](https://huggingface.co/docs/tokenizers/main/en/api/encoding#tokenizers.Encoding.sequence_ids) method to
   find which part of the offset corresponds to the `question` and which corresponds to the `context`.

One example of the preprocessing involved:

In [7]:
example = train[0:2]
example

{'id': ['5725e5ce38643c19005ace55', '5733b38fd058e614000b60ac'],
 'title': ['Buckingham_Palace', 'Tajikistan'],
 'context': ["Directly underneath the State Apartments is a suite of slightly less grand rooms known as the semi-state apartments. Opening from the Marble Hall, these rooms are used for less formal entertaining, such as luncheon parties and private audiences. Some of the rooms are named and decorated for particular visitors, such as the 1844 Room, decorated in that year for the State visit of Tsar Nicholas I of Russia, and, on the other side of the Bow Room, the 1855 Room, in honour of the visit of Emperor Napoleon III of France. At the centre of this suite is the Bow Room, through which thousands of guests pass annually to the Queen's Garden Parties in the Gardens. The Queen and Prince Philip use a smaller suite of rooms in the north wing.",
  "Tajikistan is officially a republic, and holds elections for the presidency and parliament, operating under a presidential system. I

In [8]:
print(example["context"][0])
print(example["context"][1])

Directly underneath the State Apartments is a suite of slightly less grand rooms known as the semi-state apartments. Opening from the Marble Hall, these rooms are used for less formal entertaining, such as luncheon parties and private audiences. Some of the rooms are named and decorated for particular visitors, such as the 1844 Room, decorated in that year for the State visit of Tsar Nicholas I of Russia, and, on the other side of the Bow Room, the 1855 Room, in honour of the visit of Emperor Napoleon III of France. At the centre of this suite is the Bow Room, through which thousands of guests pass annually to the Queen's Garden Parties in the Gardens. The Queen and Prince Philip use a smaller suite of rooms in the north wing.
Tajikistan is officially a republic, and holds elections for the presidency and parliament, operating under a presidential system. It is, however, a dominant-party system, where the People's Democratic Party of Tajikistan routinely has a vast majority in Parliame

In [9]:
questions = [q.strip() for q in example["question"]]
questions

['Where are the suites located that the Queen and Prince Phillip use?',
 'What type of government does Tajikistan have?']

Using distillBERT tokenizer:

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [11]:
print('Max no of input tokens:', tokenizer.model_max_length)

Max no of input tokens: 512


Why truncation=only_second?:                  
Truncate to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. This will only truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.       
So only context will be truncated.

In [12]:
inputs = tokenizer(
    questions,
    example["context"],
    truncation="only_second",
    return_offsets_mapping=True,
    padding="max_length"
)

In [13]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping'])

What is an attention mask?:       
The attention mask is a binary tensor indicating the position of the padded indices so that the model does not attend to them. For the BertTokenizer , 1 indicates a value that should be attended to, while 0 indicates a padded value.

In [14]:
inputs['attention_mask'][0]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


What is offset mapping?                  
Offset mapping is a mapping between the tokens generated by the tokenizer and their corresponding character positions in the original sentence. It essentially creates a link between the tokenized output and the original text.         
Offset mapping is typically returned as a list of tuples. Each tuple represents a single token and contains two elements:

- Start Offset: This is the character position where the token begins in the original sentence.
- End Offset: This is the character position where the token ends in the original sentence (excluding the character at the end position itself). 


Offset mapping of (0,0) indicates that token doesn't exist in original sentence.

In [15]:
inputs['offset_mapping'][0]

[(0, 0),
 (0, 5),
 (6, 9),
 (10, 13),
 (14, 20),
 (21, 28),
 (29, 33),
 (34, 37),
 (38, 43),
 (44, 47),
 (48, 54),
 (55, 62),
 (63, 66),
 (66, 67),
 (0, 0),
 (0, 8),
 (9, 19),
 (20, 23),
 (24, 29),
 (30, 40),
 (41, 43),
 (44, 45),
 (46, 51),
 (52, 54),
 (55, 63),
 (64, 68),
 (69, 74),
 (75, 80),
 (81, 86),
 (87, 89),
 (90, 93),
 (94, 98),
 (98, 99),
 (99, 104),
 (105, 115),
 (115, 116),
 (117, 124),
 (125, 129),
 (130, 133),
 (134, 140),
 (141, 145),
 (145, 146),
 (147, 152),
 (153, 158),
 (159, 162),
 (163, 167),
 (168, 171),
 (172, 176),
 (177, 183),
 (184, 196),
 (196, 197),
 (198, 202),
 (203, 205),
 (206, 211),
 (211, 214),
 (215, 222),
 (223, 226),
 (227, 234),
 (235, 244),
 (244, 245),
 (246, 250),
 (251, 253),
 (254, 257),
 (258, 263),
 (264, 267),
 (268, 273),
 (274, 277),
 (278, 287),
 (288, 291),
 (292, 302),
 (303, 311),
 (311, 312),
 (313, 317),
 (318, 320),
 (321, 324),
 (325, 329),
 (330, 334),
 (334, 335),
 (336, 345),
 (346, 348),
 (349, 353),
 (354, 358),
 (359, 362),

In [16]:
print(len(inputs['offset_mapping'][0]), len(inputs['attention_mask'][0]))

512 512


Now using offset mapping to convert start and end position of answers in original context to start and end tokens: 

In [17]:
answers = example["answers"]
answers

[{'text': ['the north wing'], 'answer_start': [721]},
 {'text': ['a republic'], 'answer_start': [25]}]

What is inputs.sequence_ids:                     
Return a list mapping the tokens to the id of their original sentences:

None for special tokens added around or between sequences,
0 for tokens corresponding to words in the first sequence,
1 for tokens corresponding to words in the second sequence when a pair of sequences was jointly encoded.


In [18]:
offset_mapping = inputs.pop("offset_mapping")
start_positions = []
end_positions = []

for i, offset in enumerate(offset_mapping):
    answer = answers[i]
    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer["text"][0])
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label it (0, 0)
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

inputs["start_positions"] = start_positions
inputs["end_positions"] = end_positions

This is what a single tokenized encoded input to the model looks like:

In [19]:
inputs[0]

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [20]:
# tokens
inputs[0].tokens

['[CLS]',
 'where',
 'are',
 'the',
 'suites',
 'located',
 'that',
 'the',
 'queen',
 'and',
 'prince',
 'phillip',
 'use',
 '?',
 '[SEP]',
 'directly',
 'underneath',
 'the',
 'state',
 'apartments',
 'is',
 'a',
 'suite',
 'of',
 'slightly',
 'less',
 'grand',
 'rooms',
 'known',
 'as',
 'the',
 'semi',
 '-',
 'state',
 'apartments',
 '.',
 'opening',
 'from',
 'the',
 'marble',
 'hall',
 ',',
 'these',
 'rooms',
 'are',
 'used',
 'for',
 'less',
 'formal',
 'entertaining',
 ',',
 'such',
 'as',
 'lunch',
 '##eon',
 'parties',
 'and',
 'private',
 'audiences',
 '.',
 'some',
 'of',
 'the',
 'rooms',
 'are',
 'named',
 'and',
 'decorated',
 'for',
 'particular',
 'visitors',
 ',',
 'such',
 'as',
 'the',
 '1844',
 'room',
 ',',
 'decorated',
 'in',
 'that',
 'year',
 'for',
 'the',
 'state',
 'visit',
 'of',
 'tsar',
 'nicholas',
 'i',
 'of',
 'russia',
 ',',
 'and',
 ',',
 'on',
 'the',
 'other',
 'side',
 'of',
 'the',
 'bow',
 'room',
 ',',
 'the',
 '1855',
 'room',
 ',',
 'in',
 

In [21]:
print('Index of start token of answer:', inputs['start_positions'][0])
print('Index of end token of answer:', inputs['end_positions'][0])

Index of start token of answer: 161
Index of end token of answer: 163


In [22]:
print('Context:', example['context'][0])
print('Question:', example['question'][0])
print('Answer tokens:', inputs[0].tokens[inputs['start_positions'][0]: inputs['end_positions'][0]])

Context: Directly underneath the State Apartments is a suite of slightly less grand rooms known as the semi-state apartments. Opening from the Marble Hall, these rooms are used for less formal entertaining, such as luncheon parties and private audiences. Some of the rooms are named and decorated for particular visitors, such as the 1844 Room, decorated in that year for the State visit of Tsar Nicholas I of Russia, and, on the other side of the Bow Room, the 1855 Room, in honour of the visit of Emperor Napoleon III of France. At the centre of this suite is the Bow Room, through which thousands of guests pass annually to the Queen's Garden Parties in the Gardens. The Queen and Prince Philip use a smaller suite of rooms in the north wing.
Question: Where are the suites located that the Queen and Prince Phillip use?
Answer tokens: ['the', 'north']


Creating a function for this preprocessing task:

In [23]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
preprocess_function(example).keys()

NameError: name 'preprocess_function' is not defined

Applying the preprocessing function to the dataset:

In [28]:
tokenized_train = preprocess_function(train)
tokenized_test = preprocess_function(test)
tokenized_val = preprocess_function(val)

In [5]:
tokenized_train['start_positions'][0]

NameError: name 'tokenized_train' is not defined

Creating a batch of examples using [DefaultDataCollator](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DefaultDataCollator).

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

Training distilBERT model on the dataset:

In [1]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
tokenized_train[0]

NameError: name 'tokenized_train' is not defined

In [3]:
training_args = TrainingArguments(
    output_dir="model1",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

NameError: name 'tokenized_train' is not defined

In [104]:
%pip install accelerate -U

Note: you may need to restart the kernel to use updated packages.


In [83]:
tokenized_train[0].keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [85]:
tokenized_train[0]

{'input_ids': [101,
  2040,
  2001,
  2081,
  4001,
  3484,
  3003,
  2044,
  1996,
  4051,
  2602,
  1029,
  102,
  1996,
  8037,
  4227,
  1037,
  3484,
  1999,
  2119,
  3506,
  1999,
  1996,
  4051,
  2602,
  1012,
  16551,
  2018,
  2000,
  2147,
  2007,
  1996,
  3537,
  3484,
  3003,
  23037,
  1038,
  1012,
  3779,
  1006,
  2101,
  1057,
  1012,
  1055,
  1012,
  2343,
  1007,
  1999,
  1996,
  4001,
  1998,
  5882,
  3520,
  4097,
  8022,
  1999,
  1996,
  2160,
  1010,
  2119,
  2013,
  3146,
  1012,
  3533,
  3235,
  1010,
  1996,
  3951,
  5882,
  2013,
  4006,
  2000,
  4085,
  1998,
  2153,
  2013,
  4052,
  2000,
  3982,
  1010,
  2626,
  2008,
  16551,
  1000,
  2196,
  5129,
  2370,
  2007,
  16838,
  2040,
  2071,
  9611,
  2576,
  3471,
  2007,
  2658,
  8066,
  1012,
  2045,
  2020,
  11790,
  1010,
  7723,
  1059,
  1012,
  2534,
  1010,
  2005,
  2742,
  1010,
  2040,
  2004,
  3472,
  1997,
  1996,
  3951,
  2120,
  2837,
  2699,
  2000,
  2330,
  1996,
  3447,


In [None]:
def preprocess_function(example):
    question = example['question']
    inputs = tokenizer(
        question,
        example["context"],
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = example["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs