In [1]:
# install datasets
!pip install datasets

Collecting datasets
  Using cached datasets-1.1.3-py3-none-any.whl (153 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.11.1-py36-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 7.9 MB/s eta 0:00:01
Collecting dill
  Using cached dill-0.3.3-py2.py3-none-any.whl (81 kB)
Collecting xxhash
  Downloading xxhash-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (242 kB)
[K     |████████████████████████████████| 242 kB 25.5 MB/s eta 0:00:01
[?25hCollecting pyarrow>=0.17.1
  Downloading pyarrow-2.0.0-cp36-cp36m-manylinux2014_x86_64.whl (17.7 MB)
[K     |████████████████████████████████| 17.7 MB 53.6 MB/s eta 0:00:01
Installing collected packages: dill, multiprocess, xxhash, pyarrow, datasets
Successfully installed datasets-1.1.3 dill-0.3.3 multiprocess-0.70.11.1 pyarrow-2.0.0 xxhash-2.0.0


In [2]:
# Make sure that we have a recent version of pyarrow in the session before we continue - otherwise reboot Colab to activate it
import pyarrow
if int(pyarrow.__version__.split('.')[1]) < 16 and int(pyarrow.__version__.split('.')[0]) == 0:
    import os
    os.kill(os.getpid(), 9)

In [3]:
# Let's import the library. We typically only need at most four methods:
from datasets import list_datasets, list_metrics, load_dataset, load_metric

from pprint import pprint

In [4]:
# Currently available datasets and metrics
datasets = list_datasets()
metrics = list_metrics()

print(f"🤩 Currently {len(datasets)} datasets are available on the hub:")
pprint(datasets, compact=True)
print("##################################################")
print("##################################################")
print(f"🤩 Currently {len(metrics)} metrics are available on the hub:")
pprint(metrics, compact=True)

🤩 Currently 316 datasets are available on the hub:
['aeslc', 'afrikaans_ner_corpus', 'ag_news', 'ai2_arc', 'ajgt_twitter_ar',
 'allegro_reviews', 'allocine', 'amazon_reviews_multi', 'amazon_us_reviews',
 'amttl', 'anli', 'arcd', 'arsentd_lev', 'art', 'aslg_pc12', 'asnq', 'asset',
 'autshumato', 'big_patent', 'billsum', 'biomrc', 'blended_skill_talk', 'blimp',
 'blog_authorship_corpus', 'bookcorpus', 'bookcorpusopen', 'boolq',
 'break_data', 'c3', 'c4', 'cail2018', 'cawac', 'cdsc', 'cdt', 'cfq', 'chr_en',
 'circa', 'civil_comments', 'clinc_oos', 'clue', 'cmrc2018', 'cnn_dailymail',
 'coached_conv_pref', 'coarse_discourse', 'codah', 'com_qa', 'common_gen',
 'commonsense_qa', 'compguesswhat', 'conll2000', 'conll2002', 'conll2003',
 'conv_ai', 'coqa', 'cornell_movie_dialog', 'cos_e', 'cosmos_qa', 'crd3',
 'crime_and_punish', 'crows_pairs', 'cs_restaurants', 'csv', 'daily_dialog',
 'danish_political_comments', 'dart', 'dbpedia_14', 'deal_or_no_dialog',
 'definite_pronoun_resolution', 'dialo

In [5]:
# You can access various attributes of the datasets before downloading them
squad_dataset = list_datasets(with_details=True)[datasets.index('squad')]

pprint(squad_dataset.__dict__)  # It's a simple python dataclass

{'author': None,
 'citation': '@article{2016arXiv160605250R,\n'
             '       author = {{Rajpurkar}, Pranav and {Zhang}, Jian and '
             '{Lopyrev},\n'
             '                 Konstantin and {Liang}, Percy},\n'
             '        title = "{SQuAD: 100,000+ Questions for Machine '
             'Comprehension of Text}",\n'
             '      journal = {arXiv e-prints},\n'
             '         year = 2016,\n'
             '          eid = {arXiv:1606.05250},\n'
             '        pages = {arXiv:1606.05250},\n'
             'archivePrefix = {arXiv},\n'
             '       eprint = {1606.05250},\n'
             '}',
 'description': 'Stanford Question Answering Dataset (SQuAD) is a reading '
                'comprehension dataset, consisting of questions posed by '
                'crowdworkers on a set of Wikipedia articles, where the answer '
                'to every question is a segment of text, or span, from the '
                'corresponding reading pa

In [6]:
# Downloading and loading a dataset
dataset = load_dataset('squad', split='validation[:10%]')

Reusing dataset squad (/home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41)


In [7]:
# Informations on the dataset (description, citation, size, splits, format...)
# are provided in `dataset.info` (a simple python dataclass) and also as direct attributes in the dataset object
pprint(dataset.info.__dict__)

{'builder_name': 'squad',
 'citation': '@article{2016arXiv160605250R,\n'
             '       author = {{Rajpurkar}, Pranav and {Zhang}, Jian and '
             '{Lopyrev},\n'
             '                 Konstantin and {Liang}, Percy},\n'
             '        title = "{SQuAD: 100,000+ Questions for Machine '
             'Comprehension of Text}",\n'
             '      journal = {arXiv e-prints},\n'
             '         year = 2016,\n'
             '          eid = {arXiv:1606.05250},\n'
             '        pages = {arXiv:1606.05250},\n'
             'archivePrefix = {arXiv},\n'
             '       eprint = {1606.05250},\n'
             '}\n',
 'config_name': 'plain_text',
 'dataset_size': 89789763,
 'description': 'Stanford Question Answering Dataset (SQuAD) is a reading '
                'comprehension dataset, consisting of questions posed by '
                'crowdworkers on a set of Wikipedia articles, where the answer '
                'to every question is a segment of

In [8]:
print(dataset)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1057
})


In [9]:
print(f"👉Dataset len(dataset): {len(dataset)}")
print("\n👉First item 'dataset[0]':")
pprint(dataset[0])

👉Dataset len(dataset): 1057

👉First item 'dataset[0]':
{'answers': {'answer_start': [177, 177, 177],
             'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']},
 'context': 'Super Bowl 50 was an American football game to determine the '
            'champion of the National Football League (NFL) for the 2015 '
            'season. The American Football Conference (AFC) champion Denver '
            'Broncos defeated the National Football Conference (NFC) champion '
            'Carolina Panthers 24–10 to earn their third Super Bowl title. The '
            "game was played on February 7, 2016, at Levi's Stadium in the San "
            'Francisco Bay Area at Santa Clara, California. As this was the '
            '50th Super Bowl, the league emphasized the "golden anniversary" '
            'with various gold-themed initiatives, as well as temporarily '
            'suspending the tradition of naming each Super Bowl game with '
            'Roman numerals (under which 

In [10]:
# Or get slices with several examples:
print("\n👉Slice of the two items 'dataset[10:12]':")
pprint(dataset[10:12])


👉Slice of the two items 'dataset[10:12]':
OrderedDict([('answers',
              [{'answer_start': [334, 334, 334],
                'text': ['February 7, 2016', 'February 7', 'February 7, 2016']},
               {'answer_start': [177, 177, 177],
                'text': ['Denver Broncos',
                         'Denver Broncos',
                         'Denver Broncos']}]),
             ('context',
              ['Super Bowl 50 was an American football game to determine the '
               'champion of the National Football League (NFL) for the 2015 '
               'season. The American Football Conference (AFC) champion Denver '
               'Broncos defeated the National Football Conference (NFC) '
               'champion Carolina Panthers 24–10 to earn their third Super '
               "Bowl title. The game was played on February 7, 2016, at Levi's "
               'Stadium in the San Francisco Bay Area at Santa Clara, '
               'California. As this was the 50th Supe

In [11]:
# You can get a full column of the dataset by indexing with its name as a string:
print(dataset['question'][:10])

['Which NFL team represented the AFC at Super Bowl 50?', 'Which NFL team represented the NFC at Super Bowl 50?', 'Where did Super Bowl 50 take place?', 'Which NFL team won Super Bowl 50?', 'What color was used to emphasize the 50th anniversary of the Super Bowl?', 'What was the theme of Super Bowl 50?', 'What day was the game played on?', 'What is the AFC short for?', 'What was the theme of Super Bowl 50?', 'What does AFC stand for?']


In [12]:
print(dataset[0]['question'] == dataset['question'][0])
print(dataset[10:20]['context'] == dataset['context'][10:20])

True
True


In [13]:
# You can inspect the dataset column names and types 
print("Column names:")
pprint(dataset.column_names)
print("Features:")
pprint(dataset.features)

Column names:
['answers', 'context', 'id', 'question', 'title']
Features:
{'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
 'context': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None)}


In [14]:
# Datasets also have shapes informations
print("The number of rows", dataset.num_rows, "also available as len(dataset)", len(dataset))
print("The number of columns", dataset.num_columns)
print("The shape (rows, columns)", dataset.shape)

The number of rows 1057 also available as len(dataset) 1057
The number of columns 5
The shape (rows, columns) (1057, 5)


In [15]:
# Let's print the length of each `context` string in our subset of the dataset
# (10% of the validation i.e. 1057 examples)

dataset.map(lambda example: print(len(example['context']), end=','))

775,

HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))

775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,179,179,179,179,179,179,179,179,179,179,179,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1166,1166,1166,1166,1166,1166,1166,1

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1057
})

In [16]:
# for example in dataset:
#     function(example)

In [17]:
from datasets import logging
logging.set_verbosity_warning()

dataset.map(lambda example: print(len(example['context']), end=','))

775,

HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))

775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,179,179,179,179,179,179,179,179,179,179,179,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1166,1166,1166,1166,1166,1166,1166,1

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1057
})

In [18]:
# Let's keep it verbose for our tutorial though
from datasets import logging
logging.set_verbosity_info()

In [19]:
# Let's add a prefix 'My cute title: ' to each of our titles

def add_prefix_to_title(example):
    example['title'] = 'My cute title: ' + example['title']
    return example

prefixed_dataset = dataset.map(add_prefix_to_title)

print(prefixed_dataset.unique('title'))  # `.unique()` is a super fast way to print the unique elemnts in a column (see the doc for all the methods)

Testing the mapped function outputs
Testing finished, running the mapping function on the dataset
Caching processed dataset at /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/cache-f5b2fe3c179dab04.arrow


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))

Done writing 1057 examples in 921948 bytes /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/tmphwnokt9r.



['My cute title: Super_Bowl_50', 'My cute title: Warsaw']


In [20]:
# Since the input example dict is updated with our function output dict,
# we can actually just return the updated 'title' field
titled_dataset = dataset.map(lambda example: {'title': 'My cutest title: ' + example['title']})

print(titled_dataset.unique('title'))

Testing the mapped function outputs
Testing finished, running the mapping function on the dataset
Caching processed dataset at /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/cache-4e69a5d3bdc51b20.arrow


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))

Done writing 1057 examples in 924062 bytes /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/tmplxeiofv2.



['My cutest title: Super_Bowl_50', 'My cutest title: Warsaw']


In [21]:
# This will remove the 'title' column while doing the update (after having send it the the mapped function so you can use it in your function!)
less_columns_dataset = dataset.map(lambda example: {'new_title': 'Wouhahh: ' + example['title']}, remove_columns=['title'])

print(less_columns_dataset.column_names)
print(less_columns_dataset.unique('new_title'))

Testing the mapped function outputs
Testing finished, running the mapping function on the dataset
Caching processed dataset at /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/cache-80781248ba7ff67f.arrow


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))

Done writing 1057 examples in 915606 bytes /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/tmpv3el4_vz.



['answers', 'context', 'id', 'new_title', 'question']
['Wouhahh: Super_Bowl_50', 'Wouhahh: Warsaw']


In [22]:
# This will add the index in the dataset to the 'question' field
with_indices_dataset = dataset.map(lambda example, idx: {'question': f'{idx}: ' + example['question']},
                                   with_indices=True)

pprint(with_indices_dataset['question'][:5])

Testing the mapped function outputs
Testing finished, running the mapping function on the dataset
Caching processed dataset at /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/cache-b21142204e793319.arrow


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))

Done writing 1057 examples in 911325 bytes /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/tmpbvp764lo.



['0: Which NFL team represented the AFC at Super Bowl 50?',
 '1: Which NFL team represented the NFC at Super Bowl 50?',
 '2: Where did Super Bowl 50 take place?',
 '3: Which NFL team won Super Bowl 50?',
 '4: What color was used to emphasize the 50th anniversary of the Super Bowl?']


In [24]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.0.0-py3-none-any.whl (1.4 MB)
Collecting sacremoses
  Using cached sacremoses-0.0.43.tar.gz (883 kB)
Collecting regex!=2019.12.17
  Downloading regex-2020.11.13-cp36-cp36m-manylinux2014_x86_64.whl (723 kB)
[K     |████████████████████████████████| 723 kB 15.1 MB/s eta 0:00:01
Collecting tokenizers==0.9.4
  Downloading tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 50.8 MB/s eta 0:00:01
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25ldone
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.43-py3-none-any.whl size=893259 sha256=482540025452429d47a122f70e51ffafb84e59d1aac4867a76664e5c54734c59
  Stored in directory: /home/ubuntu/.cache/pip/wheels/49/25/98/cdea9c79b2d9a22ccc59540b1784b67f06b633378e97f58da2
Successfully built sacremoses
Installing collected packages: regex, sacremoses, tokenizers, transforme

In [25]:
# Let's import a fast tokenizer that can work on batched inputs
# (the 'Fast' tokenizers in HuggingFace)
from transformers import BertTokenizerFast, logging as transformers_logging

transformers_logging.set_verbosity_warning()

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [26]:
# Now let's batch tokenize our dataset 'context'
encoded_dataset = dataset.map(lambda example: tokenizer(example['context']), batched=True)

print("encoded_dataset[0]")
pprint(encoded_dataset[0], compact=True)

Testing the mapped function outputs
Testing finished, running the mapping function on the dataset
Caching processed dataset at /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/cache-345aae1e66add54d.arrow


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Done writing 1057 examples in 4714929 bytes /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/tmp4g1izzjt.



encoded_dataset[0]
{'answers': {'answer_start': [177, 177, 177],
             'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']},
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1],
 'context': 'Super Bowl 50 was an American football game to determine the '
            'champion of the National Football League (NFL) for the 2015 '
            'season. The American Football C

In [27]:
# we have added additional columns
pprint(dataset.column_names)

['answers', 'context', 'id', 'question', 'title']


In [28]:
# Let show a more complex processing with the full preparation of the SQuAD dataset
# for training a model from Transformers
def convert_to_features(batch):
    # Tokenize contexts and questions (as pairs of inputs)
    input_pairs = list(zip())
    encodings = tokenizer(batch['context'], batch['question'], truncation=True)

    # Compute start and end tokens for labels
    start_positions, end_positions = [], []
    for i, answer in enumerate(batch['answers']):
        first_char = answer['answer_start'][0]
        last_char = first_char + len(answer['text'][0]) - 1
        start_positions.append(encodings.char_to_token(i, first_char))
        end_positions.append(encodings.char_to_token(i, last_char))

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encodings

encoded_dataset = dataset.map(convert_to_features, batched=True)

Testing the mapped function outputs
Testing finished, running the mapping function on the dataset
Caching processed dataset at /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/cache-391a337dc71b242a.arrow


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Done writing 1057 examples in 5081257 bytes /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/tmpcvg3ygx5.





In [29]:
# Now our dataset comprise the labels for the start and end position
# as well as the offsets for converting back tokens
# in span of the original string for evaluation
print("column_names", encoded_dataset.column_names)
print("start_positions", encoded_dataset[:5]['start_positions'])

column_names ['answers', 'attention_mask', 'context', 'end_positions', 'id', 'input_ids', 'question', 'start_positions', 'title', 'token_type_ids']
start_positions [34, 45, 80, 34, 98]


In [30]:
columns_to_return = ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']

encoded_dataset.set_format(type='torch', columns=columns_to_return)

# Our dataset indexing output is now ready for being used in a pytorch dataloader
pprint(encoded_dataset[1], compact=True)

Set __getitem__(key) output type to torch for ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'] columns  (when key is int or slice) and don't output other (un-formatted) columns.


{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1]),
 'end_positions': tensor(46),
 'input_ids': tensor([  101,  3198,  5308,  1851,  1108,  1126,  1237,  1709,  1342,  1106,
         4959,  1103,  3628,  1104,  1103,  1305,  2289,  1453,   113,  4279,
          114,  1111,  1103,  1410,  1265,   119,  1109,  1237,  2289,  3047,
          113, 10402,   114,  3628,  7068, 14722,  2378,  1103,  1305,  2289,
         3047,   113, 24743,   114,  3628, 

In [31]:
# Note that the columns are not removed from the dataset, just not returned when calling __getitem__
# Similarly the inner type of the dataset is not changed to torch.Tensor, the conversion and filtering is done on-the-fly when querying the dataset
print(encoded_dataset.column_names)

['answers', 'attention_mask', 'context', 'end_positions', 'id', 'input_ids', 'question', 'start_positions', 'title', 'token_type_ids']


In [32]:
# We can remove the formatting with `.reset_format()`
# or, identically, a call to `.set_format()` with no arguments
encoded_dataset.reset_format()

pprint(encoded_dataset[1], compact=True)

Set __getitem__(key) output type to python objects for no columns  (when key is int or slice) and don't output other (un-formatted) columns.


{'answers': {'answer_start': [249, 249, 249],
             'text': ['Carolina Panthers', 'Carolina Panthers',
                      'Carolina Panthers']},
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'context': 'Super Bowl 50 was an American football game to determine the '
            'champion of the National Football League (NFL) for the 2015

In [33]:
# The current format can be checked with `.format`,
# which is a dict of the type and formatting
pprint(encoded_dataset.format)

{'columns': ['answers',
             'attention_mask',
             'context',
             'end_positions',
             'id',
             'input_ids',
             'question',
             'start_positions',
             'title',
             'token_type_ids'],
 'format_kwargs': {},
 'output_all_columns': False,
 'type': None}


In [34]:
!pip install transformers



In [35]:
import torch 
from datasets import load_dataset
from transformers import BertTokenizerFast

# Load our training dataset and tokenizer
dataset = load_dataset('squad')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def get_correct_alignement(context, answer):
    """ Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. """
    gold_text = answer['text'][0]
    start_idx = answer['answer_start'][0]
    end_idx = start_idx + len(gold_text)
    if context[start_idx:end_idx] == gold_text:
        return start_idx, end_idx       # When the gold label position is good
    elif context[start_idx-1:end_idx-1] == gold_text:
        return start_idx-1, end_idx-1   # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
        return start_idx-2, end_idx-2   # When the gold label is off by two character
    else:
        raise ValueError()

# Tokenize our training dataset
def convert_to_features(example_batch):
    # Tokenize contexts and questions (as pairs of inputs)
    encodings = tokenizer(example_batch['context'], example_batch['question'], truncation=True)

    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methods.
    start_positions, end_positions = [], []
    for i, (context, answer) in enumerate(zip(example_batch['context'], example_batch['answers'])):
        start_idx, end_idx = get_correct_alignement(context, answer)
        start_positions.append(encodings.char_to_token(i, start_idx))
        end_positions.append(encodings.char_to_token(i, end_idx-1))
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encodings

encoded_dataset = dataset.map(convert_to_features, batched=True)

# Format our dataset to outputs torch.Tensor to train a pytorch model
columns = ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
encoded_dataset.set_format(type='torch', columns=columns)

# Instantiate a PyTorch Dataloader around our dataset
# Let's do dynamic batching (pad on the fly with our own collate_fn)
def collate_fn(examples):
    return tokenizer.pad(examples, return_tensors='pt')
dataloader = torch.utils.data.DataLoader(encoded_dataset['train'], collate_fn=collate_fn, batch_size=8)

Checking /home/ubuntu/.cache/huggingface/datasets/e0bdcacbb45db988fbbf4f0e0974c8cb2bf0281198e77c267a9ae22d1214616a.85f43de978b9b25921cb78d7a2f2b350c04acdbaedb9ecb5f7101cd7c0950e68.py for additional imports.
Lock 139703862266232 acquired on /home/ubuntu/.cache/huggingface/datasets/e0bdcacbb45db988fbbf4f0e0974c8cb2bf0281198e77c267a9ae22d1214616a.85f43de978b9b25921cb78d7a2f2b350c04acdbaedb9ecb5f7101cd7c0950e68.py.lock
Found main folder for dataset https://raw.githubusercontent.com/huggingface/datasets/1.1.3/datasets/squad/squad.py at /home/ubuntu/.cache/huggingface/modules/datasets_modules/datasets/squad
Found specific version folder for dataset https://raw.githubusercontent.com/huggingface/datasets/1.1.3/datasets/squad/squad.py at /home/ubuntu/.cache/huggingface/modules/datasets_modules/datasets/squad/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41
Found script file from https://raw.githubusercontent.com/huggingface/datasets/1.1.3/datasets/squad/squad.py to /home/ubuntu/

HBox(children=(FloatProgress(value=0.0, max=88.0), HTML(value='')))




Done writing 87599 examples in 452536620 bytes /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/tmpkqecat4h.
Testing the mapped function outputs
Testing finished, running the mapping function on the dataset
Caching processed dataset at /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/cache-035645402298610f.arrow


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




Done writing 10570 examples in 56664663 bytes /home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41/tmpkopprowq.
Set __getitem__(key) output type to torch for ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to torch for ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'] columns  (when key is int or slice) and don't output other (un-formatted) columns.


In [36]:
# Let's load a pretrained Bert model and a simple optimizer
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained('distilbert-base-cased', return_dict=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263273408.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-cased were not used when initializing BertForQuestionAnswering: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin

In [None]:
# # Now let's train our model
# device = 'cuda' if torch.cuda.is_available() else 'cpu'

# model.train().to(device)
# for i, batch in enumerate(dataloader):
#     batch.to(device)
#     outputs = model(**batch)
#     loss = outputs.loss
#     loss.backward()
#     optimizer.step()
#     model.zero_grad()
#     print(f'Step {i} - loss: {loss:.3}')
#     if i > 5:
#         break

In [37]:
from datasets import load_metric
sacrebleu_metric = load_metric('sacrebleu')
 
# If you only have a single iteration, you can easily compute the score like this
predictions = model(inputs)
score = sacrebleu_metric.compute(predictions, references)
 
# If you have a loop, you can "add" your predictions and references at each iteration instead of having to save them yourself (the metric object store them efficiently for you)
for batch in dataloader:
    model_input, targets = batch
    predictions = model(model_inputs)
    sacrebleu_metric.add_batch(predictions, targets)
score = sacrebleu_metric.compute()  # Compute the score from all the stored predictions/references

Lock 139703876655032 acquired on /home/ubuntu/.cache/huggingface/datasets/f6b8871d16bf6d7f7f9f0d1e942f224dd6b8e3806e66caca3d8292a34a3f3b7e.90615c613092df9760941da0e77d044445c7ed8d9bf8d99e9319993d91df4055.py.lock
https://raw.githubusercontent.com/huggingface/datasets/1.1.3/metrics/sacrebleu/sacrebleu.py not found in cache or force_download set to True, downloading to /home/ubuntu/.cache/huggingface/datasets/tmpkrnr_6se


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1880.0, style=ProgressStyle(description…

storing https://raw.githubusercontent.com/huggingface/datasets/1.1.3/metrics/sacrebleu/sacrebleu.py in cache at /home/ubuntu/.cache/huggingface/datasets/f6b8871d16bf6d7f7f9f0d1e942f224dd6b8e3806e66caca3d8292a34a3f3b7e.90615c613092df9760941da0e77d044445c7ed8d9bf8d99e9319993d91df4055.py
creating metadata file for /home/ubuntu/.cache/huggingface/datasets/f6b8871d16bf6d7f7f9f0d1e942f224dd6b8e3806e66caca3d8292a34a3f3b7e.90615c613092df9760941da0e77d044445c7ed8d9bf8d99e9319993d91df4055.py
Lock 139703876655032 released on /home/ubuntu/.cache/huggingface/datasets/f6b8871d16bf6d7f7f9f0d1e942f224dd6b8e3806e66caca3d8292a34a3f3b7e.90615c613092df9760941da0e77d044445c7ed8d9bf8d99e9319993d91df4055.py.lock
Checking /home/ubuntu/.cache/huggingface/datasets/f6b8871d16bf6d7f7f9f0d1e942f224dd6b8e3806e66caca3d8292a34a3f3b7e.90615c613092df9760941da0e77d044445c7ed8d9bf8d99e9319993d91df4055.py for additional imports.





ImportError: To be able to use this metric, you need to install the following dependencies['sacrebleu'] using 'pip install sacrebleu' for instance'