In [8]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
# adding labels
batch['labels'] = torch.tensor([1,1])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from torch.optim import *

optimizer = AdamW(model.parameters())
loss = model(**batch).loss # calculate loss, ** batch passing dict
loss.backward()
optimizer.step()

In [11]:
from datasets import load_dataset
# dataset from Hub

raw_datasets = load_dataset("glue","mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

* DatasetDict object with train, val, test set
* Each contains several columns (sentence1, sentence 2 ...

In [13]:
raw_train_dataset = raw_datasets['train']
raw_train_dataset[404]

{'sentence1': 'Both Blair and Bush have faced accusations that they manipulated intelligence about weapons of mass destruction to make the case for military action .',
 'sentence2': 'At home , the premier has faced accusations that he overplayed intelligence about weapons of mass destruction to make the case for war .',
 'label': 1,
 'idx': 450}

In [14]:
# tell us about the features / each column
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

## Process data

In [15]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
# tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [24]:
inputs = ["This is the first sentence.","second sentence bruh."]
inputs = tokenizer(inputs)
print(inputs)
for x in inputs['input_ids']:
    print(tokenizer.decode(x)) # decode integers 

{'input_ids': [[101, 2023, 2003, 1996, 2034, 6251, 1012, 102], [101, 2117, 6251, 7987, 27225, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}
[CLS] this is the first sentence. [SEP]
[CLS] second sentence bruh. [SEP]


In [25]:
inputs = tokenizer("This is the first sentence.","second sentence bruh.")
print(inputs)

print(tokenizer.decode(inputs['input_ids'])) # decode integers 

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2117, 6251, 7987, 27225, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] this is the first sentence. [SEP] second sentence bruh. [SEP]


In [26]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'second',
 'sentence',
 'br',
 '##uh',
 '.',
 '[SEP]']

In [33]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [29]:
tokenized_dataset = tokenizer(
        raw_datasets['train']['sentence1'],
        raw_datasets['train']['sentence2'], padding=True, truncation=True)

* disadvantage of returning a dictionary (with our keys, input_ids, attention_mask, and token_type_ids, and values that are lists of lists
* return everything, only work if u have a large RAM to store whole dataset during the tokenization
* Datasets library are Apache Arrow files stored on disk, only keep the samples u ask for loaded in memory


* Dataset.map() applies a function on each element of the dataset

In [35]:
# we can define a function, map gonna apply it to the stuff 

def tokenize_function(example):
    processed = tokenizer(example['sentence1'],example['sentence2'], truncation=True)
    return processed

What it does?
* Take a dictionary & returns a new dictionary with keys input_ids, attention_mask
* Also works if example dictionary contains several samples / Each key is a list of sentences
    * Can use batched=True
* Leave padding out, because not efficient
  * Better to pad when building batch  (pad to that batch's maximum length
  

In [36]:
tokenized_datasets = raw_datasets.map(
    tokenize_function, 
    batched=True)

Map: 100%|████████████████████████| 3668/3668 [00:00<00:00, 22286.37 examples/s]
Map: 100%|██████████████████████████| 408/408 [00:00<00:00, 17303.62 examples/s]
Map: 100%|████████████████████████| 1725/1725 [00:00<00:00, 22482.47 examples/s]


In [37]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

* Observe that new field as new key
* Can use multiprocessing , set **num_proc**

* Collate function - put samples inside a batch
  * Default is a function that converts samples to tensors & concat
* Pass as an argument to **DataLoader**
* Do padding here to avoid over-long input, leads to speedup
  * Except for TPUs that prefer fixed shapes

In [38]:
from transformers import DataCollatorWithPadding

# takes tokenizer as arg, to find out the pad token 
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [42]:
samples = tokenized_datasets['train'][:8]
samples = {k:v for k,v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
# dont need idx, s1 and s2
[len(x) for x in samples["input_ids"]] # length of samples of this batch

[50, 59, 47, 67, 59, 50, 62, 32]

In [43]:
batch = data_collator(samples)
{k:v.shape for k,v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}