## 1. Load and save a tokenizer

In [2]:
# load a tokenizer 
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [8]:
# save a tokenizer 
tokenizer.save_pretrained("directory_on_my_computer")

('directory_on_my_computer/tokenizer_config.json',
 'directory_on_my_computer/special_tokens_map.json',
 'directory_on_my_computer/vocab.txt',
 'directory_on_my_computer/added_tokens.json',
 'directory_on_my_computer/tokenizer.json')

## 2. Encoding and Decoding

In [11]:
# break text into tokens
tokens = tokenizer.tokenize("Using a Transformer network is simple")
print(tokens)  

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


In [12]:
# convert tokens to ids 
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


In [15]:
# convert ids back to string
decoded_string = tokenizer.decode(ids)
print(decoded_string)

Using a Transformer network is simple


## 3. Handling multiple Sequences

#### 3.1. Converting one sequence to one batch

In [3]:
import torch
from transformers import AutoModelForSequenceClassification

In [50]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [5]:
sequence = "I've been waiting for a HuggingFace course my whole life."

In [11]:
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids) # convert token ids to pytorch tensor 
print(tokens)
print(ids)
print(input_ids)

['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
tensor([ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
         2026,  2878,  2166,  1012])


In [13]:
model(input_ids) # this will lead to an error as we sent a single sequence to a model while transformers models expect multiple sentences by default

IndexError: too many indices for tensor of dimension 1

In [17]:
tokenized_inputs = tokenizer(sequence, return_tensors="pt") # return as a PyTorch tensor instead of a plain list or NumPy array
print(tokenized_inputs["input_ids"]) # output is a 2D tensor, with batch size as 1

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])


In [19]:
# now add to one dimension to ids 
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids]) # [ids] wraps the list of token IDs in another list, creating a 2D structure
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


#### 3.2. Testing with two sequences

In [22]:
batched_ids = [ids, ids]
print(batched_ids)

[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]]


In [26]:
input_ids = torch.tensor(batched_ids) 
print("Input IDs:", input_ids) # the shape of the tensor will be (batch_size, squence_length) 

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


#### 3.3. Padding the input

In [28]:
# the following batched ids cannot be converted to a tensor as it needs to be a rectangular shape. In this case, we'll need to use padding
batched_ids = [
    [200, 200, 200],
    [200, 200] 
] 

In [33]:
# padding ids can be found in tokenizer.pad_token_id 
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


This is wrong. The result for the second sequence should be identical with the first. This is because the key feature of Transformer models is attention layers contextualize each token, even the padding ids. To avoid this, we'll need to add attention masks. 

#### 3.4. Adding attention masks

Exercise: Apply the tokenization manually on the two sentences used in section 2 (“I’ve been waiting for a HuggingFace course my whole life.” and “I hate this so much!”). Pass them through the model and check that you get the same logits as in section 2. Now batch them together using the padding token, then create the proper attention mask. Check that you obtain the same results when going through the model!

In [77]:
sequence1 = "I've been waiting for a HuggingFace course my whole life."
sequence2 = "I hate this so much!"

In [78]:
# now add to one dimension to ids 
tokens_sequence1 = tokenizer.tokenize(sequence1)
ids_sequence1 = tokenizer.convert_tokens_to_ids(tokens_sequence1)

tokens_sequence2 = tokenizer.tokenize(sequence2)
ids_sequence2 = tokenizer.convert_tokens_to_ids(tokens_sequence2)

print("Token IDs Sentence 1:", ids_sequence1)
print("Token IDs Sentence 2:", ids_sequence2)

Token IDs Sentence 1: [1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
Token IDs Sentence 2: [1045, 5223, 2023, 2061, 2172, 999]


In [79]:
# Determine max length
sequence_length = max(len(ids_sequence1), len(ids_sequence2))
print(sequence_length)

# Pad sequence 2 
padded_ids_sequence1 = ids_sequence1 + [tokenizer.pad_token_id] * (sequence_length - len(ids_sequence1))
padded_ids_sequence2 = ids_sequence2 + [tokenizer.pad_token_id] * (sequence_length - len(ids_sequence2))

# Create attention masks
attention_mask_sequence1 = [1] * len(ids_sequence1) + [0] * (sequence_length - len(ids_sequence1))
attention_mask_sequence2 = [1] * len(ids_sequence2) + [0] * (sequence_length - len(ids_sequence2))

print(padded_ids_sequence1)
print(padded_ids_sequence2)
print(attention_mask_sequence1)
print(attention_mask_sequence2)

14
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
[1045, 5223, 2023, 2061, 2172, 999, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]


In [80]:
# Combine sequences into a batch 
batched_ids = torch.tensor([padded_ids_sequence1, padded_ids_sequence2])  # Create a tensor with consistent shape
attention_mask = torch.tensor([attention_mask_sequence1, attention_mask_sequence2])  # Match shape with batched_ids

print(batch_ids)
print(attention_mask)

tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  5223,  2023,  2061,  2172,   999,     0,     0,     0,     0,
             0,     0,     0,     0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])


In [82]:
outputs = model(batch_ids, attention_mask = attention_mask)
print(outputs.logits)

tensor([[-2.7276,  2.8789],
        [ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)
