# Multiple Sequences

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

In [3]:
# Initiaize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [4]:
sequence = "I've been waiting for a HuggingFace course my whole life."

In [5]:
tokens = tokenizer.tokenize(sequence)

In [6]:
# Convert tokens to ids
ids = tokenizer.convert_tokens_to_ids(tokens)

#Convert ids to tensors
input_ids = torch.tensor(ids)

In [7]:
# this will fail because transformer models expect multiple sequences
model(input_ids)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [12]:
print(input_ids)

tensor([ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
         2026,  2878,  2166,  1012])


In [9]:
# If you check the tokenizer method adds an additional dimension
tokenizer(sequence)

{'input_ids': [101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
# try again adding another dimension
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids]) #adding another dimenion
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


## Batching

In [16]:
# Batching is the process of combining multiple sequence IDs
batched_ids = [ids,ids]
type(batched_ids)

list

In [15]:
input_batched_ids = torch.tensor(batched_ids)
output = model(input_batched_ids)
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


## Padding

In [17]:
# The following cannot be converted to a tensor
batched_ids = [
    [200, 200, 200],
    [200, 200]
]

In [18]:
torch.tensor(batched_ids)

ValueError: expected sequence of length 3 at dim 1 (got 2)

In [19]:
# Added padding
padding_id = 100

batched_ids = [
    [200, 200, 200],
    [200, 200, padding_id],
]
torch.tensor(batched_ids)

tensor([[200, 200, 200],
        [200, 200, 100]])

The padding token ID can be found in `tokenizer.pad_token_id`.

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In the above outputof the batched IDs. This is because the transformers contextualize the padding as well. We can use the attention mask to tell the transformers to ignore the padding.

## Attention Mask

Attention masks are the same shape of the input IDs but with 1 and 0. 1 to attened and 0 to ignore.

In [21]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]
attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

In [22]:
output = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(output.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


## Longer Sequences
Transformer models will crash if the sequence length of the inputs is more than the maxium sequence length supported by the model. In these cases, either use a different model or truncate the sequence.

In [24]:
# Truncate by specifying the max_sequence_length parameter:
sequence = sequence[:max_sequence_length]

NameError: name 'max_sequence_length' is not defined