In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch 

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

tokens = tokenizer.tokenize("Hello, my dog is cute")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(input_ids) 

model(input_ids) # error -> model expects a tensor while we are passing a list of integers

In [5]:
input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt").input_ids
model(input_ids) # works fine

SequenceClassifierOutput(loss=None, logits=tensor([[-4.0687,  4.3669]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [18]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([input_ids]) # added a new dimension to the input_ids list
output = model(input_ids) # works fine

print(output.logits)

tensor([[-1.7674,  1.9746]], grad_fn=<AddmmBackward0>)


_Batching is the act of sending multiple sentences through the model, all at once. If you only have one sentence, you can just build a batch with a single sequence.._

In [19]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch 

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

sequence = "Hello, my dog is cute"

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

In [20]:
batched_ids = [
    ids,
    ids
]

batched_input_ids = torch.tensor(batched_ids)
output = model(batched_input_ids)

print(output.logits)    # same as before

tensor([[-1.7674,  1.9746],
        [-1.7674,  1.9746]], grad_fn=<AddmmBackward0>)


In [26]:
# padding in batched input

"""
tensors accept sequences of the same length. If the sequences are of different lengths, we can pad the sequences to make them of the same length.
"""

batched_ids = [
    [10017, 10017, 10017],
    [10017, 10017],
] # different lengths

PADDING_TOKEN_ID = 100
batched_ids = [
    [10017, 10017, 10017],
    [10017, 10017, PADDING_TOKEN_ID]
] # same lengths

['casual', 'casual', 'casual']

In [27]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id], # padding_token of that tokenizer
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 0.3784, -0.3921]], grad_fn=<AddmmBackward0>)
tensor([[ 0.2837, -0.1771]], grad_fn=<AddmmBackward0>)
tensor([[ 0.3784, -0.3921],
        [ 0.3961, -0.4254]], grad_fn=<AddmmBackward0>)


There’s something wrong with the logits in our batched predictions: the second row should be the same as the logits 
for the second sentence, but we’ve got completely different values! This is because the key feature of Transformer 
models is attention layers that contextualize each token. These will take into account the padding tokens since 
they attend to all of the tokens of a sequence. To get the same result when passing individual sentences of 
different lengths through the model or when passing a batch with the same sentences and padding applied, 

__we need to tell those attention layers to ignore the padding tokens. This is done by using an attention mask.__

In [29]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],  # 0 means ignore this token
]


print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

"""
Batched logits now match with the sequence1_ids and sequence2_ids logits
"""

tensor([[ 0.3784, -0.3921]], grad_fn=<AddmmBackward0>)
tensor([[ 0.2837, -0.1771]], grad_fn=<AddmmBackward0>)
tensor([[ 0.3784, -0.3921],
        [ 0.2837, -0.1771]], grad_fn=<AddmmBackward0>)
