In [1]:
!pip install transformers



## Converting from:
$$Sentence > Tokens > In-Ids > Model > Out-Ids > Sentence$$

In [11]:
raw1 = "I’ve been waiting for a HuggingFace course my whole life."
raw2 ="I hate this so much!"

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokens1 = tokenizer.tokenize(raw1)
print(tokens1)
tokens2 = tokenizer.tokenize(raw2)
print(tokens2)

['I', '’', 've', 'been', 'waiting', 'for', 'a', 'Hu', '##gging', '##F', '##ace', 'course', 'my', 'whole', 'life', '.']
['I', 'hate', 'this', 'so', 'much', '!']


In [17]:
ids1 = tokenizer.convert_tokens_to_ids(tokens1)
ids2 = tokenizer.convert_tokens_to_ids(tokens2)

print(ids1)
print(ids2)

[146, 787, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119]
[146, 4819, 1142, 1177, 1277, 106]


In [19]:
decode1 = tokenizer.decode(ids1)
decode2 = tokenizer.decode(ids2)

print(decode1)
print(decode2)

I ’ ve been waiting for a HuggingFace course my whole life.
I hate this so much!


## Handling multiple sequences (*as expected by the models*):

> Models need `torch.tensor` type inputs in `batches` in `rectangular shape (m x n)` with shorter sequences `padded`.

In [48]:
import torch
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [49]:
raw1 = "I’ve been waiting for a HuggingFace course my whole life."
raw2 ="I hate this so much!"

ids1 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(raw1))
ids2 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(raw2))

In [50]:
# using the model on our ids1 and ids2

output1 = model(torch.tensor(ids1))
print(output1.logits)

IndexError: too many indices for tensor of dimension 1

In [51]:
# the problem is the model expects 2d input or batched examples to predict on (m, x)

output1 = model(torch.tensor([ids1, ids1]))
print(output1.logits)

tensor([[-2.5720,  2.6852],
        [-2.5720,  2.6852]], grad_fn=<AddmmBackward0>)


In [52]:
# so for the two sentences we can just do

outputs = model(torch.tensor([ids1, ids2]))
print(outputs.logits)

ValueError: expected sequence of length 14 at dim 1 (got 6)

In [53]:
# but the above fails because the two tensors are of different shapes and the model expects a rectangular input -> (m, n)
print(len(ids1))
print(len(ids2))

14
6


In [54]:
# we could do seperately
out1 = model(torch.tensor([ids1]))
print(out1.logits)

tensor([[-2.5720,  2.6852]], grad_fn=<AddmmBackward0>)


In [55]:
out2 = model(torch.tensor([ids2]))
print(out2.logits)

tensor([[ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)


In [56]:
print(ids1)
print(ids2)

[1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
[1045, 5223, 2023, 2061, 2172, 999]


In [57]:
# we can batch the ids together
batched_ids = [
    [1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
    [1045, 5223, 2023, 2061, 2172, 999]
]

In [60]:
# Pad the 2nd sentence
pad_id = tokenizer.pad_token_id
pad_id

0

In [62]:
[1] * 6 + [0] * 8

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]

In [63]:
batched_ids = [
    [1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
    [1045, 5223, 2023, 2061, 2172, 999, 0, 0, 0, 0, 0, 0, 0, 0]
]

# give  0 attention to padding tokens as they dont carry any meaning
attention_mask = [
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
]

In [64]:
out = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
out.logits

tensor([[-2.5720,  2.6852],
        [ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)

In [65]:
print(raw1)
print(raw2)

I’ve been waiting for a HuggingFace course my whole life.
I hate this so much!


- The first sentence is `positive` as predicted.
- The second sentence is `negative` as predicted.