In [34]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

In [35]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [36]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")

In [37]:
model_inputs

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [38]:
tokens = tokenizer.tokenize(sequences)

In [39]:
ids = tokenizer.convert_tokens_to_ids(tokens)

In [40]:
# Will pad the sequences up to model max length
ids

[1045,
 1005,
 2310,
 2042,
 3403,
 2005,
 1037,
 17662,
 12172,
 2607,
 2026,
 2878,
 2166,
 1012,
 2061,
 2031,
 1045,
 999]

In [41]:
# Special Tokens

sequence = "I've been waiting for huggingface course all my life."

# Note: how this adds special tokens
model_inputs = tokenizer(sequence)
print(model_inputs["input_ids"])

[101, 1045, 1005, 2310, 2042, 3403, 2005, 17662, 12172, 2607, 2035, 2026, 2166, 1012, 102]


In [42]:
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[1045,
 1005,
 2310,
 2042,
 3403,
 2005,
 17662,
 12172,
 2607,
 2035,
 2026,
 2166,
 1012]

In [43]:
# decoding the ids created dirctly from the tokenizer # note: model input ids include [CLS and [SEP] but just ids dont include them
decoded_str_tokenizer = tokenizer.decode(model_inputs["input_ids"])
decoded_str_tokenizer

"[CLS] i've been waiting for huggingface course all my life. [SEP]"

In [44]:
# decoding the ids created by the converter function
decoded_str_convert = tokenizer.decode(ids)
decoded_str_convert

"i've been waiting for huggingface course all my life."

    The tokenizer added the special word [CLS] at the beginning and the special word [SEP] at the end. This is because the model was pretrained with those, so to get the same results for inference we need to add them as well. Note that some models don’t add special words, or add different ones; models may also add these special words only at the beginning, or only at the end. In any case, the tokenizer knows which ones are expected and will deal with this for you.

In [None]:
### Wrapping from tokenizer to model

In [45]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [46]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

In [48]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)



In [49]:
sequences = ["Reading books have been one of my favorite hobbies.", "I enjoy reading"]

In [54]:
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

In [55]:
output = model(**tokens)

In [56]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-2.4806,  2.5807],
        [-3.9313,  4.1768]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)