In [4]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# Tokenizer splits input to words, subwords etc. that are called tokens.
# Maps each token to an integer.

In [5]:
# We have to convert the list of input IDs to tensors. Transformer models only accept tensors as input
# Think of tensors as numpy arrays. scalar(0D), a vector(1D), a matrix(2D) or have more dimension
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
# Type of tensors: PyTorch, TensorFlow, or plain NumPy(return_tensors)
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

# Input ids contains unique identifiers of the tokens in each sentence

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [6]:
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [7]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)
# The vector output generally has three dimensions: 
# batch size: The number of sequences processed at a time (2 here)
# sequence length: The length of the numerical representation of the sequence (16 here)
# hidden size: The vector dimension of each model input

torch.Size([2, 16, 768])


In [8]:
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

In [9]:
print(outputs.logits.shape)

torch.Size([2, 2])
