In [2]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

### Tokenizer

In [3]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs['input_ids'])
print(inputs['attention_mask'])

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])


In [11]:
tokenizer.decode([  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102])

"[CLS] i've been waiting for a huggingface course my whole life. [SEP]"

In [13]:
tokenizer.encode("i've been waiting for a HuggingFace course my whole life.")

[101,
 1045,
 1005,
 2310,
 2042,
 3403,
 2005,
 1037,
 17662,
 12172,
 2607,
 2026,
 2878,
 2166,
 1012,
 102]

### Model

In [18]:
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

In [26]:
output = model(**inputs)

output.last_hidden_state.shape

torch.Size([2, 16, 768])

In [28]:
inputs['input_ids'].shape

torch.Size([2, 16])

In [None]:
# Here is a non-exhaustive list:

# *Model (retrieve the hidden states)
# *ForCausalLM
# *ForMaskedLM
# *ForMultipleChoice
# *ForQuestionAnswering
# *ForSequenceClassification
# *ForTokenClassification
# and others 🤗

In [32]:
from transformers import AutoModelForSequenceClassification # ForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
print(model.config.id2label)
outputs = model(**inputs)
# raw_inputs = [
#     "I've been waiting for a HuggingFace course my whole life.",
#     "I hate this so much!",
# ]
outputs.logits

{0: 'NEGATIVE', 1: 'POSITIVE'}


tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)

In [35]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

for i, logits in enumerate(outputs.logits):
    probs = torch.nn.functional.softmax(logits, dim=-1)
    print(raw_inputs[i])
    print(f"Negative: {probs[0]:.4f}, Positive: {probs[1]:.4f}")
    print()

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)
I've been waiting for a HuggingFace course my whole life.
Negative: 0.0402, Positive: 0.9598

I hate this so much!
Negative: 0.9995, Positive: 0.0005

