In [1]:
!pip install transformers[sentencepiece] --q

[K     |████████████████████████████████| 2.8 MB 4.3 MB/s 
[K     |████████████████████████████████| 895 kB 40.5 MB/s 
[K     |████████████████████████████████| 52 kB 1.4 MB/s 
[K     |████████████████████████████████| 3.3 MB 38.9 MB/s 
[K     |████████████████████████████████| 636 kB 46.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 40.5 MB/s 
[?25h

# Getting Satrted

## Tokenizer

In [2]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=checkpoint)

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [4]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.", 
    "I am very excited about training the model !!",
    "I hate this weather which makes me feel irritated  !"
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  2572,  2200,  7568,  2055,  2731,  1996,  2944,   999,
           999,   102,     0,     0,     0,     0],
        [  101,  1045,  5223,  2023,  4633,  2029,  3084,  2033,  2514, 15560,
           999,   102,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}


## Model

In [5]:
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

In [6]:
model = AutoModel.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([3, 16, 768])


## Sequence Classification

In [8]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [10]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.", 
    "I am very excited about training the model !!",
    "I hate this weather which makes me feel irritated  !"
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  2572,  2200,  7568,  2055,  2731,  1996,  2944,   999,
           999,   102,     0,     0,     0,     0],
        [  101,  1045,  5223,  2023,  4633,  2029,  3084,  2033,  2514, 15560,
           999,   102,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}


In [11]:
outputs = model(**inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.9628,  4.2892],
        [ 4.0951, -3.3116]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)


In [12]:
print(outputs.logits.shape)

torch.Size([3, 2])


In [13]:
import torch 

outputs = torch.nn.functional.softmax(outputs.logits, dim = -1)
print(outputs)

tensor([[4.0195e-02, 9.5980e-01],
        [2.6066e-04, 9.9974e-01],
        [9.9939e-01, 6.0683e-04]], grad_fn=<SoftmaxBackward>)


In [14]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

## Replicate Pipeline API

In [15]:
# Imports 
import torch 
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Define the model checkpoint
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

# Download and cache the tokenizer and classification model
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# Define the inputs and tokenize them
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.", 
    "I am very excited about training the model !!",
    "I hate this weather which makes me feel irritated  !"
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

# Get the outputs from the model
outputs = model(**inputs)
print(outputs)

# Find the class/label probabilities  
outputs = torch.nn.functional.softmax(outputs.logits, dim = -1)
print(outputs)

# Find the label to class mapping for verification
print(model.config.id2label)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  2572,  2200,  7568,  2055,  2731,  1996,  2944,   999,
           999,   102,     0,     0,     0,     0],
        [  101,  1045,  5223,  2023,  4633,  2029,  3084,  2033,  2514, 15560,
           999,   102,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}
SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.9628,  4.2892],
        [ 4.0951, -3.3116]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
tensor([[4.0195e-02, 9.5980e-01],
        [2.6066e-04, 9.9974e-01],
        [9.9939e-01, 6.0683e-04]], grad_fn=<SoftmaxBackward>)
{0: 'NEGATIVE', 1: 'POSITIVE'}


In [16]:
print(model.config.id2label)

{0: 'NEGATIVE', 1: 'POSITIVE'}
