# Behind the pipeline

In [13]:
from transformers import pipeline
raw_inputs= ["I've finally made it!", "Why is this not working!"]
classifier = pipeline("sentiment-analysis")
classifier(raw_inputs)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9998001456260681},
 {'label': 'NEGATIVE', 'score': 0.9997509121894836}]

## Preprocessing with a tokenizer

In [14]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [15]:
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[ 101, 1045, 1005, 2310, 2633, 2081, 2009,  999,  102],
        [ 101, 2339, 2003, 2023, 2025, 2551,  999,  102,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0]])}


## Going through the model

In [16]:
from transformers import AutoModel
model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### A high-dimensional vector?

In [17]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 9, 768])


### Model heads: Making sense out of numbers

In [18]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

In [19]:
print(outputs.logits.shape)

torch.Size([2, 2])


## Postprocessing the output

In [20]:
print(outputs.logits)

tensor([[-4.1149,  4.4030],
        [ 4.6786, -3.6188]], grad_fn=<AddmmBackward0>)


In [21]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=1)
print(predictions)

tensor([[1.9983e-04, 9.9980e-01],
        [9.9975e-01, 2.4909e-04]], grad_fn=<SoftmaxBackward0>)


In [22]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

# Models

## Creating a transformer

In [23]:
from transformers import BertConfig, BertModel
config = BertConfig()
model = BertModel(config)

In [24]:
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

### Different loading methods

In [25]:
# random initialization
from transformers import BertConfig, BertModel
config = BertConfig()
model = BertModel(config)

In [26]:
# initialize from pretrained
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-cased")

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Saving methods

In [27]:
model.save_pretrained("pretrained")

In [28]:
!ls pretrained/

config.json  pytorch_model.bin


## Using a Transformer model for inference

In [29]:
sequences = ["Hello!", "Cool!", "Nice!"]

In [32]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [34]:
tokenizer(sequences , padding=True, truncation=True)

{'input_ids': [[101, 8667, 106, 102], [101, 13297, 106, 102], [101, 8835, 106, 102]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]}

In [35]:
encoded_sequences = [
    [101, 7592, 999, 102],
    [101, 4658, 1012, 102],
    [101, 3835, 999, 102],
]

In [36]:
import torch
model_inputs = torch.tensor(encoded_sequences)

# Tokenizers

## Loading and saving

In [2]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer

Downloading tokenizer.json:   0%|          | 0.00/426k [00:00<?, ?B/s]

PreTrainedTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [11]:
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
tokenizer.save_pretrained("tokenizer_pretrained")

('tokenizer_pretrained/tokenizer_config.json',
 'tokenizer_pretrained/special_tokens_map.json',
 'tokenizer_pretrained/vocab.txt',
 'tokenizer_pretrained/added_tokens.json',
 'tokenizer_pretrained/tokenizer.json')

## Encoding

### Tokenization

In [13]:
tokens = tokenizer.tokenize("Let's try to tokenize!")
tokens

['Let', "'", 's', 'try', 'to', 'token', '##ize', '!']

### From tokens to input IDs

In [10]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids

[2421, 112, 188, 2222, 1106, 22559, 3708, 106]

In [14]:
tokens = tokenizer.tokenize("I've been waiting for a huggingface course my whole life.")
tokens

['I',
 "'",
 've',
 'been',
 'waiting',
 'for',
 'a',
 'hugging',
 '##face',
 'course',
 'my',
 'whole',
 'life',
 '.']

In [15]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids

[146,
 112,
 1396,
 1151,
 2613,
 1111,
 170,
 19558,
 10931,
 1736,
 1139,
 2006,
 1297,
 119]

In [9]:
tokenizer.decode(input_ids)

"Let's try to tokenize!"

## Decoding

In [17]:
tokenizer.decode([7993, 170, 13809, 23763, 2443, 1110, 3014])

'Using a Transformer network is simple'

# Handling multiple sequences

## Models expect a batch of inputs

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/255M [00:00<?, ?B/s]

In [4]:

sequence = "I've been waiting for a Huggingface course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([ids]) # needed to add batching dimension

output = model(input_ids)
output.logits

tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)

In [7]:
batched_ids = torch.tensor([ids, ids])
output = model(batched_ids)
output.logits

tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)

## Padding the inputs

In [8]:
sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids =[
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]


In [12]:
model(torch.tensor(sequence1_ids)).logits

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)

In [13]:
model(torch.tensor(sequence2_ids)).logits

tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)

In [15]:
model(torch.tensor(batched_ids)).logits # wrong result due to lack of attention mask

tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)

## Attention Masks

In [16]:
attention_mask = [
    [1,1,1],
    [1,1,0]
]
model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask)).logits

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)

In [19]:
sentence1 = tokenizer.tokenize("I've been waiting for a HuggingFace course my whole life.")
sentence1_ids = tokenizer.convert_tokens_to_ids(sentence1)
print(sentence1_ids)

[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]


In [20]:
sentence2 = tokenizer.tokenize("I hate this so much")
sentence2_ids = tokenizer.convert_tokens_to_ids(sentence2)
print(sentence2_ids)

[1045, 5223, 2023, 2061, 2172]


In [22]:
model(torch.tensor([sentence1_ids])).logits

tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)

In [23]:
model(torch.tensor([sentence2_ids])).logits

tensor([[ 3.1744, -2.6848]], grad_fn=<AddmmBackward0>)

In [24]:
batched_ids = [
    [1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
    [1045, 5223, 2023, 2061, 2172, tokenizer.pad_token_id,tokenizer.pad_token_id,tokenizer.pad_token_id, tokenizer.pad_token_id,tokenizer.pad_token_id,tokenizer.pad_token_id, tokenizer.pad_token_id,tokenizer.pad_token_id,tokenizer.pad_token_id]
]

In [31]:
attention_mask = [
    [1,1,1,1,1,1,1,1,1,1,1,1,1,1],
    [1,1,1,1,1,0,0,0,0,0,0,0,0,0]
]

In [32]:
model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask)).logits

tensor([[-2.7276,  2.8789],
        [ 3.1744, -2.6848]], grad_fn=<AddmmBackward0>)

## Longer sequences

In [33]:
tokenizer.max_len_single_sentence # maximum number of tokens a single sentence can have (i.e. without special tokens)

510

In [34]:
tokenizer.model_max_length # maximum number of tokens a model can handle (i.e. including special tokens)

512

# Putting it all together

In [23]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sequence = "I've been waiting for a HuggingFace course my whole life."
model_inputs = tokenizer(sequence)
model_inputs

{'input_ids': [101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [24]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
model_inputs = tokenizer(sequences)
model_inputs

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [25]:
# maximum sequence length in the list
model_inputs = tokenizer(sequences, padding="longest")
model_inputs

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

In [26]:
# model max length
model_inputs = tokenizer(sequences, padding="max_length")
model_inputs

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [27]:
# specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
model_inputs

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0]]}

In [28]:
# truncate to model max length
model_inputs = tokenizer(sequences, truncation=True)
model_inputs

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [29]:
# truncate to specified max length
model_inputs = tokenizer(sequences, truncation=True, max_length=8)
model_inputs

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 102], [101, 2061, 2031, 1045, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [30]:
# pytorch tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
model_inputs

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [38]:
# tensorflow tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")
model_inputs

{'input_ids': <tf.Tensor: shape=(2, 16), dtype=int32, numpy=
array([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,
        12172,  2607,  2026,  2878,  2166,  1012,   102],
       [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 16), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}

In [32]:
# numpy arrays
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")
model_inputs

{'input_ids': array([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,
        12172,  2607,  2026,  2878,  2166,  1012,   102],
       [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

### Special Tokens

In [33]:
model_inputs = tokenizer(sequence)
print(model_inputs["input_ids"])

[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]


In [34]:
print(tokenizer.decode(model_inputs["input_ids"]))

[CLS] i've been waiting for a huggingface course my whole life. [SEP]


In [35]:
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]


In [36]:
print(tokenizer.decode(ids))

i've been waiting for a huggingface course my whole life.


### Wrapping up: From tokenizer to model

In [40]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint="distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)

Downloading pytorch_model.bin:   0%|          | 0.00/255M [00:00<?, ?B/s]

In [41]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)