In [None]:
!pip --q install transformers[sentencepiece]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m100.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m81.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#@ IMPORTING THE REQUIRED LIBRARIES
from transformers import pipeline
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
classifier(
    ["I am loving this Hugging Face course till now",
     "I hate this soo much",
     ]
)

[{'label': 'POSITIVE', 'score': 0.9998708963394165},
 {'label': 'NEGATIVE', 'score': 0.9992349147796631}]

![Images](https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter2/full_nlp_pipeline-dark.svg)

**Preprocessing with a tokenizer**

Our first step is to convert the text inputs into numbers that the model can make sense of. For this purpose we use `tokenizer` which is responsible for :

- Splitting inputs into words, subwords, or symbols (like punctuations) i.e. called tokens
- Mapping each token to an integer
- Adding additional inputs that may be useful to the model

In [None]:
#@ CREATING A CHECKPOINT
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Now, our text steps is to convert the list of inputs IDs to tensors

In [None]:
raw_inputs = ["I am loving this Hugging Face course till now",
     "I hate this soo much",
     ]

inputs = tokenizer(raw_inputs,
                   padding=True,
                   truncation=True,
                   return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  2572,  8295,  2023, 17662,  2227,  2607,  6229,  2085,
           102],
        [  101,  1045,  5223,  2023, 17111,  2172,   102,     0,     0,     0,
             0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}


In [None]:
#@ DOWNLOADING THE PRETRAINED MODEL
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

**A high-dimensional vector?**

The vector output by the Transformer module is usually large. It generally has three dimensions:
- *Batch size:* The number of sequences processed at a time
- *Sequence length:* The length of numerical reprsentations of the sequence
- *Hidden size:* The vector dimension of each model input

In [None]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 11, 768])


![Hugging Face](https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter2/transformer_and_head-dark.svg)

- The model is represented by its embedding layer and the subsequent layers.

- The embeddings layer converts each input ID in the tokenized input into a vector that represents the associated token.

- The subsequent layers manipulate those vectors using the attention mechanism to produce the final represntation of the sentences

In [None]:
#@ MODEL WITH SEQUENCE CLASSIFICATION HEAD
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

In [None]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-4.2930,  4.6621],
        [ 3.9356, -3.2392]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
print(outputs.logits.shape)

torch.Size([2, 2])


In [None]:
#@ POSTPROCESSING THE OUTPUT
print(outputs.logits)

tensor([[-4.2930,  4.6621],
        [ 3.9356, -3.2392]], grad_fn=<AddmmBackward0>)


In [None]:
import torch

In [None]:
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[1.2906e-04, 9.9987e-01],
        [9.9923e-01, 7.6507e-04]], grad_fn=<SoftmaxBackward0>)


In [None]:
#@ GETTIG THE LABELS CORRESPONDING TO EACH POSITION
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}