In [1]:
!pip install transformers datasets torch -q

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
import torch

In [2]:
# Sentiment Analysis
sentiment = pipeline("sentiment-analysis")
print(sentiment("I love HuggingFace! NLP is amazing."))

# Named Entity Recognition (NER)
ner = pipeline("ner", grouped_entities=True)
print(ner("My name is Alice and I work at OpenAI."))

# Zero-Shot Classification
zero_shot = pipeline("zero-shot-classification")
text = "I recently watched a great sci-fi movie."
candidate_labels = ["entertainment", "politics", "technology"]
result = zero_shot(text, candidate_labels)
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9998831748962402}]


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu
No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'entity_group': 'PER', 'score': np.float32(0.99777395), 'word': 'Alice', 'start': 11, 'end': 16}, {'entity_group': 'ORG', 'score': np.float32(0.99774265), 'word': 'OpenAI', 'start': 31, 'end': 37}]


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


{'sequence': 'I recently watched a great sci-fi movie.', 'labels': ['entertainment', 'technology', 'politics'], 'scores': [0.6876976490020752, 0.30934247374534607, 0.002959835110232234]}


In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

text = "HuggingFace is awesome!"

# Text to IDs
input_ids = tokenizer.encode(text)
print("Input IDs:", input_ids)

# IDs back to text
decoded_text = tokenizer.decode(input_ids)
print("Decoded Text:", decoded_text)

# Tokens
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

# Token IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Input IDs: [101, 17662, 12172, 2003, 12476, 999, 102]
Decoded Text: [CLS] huggingface is awesome! [SEP]
Tokens: ['hugging', '##face', 'is', 'awesome', '!']
Token IDs: [17662, 12172, 2003, 12476, 999]


In [4]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Encode text
inputs = tokenizer("I love PyTorch and HuggingFace!", return_tensors="pt")

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)

# Logits
logits = outputs.logits
print("Logits:", logits)

# Predicted class
predicted_class = torch.argmax(logits, dim=1)
print("Predicted Class ID:", predicted_class.item())

# Map ID to label
label_map = model.config.id2label
print("Predicted Label:", label_map[predicted_class.item()])

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Logits: tensor([[-4.1038,  4.4065]])
Predicted Class ID: 1
Predicted Label: POSITIVE


In [5]:
# Save locally
model.save_pretrained("my_hf_model")
tokenizer.save_pretrained("my_hf_tokenizer")

# Load again
loaded_model = AutoModelForSequenceClassification.from_pretrained("my_hf_model")
loaded_tokenizer = AutoTokenizer.from_pretrained("my_hf_tokenizer")

# Test loaded model
inputs = loaded_tokenizer("HuggingFace makes NLP easy!", return_tensors="pt")
with torch.no_grad():
    logits = loaded_model(**inputs).logits
predicted_class = torch.argmax(logits, dim=1).item()
print("Predicted Label with loaded model:", label_map[predicted_class])

Predicted Label with loaded model: POSITIVE
