<a href="https://colab.research.google.com/github/skandanyal/Udacity_Foundations_of_Generative_AI/blob/main/Udacity_Hugging_Face.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load a Tokenizer

In [None]:
from transformers import BertTokenizer

In [None]:
from huggingface_hub import login

login("__login__")

In [None]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', user_auth_token="__user_auth_token__")

# Check the number of tokens in the vocabulary
tokenizer.vocab_size

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

30522

Convert a Sentense to a List of Tokens

In [None]:
# Tokenize the sentence

my_sentence = "This seems like an interesting thing to do, but I'm still not convinced that this is the thing that I'd want to do for the time to come."
tokens = tokenizer.tokenize(my_sentence)

# Print the tokens
print(tokens)

#Show the token ids assigned to each token
print(tokenizer.convert_tokens_to_ids(tokens))

['this', 'seems', 'like', 'an', 'interesting', 'thing', 'to', 'do', ',', 'but', 'i', "'", 'm', 'still', 'not', 'convinced', 'that', 'this', 'is', 'the', 'thing', 'that', 'i', "'", 'd', 'want', 'to', 'do', 'for', 'the', 'time', 'to', 'come', '.']
[2023, 3849, 2066, 2019, 5875, 2518, 2000, 2079, 1010, 2021, 1045, 1005, 1049, 2145, 2025, 6427, 2008, 2023, 2003, 1996, 2518, 2008, 1045, 1005, 1040, 2215, 2000, 2079, 2005, 1996, 2051, 2000, 2272, 1012]


# Using Hugging Face models

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

In [None]:
# Load a pre-trained semtiment analysis model
model_name = 'textattack/bert-base-uncased-imdb'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = BertTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/511 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Performing Sentiment Analysis on my_sentence

In [None]:
import torch

In [None]:
# Tokenize the input sentence
inputs = tokenizer(my_sentence, return_tensors='pt')

# Make prediction
with torch.no_grad():
    outputs = model(**inputs).logits
    probabilities = torch.nn.functional.softmax(outputs, dim=1)
    predicted_class = torch.argmax(probabilities)

# Display sentiment results
if predicted_class == 1:
    print(f"Sentiment: Positive ({probabilities[0][1] * 100:.2f}%)")
else:
    print(f"Sentiment: Nagative ({probabilities[0][0] * 100:.2f}%)")

Sentiment: Nagative (64.09%)


Trying out with something gibbrish

In [None]:
gibberish_sentence = "asdjklfhqweoiru zxmncvpoiuqwe lkjhasd"

In [None]:
# Tokenize the input sentence
inputs = tokenizer(gibberish_sentence, return_tensors='pt')

# Make prediction
with torch.no_grad():
    outputs = model(**inputs).logits
    probabilities = torch.nn.functional.softmax(outputs, dim=1)
    predicted_class = torch.argmax(probabilities)

# Display sentiment results
if predicted_class == 1:
    print(f"Sentiment: Positive ({probabilities[0][1] * 100:.2f}%)")
else:
    print(f"Sentiment: Nagative ({probabilities[0][0] * 100:.2f}%)")

Sentiment: Nagative (81.20%)


# Using a pre-existing dataset - IMDb dataset

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
from datasets import load_dataset
from IPython.display import HTML, display

#Load the IMDb dataset, which contains movie reviews and sentiment labels (positive or negative)
dataset = load_dataset('imdb')

# Fetch a review from the training set
review_number = 100
sample_review = dataset['train'][review_number]

display(HTML(sample_review['text'][:450] + '...'))

# Displaying the sentiment results
if sample_review['label'] == 1:
    print("Sentiment: Positive")
else:
    print("Sentiment: Negative")

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Sentiment: Negative


Another review number

In [None]:
import numpy as np

In [None]:
random_number = np.random.randint(1, 10000)

In [None]:
# Fetch a review from the training set
review_number = random_number
sample_review = dataset['train'][review_number]

display(HTML(sample_review['text'][:450] + '...'))

# Displaying the sentiment results
if sample_review['label'] == 1:
    print("Sentiment: Positive")
else:
    print("Sentiment: Negative")

Sentiment: Negative


# Hugging Face Trainers: Models and Dataset

In [None]:
from transformers import (DistilBertForSequenceClassification,
                          DistilBertTokenizer,
                          TrainingArguments,
                          Trainer)

from datasets import load_dataset

In [None]:
# Initialize the model and the tokenizer
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=2
)

tokenizer  = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

dataset = load_dataset('imdb')
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=64,
    output_dir="./results",
    learning_rate=2e-5,
    num_train_epochs=3,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)
# trainer.train()

# not enough RAM available to run train this

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mskandanyalagach[0m ([33mskandanyalagach-the-national-institute-of-engineering[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
