<a href="https://colab.research.google.com/github/raduButucelea23/HF_Transformers/blob/main/course/en/chapter3/section2_pt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Processing the data (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

# Preparing the env

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

# Loading the dataset

In [82]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "sst2")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
raw_train_dataset = raw_datasets["train"]

In [67]:
raw_train_dataset.features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

# Sampling 2 models

Choosing models && tokenizer

In [70]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

untrn_checkpoint = "bert-base-uncased"
trn_checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

untrn_tokenizer = AutoTokenizer.from_pretrained(untrn_checkpoint)
trn_tokenizer = AutoTokenizer.from_pretrained(trn_checkpoint)

untrn_model = AutoModelForSequenceClassification.from_pretrained(untrn_checkpoint)
trn_model = AutoModelForSequenceClassification.from_pretrained(trn_checkpoint)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Comparing the outputs on 3 samples

In [71]:
def predict_sentiment(text, model, tokenizer, labels=None):
    """
    Predict sentiment for a given text using the specified model and tokenizer.

    Args:
        text (str): Input text to analyze
        model: Hugging Face model
        tokenizer: Corresponding tokenizer
        labels (list, optional): Labels for prediction outputs. Defaults to ['negative', 'positive'].

    Returns:
        dict: Dictionary containing percentages for each label
    """
    if labels is None:
        labels = ['negative', 'positive']

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Get model prediction
    with torch.no_grad():
        outputs = model(**inputs)

    # Apply softmax to get probabilities
    probs = F.softmax(outputs.logits, dim=-1)

    # Convert to percentages
    percentages = probs[0] * 100

    # Return dictionary with labels and percentages
    return {labels[i]: float(percentages[i]) for i in range(len(labels))}

def compare_predictions(text):
    """
    Compare predictions between trained and untrained models.

    Args:
        text (str): Text to analyze

    Returns:
        tuple: Predictions from both models
    """
    untrn_pred = predict_sentiment(text, untrn_model, untrn_tokenizer)
    trn_pred = predict_sentiment(text, trn_model, trn_tokenizer)

    return {
        "untrained_bert": untrn_pred,
        "finetuned_distilbert": trn_pred
    }

# Test examples
positive_example = "I love this movie! It's amazing."
negative_example = "This was a terrible waste of time."
ambiguous_example = "It was different than I expected."

results = {
    "positive": compare_predictions(positive_example),
    "negative": compare_predictions(negative_example),
    "ambiguous": compare_predictions(ambiguous_example)
}

# Print results in a readable format
for sentiment_type, prediction in results.items():
    print(f"\n--- {sentiment_type.upper()} EXAMPLE ---")
    print(f"Text: {locals()[f'{sentiment_type}_example']}")
    for model_name, scores in prediction.items():
        print(f"\n  {model_name}:")
        for label, score in scores.items():
            print(f"    {label}: {score:.2f}%")


--- POSITIVE EXAMPLE ---
Text: I love this movie! It's amazing.

  untrained_bert:
    negative: 42.47%
    positive: 57.53%

  finetuned_distilbert:
    negative: 0.01%
    positive: 99.99%

--- NEGATIVE EXAMPLE ---
Text: This was a terrible waste of time.

  untrained_bert:
    negative: 39.76%
    positive: 60.24%

  finetuned_distilbert:
    negative: 99.98%
    positive: 0.02%

--- AMBIGUOUS EXAMPLE ---
Text: It was different than I expected.

  untrained_bert:
    negative: 41.02%
    positive: 58.98%

  finetuned_distilbert:
    negative: 0.14%
    positive: 99.86%


# Pre-processing the data

Instantiating the tokenizer

In [73]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Checking that the tokenizer works

In [None]:
input = tokenizer("This is a sample sentence.")
input

tokenizer.convert_ids_to_tokens(input["input_ids"])

Tokenizing the dataset

In [81]:
def tokenize_function(example):
  return tokenizer(example['sentence'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [83]:
tokenized_datasets["train"][15312]

{'sentence': 'but they fascinate in their recklessness . ',
 'label': 1,
 'idx': 15312,
 'input_ids': [101,
  2021,
  2027,
  6904,
  11020,
  14776,
  1999,
  2037,
  18555,
  2791,
  1012,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Choosing 8 samples to check tensor sizes and clean-up unnecessary data

In [90]:
# @title
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence"]}
[len(x) for x in samples["input_ids"]]

[10, 11, 15, 10, 22, 13, 29, 6]

Instantiating the collator

In [89]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Creating a pre-processed batch with automatically added padding

In [91]:
# @title
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 29]),
 'token_type_ids': torch.Size([8, 29]),
 'attention_mask': torch.Size([8, 29]),
 'labels': torch.Size([8])}