<a href="https://colab.research.google.com/github/rsanchezgarc/AI-ML-analytics-IE/blob/main/notebooks/sequence_models/manyToOneLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets evaluate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [3

In [3]:
import os
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [4]:
# 1. Load the IMDB dataset (binary sentiment classification)
dataset = load_dataset("imdb")
print(f"Dataset loaded with {len(dataset['train'])} training examples and {len(dataset['test'])} test examples")

# 2. Prepare tokenizer
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 3. Define preprocessing function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

# 4. Apply preprocessing to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset loaded with 25000 training examples and 25000 test examples


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:

# 5. Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# 6. Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=2
)

# 7. Define training arguments
batch_size = 8
output_dir = "./bert-finetuned-imdb"

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,  # Set to True if you want to upload to HF Hub
    report_to ="tensorboard"

)

# 8. Create data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 9. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 10. Train the model
print("Starting training...")
trainer.train()

# 11. Evaluate the model
print("Evaluating the model...")
evaluation_results = trainer.evaluate()
print(f"Evaluation results: {evaluation_results}")

# 12. Save the model locally
trainer.save_model(output_dir)
print(f"Model saved to {output_dir}")



Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2633,0.276565,0.92,0.916227,0.961579,0.87496
2,0.1356,0.262449,0.94232,0.942486,0.939787,0.9452


Evaluating the model...


Evaluation results: {'eval_loss': 0.26244911551475525, 'eval_accuracy': 0.94232, 'eval_f1': 0.9424856413529037, 'eval_precision': 0.9397868278714604, 'eval_recall': 0.9452, 'eval_runtime': 637.2266, 'eval_samples_per_second': 39.233, 'eval_steps_per_second': 4.904, 'epoch': 2.0}
Model saved to ./bert-finetuned-imdb

Testing the fine-tuned model:


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [8]:
# 13. Example of how to use the fine-tuned model for inference
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        inputs = inputs.to(model.device)
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class = torch.argmax(probabilities, dim=-1).item()
    confidence = probabilities[0][predicted_class].item()

    sentiment = "Positive" if predicted_class == 1 else "Negative"
    return {"sentiment": sentiment, "confidence": confidence}

# Test with some examples
test_texts = [
    "This movie was absolutely fantastic! I loved every minute of it.",
    "What a waste of time. The plot made no sense and the acting was terrible.",
    "It was okay, not great but not terrible either."
]

print("\nTesting the fine-tuned model:")
for text in test_texts:
    result = predict_sentiment(text)
    print(f"Text: {text}\nSentiment: {result['sentiment']} (Confidence: {result['confidence']:.4f})\n")




Testing the fine-tuned model:
Text: This movie was absolutely fantastic! I loved every minute of it.
Sentiment: Positive (Confidence: 0.9994)

Text: What a waste of time. The plot made no sense and the acting was terrible.
Sentiment: Negative (Confidence: 0.9996)

Text: It was okay, not great but not terrible either.
Sentiment: Negative (Confidence: 0.9688)



In [None]:
# 14. How to load the model after saving
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("./bert-finetuned-imdb")
model = AutoModelForSequenceClassification.from_pretrained("./bert-finetuned-imdb")
