<a href="https://colab.research.google.com/github/rubuntu/Taller_Introduccion_a_Ciencia_de_Datos_IA_e_Ingenieria_de_Datos/blob/main/sesion_15_fine_tuning_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧑‍💻 Sesión 15 – Fine-tuning de BERT para Análisis de Sentimiento

In [None]:

# ==========================================
# SESIÓN 15: Fine-tuning de BERT en IMDB
# ==========================================

#!pip install transformers datasets torch scikit-learn -q

import torch
import numpy as np
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)


In [None]:

# Dataset IMDB (25k train / 25k test)
dataset = load_dataset("imdb")

# Reducimos para entrenar rápido en clase
small_train = dataset["train"].shuffle(seed=42).select(range(2000))
small_test = dataset["test"].shuffle(seed=42).select(range(1000))


In [None]:

# Baseline clásico – TF-IDF + Logistic Regression
X_train = small_train["text"]
y_train = small_train["label"]
X_test = small_test["text"]
y_test = small_test["label"]

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Baseline TF-IDF + LogReg -> Accuracy: {acc:.4f}, F1: {f1:.4f}")


In [None]:

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_enc = small_train.map(tokenize, batched=True, batch_size=32)
test_enc = small_test.map(tokenize, batched=True, batch_size=32)

train_enc.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_enc.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [None]:

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


In [None]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="no",
    num_train_epochs=2,  # demo rápida
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_enc,
    eval_dataset=test_enc,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)


In [None]:

trainer.train()
metrics = trainer.evaluate()
print(metrics)


In [None]:

print("=== Comparación final ===")
print(f"Baseline TF-IDF + LogReg -> Accuracy: {acc:.4f}, F1: {f1:.4f}")
print(f"BERT Fine-tuned         -> Accuracy: {metrics['eval_accuracy']:.4f}, F1: {metrics['eval_f1']:.4f}")


In [None]:

examples = [
    "I really loved this movie, the story was amazing!",
    "This was the worst customer experience I ever had."
]

inputs = tokenizer(examples, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=1)

for txt, pred in zip(examples, preds):
    label = "positive" if pred.item() == 1 else "negative"
    print(f"{txt} -> {label}")



## Conclusión y discusión

- **TF-IDF + LogReg**: rápido, interpretable, rendimiento aceptable (~80–85%).  
- **BERT fine-tuned**: mayor performance (~90%+), pero con más costo y menor interpretabilidad.  
- En un empresa: trade-off entre **performance vs costo vs explicabilidad**.  
