In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
from tqdm.auto import tqdm

# 1. Prepare dataset
# df_sub = df.dropna(subset=["title", "engagement_ratio_z"])[["title", "engagement_ratio_z"]]
df_sub = pd.read_csv("data/processed/distilbert_regression_data.csv", index=False) # Assuming df_sub is saved as a CSV
train_df, test_df = train_test_split(df_sub, test_size=0.2, random_state=42)

# 2. Create a dataset wrapper
class TitleDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=64):
        self.titles = df["title"].tolist()
        self.targets = df["engagement_ratio_z"].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.titles[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.targets[idx], dtype=torch.float)
        }

# 3. Initialize model and tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    problem_type="regression",
    num_labels=1
)

# 4. Instantiate dataloaders
train_ds = TitleDataset(train_df, tokenizer)
test_ds = TitleDataset(test_df, tokenizer)
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)

# 5. Setup training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.MSELoss()

# 6. Train for a few epochs
model.train()
for epoch in range(3):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attn = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device).unsqueeze(1)
        out = model(input_ids, attention_mask=attn, labels=labels)
        loss = out.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

# 7. Evaluate on test set
model.eval()
preds, true = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attn = batch["attention_mask"].to(device)
        labels = batch["labels"].cpu().numpy()
        out = model(input_ids, attention_mask=attn)
        pred = out.logits.cpu().squeeze().numpy()
        preds.extend(pred)
        true.extend(labels)

r2 = r2_score(true, preds)
rmse = np.sqrt(mean_squared_error(true, preds))
print(f"DistilBERT R²: {r2:.4f}, RMSE: {rmse:.4f}")
