In [34]:
from datasets import load_dataset
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
import numpy as np
from torch.optim import AdamW
from torch.nn import MSELoss

# Load the dataset
dataset = load_dataset("nvidia/HelpSteer")
train_df = pd.DataFrame(dataset['train'])
validation_df = pd.DataFrame(dataset['validation'])

# Set up the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load DistilBERT model with a regression head
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=1  # Single output for regression
).to(device)

# Prepare the dataset
class HelpSteerDataset(Dataset):
    def __init__(self, prompts, responses, labels, tokenizer, max_length=128):
        self.prompts = prompts
        self.responses = responses
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Concatenate prompt and response with a separator token
        text = self.prompts[idx] + " [SEP] " + self.responses[idx]
        encoding = self.tokenizer(text, 
                                  max_length=self.max_length, 
                                  truncation=True, 
                                  padding="max_length", 
                                  return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(self.labels[idx], dtype=torch.float)
        }

# Split the data into features (X) and target (y)
X = train_df[['prompt', 'response']].values  # Prompts and responses
y = train_df['complexity'].values  # Target variable (complexity)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create train and validation datasets
train_dataset = HelpSteerDataset(
    prompts=[x[0] for x in X_train],
    responses=[x[1] for x in X_train],
    labels=y_train,
    tokenizer=tokenizer
)

test_dataset = HelpSteerDataset(
    prompts=[x[0] for x in X_test],
    responses=[x[1] for x in X_test],
    labels=y_test,
    tokenizer=tokenizer
)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Set up the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = MSELoss()  # Mean Squared Error for regression

# Training loop
epochs = 3
model.train()

for epoch in range(epochs):
    total_loss = 0
    print(f"Epoch {epoch + 1}/{epochs}")
    
    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()

        # Move data to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels.unsqueeze(1))
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} Average Loss: {avg_loss:.4f}")

# Evaluate the model
model.eval()
all_predictions = []
all_targets = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        # Move data to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = outputs.logits.squeeze()

        # Store predictions and targets
        all_predictions.extend(predictions.cpu().numpy())
        all_targets.extend(labels.cpu().numpy())

# Compute evaluation metrics
rmse = mean_squared_error(all_targets, all_predictions, squared=False)
mae = mean_absolute_error(all_targets, all_predictions)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")


Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training: 100%|██████████| 884/884 [07:19<00:00,  2.01it/s]


Epoch 1 Average Loss: 0.6383
Epoch 2/3


Training: 100%|██████████| 884/884 [07:13<00:00,  2.04it/s]


Epoch 2 Average Loss: 0.4363
Epoch 3/3


Training: 100%|██████████| 884/884 [07:13<00:00,  2.04it/s]


Epoch 3 Average Loss: 0.2931


Evaluating: 100%|██████████| 221/221 [01:22<00:00,  2.68it/s]

RMSE: 0.5992
MAE: 0.4519



