# 2.1 Objective

To perform a comprehensive exploratory data analysis (EDA) on the HelpSteer Dataset to
understand the data's characteristics and attribute correlations.

In [14]:
!pip install datasets




In [15]:
from datasets import load_dataset
import pandas as pd

# Load the dataset splits
dataset = load_dataset("nvidia/HelpSteer")
train_df = pd.DataFrame(dataset['train'])
validation_df = pd.DataFrame(dataset['validation'])


In [16]:
train_df.head()

Unnamed: 0,prompt,response,helpfulness,correctness,coherence,complexity,verbosity
0,What are the three most important things to co...,To build an assistive device to help an elderl...,3,4,4,2,2
1,What are the three most important things to co...,There are many different types of assistive de...,4,3,3,2,3
2,What are the three most important things to co...,When deciding what technology to use to build ...,4,4,4,2,2
3,What are the three most important things to co...,You can create an assistant device to help an ...,3,3,3,2,3
4,Background:\n<start of reference>\nFamily doct...,"Hi there! I'm Dr. Family, and I'm here to tell...",3,3,3,2,1


In [17]:
validation_df.head()

Unnamed: 0,prompt,response,helpfulness,correctness,coherence,complexity,verbosity
0,The reference text below provides context for ...,A woman who helped her cousin retrieve her bel...,3,2,3,2,2
1,The reference text below provides context for ...,A woman who tried to help her cousin retrieve ...,2,2,3,1,2
2,The following information may be useful:\n<sta...,The protagonist has a very casual attitude tow...,3,2,3,1,1
3,The following information may be useful:\n<sta...,The protagonist has a positive attitude toward...,3,3,3,2,1
4,The following information may be useful:\n<sta...,The protagonist's attitude toward swear words ...,3,3,3,2,2


In [18]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35331 entries, 0 to 35330
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   prompt       35331 non-null  object
 1   response     35331 non-null  object
 2   helpfulness  35331 non-null  int64 
 3   correctness  35331 non-null  int64 
 4   coherence    35331 non-null  int64 
 5   complexity   35331 non-null  int64 
 6   verbosity    35331 non-null  int64 
dtypes: int64(5), object(2)
memory usage: 1.9+ MB


In [19]:
train_df["complexity"].describe()

count    35331.000000
mean         1.443888
std          0.822268
min          0.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          4.000000
Name: complexity, dtype: float64

# 3.1 Objective

Develop a regression model capable of reasonably accurately predicting the complexity
attribute of a response using the HelpSteer Dataset.

Use a small model like DistilBERT to extract embeddings from the prompt and response columns

In [22]:
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [23]:
from transformers import DistilBertTokenizer, DistilBertModel

# Load the model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)  # Move model to GPU


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

 Tokenize the input text, passes it through the model, and returns the embeddings

In [24]:
def get_embeddings(text):
    # Tokenize the input text and send it to the GPU
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128).to(device)
    
    # Generate embeddings with no gradient tracking
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Move embeddings back to CPU for further processing if necessary
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return embeddings


Apply this function to both the prompt and response columns

In [25]:
train_df['prompt_embedding'] = train_df['prompt'].apply(get_embeddings)
train_df['response_embedding'] = train_df['response'].apply(get_embeddings)


In [26]:
import numpy as np

# Average the prompt and response embeddings
train_df['combined_embedding'] = train_df.apply(lambda row: (row['prompt_embedding'] + row['response_embedding']) , axis=1)


# Prepare the feature matrix (X) and target variable (y)
X = np.vstack(train_df['combined_embedding'].values)
y = train_df['complexity'].values


In [27]:
train_df['combined_embedding']

0        [-0.4473592, 0.56709456, 0.3154971, -0.0720754...
1        [-0.5759791, 0.607692, 0.2765418, -0.09543538,...
2        [-0.67710173, 0.4798023, 0.25830466, -0.029011...
3        [-0.53883135, 0.4674706, 0.37658167, 0.0036770...
4        [0.0087714195, 0.64115447, 0.36642605, -0.2480...
                               ...                        
35326    [-1.5298746, -0.370573, 0.47147185, -0.2574832...
35327    [-0.47161123, 0.105901666, 0.08467649, -0.0057...
35328    [-0.34656668, -0.09481117, 0.19014813, 0.02840...
35329    [0.27400887, 0.114452496, 0.00477916, 0.052146...
35330    [-0.39414436, -0.123563945, 0.34819585, 0.0215...
Name: combined_embedding, Length: 35331, dtype: object

In [28]:
train_df['combined_embedding'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 35331 entries, 0 to 35330
Series name: combined_embedding
Non-Null Count  Dtype 
--------------  ----- 
35331 non-null  object
dtypes: object(1)
memory usage: 276.1+ KB


In [29]:
sample_embedding_shape = train_df['combined_embedding'].iloc[0].shape
print("Sample embedding shape:", sample_embedding_shape)


Sample embedding shape: (768,)


In [30]:
from sklearn.model_selection import train_test_split

# X is the feature matrix (e.g., combined embeddings), y is the target (complexity)
X = np.vstack(train_df['combined_embedding'].values)  
y = train_df['complexity'].values

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [31]:
print("Training set:", X_train.shape, y_train.shape)
print("Testing set:", X_test.shape, y_test.shape)


Training set: (28264, 768) (28264,)
Testing set: (7067, 768) (7067,)


In [32]:
X_train.shape

(28264, 768)

# Bypass the Embedding Layer:
The custom model skips the embedding layer by directly accepting precomputed_embeddings in the forward method.
# Regression Head: 
A single linear layer (self.regression_head) replaces the classification head to produce a continuous output for regression.
# Forward Pass: 
The model takes precomputed_embeddings and directly feeds them into the transformer layers

In [33]:
import torch
from torch import nn
from transformers import DistilBertModel, DistilBertConfig

# Load the base DistilBERT model without the classification head
base_model = DistilBertModel.from_pretrained("distilbert-base-uncased")


class DistilBERTForRegression(nn.Module):
    def __init__(self, base_model):
        super(DistilBERTForRegression, self).__init__()
        self.base_model = base_model  # DistilBERT base model
        
        # Regression head for single output
        self.regression_head = nn.Linear(self.base_model.config.hidden_size, 1)

    def forward(self, precomputed_embeddings):
        # Expand embeddings to [batch_size, 512, 768]
        batch_size = precomputed_embeddings.size(0)
        expanded_embeddings = precomputed_embeddings.unsqueeze(1).expand(batch_size, 512, 768)
        
        # Pass expanded embeddings to the transformer layers
        transformer_output = self.base_model(inputs_embeds=expanded_embeddings)
        pooled_output = transformer_output.last_hidden_state[:, 0]  # CLS token representation
        return self.regression_head(pooled_output)  # Output continuous value for regression



# Instantiate the custom model
regression_model = DistilBERTForRegression(base_model).to(device)

In [34]:
from tqdm import tqdm
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Convert data to tensors and create a DataLoader
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Define optimizer and loss function
optimizer = optim.Adam(regression_model.parameters(), lr=2e-5)
criterion = nn.MSELoss()  # MSE for regression

# Training loop
epochs = 3
regression_model.train()  # Set model to training mode

for epoch in range(epochs):
    total_loss = 0
    print(f"Epoch {epoch + 1}/{epochs}")
    
    # Wrap the training loop with tqdm for a progress bar
    for batch in tqdm(train_loader, desc="Training", leave=False):
        optimizer.zero_grad()
        
        # Move inputs and targets to device
        inputs, targets = batch[0].to(device), batch[1].to(device)
        
        # Forward pass
        outputs = regression_model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    # Print average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Average Loss: {avg_loss:.4f}")


Epoch 1/3


                                                             

Average Loss: 0.5300
Epoch 2/3


                                                             

Average Loss: 0.4939
Epoch 3/3


                                                             

Average Loss: 0.4587




In [35]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
import numpy as np

# Convert test data to tensors and create a DataLoader
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Set model to evaluation mode
regression_model.eval()

# Initialize lists to store predictions and actual values
all_predictions = []
all_targets = []

# Disable gradient computation for evaluation
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating", leave=False):
        # Move inputs and targets to device
        inputs, targets = batch[0].to(device), batch[1].to(device)
        
        # Forward pass to get predictions
        outputs = regression_model(inputs)
        
        # Store predictions and actual targets
        all_predictions.extend(outputs.squeeze().cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

# Calculate RMSE and MAE
rmse = mean_squared_error(all_targets, all_predictions, squared=False)  # RMSE
mae = mean_absolute_error(all_targets, all_predictions)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")


                                                             

RMSE: 0.6840
MAE: 0.5442




In [36]:
from tqdm import tqdm
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Convert data to tensors and create a DataLoader
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define optimizer and loss function
optimizer = optim.Adam(regression_model.parameters(), lr=1e-5)
criterion = nn.MSELoss()  # MSE for regression

# Training loop
epochs = 3
regression_model.train()  # Set model to training mode

for epoch in range(epochs):
    total_loss = 0
    print(f"Epoch {epoch + 1}/{epochs}")
    
    # Wrap the training loop with tqdm for a progress bar
    for batch in tqdm(train_loader, desc="Training", leave=False):
        optimizer.zero_grad()
        
        # Move inputs and targets to device
        inputs, targets = batch[0].to(device), batch[1].to(device)
        
        # Forward pass
        outputs = regression_model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    # Print average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Average Loss: {avg_loss:.4f}")


Epoch 1/3


                                                           

Average Loss: 0.3642
Epoch 2/3


                                                           

Average Loss: 0.3041
Epoch 3/3


                                                           

Average Loss: 0.2488




In [37]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
import numpy as np

# Convert test data to tensors and create a DataLoader
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Set model to evaluation mode
regression_model.eval()

# Initialize lists to store predictions and actual values
all_predictions = []
all_targets = []

# Disable gradient computation for evaluation
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating", leave=False):
        # Move inputs and targets to device
        inputs, targets = batch[0].to(device), batch[1].to(device)
        
        # Forward pass to get predictions
        outputs = regression_model(inputs)
        
        # Store predictions and actual targets
        all_predictions.extend(outputs.squeeze().cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

# Calculate RMSE and MAE
rmse = mean_squared_error(all_targets, all_predictions, squared=False)  # RMSE
mae = mean_absolute_error(all_targets, all_predictions)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")


                                                             

RMSE: 0.6308
MAE: 0.4916




In [38]:
from tqdm import tqdm
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Convert data to tensors and create a DataLoader
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define optimizer and loss function
optimizer = optim.Adam(regression_model.parameters(), lr=2e-2)
criterion = nn.MSELoss()  # MSE for regression

# Training loop
epochs = 3
regression_model.train()  # Set model to training mode

for epoch in range(epochs):
    total_loss = 0
    print(f"Epoch {epoch + 1}/{epochs}")
    
    # Wrap the training loop with tqdm for a progress bar
    for batch in tqdm(train_loader, desc="Training", leave=False):
        optimizer.zero_grad()
        
        # Move inputs and targets to device
        inputs, targets = batch[0].to(device), batch[1].to(device)
        
        # Forward pass
        outputs = regression_model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    # Print average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Average Loss: {avg_loss:.4f}")


Epoch 1/3


                                                           

Average Loss: 1.1728
Epoch 2/3


                                                           

Average Loss: 0.6980
Epoch 3/3


                                                           

Average Loss: 0.6935




In [39]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
import numpy as np

# Convert test data to tensors and create a DataLoader
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Set model to evaluation mode
regression_model.eval()

# Initialize lists to store predictions and actual values
all_predictions = []
all_targets = []

# Disable gradient computation for evaluation
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating", leave=False):
        # Move inputs and targets to device
        inputs, targets = batch[0].to(device), batch[1].to(device)
        
        # Forward pass to get predictions
        outputs = regression_model(inputs)
        
        # Store predictions and actual targets
        all_predictions.extend(outputs.squeeze().cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

# Calculate RMSE and MAE
rmse = mean_squared_error(all_targets, all_predictions, squared=False)  # RMSE
mae = mean_absolute_error(all_targets, all_predictions)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")


                                                             

RMSE: 0.8245
MAE: 0.7105


