# 2.1 Objective

To perform a comprehensive exploratory data analysis (EDA) on the HelpSteer Dataset to
understand the data's characteristics and attribute correlations.

In [1]:
!pip install datasets




In [2]:
from datasets import load_dataset
import pandas as pd

# Load the dataset splits
dataset = load_dataset("nvidia/HelpSteer")
train_df = pd.DataFrame(dataset['train'])
validation_df = pd.DataFrame(dataset['validation'])


README.md:   0%|          | 0.00/7.96k [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

validation.jsonl.gz:   0%|          | 0.00/813k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/35331 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1789 [00:00<?, ? examples/s]

In [3]:
train_df.head()

Unnamed: 0,prompt,response,helpfulness,correctness,coherence,complexity,verbosity
0,What are the three most important things to co...,To build an assistive device to help an elderl...,3,4,4,2,2
1,What are the three most important things to co...,There are many different types of assistive de...,4,3,3,2,3
2,What are the three most important things to co...,When deciding what technology to use to build ...,4,4,4,2,2
3,What are the three most important things to co...,You can create an assistant device to help an ...,3,3,3,2,3
4,Background:\n<start of reference>\nFamily doct...,"Hi there! I'm Dr. Family, and I'm here to tell...",3,3,3,2,1


In [4]:
validation_df.head()

Unnamed: 0,prompt,response,helpfulness,correctness,coherence,complexity,verbosity
0,The reference text below provides context for ...,A woman who helped her cousin retrieve her bel...,3,2,3,2,2
1,The reference text below provides context for ...,A woman who tried to help her cousin retrieve ...,2,2,3,1,2
2,The following information may be useful:\n<sta...,The protagonist has a very casual attitude tow...,3,2,3,1,1
3,The following information may be useful:\n<sta...,The protagonist has a positive attitude toward...,3,3,3,2,1
4,The following information may be useful:\n<sta...,The protagonist's attitude toward swear words ...,3,3,3,2,2


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35331 entries, 0 to 35330
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   prompt       35331 non-null  object
 1   response     35331 non-null  object
 2   helpfulness  35331 non-null  int64 
 3   correctness  35331 non-null  int64 
 4   coherence    35331 non-null  int64 
 5   complexity   35331 non-null  int64 
 6   verbosity    35331 non-null  int64 
dtypes: int64(5), object(2)
memory usage: 1.9+ MB


In [6]:
train_df["complexity"].describe()

count    35331.000000
mean         1.443888
std          0.822268
min          0.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          4.000000
Name: complexity, dtype: float64

# 3.1 Objective

Develop a regression model capable of reasonably accurately predicting the complexity
attribute of a response using the HelpSteer Dataset.

Use a small model like DistilBERT to extract embeddings from the prompt and response columns

In [7]:
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [8]:
from transformers import DistilBertTokenizer, DistilBertModel

# Load the model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)  # Move model to GPU


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

 Tokenize the input text, passes it through the model, and returns the embeddings

In [9]:
def get_embeddings(text):
    # Tokenize the input text and send it to the GPU
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128).to(device)
    
    # Generate embeddings with no gradient tracking
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Move embeddings back to CPU for further processing if necessary
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return embeddings


Apply this function to both the prompt and response columns

In [10]:
train_df['prompt_embedding'] = train_df['prompt'].apply(get_embeddings)
train_df['response_embedding'] = train_df['response'].apply(get_embeddings)


In [11]:
import numpy as np

# Average the prompt and response embeddings
train_df['combined_embedding'] = train_df.apply(lambda row: (row['prompt_embedding'] + row['response_embedding']) , axis=1)


# Prepare the feature matrix (X) and target variable (y)
X = np.vstack(train_df['combined_embedding'].values)
y = train_df['complexity'].values


In [12]:
train_df['combined_embedding']

0        [-0.4473592, 0.56709456, 0.3154971, -0.0720754...
1        [-0.5759791, 0.607692, 0.2765418, -0.09543538,...
2        [-0.67710173, 0.4798023, 0.25830466, -0.029011...
3        [-0.53883135, 0.4674706, 0.37658167, 0.0036770...
4        [0.0087714195, 0.64115447, 0.36642605, -0.2480...
                               ...                        
35326    [-1.5298746, -0.370573, 0.47147185, -0.2574832...
35327    [-0.47161123, 0.105901666, 0.08467649, -0.0057...
35328    [-0.34656668, -0.09481117, 0.19014813, 0.02840...
35329    [0.27400887, 0.114452496, 0.00477916, 0.052146...
35330    [-0.39414436, -0.123563945, 0.34819585, 0.0215...
Name: combined_embedding, Length: 35331, dtype: object

In [13]:
train_df['combined_embedding'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 35331 entries, 0 to 35330
Series name: combined_embedding
Non-Null Count  Dtype 
--------------  ----- 
35331 non-null  object
dtypes: object(1)
memory usage: 276.1+ KB


In [14]:
sample_embedding_shape = train_df['combined_embedding'].iloc[0].shape
print("Sample embedding shape:", sample_embedding_shape)


Sample embedding shape: (768,)


In [15]:
from sklearn.model_selection import train_test_split

# X is the feature matrix (e.g., combined embeddings), y is the target (complexity)
X = np.vstack(train_df['combined_embedding'].values)  
y = train_df['complexity'].values

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
print("Training set:", X_train.shape, y_train.shape)
print("Testing set:", X_test.shape, y_test.shape)


Training set: (28264, 768) (28264,)
Testing set: (7067, 768) (7067,)


In [17]:
X_train.shape

(28264, 768)

In [21]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from tqdm import tqdm
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Base models with GPU support
base_models = {
    'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, tree_method='gpu_hist'),
    'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42, device='gpu'),
}

# Train each base model with tqdm and GPU
base_predictions = {}
for name, model in tqdm(base_models.items(), desc="Training Base Models on GPU"):
    model.fit(X_train, y_train)
    base_predictions[name] = model.predict(X_test)

# Combine predictions from base models as features for the meta-model
meta_features = np.column_stack(list(base_predictions.values()))

# Meta-model (still on CPU for simplicity)
meta_model = Ridge(alpha=1.0)

# Train the meta-model with tqdm
with tqdm(total=1, desc="Training Meta-Model") as pbar:
    meta_model.fit(meta_features, y_test)
    pbar.update(1)

# Final predictions
y_pred = meta_model.predict(meta_features)

# Evaluate the ensemble model
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

print(f"\nMean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Training Base Models on GPU:  50%|█████     | 1/2 [00:03<00:03,  3.66s/it]

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 28264, number of used features: 768
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 768 dense feature groups (20.70 MB) transferred to GPU in 0.018351 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 1.439251


Training Base Models on GPU: 100%|██████████| 2/2 [00:19<00:00,  9.79s/it]
Training Meta-Model: 100%|██████████| 1/1 [00:00<00:00, 46.03it/s]


Mean Absolute Error (MAE): 0.5180972612314204
Root Mean Squared Error (RMSE): 0.6425749929200665



