In [1]:
#|hide
## Standard libraries
import os
import math
import numpy as np
import time
from fastcore.all import *
from nbdev.showdoc import *

## Imports for plotting
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import set_matplotlib_formats
# set_matplotlib_formats('svg', 'pdf') # For export
from matplotlib.colors import to_rgba
import seaborn as sns
sns.set()

## Progress bar
from tqdm.auto import tqdm, trange

## project specifics
import murdo
import transformers

%load_ext autoreload
%autoreload 2

# Integrating Epinet Into Armo Regression
> C'mon, c'mon and meet the epinets

ArmoRM's regression layer is pretty simple: it's just a linear layer, with a bias term, optimized through Ridge regression. In other words, it's just performing a change of basis on the underlying last-layer embeddings of Llama3 8b. And apparently, dimensions already exist in this marvelous latent space that correspond to many of the dimensions we care about.

What can the epinet add to this? First, the structure fo the epinet should be pretty simple: we'll replicate the linear layer being used with Ridge Regression an MLP (hoping gradient descent is sufficient to extract the same dimensional information; and assuming we need extra compute power to both get the dimensions and reason about their epistemic status), but will multiply the output by the random epistemic index. In theory, this should allow the epinet to reduce the MSE regression loss by adding randomness to dimensions with questionable epistemic status.

In this notebook, we integrate an mlp epinet with the regression layer. We'll try the simplest possible integration first, then perform training (which, given the small size, should be pretty quick), and iterate.

How will we measure whether the integration works? First, we can sanity check by seeing how uncertainty compares across dimensions of rewards, comparing with our prior data on which dimensions have the most activation and which appear to be duplicates of each other. Ultimately, we can measure the performance of the reward model by doing some uncertainty-weighted best of N search.

A note on form: prior versions of zetteldev had an emphasis on atomic notebooks for experimentation. We break from that. This document is more 'computational essay/lab report' than slip. It will contain many ideas, and confront much computational and ideological reducibility. The metaphors worthy of further abstraction will be highlighted in a separate report, so see the 'Reports' folder for the high level summary. What follows is a 'lab report' in chronological order.

**Hypothesis**:
1. Integrating the MLP reward model with an MLP epinet will enable the prediction of uncertainty per reward dimension per prompt.
2. The uncertainty estimate should change with prompt response pairs.
3. When the gating layer denotes a reward dimension as irrelevant, it should have a higher uncertainty.
4. Integrating uncertainty into a Best of N sampler from a base llama model should have superior performance to the reward model without uncertainty.

# Machinery

First, we'll set up the pretrained reward model, then extract dimensions and such from it.

In [2]:
import torch, numpy

In [3]:
model_name, dataset_name = ("FsfairX-LLaMA3-RM-v0.1", "ArmoRM-Multi-Objective-Data-v0.1")
save_dir = os.path.join("/home/piriac", "data", "ArmoRM", "regression_weights")
save_path = os.path.join(save_dir, f"{model_name}_{dataset_name}.pt")
regression_layer = torch.load(save_path)["weight"]

In [4]:
n_attributes, hidden_size = regression_layer.shape

Example usage:

In [5]:
pairwise_rewards = torch.rand(800,hidden_size) @ regression_layer.T

Load the dataset prompt-response embeddings from the base LLM, as inputs to the regression layer.

In [6]:
import os
import torch
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from safetensors.torch import load_file
from argparse import ArgumentParser
def load_embeddings_and_preferences(embeddings_dir=None, model_name=None, dataset_name=None):
    """
    Load embeddings and preferences from safetensors files.

    Args:
        embeddings_dir (str, optional): Path to embeddings directory
        model_name (str, optional): Name of the model
        dataset_name (str, optional): Name of the dataset

    Returns:
        tuple: (embeddings tensor, labels tensor)
    """
    # Set default paths if not provided
    HOME = os.path.expanduser("~")
    if embeddings_dir is None:
        embeddings_dir = os.path.join(
            HOME, "data", "ArmoRM", "embeddings", model_name, dataset_name
        )

    # Collect all embedding files
    embedding_files = sorted(glob(f"{embeddings_dir}-*.safetensors"))

    if not embedding_files:
        raise FileNotFoundError(f"No embedding files found in {embeddings_dir}")

    # Initialize lists to store embeddings and labels
    embeddings = []
    labels = []

    print("Loading embeddings and labels from Safetensors files...")
    for file in tqdm(embedding_files, desc="Loading embeddings"):
        # Load the safetensors file
        data = load_file(file)
        embeddings.append(data["embeddings"])  # Append embeddings tensor
        labels.append(data["labels"])  # Append labels tensor

    # Concatenate all embeddings and labels into single tensors
    embeddings = torch.cat(embeddings, dim=0).float()
    labels = torch.cat(labels, dim=0).float()

    print(f"Total embeddings loaded: {embeddings.shape[0]}")
    print(f"Total labels loaded: {labels.shape[0]}")

    # Verify shapes match
    assert embeddings.shape[0] == labels.shape[0], "Number of embeddings and labels must match"

    return embeddings, labels

In [7]:
embeddings, sparse_rewards = load_embeddings_and_preferences(
    model_name="FsfairX-LLaMA3-RM-v0.1",
    dataset_name="ArmoRM-Multi-Objective-Data-v0.1"
)

Loading embeddings and labels from Safetensors files...


Loading embeddings: 100%|██████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1269.08it/s]


Total embeddings loaded: 569185
Total labels loaded: 569185


Only 15% of the reward labels are present.

In [8]:
sparse_rewards.shape

torch.Size([569185, 19])

In [9]:
sparse_rewards[sparse_rewards == sparse_rewards].shape

torch.Size([1647670])

In [10]:
1647670/(569185*19)

0.15235727168532293

In [11]:
embeddings

tensor([[-1.4844,  1.1484,  1.0781,  ..., -0.5156, -0.5703,  1.9453],
        [-1.4453,  1.3984,  0.5352,  ..., -0.4980, -0.8047,  1.6328],
        [-1.5156,  1.7812,  1.5781,  ...,  0.1514, -0.6133,  2.3438],
        ...,
        [-1.4141,  1.5781,  1.1484,  ..., -0.9023, -0.9961,  1.7500],
        [-1.3906,  1.7422,  1.2969,  ..., -0.6797, -0.4570,  2.0000],
        [-1.4531,  1.8125,  1.3047,  ..., -0.9688, -0.3887,  2.2344]])

Sanity check the regression weights by seeing how well it matches the preferences.

In [12]:
predicted_rewards = embeddings @ regression_layer.T

In [13]:
~torch.isnan(sparse_rewards)

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [14]:
diff = (sparse_rewards - predicted_rewards)[~torch.isnan(sparse_rewards)].numpy()
mse_diff = np.mean(diff**2)

In [15]:
mse_diff

0.022128979

So the reward prediction is extremely successful.

## Setting up the epinet

For our first epinet, we'll use a two layer mlp for the randomized component, and a pure linear layer for deterministic component. Thus, the non-randomized network recreates the Ridge regression setting. The epinet is given slightly more structure under the intuition that it needs not only to reproduce the computations of the deterministic component, but also reason about when those calculations need added randomness.

In [16]:
from murdo.epinet_mlp import make_mlp_epinet
epinet, indexer = make_mlp_epinet(
    output_sizes = [hidden_size,n_attributes],
    epinet_hiddens = [hidden_size + n_attributes, 512],
    index_dim = 8,
    prior_scale = 1,
    name = "my first epinet",
)

In [17]:
epinet

MLPEpinet(
  (base_mlp): ExposedMLP(
    (layers): ModuleList(
      (0): Linear(in_features=4096, out_features=19, bias=True)
    )
  )
  (train_epinet): ProjectedMLP(
    (mlp): Sequential(
      (0): Linear(in_features=4115, out_features=512, bias=True)
      (1): ReLU()
      (2): Linear(in_features=512, out_features=152, bias=True)
    )
  )
  (prior_epinet): ProjectedMLP(
    (mlp): Sequential(
      (0): Linear(in_features=4115, out_features=512, bias=True)
      (1): ReLU()
      (2): Linear(in_features=512, out_features=152, bias=True)
    )
  )
)

In [18]:
# example usage
output = epinet(torch.randn(64, hidden_size), indexer(64))
train_predictions = output.train
prior_predictions = output.prior

In [19]:
train_predictions.shape

torch.Size([64, 19])

In [20]:
indexer(0)

tensor([ 0.4685,  0.1797, -0.7740,  0.2451,  1.7227, -0.9314, -0.2282, -0.9275])

For training, we'll follow the same principle as is the paper: simply masking the unknown dimensions when calculating losses. This is hopefully sufficiently in keeping with the nature of SGD. Future work might explore using uncertainty to more cleverly compensate for missing values.

In [21]:
#|export
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm, trange
import os
import glob
from datetime import datetime

def train_epinet(epinet, indexer, embeddings, sparse_rewards, hidden_size,
                 batch_size=64, num_epochs=100, lr=1e-3, load_latest=False):
    """
    Train the epinet using masked MSE loss.

    Args:
        epinet: The epinet MLP model
        indexer: The indexer function for epinet
        embeddings: Input embeddings tensor
        sparse_rewards: Sparse reward labels tensor
        hidden_size: Size of the hidden dimension
        batch_size: Batch size for training
        num_epochs: Number of training epochs
        lr: Learning rate
        load_latest: If True, load most recent saved model if it exists
    """
    # Setup save directory
    save_dir = os.path.join(os.path.expanduser("~"), "data", "ArmoRM", "weights")
    os.makedirs(save_dir, exist_ok=True)

    # Check for latest saved model if requested
    if load_latest:
        model_files = glob.glob(os.path.join(save_dir, "epinet_*.pt"))
        if model_files:
            latest_model = max(model_files, key=os.path.getctime)
            print(f"Loading latest model from {latest_model}")
            epinet.load_state_dict(torch.load(latest_model))
            return epinet

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    epinet = epinet.to(device)

    # Setup optimizer
    optimizer = optim.Adam(epinet.parameters(), lr=lr)

    # Calculate number of batches
    n_samples = embeddings.shape[0]
    n_batches = n_samples // batch_size

    # Training loop
    for epoch in range(num_epochs):
        epoch_loss = 0.0

        # Shuffle data
        perm = torch.randperm(n_samples)
        embeddings = embeddings[perm]
        sparse_rewards = sparse_rewards[perm]

        # Batch training
        pbar = tqdm(range(n_batches), desc=f'Epoch {epoch+1}/{num_epochs}')
        for b in pbar:
            # Get batch and move to device
            start_idx = b * batch_size
            end_idx = start_idx + batch_size
            batch_embeddings = embeddings[start_idx:end_idx].to(device)
            batch_rewards = sparse_rewards[start_idx:end_idx].to(device)

            # Generate random indices for epinet
            indices = indexer(batch_size).to(device)

            # Forward pass
            epiout = epinet(batch_embeddings, indices)
            predicted_rewards = epiout.train + epiout.prior # preweighted sum of the learnable and fixed components

            # Create mask for non-nan values
            mask = ~torch.isnan(batch_rewards)

            # Calculate masked MSE loss
            loss = torch.mean((predicted_rewards[mask] - batch_rewards[mask])**2)

            # Backward pass and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Update progress bar
            epoch_loss += loss.item()
            pbar.set_postfix({'loss': epoch_loss/(b+1)})

            # Free memory
            del batch_embeddings
            del batch_rewards
            torch.cuda.empty_cache()

        # Print epoch statistics
        print(f'Epoch {epoch+1}/{num_epochs}, Average Loss: {epoch_loss/n_batches:.6f}')

        # Save model with timestamp
        if epoch % 25 == 0:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            save_path = os.path.join(save_dir, f"epinet_{timestamp}.pt")
            torch.save(epinet.state_dict(), save_path)
            print(f"Model saved to {save_path}")

    return epinet

Now we train! From above, Ridge regression without the epistemic nn achieved a loss of 0.022. Let's see if we can match that in the same order of magnitude, and perhaps even get below it.
Although as we've also changed the methodology (to gradient descent), and are adding randomness to the outputs, the raw numbers aren't directly comparable.

As seen below, we can quickly get in the same order of magnitude; the remainder of training is how to exploit randomness efficiently to further minimize the loss. It will be a good sanity check to see how much randomness is added: is the plain MLP doing most of the work, or are outputs of a substantial magnitude coming from the randomized portion of the network?

In [27]:
trained_epinet = train_epinet(
    epinet=epinet,
    indexer=indexer,
    embeddings=embeddings,
    sparse_rewards=sparse_rewards,
    hidden_size=hidden_size,
    lr = 1e-5,
    load_latest = True,
)

Loading latest model from /home/piriac/data/ArmoRM/weights/epinet_20241111_163611.pt


In [28]:
trained_epinet

MLPEpinet(
  (base_mlp): ExposedMLP(
    (layers): ModuleList(
      (0): Linear(in_features=4096, out_features=19, bias=True)
    )
  )
  (train_epinet): ProjectedMLP(
    (mlp): Sequential(
      (0): Linear(in_features=4115, out_features=512, bias=True)
      (1): ReLU()
      (2): Linear(in_features=512, out_features=152, bias=True)
    )
  )
  (prior_epinet): ProjectedMLP(
    (mlp): Sequential(
      (0): Linear(in_features=4115, out_features=512, bias=True)
      (1): ReLU()
      (2): Linear(in_features=512, out_features=152, bias=True)
    )
  )
)

# Analysis

The first thing we need in evaluating our epinet is a quantification of uncertainty per dimension. We can then perform this measurement across the dataset, and report:
1. Average uncertainty per dimension across all samples
2. Variance of uncertainty per dimension across all samples

In [70]:
def sample_epinet_outputs(trained_epinet, indexer, embeddings, n_samples=10, batch_size=128):
    """
    Sample multiple outputs from a trained epinet for each input embedding.

    Args:
        trained_epinet: The trained epinet model
        indexer: The indexer function for generating random indices
        embeddings: Input embeddings tensor of shape (n_embeddings, hidden_size)
        n_samples: Number of samples to generate per input
        batch_size: Batch size for processing

    Returns:
        torch.Tensor: Array of shape (n_embeddings, n_dimensions, n_samples)
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    trained_epinet = trained_epinet.to(device)

    n_embeddings = embeddings.shape[0]
    n_dimensions = 19  # number of reward dimensions

    # Initialize output tensor
    all_outputs = torch.zeros((n_embeddings, n_dimensions, n_samples), device=device)
    # Process in batches
    with torch.no_grad():  # disable gradient computation for inference
        for batch_start in tqdm(range(0, n_embeddings, batch_size), desc="Processing batches"):
            # Get batch of embeddings
            batch_end = min(batch_start + batch_size, n_embeddings)
            batch_embeddings = embeddings[batch_start:batch_end].to(device)

            # Initialize batch predictions tensor
            batch_predictions = torch.zeros((batch_end - batch_start, n_dimensions, n_samples), device=device)

            # Sample multiple times for each embedding
            for i in range(n_samples):
                # Generate index for this sample
                indices = indexer(1).to(device)

                # Get predictions for this sample
                outputs = trained_epinet(batch_embeddings, indices)
                predictions = outputs.train + outputs.prior  # combine train and prior predictions

                # Store predictions for this sample
                batch_predictions[:, :, i] = predictions

            # Store in output tensor
            all_outputs[batch_start:batch_end] = batch_predictions
            # Clear GPU memory
            del batch_embeddings, indices, outputs, predictions, batch_predictions
            torch.cuda.empty_cache()

    return all_outputs



In [71]:
torch.vstack([indexer(42) for i in range(10)])

tensor([[ 1.0472, -0.2657, -0.1110,  0.4044, -0.4292,  0.1513,  1.2102,  0.9952],
        [ 1.7883,  0.4186,  0.1676, -0.7698,  0.7433, -0.2450, -0.6685,  1.3462],
        [ 1.4534, -0.7636,  1.0673,  1.5691,  0.1276,  0.7721, -0.9738,  0.2645],
        [-1.1231, -0.3579, -0.8793, -0.5154, -1.2087,  0.3077,  0.3038,  0.5613],
        [-1.2444,  2.1400, -0.8152,  0.1227,  0.9151, -1.0056, -0.0136,  1.8474],
        [-0.2797,  0.8993, -0.6702,  0.1811,  1.3301,  2.0206,  0.1199,  0.3323],
        [ 0.1806, -0.6630, -0.4456, -2.9613, -0.4385, -2.2631, -1.5685,  0.9717],
        [ 0.1786, -0.2793,  0.4931,  0.2095,  0.7488,  0.1491,  0.5362,  0.8238],
        [-2.1192, -0.1512, -0.5590, -0.7132,  0.2609, -1.0812, -0.1860, -1.0017],
        [ 0.6723,  0.4385, -1.6923, -1.0989, -2.0868,  1.5404, -0.2603, -0.5551]])

In [72]:
indexer(42)

tensor([-0.3999, -1.1252,  1.6562, -0.0490, -1.0774,  0.9167,  0.5866, -1.7774])

In [73]:
# Usage example:
samples = sample_epinet_outputs(
    trained_epinet=trained_epinet,
    indexer=indexer,
    embeddings=embeddings,
    n_samples=100,
    batch_size=128
)

# Calculate statistics
mean_predictions = samples.mean(dim=2)  # Average across samples
std_predictions = samples.std(dim=2)    # Standard deviation across samples

Processing batches: 100%|██████████████████████████████████████████████████████████████████| 4447/4447 [02:02<00:00, 36.26it/s]


In [74]:
samples.shape

torch.Size([569185, 19, 100])

In [75]:
samples[0,0,:]

tensor([0.8302, 0.8226, 0.8270, 0.8269, 0.8257, 0.8240, 0.8211, 0.8343, 0.8253,
        0.8223, 0.8288, 0.8289, 0.8213, 0.8235, 0.8292, 0.8318, 0.8292, 0.8311,
        0.8258, 0.8282, 0.8293, 0.8322, 0.8263, 0.8222, 0.8156, 0.8253, 0.8249,
        0.8273, 0.8247, 0.8229, 0.8323, 0.8321, 0.8253, 0.8250, 0.8377, 0.8261,
        0.8342, 0.8233, 0.8187, 0.8223, 0.8308, 0.8230, 0.8308, 0.8287, 0.8291,
        0.8339, 0.8250, 0.8210, 0.8233, 0.8233, 0.8323, 0.8236, 0.8270, 0.8285,
        0.8322, 0.8258, 0.8286, 0.8227, 0.8285, 0.8298, 0.8220, 0.8281, 0.8248,
        0.8198, 0.8251, 0.8285, 0.8313, 0.8244, 0.8372, 0.8305, 0.8231, 0.8253,
        0.8213, 0.8281, 0.8307, 0.8338, 0.8303, 0.8219, 0.8284, 0.8319, 0.8324,
        0.8192, 0.8254, 0.8315, 0.8333, 0.8329, 0.8281, 0.8293, 0.8272, 0.8337,
        0.8228, 0.8244, 0.8228, 0.8132, 0.8244, 0.8264, 0.8263, 0.8205, 0.8251,
        0.8238], device='cuda:0')

In [76]:
samples.std(dim=2)

tensor([[0.0045, 0.0060, 0.0076,  ..., 0.0146, 0.0137, 0.0161],
        [0.0029, 0.0037, 0.0054,  ..., 0.0105, 0.0087, 0.0120],
        [0.0044, 0.0059, 0.0075,  ..., 0.0144, 0.0136, 0.0160],
        ...,
        [0.0037, 0.0046, 0.0054,  ..., 0.0126, 0.0094, 0.0140],
        [0.0048, 0.0066, 0.0071,  ..., 0.0162, 0.0132, 0.0175],
        [0.0036, 0.0044, 0.0052,  ..., 0.0122, 0.0090, 0.0136]],
       device='cuda:0')

In [77]:
mean_std_per_dimension = (samples.std(dim=2)).mean(dim=0)  # Average across samples
mean_std_per_dimension

tensor([0.0045, 0.0051, 0.0063, 0.0040, 0.0060, 0.0023, 0.0023, 0.0032, 0.0026,
        0.0018, 0.0058, 0.0012, 0.0061, 0.0119, 0.0164, 0.0163, 0.0142, 0.0129,
        0.0151], device='cuda:0')

In [79]:
std_of_std_per_dimension = (samples.std(dim=2)).std(dim=0)  # Average across samples
std_of_std_per_dimension

tensor([0.0033, 0.0038, 0.0065, 0.0050, 0.0035, 0.0045, 0.0037, 0.0034, 0.0032,
        0.0029, 0.0039, 0.0026, 0.0015, 0.0042, 0.0052, 0.0043, 0.0039, 0.0045,
        0.0049], device='cuda:0')

These early results show that 
1. different reward dimensions have different 'uncertainties'.
2. The uncertainty per reward dimension changes across samples on a magnitude equal to the original std.
3. The last six dimensions are, weirdly, far more uncertain than the first 13. 

# Conclusion