In [1]:
#|hide
## Standard libraries
import os
import math
import numpy as np
import time
from fastcore.all import *
from nbdev.showdoc import *

## Imports for plotting
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import set_matplotlib_formats
# set_matplotlib_formats('svg', 'pdf') # For export
from matplotlib.colors import to_rgba
import seaborn as sns
sns.set()

## Progress bar
from tqdm.auto import tqdm, trange

## project specifics
import murdo
import transformers

%load_ext autoreload
%autoreload 2

# Integrating Epinet Into Armo Regression
> C'mon, c'mon and meet the epinets

ArmoRM's regression layer is pretty simple: it's just a linear layer, with a bias term, optimized through Ridge regression. In other words, it's just performing a change of basis on the underlying last-layer embeddings of Llama3 8b. And apparently, dimensions already exist in this marvelous latent space that correspond to many of the dimensions we care about.

What can the epinet add to this? First, the structure fo the epinet should be pretty simple: we'll replicate the linear layer being used with Ridge Regression an MLP (hoping gradient descent is sufficient to extract the same dimensional information; and assuming we need extra compute power to both get the dimensions and reason about their epistemic status), but will multiply the output by the random epistemic index. In theory, this should allow the epinet to reduce the MSE regression loss by adding randomness to dimensions with questionable epistemic status.

In this notebook, we integrate an mlp epinet with the regression layer. We'll try the simplest possible integration first, then perform training (which, given the small size, should be pretty quick), and iterate.

How will we measure whether the integration works? First, we can sanity check by seeing how uncertainty compares across dimensions of rewards, comparing with our prior data on which dimensions have the most activation and which appear to be duplicates of each other. Ultimately, we can measure the performance of the reward model by doing some uncertainty-weighted best of N search.

A note on form: prior versions of zetteldev had an emphasis on atomic notebooks for experimentation. We break from that. This document is more 'computational essay/lab report' than slip. It will contain many ideas, and confront much computational and ideological reducibility. The metaphors worthy of further abstraction will be highlighted in a separate report, so see the 'Reports' folder for the high level summary. What follows is a 'lab report' in chronological order.

**Hypothesis**: 
1. Integrating the MLP reward model with an MLP epinet will enable the prediction of uncertainty per reward dimension per prompt.
2. The uncertainty estimate should change with prompt response pairs.
3. When the gating layer denotes a reward dimension as irrelevant, it should have a higher uncertainty.
4. Integrating uncertainty into a Best of N sampler from a base llama model should have superior performance to the reward model without uncertainty.

# Machinery

First, we'll set up the pretrained reward model, then extract dimensions and such from it.

In [2]:
import torch, numpy

In [9]:
model_name, dataset_name = ("FsfairX-LLaMA3-RM-v0.1", "ArmoRM-Multi-Objective-Data-v0.1")
save_dir = os.path.join("/home/piriac", "data", "ArmoRM", "regression_weights")
save_path = os.path.join(save_dir, f"{model_name}_{dataset_name}.pt")
regression_layer = torch.load(save_path)["weight"]

In [11]:
n_attributes, hidden_size = regression_layer.shape

Example usage:

In [12]:
pairwise_rewards = torch.rand(800,hidden_size) @ regression_layer.T

Load the dataset prompt-response embeddings from the base LLM, as inputs to the regression layer.

In [20]:
import os
import torch
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from safetensors.torch import load_file
from argparse import ArgumentParser
def load_embeddings_and_preferences(embeddings_dir=None, model_name=None, dataset_name=None):
    """
    Load embeddings and preferences from safetensors files.
    
    Args:
        embeddings_dir (str, optional): Path to embeddings directory
        model_name (str, optional): Name of the model
        dataset_name (str, optional): Name of the dataset
    
    Returns:
        tuple: (embeddings tensor, labels tensor)
    """
    # Set default paths if not provided
    HOME = os.path.expanduser("~")
    if embeddings_dir is None:
        embeddings_dir = os.path.join(
            HOME, "data", "ArmoRM", "embeddings", model_name, dataset_name
        )

    # Collect all embedding files
    embedding_files = sorted(glob(f"{embeddings_dir}-*.safetensors"))
    
    if not embedding_files:
        raise FileNotFoundError(f"No embedding files found in {embeddings_dir}")

    # Initialize lists to store embeddings and labels
    embeddings = []
    labels = []
    
    print("Loading embeddings and labels from Safetensors files...")
    for file in tqdm(embedding_files, desc="Loading embeddings"):
        # Load the safetensors file
        data = load_file(file)
        embeddings.append(data["embeddings"])  # Append embeddings tensor
        labels.append(data["labels"])  # Append labels tensor

    # Concatenate all embeddings and labels into single tensors
    embeddings = torch.cat(embeddings, dim=0).float()
    labels = torch.cat(labels, dim=0).float()

    print(f"Total embeddings loaded: {embeddings.shape[0]}")
    print(f"Total labels loaded: {labels.shape[0]}")
    
    # Verify shapes match
    assert embeddings.shape[0] == labels.shape[0], "Number of embeddings and labels must match"
    
    return embeddings, labels

In [35]:
embeddings, sparse_rewards = load_embeddings_and_preferences(
    model_name="FsfairX-LLaMA3-RM-v0.1",
    dataset_name="ArmoRM-Multi-Objective-Data-v0.1"
)

Loading embeddings and labels from Safetensors files...


Loading embeddings: 100%|██████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1529.65it/s]


Total embeddings loaded: 569185
Total labels loaded: 569185


Only 15% of the reward labels are present.

In [36]:
preferences.shape

torch.Size([569185, 19])

In [37]:
preferences[preferences == preferences].shape

torch.Size([1647670])

In [30]:
1647670/(569185*19)

0.15235727168532293

In [25]:
embeddings

tensor([[-1.4844,  1.1484,  1.0781,  ..., -0.5156, -0.5703,  1.9453],
        [-1.4453,  1.3984,  0.5352,  ..., -0.4980, -0.8047,  1.6328],
        [-1.5156,  1.7812,  1.5781,  ...,  0.1514, -0.6133,  2.3438],
        ...,
        [-1.4141,  1.5781,  1.1484,  ..., -0.9023, -0.9961,  1.7500],
        [-1.3906,  1.7422,  1.2969,  ..., -0.6797, -0.4570,  2.0000],
        [-1.4531,  1.8125,  1.3047,  ..., -0.9688, -0.3887,  2.2344]])

Sanity check the regression weights by seeing how well it matches the preferences.

In [33]:
predicted_rewards = embeddings @ regression_layer.T

In [42]:
~torch.isnan(sparse_rewards)

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [43]:
diff = (sparse_rewards - predicted_rewards)[~torch.isnan(sparse_rewards)].numpy()
mse_diff = np.mean(diff**2)

In [44]:
mse_diff

0.022128979

So the reward prediction is extremely successful.

## Setting up the epinet

For our first epinet, we'll use a two layer mlp for the randomized component, and a pure linear layer for deterministic component. Thus, the non-randomized network recreates the Ridge regression setting. The epinet is given slightly more structure under the intuition that it needs not only to reproduce the computations of the deterministic component, but also reason about when those calculations need added randomness.

In [59]:
from murdo.epinet_mlp import make_mlp_epinet
epinet, indexer = make_mlp_epinet(
    output_sizes = [hidden_size,n_attributes],
    epinet_hiddens = [hidden_size, 512],
    index_dim = 8,
    prior_scale = 1, 
    name = "my first epinet",
)

INFO:murdo.epinet_mlp:Creating MLPEpinet with output sizes: [4096, 19], epinet hiddens: [4096, 512], index dim: 8


In [60]:
epinet

MLPEpinet(
  (base_mlp): ExposedMLP(
    (layers): ModuleList(
      (0): Linear(in_features=4096, out_features=19, bias=True)
    )
  )
  (train_epinet): ProjectedMLP(
    (mlp): Sequential(
      (0): Linear(in_features=4096, out_features=512, bias=True)
      (1): ReLU()
      (2): Linear(in_features=512, out_features=152, bias=True)
    )
  )
  (prior_epinet): ProjectedMLP(
    (mlp): Sequential(
      (0): Linear(in_features=4096, out_features=512, bias=True)
      (1): ReLU()
      (2): Linear(in_features=512, out_features=152, bias=True)
    )
  )
)

In [58]:
indexer(0)

INFO:murdo.epinet_mlp:GaussianIndexer generating index with dimension 19


tensor([ 3.0516, -1.0326,  0.9805,  1.9015, -1.8265, -1.1090,  0.3039, -0.2242,
         0.2080, -0.2025, -0.3878, -0.6146,  0.8108,  0.5130,  0.3731, -0.5977,
        -0.1985, -0.6946, -2.5827])

For training, we'll follow the same principle as is the paper: simply masking the unknown dimensions when calculating losses. This is hopefully sufficiently in keeping with the nature of SGD. Future work might explore using uncertainty to more cleverly compensate for missing values.

# Results

# Conclusion