In [1]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import plotly.subplots as sp
import plotly.graph_objects as go
import plotly.express as px
import torch
from univ_utils import load_model_and_sae, get_running_activation_stats, load_data, create_umap_visualization
device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


#### Universality across models for final checkpoints

In [2]:
n_batches = 1
batch_size = 1
train_data, val_data = load_data(dataset="openwebtext", device=device)

In [3]:
# model_sae_pairs = [
#     ("8-768", "443ngubm"),
#     ("8-512", "fyqbawtf"),
#     ("8-256", "7g6hq05j"),
#     ("8-128", "ngd29532"),
# ]

model_sae_pairs = [
    ("8-128", "ngd29532"),
    ("8-256", "7g6hq05j"),
    ("8-512", "fyqbawtf"),
    ("8-768", "443ngubm"),
]

In [4]:
from itertools import product
all_stats = {}
for (model1_name, sae1_name), (model2_name, sae2_name) in product(model_sae_pairs, repeat=2):
    if model1_name == model2_name:
        continue
    model1, sae1 = load_model_and_sae(model1_name, sae1_name, None, device)
    model2, sae2 = load_model_and_sae(model2_name, sae2_name, None, device)
    print(model1_name, sae1_name, model2_name, sae2_name)
    
    stats = get_running_activation_stats(model1, model2, train_data, batch_size=batch_size, n_batches=n_batches, seed=34)
    
    all_stats[(model1_name, sae1_name), (model2_name, sae2_name)] = stats.to_cpu()
    del model1, model2, sae1, sae2, stats; torch.cuda.empty_cache()

  checkpoint = torch.load(ckpt_file, map_location=device)
This SAE has non-empty model_from_pretrained_kwargs. 
For optimal performance, load the model like so:
model = HookedSAETransformer.from_pretrained_no_processing(..., **cfg.model_from_pretrained_kwargs)


8-128 ngd29532 8-256 7g6hq05j


  0%|          | 0/1 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.50 GiB. GPU 0 has a total capacity of 23.64 GiB of which 281.00 MiB is free. Including non-PyTorch memory, this process has 23.35 GiB memory in use. Of the allocated memory 22.87 GiB is allocated by PyTorch, and 39.76 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [7]:
num_layers = lambda model_name: int((model_name.split("-"))[1])

In [16]:
scatter_data = []
corr_coefs = []
subplot_titles = []
masked_scatter_data = []
masked_corr_coefs = []
masked_subplot_titles = []
masks = []

for model1_name, sae1_name in model_sae_pairs:
    for model2_name, sae2_name in model_sae_pairs:
        if model1_name == model2_name:
            subplot_titles.append("____")
            masked_subplot_titles.append("____")
            continue

        stats = all_stats[(model1_name, sae1_name), (model2_name, sae2_name)]

        x = stats.max_x
        y = stats.corr_matrix.amax(dim=-1)
        scatter_data.append((x, y))

        # corr coef
        corr_coef = torch.corrcoef(torch.stack((x, y), dim=0))[0, 1].item()
        corr_coefs.append(corr_coef)

        # masked scatter plot data
        mask = (y > 0.9) & (x < 1) # TODO: get this mask based on UMAP
        masks.append(mask.clone())
        x = x[~mask]
        y = y[~mask]
        masked_scatter_data.append((x, y))
        masked_corr_coef = torch.corrcoef(torch.stack((x, y), dim=0))[0, 1].item()
        masked_corr_coefs.append(masked_corr_coef)


        # subplot title
        nl1, nl2 = num_layers(model1_name), num_layers(model2_name)
        subplot_titles.append(f"Corr={corr_coef:.4f}")
        masked_subplot_titles.append(f"Corr={masked_corr_coef:.4f}")


In [17]:
# TODO: remove uninterpretable features from each SAE using UMAP

In [18]:
grid_length = len(model_sae_pairs)  # Ensure model_sae_pairs is defined and matches your data
grid_titles = [f"n_layers={model_name.split('-')[1]}" for model_name, _ in model_sae_pairs]
fig = sp.make_subplots(rows=grid_length, cols=grid_length, subplot_titles=masked_subplot_titles, row_titles=grid_titles, column_titles=grid_titles,
                       x_title="Maximum Feature Activation", y_title="Maximum Activation Similarity",
                       horizontal_spacing=0.05, vertical_spacing=0.05)

i = 0
for row in range(1, grid_length + 1):
    for col in range(1, grid_length + 1):
        if row == col:
            continue
        
        x, y = masked_scatter_data[i]
        corr_coef = masked_corr_coefs[i]
        
        fig.add_trace(
                go.Scatter(x=x.numpy(), y=y.numpy(), mode='markers', name=""),
                row=row, col=col
            )
        i += 1

fig.layout.annotations[9].update(y=1.025)
fig.layout.annotations[10].update(y=1.025)
fig.layout.annotations[11].update(y=1.025)

# Update layout for better display
fig.update_layout(
    title_text="Feature Importance (x-axis) vs Universality (y-axis)",
    showlegend=False,
    height=1200,
    width=1200,
)

fig.show()

### Removing unitnerpretable features from each SAE

In [None]:
# TODO: you know it's possible that this cluster of features is actually interpretable, 
# but it does something different from the rest of them.

In [37]:
model1_name, sae1_name = model_sae_pairs[1]
model1, sae1 = load_model_and_sae(model1_name, sae1_name, None, device)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.



This SAE has non-empty model_from_pretrained_kwargs. 
For optimal performance, load the model like so:
model = HookedSAETransformer.from_pre

In [38]:
w_dec = sae1.W_dec.clone().detach().cpu().numpy()
mask = torch.zeros(w_dec.shape[0])
embedding, fig = create_umap_visualization(w_dec, mask)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [39]:
fig.show()

In [40]:
import numpy as np
from sklearn.cluster import DBSCAN

# Assume `umap_embeddings` is your UMAP output (n_points, 2) array
# Replace `umap_embeddings` with the actual array of your UMAP points

# Cluster the points
dbscan = DBSCAN(eps=0.5, min_samples=5)  # Adjust `eps` and `min_samples` as needed
labels = dbscan.fit_predict(embedding)

# Convert cluster labels to boolean tensor
boolean_tensor = (labels == 1)  # Example: Cluster labeled as '1'

print("Cluster Labels:", labels)
print("Boolean Tensor:", boolean_tensor)

Cluster Labels: [0 0 0 ... 0 0 0]
Boolean Tensor: [False False False ... False False False]


In [None]:
# Interestingly some times there are 3 clusters!

In [41]:
embedding, fig = create_umap_visualization(w_dec, boolean_tensor)
fig.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [42]:
from sklearn.cluster import KMeans

# Assume `umap_embeddings` is your UMAP output (n_points, 2) array

# Cluster the points into 2 clusters
kmeans = KMeans(n_clusters=2, random_state=42)
labels = kmeans.fit_predict(embedding)

# Convert cluster labels to a boolean tensor
# You can select either label as 'True' depending on your use case
boolean_tensor = (labels == 1)  # Replace '1' with '0' for the other cluster

print("Cluster Labels:", labels)
print("Boolean Tensor:", boolean_tensor)

Cluster Labels: [0 0 0 ... 0 0 0]
Boolean Tensor: [False False False ... False False False]


In [43]:
embedding, fig = create_umap_visualization(w_dec, boolean_tensor)
fig.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [44]:
# Can I plot mutual cosine similarity between the features in each cluster?

cos_sims_matrix = lambda tensor: np.triu(tensor @ tensor.transpose())
cos_sims = lambda tensor: cos_sims_matrix(tensor)[cos_sims_matrix(tensor) != 0].flatten()

flatten = cos_sims(w_dec[boolean_tensor])
print(np.median(flatten))
fig1 = px.histogram(flatten)
fig1.show()

In [35]:
# Can I plot mutual cosine similarity between the features in each cluster?

cos_sims_matrix = lambda tensor: np.triu(tensor @ tensor.transpose())
cos_sims = lambda tensor: cos_sims_matrix(tensor)[cos_sims_matrix(tensor) != 0].flatten()

flatten = cos_sims(w_dec[~boolean_tensor])
print(np.median(flatten))
fig1 = px.histogram(flatten)
fig1.show()

In [36]:
np.median(flatten)

0.0025844239