In [2]:
import os
import sys
sys.path.append(os.path.abspath('..'))

import torch
from torch.utils.data import DataLoader
from transformer_lens import HookedTransformer
from transformer_lens import utils as tutils
from transformer_lens.evals import make_pile_data_loader, evaluate_on_dataset

from functools import partial
from datasets import load_dataset
from tqdm import tqdm

from sae_lens import SparseAutoencoder
from sae_lens.toolkit.pretrained_saes import get_gpt2_res_jb_saes
from sae_lens import SparseAutoencoder, ActivationsStore

from steering.eval_utils import evaluate_completions
from steering.utils import text_to_sae_feats, top_activations, normalise_decoder, get_activation_steering
from steering.patch import generate, get_scores_and_losses, patch_resid, get_loss, scores_2d

from sae_vis.data_config_classes import SaeVisConfig
from sae_vis.data_storing_fns import SaeVisData

import plotly.express as px
import plotly.graph_objects as go

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7fa7241f41c0>

In [11]:
feature_descriptions = ['Anger', 'London']
save_dir = "runs/fixed_anger_london"

In [5]:
import json

with open('runs/fixed_anger_london/gen_log.json', 'r') as f:
    gens = json.load(f)


In [6]:
print(gens[0].keys())
scales = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140]

dict_keys(['texts', 'scales', 'scores_1', 'scores_2', 'coherence_scores'])


In [7]:
scores_1 = torch.zeros((len(scales), len(scales)))
scores_2 = torch.zeros((len(scales), len(scales)))
losses = torch.zeros((len(scales), len(scales)))
coherence = torch.zeros((len(scales), len(scales)))

In [8]:
for gen_dict in tqdm(gens):
    s1, s2 = gen_dict['scales']
    i = scales.index(s1)
    j = scales.index(s2)
    scores_1[i, j] = sum(gen_dict['scores_1'])/len(gen_dict['scores_1'])
    scores_2[i, j] = sum(gen_dict['scores_2'])/len(gen_dict['scores_2'])
    coherence[i, j] = sum(gen_dict['coherence_scores'])/len(gen_dict['coherence_scores'])

100%|██████████| 225/225 [00:00<00:00, 7919.63it/s]


In [9]:
scores_1.shape

torch.Size([15, 15])

In [29]:
fig = px.imshow(scores_1, x=scales, y=scales,
          title=f"{feature_descriptions[0]} scores",labels={'x': feature_descriptions[1], 'y': feature_descriptions[0]},
          color_continuous_scale="RdBu", color_continuous_midpoint=0)
fig.write_html(f"{save_dir}/scores_1.html")
fig.write_image(f"{save_dir}/scores_1.png")
        
fig = px.imshow(scores_2, x=scales, y=scales,
            title=f"{feature_descriptions[1]} scores",labels={'x': feature_descriptions[1], 'y': feature_descriptions[0]},
            color_continuous_scale="RdBu", color_continuous_midpoint=0)
fig.write_html(f"{save_dir}/scores_2.html")
fig.write_image(f"{save_dir}/scores_2.png")

fig = px.imshow(coherence, x=scales, y=scales,
            title="Coherence scores",labels={'x': feature_descriptions[1], 'y': feature_descriptions[0]},
            color_continuous_scale="RdBu", color_continuous_midpoint=0)
fig.write_html(f"{save_dir}/coherence_scores.html")
fig.write_image(f"{save_dir}/coherence_scores.png")

# fig = px.imshow(losses, x=scales, y=scales,
#             title="Losses",labels={'x': feature_descriptions[1], 'y': feature_descriptions[0]},
#             color_continuous_scale="RdBu", color_continuous_midpoint=0)
# fig.write_html(f"{save_dir}/losses.html")
# fig.write_image(f"{save_dir}/losses.png")

In [12]:
# save tensors
torch.save(scores_1, f"{save_dir}/scores_1.pt")
torch.save(scores_2, f"{save_dir}/scores_2.pt")
torch.save(coherence, f"{save_dir}/coherence_scores.pt")
torch.save(losses, f"{save_dir}/losses.pt")