In [1]:
import sys

sys.path.insert(0, '../')

import torch as t
from nnsight import LanguageModel
from dictionary_learning import ActivationBuffer
from dictionary_learning.interp import examine_dimension
from dictionary_learning.utils import zst_to_generator
from loading_utils import load_submodules_and_dictionaries
from circuitsvis.activations import text_neuron_activations
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 42
device = 'cuda:0'

model = LanguageModel('EleutherAI/pythia-70m-deduped', device_map=device)
submodules, submodule_names, dictionaries = load_submodules_and_dictionaries(
        model,
        use_attn=True,
        use_mlp=True,
        use_resid=True,
        dict_path="/share/projects/dictionary_circuits/autoencoders/pythia-70m-deduped/",
        dict_size=512*64,
        dict_run_name="5_32768",
        device=device,
)
submodule_names = {v: k for k, v in submodule_names.items()}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
# Setup Buffer
n_ctxs = 256
ctx_len = 128
data = zst_to_generator('/share/data/datasets/pile/the-eye.eu/public/AI/pile/train/00.jsonl.zst')
buffer = ActivationBuffer(
    data,
    model,
    [submodules[0]], # doesn't matter which submodule, we only use the buffer for contexts
    out_feats=512,
    in_batch_size=128,
    n_ctxs=n_ctxs,
    ctx_len=ctx_len,
    device=device,
)

inputs = buffer.text_batch(batch_size=n_ctxs)
print(f'Inputs shape: {len(inputs)}, {len(inputs[0])}')

Inputs shape: 256, 13274


In [9]:
# Choose a component and feature to examine

submodule_name = "mlp0"
feat_idx = 22005

submodule = submodule_names[submodule_name]
dictionary = dictionaries[submodule]
out = examine_dimension(
    model,
    submodule,
    inputs,
    dictionary,
    dim_idx=feat_idx,
    max_length=ctx_len,
    n_inputs=n_ctxs,
)

# Cleanup memory
t.cuda.empty_cache()
gc.collect()

# Visualize the results
text_neuron_activations(*out.top_contexts)

## Concat context files

In [11]:
import json
import numpy as np

file_names = [
    "dense_RC_10_32768_contexts",
    "dense_random_10_32768_contexts",
    "sparse_RC_10_32768_contexts",
    "sparse_random_10_32768_contexts",
    "dense_BiB_10_32768_contexts",
    "sparse_BiB_10_32768_contexts",
]
total_contexts = dict()
for file_name in file_names:
    next_idx = len(total_contexts)
    with open(f'/home/can/dictionary-circuits/feature_annotation/contexts/{file_name}.json', 'r') as f:
        context = json.load(f)
        print(f'Loaded {len(context)} contexts from {file_name}')
    for idx, ctx in context.items():
        idx = int(idx)
        total_contexts[next_idx + idx] = ctx

Loaded 20 contexts from dense_RC_10_32768_contexts
Loaded 25 contexts from dense_random_10_32768_contexts
Loaded 23 contexts from sparse_RC_10_32768_contexts
Loaded 25 contexts from sparse_random_10_32768_contexts
Loaded 25 contexts from dense_BiB_10_32768_contexts
Loaded 25 contexts from sparse_BiB_10_32768_contexts


In [12]:
# Randomly shuffle keys of total_contexts
keys = list(total_contexts.keys())
np.random.shuffle(keys)
total_contexts = {i: total_contexts[k] for i, k in enumerate(keys)}

# Save total_contexts
with open(f'/home/can/dictionary-circuits/feature_annotation/contexts/sparse-dense_random-RC_contexts.json', 'w') as f:
    json.dump(total_contexts, f)

In [5]:
# add feature set name to all random conexts
set_name = "dense_RC"
file_name = "dense_RC_contexts_run10_32768"
with open(f'/home/can/dictionary-circuits/feature_annotation/contexts/{file_name}.json', 'r') as f:
    context = json.load(f)
    for k in context:
        context[k]['feature']['set_name'] = set_name

# save the updated context
with open(f'/home/can/dictionary-circuits/feature_annotation/contexts/{file_name}.json', 'w') as f:
    json.dump(context, f)

FileNotFoundError: [Errno 2] No such file or directory: '/home/can/dictionary-circuits/feature_annotation/contexts/dense_RC_contexts_run10_32768.json'