In [3]:
import os
import torch
import numpy as np
from nnsight import LanguageModel

import sys
sys.path.append('../')
from loading_utils import load_submodule_and_dictionary, DictionaryCfg

  from .autonotebook import tqdm as notebook_tqdm


## Tokenize 20 datasets of the pile
`quanta-discovery/misc/create_pile_canonical.py`

## Evaluate model loss
`quanta-discovery/scripts/evaluate_pile_losses.py`

## Synthesize dataset with Tokens of low loss

In [4]:
model_name = "pythia-70m-deduped"
step = 143000
device = "cuda:0"
cache_dir = "/home/can/feature_clustering/cache/"
pile_canonical = "/home/can/data/pile_test_tokenized_200k/"
loss_threshold = 0.0001
skip = 1
num_tokens = 10000
block_len = 250
output_dir = "/home/can/feature_clustering/results/"
filter = None
verbose = True

In [5]:
particular_model_cache_dir = os.path.join(cache_dir, model_name, f"step{step}")
losses_cached = [f for f in os.listdir(particular_model_cache_dir) if f.endswith("losses.pt")]
max_i = max(list(range(len(losses_cached))), key=lambda i: int(losses_cached[i].split("_")[0]))
docs, tokens = int(losses_cached[max_i].split("_")[0]), int(losses_cached[max_i].split("_")[2])
losses = torch.load(os.path.join(particular_model_cache_dir, f"{docs}_docs_{tokens}_tokens_losses.pt"))
c = 1 / np.log(2) # for nats to bits conversion

if filter:
    criterias = torch.load(filter)
    token_idxs = ((losses < (loss_threshold / c)) & (~criterias)).nonzero().flatten()
else:
    token_idxs = (losses < (loss_threshold / c)).nonzero().flatten()
token_idxs = token_idxs[::skip]
token_idxs = token_idxs[:num_tokens].tolist()
assert len(token_idxs) == num_tokens, "not enough tokens meeting loss threshold (and filter) to sample from"

In [6]:
token_idxs[:10]

[511, 794, 2268, 2961, 3308, 4089, 4174, 4189, 4191, 4194]

## Save results for ~10K tokens
- feature activation
- gradient of correct logit w.r.t feature

In [2]:
model = LanguageModel('EleutherAI/pythia-70m-deduped', device_map='cuda:0')

dictionary_dir = "/share/projects/dictionary_circuits/autoencoders/pythia-70m-deduped"
dictionary_size = 32768
submodule, dictionary = load_submodule_and_dictionary(
    model, 
    submod_name='model.gpt_neox.layers.5.mlp.dense_4h_to_h',
    dict_cfg=DictionaryCfg(dictionary_dir, dictionary_size)
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
# Define metric as final token logit
def metric_fn(model):
    logits = model.embed_out.output
    batch_size = logits.shape[0]
    return logits[t.arange(batch_size), -1, prompt_batch_final_tok]