In [1]:
import sys

sys.path.insert(0, '../')
from nnsight import LanguageModel
import pandas as pd
import torch as t
import torch.nn as nn
from attribution import patching_effect
from dictionary_learning import AutoEncoder, ActivationBuffer
from dictionary_learning.interp import examine_dimension
from dictionary_learning.utils import zst_to_generator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda:0'

model = LanguageModel('EleutherAI/pythia-70m-deduped', device_map=device)

SEED = 42

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def data_ambiguous(split='train', seed=SEED):
    t.manual_seed(seed)
    data = pd.read_csv(f'data/{split}_data.csv')
    labels = t.randint(0, 2, (len(data),)).to(device)
    inputs = [
        row['singular'].lower() if label == 0 else row['plural'].upper() for (_, row), label in zip(data.iterrows(), labels)
    ]
    return inputs, labels

def data_unambiguous(split='train', seed=SEED):
    t.manual_seed(seed)
    data = pd.read_csv(f'data/{split}_data.csv')
    feat1_labels = t.randint(0, 2, (len(data),)).to(device)
    feat2_labels = t.randint(0, 2, (len(data),)).to(device)
    inputs = []
    for (_, row), label1, label2 in zip(data.iterrows(), feat1_labels, feat2_labels):
        if label1 == 0 and label2 == 0:
            inputs.append(row['singular'])
        elif label1 == 0 and label2 == 1:
            inputs.append(row['singular'].upper())
        elif label1 == 1 and label2 == 0:
            inputs.append(row['plural'])
        elif label1 == 1 and label2 == 1:
            inputs.append(row['plural'].upper())
    return inputs, feat1_labels, feat2_labels

def data_gen(ambiguous=True, split='train', seed=SEED):
    if ambiguous:
        return data_ambiguous(split=split, seed=seed)
    else:
        return data_unambiguous(split=split, seed=seed)

In [4]:
def run_with_ablations(
        model,
        inputs,
        submodules,
        dictionaries,
        to_ablate,
        out_fn,
        inference=True,
):
    with model.invoke(inputs, fwd_args={'inference': inference}):
        for submodule, dictionary in zip(submodules, dictionaries):
            x = submodule.output
            is_resid = (type(x.shape) == tuple)
            if is_resid:
                x = x[0]
            x_hat = dictionary(x)
            residual = x - x_hat

            f = dictionary.encode(x)
            ablation_idxs = t.Tensor(to_ablate[submodule]).long()
            f[:, :, ablation_idxs] = 0.
            x_hat = dictionary.decode(f)
            if is_resid:
                submodule.output[0][:] = x_hat # + residual
            else:
                submodule.output = x_hat # + residual
            
        out = out_fn(model).save()
    return out.value

In [5]:
class Probe(nn.Module):
    def __init__(self, activation_dim):
        super().__init__()
        self.net = nn.Linear(activation_dim, 1, bias=True)

    def forward(self, x):
        logits = self.net(x).squeeze(-1)
        return logits.sigmoid()

In [6]:
lr = 1e-2
epochs = 20
layer = 2

In [7]:
inputs, feat1_labels, feat2_labels = data_gen(ambiguous=False, split='train')

with model.invoke(inputs):
    acts = model.gpt_neox.layers[layer].output[0][:,-1,:].save()
acts = acts.value.clone()

probe1 = Probe(acts.shape[-1]).to(device)
probe2 = Probe(acts.shape[-1]).to(device)
opt1 = t.optim.AdamW(probe1.parameters(), lr=lr)
opt2 = t.optim.AdamW(probe2.parameters(), lr=lr)

for _ in range(epochs):
    opt1.zero_grad(), opt2.zero_grad()
    logits1 = probe1(acts)
    logits2 = probe2(acts)
    loss1 = nn.BCELoss()(logits1, feat1_labels.float())
    loss2 = nn.BCELoss()(logits2, feat2_labels.float())
    loss1.backward(), loss2.backward()
    opt1.step(), opt2.step()

inputs, feat1_labels, feat2_labels = data_gen(ambiguous=False, split='test')

with model.invoke(inputs):
    acts = model.gpt_neox.layers[layer].output[0][:,-1,:].save()
acts = acts.value.clone()

probs1 = probe1(acts)
probs2 = probe2(acts)
preds1, preds2 = probs1.round(), probs2.round()
acc1 = (preds1 == feat1_labels).float().mean().item()
acc2 = (preds2 == feat2_labels).float().mean().item()

print(f'Control probe 1 accuracy: {acc1}')
print(f'Control probe 2 accuracy: {acc2}')


You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Control probe 1 accuracy: 0.9635036587715149
Control probe 2 accuracy: 0.9781022071838379


In [8]:
t.manual_seed(SEED)
probe = Probe(512).to(device)
optimizer = t.optim.AdamW(probe.parameters(), lr=lr)

# train probe on ambiguous data
inputs, labels = data_gen(ambiguous=True, split='train')
with model.invoke(inputs):
    acts = model.gpt_neox.layers[layer].output[0][:,-1,:].save()
acts = acts.value.clone()

for _ in range(epochs):
    optimizer.zero_grad()
    probs = probe(acts)
    loss = nn.BCELoss()(probs, labels.float())
    loss.backward()
    optimizer.step()

# get accuracy on ambiguous test set
inputs, labels = data_gen(ambiguous=True, split='test')
with model.invoke(inputs):
    acts = model.gpt_neox.layers[layer].output[0][:,-1,:].save()
acts = acts.value.clone()
preds = probe(acts).round()
acc = (preds == labels).float().mean().item()
print(f'Accuracy on ambiguous data: {acc}')

# get accuracy on unambiguous test set
inputs, feat1_labels, feat2_labels = data_gen(ambiguous=False, split='test')
with model.invoke(inputs):
    acts = model.gpt_neox.layers[layer].output[0][:,-1,:].save()
acts = acts.value.clone()
preds = probe(acts).round()
acc = (preds == feat1_labels).float().mean().item()
print(f'feat1 accuracy: {acc}')
preds = probe(acts).round()
acc = (preds == feat2_labels).float().mean().item()
print(f'feat2 accuracy: {acc}')

Accuracy on ambiguous data: 1.0
feat1 accuracy: 0.5839415788650513
feat2 accuracy: 0.9635036587715149


In [9]:
submodules = [
    model.gpt_neox.layers[i] for i in range(layer + 1)
]
dictionaries = []
for i in range(layer + 1):
    ae = AutoEncoder(512, 64 * 512).to(device)
    ae.load_state_dict(t.load(f'/share/projects/dictionary_circuits/autoencoders/pythia-70m-deduped/resid_out_layer{i}/5_32768/ae.pt'))
    dictionaries.append(ae)

def metric_fn(model):
    return probe(model.gpt_neox.layers[layer].output[0][:,-1,:])

inputs, labels = data_gen(ambiguous=True, split='train')

neg_inputs, pos_inputs = [], []
for x, label in zip(inputs, labels):
    if label == 0:
        neg_inputs.append(x)
    else:
        pos_inputs.append(x)


neg_effects, _ = patching_effect(
    neg_inputs,
    None,
    model,
    submodules,
    dictionaries,
    metric_fn,
    method='ig'
)

neg_effects = {k : v.mean(dim=0).mean(dim=0) for k, v in neg_effects.items()}
for i, submodule in enumerate(submodules):
    print(f"Layer {i}:")
    effect = neg_effects[submodule]
    for feature_idx in t.nonzero(effect):
        value = effect[tuple(feature_idx)]
        if value > 0.001:
            print(f"    Multindex: {tuple(feature_idx.tolist())}, Value: {value}")

pos_effects, _ = patching_effect(
    pos_inputs,
    None,
    model,
    submodules,
    dictionaries,
    metric_fn,
    method='ig'
)

pos_effects = {k : v.mean(dim=0).mean(dim=0) for k, v in pos_effects.items()}

for i, submodule in enumerate(submodules):
    print(f"Layer {i}:")
    effect = pos_effects[submodule]
    for feature_idx in t.nonzero(effect):
        value = effect[tuple(feature_idx)]
        if -value > 0.001:
            print(f"    Multindex: {tuple(feature_idx.tolist())}, Value: {value}")

# total_effects = {
#     k : -pos_effects[k] + neg_effects[k] for k in pos_effects.keys()
# }
# for i, submodule in enumerate(submodules):
#     print(f"Layer {i}:")
#     effect = total_effects[submodule]
#     for feature_idx in t.nonzero(effect):
#         value = effect[tuple(feature_idx)]
#         if value > 0.001:
#             print(f"    Multindex: {tuple(feature_idx.tolist())}, Value: {value}")

Layer 0:
    Multindex: (450,), Value: 0.001137584331445396
    Multindex: (1403,), Value: 0.008632320910692215
    Multindex: (4726,), Value: 0.002920112805441022
    Multindex: (8844,), Value: 0.0012917125131934881
    Multindex: (9301,), Value: 0.0012395131634548306
    Multindex: (10019,), Value: 0.001957895699888468
    Multindex: (12580,), Value: 0.0022025764919817448
    Multindex: (18519,), Value: 0.003057642839848995
    Multindex: (20405,), Value: 0.0010994495823979378
    Multindex: (23723,), Value: 0.002285792026668787
    Multindex: (28583,), Value: 0.0012381786946207285
    Multindex: (28762,), Value: 0.0012637594481930137
    Multindex: (31378,), Value: 0.008632320910692215
Layer 1:
    Multindex: (12753,), Value: 0.006197110749781132
    Multindex: (13537,), Value: 0.008146233856678009
Layer 2:
    Multindex: (2543,), Value: 0.0011143218725919724
    Multindex: (12633,), Value: 0.003300841199234128
    Multindex: (29926,), Value: 0.001003497396595776
Layer 0:
    Multin

In [10]:
component_idx = 2
feat_idx = 12633

submodule = submodules[component_idx]
dictionary = dictionaries[component_idx]

# interpret some features
data = zst_to_generator('/share/data/datasets/pile/the-eye.eu/public/AI/pile/train/00.jsonl.zst')
buffer = ActivationBuffer(
    data,
    model,
    [submodule],
    out_feats=512,
    in_batch_size=128,
    n_ctxs=512,
)

out = examine_dimension(
    model,
    submodule,
    buffer,
    dictionary,
    dim_idx=feat_idx,
)
print(out['top_tokens'])
out['top_contexts']

  next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)


[('MAN', 107.1137466430664), ('For', 97.23233032226562), ('From', 94.35887908935547), ('Sl', 91.1875228881836), ('Ret', 90.5478744506836), ('H', 86.20515441894531), ('ive', 84.82954406738281), ('New', 81.80145263671875), ('CI', 81.71781921386719), ('When', 80.9211654663086)]


In [14]:
input = "YARNS"

to_ablate = {
    submodules[0] : [
        # 1512, # all caps tokens ending in S
        # 5125, # all caps text + other random stuff
        # 7655, # all caps tokens, with a preference towards ending in S
        14663, # certain all caps tokens
        # 18956, # certain tokens ending with s
        20156, # single all caps letters
        # 21034, # the token S
        22920, # the token B
        27094, # certain all caps tokens
        # 27183, # certain all caps tokens + other random stuff (including ')
        28489, # all caps text
        # 29378, # plural words
        30694, # certain all caps text
        31206, # certain all caps text
    ],
    submodules[1] : [
        # 5048, # the token S
        11997, # certain all caps text
        14172, # all caps text
        # 15924, # tokens ending with es
        26929, # certain all caps text
        # 27128, # all caps tokens ending with S
        # 29410, # all caps text + other random stuff
        31665, # certain all caps text
    ],
    submodules[2] : [
        1352, # certain all caps text
        # 2183, # plural words starting with a capital letter
        # 9222, # tokens ending with es
        11090, # certain all caps text
        # 27831, # the token S
    ]
}


with model.invoke(input):
    acts = model.gpt_neox.layers[layer].output[0][:,-1,:].save()
acts = acts.value.clone()
pred_before = probe(acts).item()

with model.invoke(input):
    for submodule, dictionary in zip(submodules, dictionaries):
        x = submodule.output
        is_resid = type(x.shape) == tuple
        if is_resid:
            x = x[0]
        x_hat = dictionary(x)
        residual = x - x_hat
        
        f = dictionary.encode(x)
        ablation_idxs = to_ablate[submodule]
        for idx in ablation_idxs:
            f[..., idx] = 0
        x_hat = dictionary.decode(f)
        if is_resid:
            submodule.output[0][:] = x_hat + residual
        else:
            submodule.output = x_hat + residual
    acts = model.gpt_neox.layers[layer].output[0][:,-1,:].save()
acts = acts.value.clone()
pred_after = probe(acts).item()

print(f'before: {pred_before}, after: {pred_after}')

before: 0.7661039233207703, after: 0.6993085741996765


In [106]:
def get_acts(model):
    return model.gpt_neox.layers[layer].output[0][:,-1,:]

# get accuracy on ambiguous test set
inputs, labels = data_gen(ambiguous=True, split='test')
acts = run_with_ablations(
    model,
    inputs,
    submodules,
    dictionaries,
    to_ablate,
    get_acts,
)
preds = probe(acts.clone()).round()
acc = (preds == labels).float().mean().item()
print(f'Accuracy on ambiguous data: {acc}')

# get accuracy on unambiguous test set
inputs, feat1_labels, feat2_labels = data_gen(ambiguous=False, split='test')
acts = run_with_ablations(
    model,
    inputs,
    submodules,
    dictionaries,
    to_ablate,
    get_acts,
)
preds = probe(acts.clone()).round()
acc = (preds == feat1_labels).float().mean().item()
print(f'feat1 accuracy: {acc}')
preds = probe(acts.clone()).round()
acc = (preds == feat2_labels).float().mean().item()
print(f'feat2 accuracy: {acc}')


Accuracy on ambiguous data: 0.7591241002082825
feat1 accuracy: 0.5620437860488892
feat2 accuracy: 0.7226277589797974


In [107]:
# train probes on unambiguous data
inputs, feat1_labels, feat2_labels = data_gen(ambiguous=False, split='train')

acts = run_with_ablations(
    model,
    inputs,
    submodules,
    dictionaries,
    to_ablate,
    get_acts,
)
acts = acts.clone()

t.manual_seed(SEED)
probe1 = Probe(acts.shape[-1]).to(device)
probe2 = Probe(acts.shape[-1]).to(device)
opt1 = t.optim.AdamW(probe1.parameters(), lr=lr)
opt2 = t.optim.AdamW(probe2.parameters(), lr=lr)

for _ in range(epochs):
    opt1.zero_grad(), opt2.zero_grad()
    logits1 = probe1(acts)
    logits2 = probe2(acts)
    loss1 = nn.BCELoss()(logits1, feat1_labels.float())
    loss2 = nn.BCELoss()(logits2, feat2_labels.float())
    loss1.backward(), loss2.backward()
    opt1.step(), opt2.step()

# get accuracy on unambiguous test set
inputs, feat1_labels, feat2_labels = data_gen(ambiguous=False, split='test')
acts = run_with_ablations(
    model,
    inputs,
    submodules,
    dictionaries,
    to_ablate,
    get_acts,
)
acts = acts.clone()

probs1 = probe1(acts)
probs2 = probe2(acts)
preds1, preds2 = probs1.round(), probs2.round()
acc1 = (preds1 == feat1_labels).float().mean().item()
acc2 = (preds2 == feat2_labels).float().mean().item()

print(f'feat1 accuracy: {acc1}')
print(f'feat2 accuracy: {acc2}')


feat1 accuracy: 0.8029196858406067
feat2 accuracy: 0.8832116723060608


In [108]:
# retrain probe on ablated model
t.manual_seed(SEED)
probe_new = Probe(512).to(device)
optimizer = t.optim.AdamW(probe_new.parameters(), lr=lr)

# train probe on ambiguous data
inputs, labels = data_gen(ambiguous=True, split='train')
acts = run_with_ablations(
    model,
    inputs,
    submodules,
    dictionaries,
    to_ablate,
    get_acts,
)
acts = acts.clone()

for _ in range(epochs):
    optimizer.zero_grad()
    probs = probe_new(acts)
    loss = nn.BCELoss()(probs, labels.float())
    loss.backward()
    optimizer.step()

# get accuracy on ambiguous test set
inputs, labels = data_gen(ambiguous=True, split='test')
acts = run_with_ablations(
    model,
    inputs,
    submodules,
    dictionaries,
    to_ablate,
    get_acts,
)
acts = acts.clone()
preds = probe_new(acts).round()
acc = (preds == labels).float().mean().item()
print(f'Accuracy on ambiguous data: {acc}')

# get accuracy on unambiguous test set
inputs, feat1_labels, feat2_labels = data_gen(ambiguous=False, split='test')
acts = run_with_ablations(
    model,
    inputs,
    submodules,
    dictionaries,
    to_ablate,
    get_acts,
)
acts = acts.clone()
preds = probe_new(acts).round()
acc = (preds == feat1_labels).float().mean().item()
print(f'feat1 accuracy: {acc}')
preds = probe_new(acts).round()
acc = (preds == feat2_labels).float().mean().item()
print(f'feat2 accuracy: {acc}')

Accuracy on ambiguous data: 0.941605806350708
feat1 accuracy: 0.6204379796981812
feat2 accuracy: 0.8394160270690918


In [123]:
# classify things based on the difference between the probe and probe_new predictions
inputs, feat1_labels, feat2_labels = data_gen(ambiguous=False, split='test')

with model.invoke(inputs):
    acts = model.gpt_neox.layers[layer].output[0][:,-1,:].save()
acts = acts.value.clone()

acts_new = run_with_ablations(
    model,
    inputs,
    submodules,
    dictionaries,
    to_ablate,
    get_acts,
)
acts_new = acts_new.clone()

probs1 = probe(acts)
probs2 = probe_new(acts_new)
diffs = probs2 - probs1
preds = (probs1 + diffs * 0.8).round()
# preds = t.where(
#     diffs.abs() > .1,
#     (diffs > 0).float(),
#     probs2.round()
# )

acc = (preds == feat1_labels).float().mean().item()
print(f'feat1 accuracy: {acc}')
acc = (preds == feat2_labels).float().mean().item()
print(f'feat2 accuracy: {acc}')

feat1 accuracy: 0.6204379796981812
feat2 accuracy: 0.8686131238937378


In [64]:
for x, prob1, prob2, pred in zip(inputs, probs1, probs2, preds):
    print(x, pred.item(), prob1.item(), prob2.item())

beef 0.0 0.04246163368225098 0.04246163368225098
tomatoes 0.0 0.01407528854906559 0.014232482761144638
headphone 0.0 0.0004125593404751271 0.0004125595442019403
ELECTRON 0.0 0.9458839893341064 0.662473201751709
seatbelt 0.0 0.002296003745868802 0.002296003745868802
GOBLINS 1.0 0.9927135109901428 0.9583725333213806
TRAVELER 0.0 0.9688680768013 0.8200774192810059
DANCE 0.0 0.35750260949134827 0.13039857149124146
parrot 0.0 0.0020670697558671236 0.002067072782665491
minerals 0.0 0.06108663231134415 0.06108663231134415
refugee 0.0 0.09666279703378677 0.09418341517448425
cabinet 0.0 0.015743708238005638 0.015743713825941086
RUG 0.0 0.6748154759407043 0.4474864602088928
doctor 0.0 0.00024147509247995913 0.00024147488875314593
cookies 0.0 0.037880077958106995 0.034380584955215454
bicycle 0.0 0.019201841205358505 0.0192018523812294
toothbrushes 0.0 0.21269121766090393 0.211523175239563
zoos 0.0 0.05740008503198624 0.05740008503198624
ROADS 1.0 0.539910614490509 0.5258908867835999
butter 0.0 0.

In [340]:
probs2 - probs1

tensor([ 3.6879e-02,  2.0250e-02, -1.3686e-04,  2.3178e-01,  1.6529e-03,
         6.6891e-02,  2.7669e-01,  1.0507e-01, -2.0387e-04,  9.0288e-02,
         1.3871e-01,  2.7971e-02,  2.8520e-01, -1.7768e-04,  2.1047e-01,
         5.7020e-02,  4.4652e-01, -2.1454e-02,  5.7701e-01,  2.8447e-02,
         8.5847e-02,  2.5172e-05,  1.0240e-02,  3.1668e-01,  1.2039e-01,
         3.1745e-01,  1.5877e-01,  2.4371e-03, -5.2675e-04,  1.5538e-01,
         1.2155e-01, -9.7485e-05,  2.5782e-02,  5.9511e-02,  7.9205e-02,
         2.8743e-02,  9.1319e-02,  5.9899e-02,  4.1422e-01,  2.7963e-01,
         3.3364e-02,  7.9152e-02,  7.3441e-03,  4.3555e-01, -3.0199e-04,
         6.6104e-03, -3.7750e-04, -4.0154e-04,  3.5560e-01,  5.5813e-02,
         2.9123e-04,  4.7253e-01,  2.7764e-01,  3.6436e-01, -3.0451e-02,
         7.5291e-02,  2.3076e-01,  1.6617e-01,  1.3763e-01,  3.0491e-01,
         3.5337e-01,  1.4857e-01, -3.3222e-04,  3.4590e-01,  2.2563e-02,
         4.7432e-01,  4.9966e-01,  3.7437e-01,  1.4

In [7]:
test_data = pd.read_json('/share/data/datasets/msgs/syntactic_category_lexical_content_the/test.jsonl', lines=True)

ling_accs, surface_accs = [], []
# get accuracy on test data
for batch_idx in range(len(test_data) // batch_size):
    inputs = test_data['sentence'][batch_idx * batch_size:(batch_idx + 1) * batch_size].tolist()
    ling_labels = test_data['linguistic_feature_label'][batch_idx * batch_size:(batch_idx + 1) * batch_size].tolist()
    surface_labels = test_data['surface_feature_label'][batch_idx * batch_size:(batch_idx + 1) * batch_size].tolist()

    with model.invoke(inputs) as invoker:
        hidden_states = model.gpt_neox.layers[-3].output[0].save()
    
    with t.no_grad():
        preds = probe(hidden_states.value)
        ling_acc = (preds.round() == t.Tensor(ling_labels).to('cuda:0')).float().mean()
        surface_acc = (preds.round() == t.Tensor(surface_labels).to('cuda:0')).float().mean()
        ling_accs.append(ling_acc.item())
        surface_accs.append(surface_acc.item())

print('ling acc:', sum(ling_accs) / len(ling_accs))
print('surface acc:', sum(surface_accs) / len(surface_accs))
    

ling acc: 0.9539930555555556
surface acc: 0.6663995726495726


In [8]:
from attribution import patching_effect
from dictionary_learning.dictionary import AutoEncoder

In [16]:
clean = "All grandsons do resemble the print and Debra is an organized child."
patch = "All grandsons do resemble a print and Debra is an banana child."

with model.invoke([clean, patch]) as invoker:
    hidden_states = model.gpt_neox.layers[-3].output[0].save()

with t.no_grad():
    preds = probe(hidden_states.value)
preds

tensor([0.4021, 0.0954], device='cuda:0')

In [17]:
def metric_fn(model):
    return probe(model.gpt_neox.layers[-3].output[0])

submodules = [
    model.gpt_neox.layers[i] for i in range(4)
] + [
    model.gpt_neox.layers[i].mlp for i in range(4)
]
dictionaries = []
for i in range(4):
    dictionary = AutoEncoder(512, 64 * 512).to(device)
    dictionary.load_state_dict(t.load(f'/share/projects/dictionary_circuits/autoencoders/pythia-70m-deduped/resid_out_layer{i}/0_32768/ae.pt'))
    dictionaries.append(dictionary)
for i in range(4):
    dictionary = AutoEncoder(512, 64 * 512).to(device)
    dictionary.load_state_dict(t.load(f'/share/projects/dictionary_circuits/autoencoders/pythia-70m-deduped/mlp_out_layer{i}/1_32768/ae.pt'))
    dictionaries.append(dictionary)

out = patching_effect(
    clean,
    patch,
    model,
    submodules,
    dictionaries,
    metric_fn,
)

In [19]:
effects, total_effect = out
print(f"Total effect: {total_effect}")
for layer, submodule in enumerate(submodules):
    print(f"Layer {layer}:")
    effect = effects[submodule]
    for feature_idx in t.nonzero(effect):
        value = effect[tuple(feature_idx)]
        if value.abs() > 0.1:
            print(f"    Multindex: {tuple(feature_idx.tolist())}, Value: {value}")

Total effect: tensor([-0.7627], device='cuda:0', grad_fn=<DivBackward0>)
Layer 0:
    Multindex: (0, 6, 23084), Value: -0.31019383668899536
    Multindex: (0, 6, 29115), Value: -0.2026602178812027
    Multindex: (0, 12, 9247), Value: 0.23665377497673035
    Multindex: (0, 12, 19133), Value: -0.13597454130649567
    Multindex: (0, 13, 1147), Value: 0.10391081869602203
    Multindex: (0, 13, 1256), Value: -0.12887312471866608
    Multindex: (0, 13, 3385), Value: -0.34761813282966614
    Multindex: (0, 13, 3613), Value: -0.11544839292764664
    Multindex: (0, 13, 5702), Value: -11.003226280212402
    Multindex: (0, 13, 5962), Value: -0.18838448822498322
    Multindex: (0, 13, 6959), Value: -0.9820340871810913
    Multindex: (0, 13, 15146), Value: -0.12887312471866608
    Multindex: (0, 13, 25951), Value: -0.24457168579101562
    Multindex: (0, 13, 26640), Value: -0.25634047389030457
    Multindex: (0, 13, 27692), Value: -0.12028089165687561
    Multindex: (0, 13, 31525), Value: -0.1246131

In [15]:
for i, tok in enumerate(invoker.input.input_ids[0]):
    print(f"{i}: {model.tokenizer.decode([tok])}")

0: All
1:  gr
2: ands
3: ons
4:  do
5:  resemble
6:  the
7:  print
8:  and
9:  De
10: bra
11:  is
12:  an
13:  organized
14:  child
15: .


In [None]:
# to_ablate = {
#     submodules[0] : [
#         5650,
#         17126,
#         22182,
#         25864,
#         2655,
#         12249,
#         12267,
#         21248,
#         21329,
#     ],
#     submodules[1] : [
#         14465,
#         18471,
#         22990,
#         16629,
#         26134,
#         32267,
#     ],
#     submodules[2] : [
#         16421,
#         22968,
#         27888,
#         15899,
#         28262,
#         28306,
#         32164,
#     ]
# }