### Custom ACDC Setup

This is a simple notebook for Automated Circuit DisCovery (ACDC) (https://arxiv.org/pdf/2304.14997.pdf) in custom settings. It follows the basic steps proposed in the accompanied paper. The current idea is to copy this notebook for each behavior under investigation. In the future, it might make sense to build a more modular solution to this, to make existing observations easier to reproduce by anyone.

To serve as an example, this notebook implements ACDC for 'greater_than' in a GPT2-small.

In [1]:
try:
    import google.colab

    IN_COLAB = True
    print("Running as a Colab notebook")

    import subprocess  # to install graphviz dependencies

    command = ["apt-get", "install", "graphviz-dev"]
    subprocess.run(command, check=True)

    import os  # make images folder

    os.mkdir("ims/")

    from IPython import get_ipython

    ipython = get_ipython()

    ipython.run_line_magic(  # install ACDC
        "pip",
        "install git+https://github.com/ArthurConmy/Automatic-Circuit-Discovery.git@d89f7fa9cbd095202f3940c889cb7c6bf5a9b516",
    )

except Exception as e:
    IN_COLAB = False
    print("Running outside of colab")

    import numpy  # crucial to not get cursed error
    import plotly

    plotly.io.renderers.default = "colab"  # added by Arthur so running as a .py notebook with #%% generates .ipynb notebooks that display in colab
    # disable this option when developing rather than generating notebook outputs

    import os  # make images folder

    if not os.path.exists("ims/"):
        os.mkdir("ims/")

    from IPython import get_ipython

    ipython = get_ipython()
    if ipython is not None:
        print("Running as a notebook")
        ipython.run_line_magic("load_ext", "autoreload")  # type: ignore
        ipython.run_line_magic("autoreload", "2")  # type: ignore
    else:
        print("Running as a script")

Running outside of colab
Running as a notebook


In [2]:
import os
import torch

# some GPU setup
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = "cuda" if torch.cuda.is_available() else "cpu"

#### Step 1: Select a Behavior, Dataset, and Metric

##### Model

In [3]:
from transformer_lens import HookedTransformer

reference_gpt2 = HookedTransformer.from_pretrained(
    "gpt2-small", fold_ln=False, center_unembed=False, center_writing_weights=False
)

Loaded pretrained model gpt2-small into HookedTransformer


##### Dataset

In [11]:
from Automatic_Circuit_Discovery.acdc.greaterthan.utils import get_year_data

# generate textual input examples and tokenize them
tokenized_inputs, inputs = get_year_data(n_examples, model)
print(f"Input Example: '{inputs[0]} \nTokenized as: {tokenized_inputs[0]}'")

# generate corrupted inputs
corrupted_tokenized_inputs = tokenized_inputs.clone()
corrupted_tokenized_inputs[:, 7] = 486  # replace with 01

# select N examples
tokenized_inputs = tokenized_inputs[:n_examples]
corrupted_tokenized_inputs = corrupted_tokenized_inputs[:n_examples]

ImportError: attempted relative import with no known parent package

##### Metric

In [12]:
METRIC = "greater_than"

In [7]:
import torch.nn.functional as F
from functools import partial

from acdc.acdc_utils import kl_divergence, negative_log_probs
from acdc.greaterthan.utils import greaterthan_metric


with torch.no_grad():
    logits = model(tokenized_inputs)[:, -1, :]
    logprobs = F.log_softmax(logits, dim=-1)

if METRIC == "kl_divergence":
    validation_metric = partial(
        kl_divergence,
        base_model_logprobs=logprobs,
        mask_repeat_candidates=None,
        last_seq_element_only=False,
    )

elif METRIC == "greater_than":
    validation_metric = partial(greaterthan_metric, tokens=tokenized_inputs.cpu())

else:
    raise ValueError(f"Unknown Metric {METRIC}")

ModuleNotFoundError: No module named 'matplotlib'

#### Step 2: Divide the Neural Network into a Graph of Smaller Units

In [None]:
import time
from dataclasses import dataclass, asdict

from acdc.TLACDCExperiment import TLACDCExperiment


@dataclass
class ACDCExperimentArguments:
    threshold: float = 0.71
    zero_ablation: bool = True
    verbose: bool = True
    indices_mode: str = "reverse"
    names_mode: str = "normal"
    corrupted_cache_cpu = False
    hook_verbose: bool = False
    online_cache_cpu: bool = False
    add_sender_hooks: bool = True
    add_receiver_hooks: bool = False
    remove_redundant: bool = False
    show_full_index: bool = str(TASK).startswith("tracr")
    use_pos_embed: bool = str(TASK).startswith("tracr")

    # weights and biases
    using_wandb: bool = False
    wandb_project_name: str = "hackathon"
    wandb_run_name: str = f"{int(time.time())}"
    wandb_entity_name: str = "entity"
    wandb_group_name: str = "group"
    wandb_notes: str = ""
    wandb_dir: str = "wandb"
    wandb_mode: str = "online"


args = asdict(ACDCExperimentArguments())

model.reset_hooks()
exp = TLACDCExperiment(
    model=model,
    ds=tokenized_inputs,
    ref_ds=corrupted_tokenized_inputs,
    metric=validation_metric,
    **args,
)



dict_keys(['blocks.11.hook_resid_post', 'blocks.11.hook_mlp_out', 'blocks.11.hook_mlp_in', 'blocks.11.attn.hook_result', 'blocks.11.attn.hook_q', 'blocks.11.hook_q_input', 'blocks.11.attn.hook_k', 'blocks.11.hook_k_input', 'blocks.11.attn.hook_v', 'blocks.11.hook_v_input', 'blocks.10.hook_mlp_out', 'blocks.10.hook_mlp_in', 'blocks.10.attn.hook_result', 'blocks.10.attn.hook_q', 'blocks.10.hook_q_input', 'blocks.10.attn.hook_k', 'blocks.10.hook_k_input', 'blocks.10.attn.hook_v', 'blocks.10.hook_v_input', 'blocks.9.hook_mlp_out', 'blocks.9.hook_mlp_in', 'blocks.9.attn.hook_result', 'blocks.9.attn.hook_q', 'blocks.9.hook_q_input', 'blocks.9.attn.hook_k', 'blocks.9.hook_k_input', 'blocks.9.attn.hook_v', 'blocks.9.hook_v_input', 'blocks.8.hook_mlp_out', 'blocks.8.hook_mlp_in', 'blocks.8.attn.hook_result', 'blocks.8.attn.hook_q', 'blocks.8.hook_q_input', 'blocks.8.attn.hook_k', 'blocks.8.hook_k_input', 'blocks.8.attn.hook_v', 'blocks.8.hook_v_input', 'blocks.7.hook_mlp_out', 'blocks.7.hook_ml

#### Step 3: Patch Model Activations to Isolate the relevant Subgraph

In [None]:
import gc
import time
from dataclasses import dataclass, asdict
from IPython import get_ipython
from IPython.display import Image, display

ipython = get_ipython()

from acdc.acdc_graphics import show

max_num_epochs = 100000
single_step = False
for i in range(max_num_epochs):
    exp.step(testing=False)

    show(
        exp.corr,
        f"ims/img_new_{i+1}.png",
        show_full_index=args.use_pos_embed,
    )

    if ipython is not None:
        # so long as we're not running this as a script, show the image!
        display(Image(f"ims/img_new_{i+1}.png"))

    print(i, "-" * 50)
    print(exp.count_no_edges())

    if i == 0:
        exp.save_edges("edges.pkl")

    if exp.current_node is None or single_step:
        break

exp.save_edges("another_final_edges.pkl")

No edge 32923
New metric: -0.8664211630821228

Node: cur_parent=TLACDCInterpNode(blocks.11.hook_mlp_out, [:]) (self.current_node=TLACDCInterpNode(blocks.11.hook_resid_post, [:]))

Metric after removing connection to blocks.11.hook_mlp_out [:] is -0.9255590438842773 (and current metric -0.8664211630821228)
Result is -0.05913788080215454...so removing connection
No edge 32922

Node: cur_parent=TLACDCInterpNode(blocks.11.attn.hook_result, [:, :, 0]) (self.current_node=TLACDCInterpNode(blocks.11.hook_resid_post, [:]))

Metric after removing connection to blocks.11.attn.hook_result [:, :, 0] is -0.9160527586936951 (and current metric -0.9255590438842773)
Result is 0.009506285190582275...so removing connection

Node: cur_parent=TLACDCInterpNode(blocks.11.attn.hook_result, [:, :, 1]) (self.current_node=TLACDCInterpNode(blocks.11.hook_resid_post, [:]))

Metric after removing connection to blocks.11.attn.hook_result [:, :, 1] is -0.9182395935058594 (and current metric -0.9160527586936951)
Resul