In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from utils.eap_utils import run_metrics, load_adapter_into_hooked_transformer
from eap.graph import Graph
import json
import gc
import torch

In [2]:
from pathlib import Path
sq_main_path = '/mnt/faster0/rje41/checkpoints/results/experiment_2/forgetting_graph_set_0_lr_false'
checkpoints_path = Path('/mnt/faster0/rje41/checkpoints/experiment_2/forgetting_graph_set_0_lr_false')

dirs = [d for d in os.listdir(sq_main_path) if os.path.isdir(os.path.join(sq_main_path, d))]

experiments = {}
for exp_folder in dirs:
    exp_path = os.path.join(sq_main_path, exp_folder)
    task_subfolders = [d for d in os.listdir(exp_path) if os.path.isdir(os.path.join(exp_path, d))]
    task_a_folder = [d for d in task_subfolders if 'task_a' in d][0]
    task_b_folder = [d for d in task_subfolders if 'task_b' in d][0]

    task_a_folder_path = os.path.join(exp_path, task_a_folder)
    task_b_folder_path = os.path.join(exp_path, task_b_folder)
    checkpoint_270_a = [d for d in os.listdir(task_a_folder_path) if 'checkpoint-270' in d][0]
    checkpoint_270_b = [d for d in os.listdir(task_b_folder_path) if 'checkpoint-270' in d][0]

    checkpoint_270_a_path = os.path.join(task_a_folder_path, checkpoint_270_a)
    checkpoint_270_b_path = os.path.join(task_b_folder_path, checkpoint_270_b)

    graph_a_path = Path(checkpoint_270_a_path) / 'graph.json'
    graph_b_path = Path(checkpoint_270_b_path) / 'graph.json'
    metrics_b_path = Path(checkpoint_270_b_path) / 'metrics.json'
    final_path = Path(checkpoints_path) / Path(exp_path).name / 'checkpoints' / 'checkpoint-270'

    task_name = Path(exp_path).name
    parts = task_name.split('->') 
    experiments[Path(exp_path).name ] = {
        'task_a_graph':graph_a_path,
        'task_b_graph':graph_b_path,
        'checkpoint_path':final_path,
        'task_a': parts[0],
        'task_b': parts[1]
    }

In [None]:
model_name = "EleutherAI/pythia-1.4b-deduped"
transformer_lens_name = "pythia-1.4B-deduped"
model_cache_dir = "/mnt/faster0/rje41/.cache/huggingface"
validation_datasets = '/homes/rje41/mech-interp-ft/experiment2/task_set_0'

all_results = {}
task_combos_set = [
    ('Sub','AddSub'),
    ('Add','Sub'),
    ('Abs','Sub'),
    ('AddSub','AddSubAlias'),
    ('FloorDiv','AddSubAlias'),
    ('AddSub','CondAddSub'),
    ('Sub','Modulo'),
    ('Abs','FloorDiv')
]

for exp_name, exp_info in experiments.items():
    task_a_graph = exp_info['task_a_graph']
    task_b_graph = exp_info['task_b_graph']
    checkpoint_path = exp_info['checkpoint_path']
    task_a = exp_info['task_a']
    task_b = exp_info['task_b']
    if (task_a,task_b) not in task_combos_set:
        continue 
    print(task_a, task_b)

    model = load_adapter_into_hooked_transformer(
        adapter_path=checkpoint_path,
        hf_model_name=model_name,
        translens_model_name=transformer_lens_name,
        adapter=True,
        scratch_cache_dir=model_cache_dir,
    )   

    result_a_on_b = run_metrics(
        g=Graph.from_json(task_a_graph),
        model=model,
        valid_file_csv=Path(validation_datasets) / task_b / 'datasets_csv' / 'validation.csv',
        loader_n=6,
        percentage_prune=0.05
    )

    result_b_on_b = run_metrics(
        g=Graph.from_json(task_b_graph),
        model=model,
        valid_file_csv=Path(validation_datasets) / task_b / 'datasets_csv' / 'validation.csv',
        loader_n=6,
        percentage_prune=0.05
    )

    kl_faithfulness = result_b_on_b[0]['circuit_kl'] / result_a_on_b[0]['circuit_kl']

    accuracy_faithfulness = result_a_on_b[0]['graph_accuracy'] / result_b_on_b[0]['graph_accuracy']

    del model
    torch.cuda.empty_cache()
    gc.collect()

    all_results[exp_name] = {
        'task_a_on_task_b': result_a_on_b[0],
        'task_b_on_task_b': result_b_on_b[0],
        'kl_faithfulness': kl_faithfulness,
        'accuracy_faithfulness': accuracy_faithfulness
    }

    print('KL_Faithfulness', kl_faithfulness)
    print('Accuracy Faithfulness', accuracy_faithfulness)

    all_results[exp_name] = {
        'task_a_on_task_b': result_a_on_b,
        'task_b_on_task_b': result_b_on_b,
        'kl_faithfulness': kl_faithfulness,
        'accuracy_faithfulness': accuracy_faithfulness
    }


Sub AddSub
Loaded pretrained model pythia-1.4B-deduped into HookedTransformer
Pruned the graph!


100%|██████████| 67/67 [00:54<00:00,  1.23it/s]
100%|██████████| 67/67 [01:24<00:00,  1.26s/it]


Baseline performance: 0.00041834349394775927
Circuit performance: 0.0003865249454975128
Faithfulness: 3.181854845024645e-05
Percentage of model performance achieved by the circuit: 92.39%


100%|██████████| 67/67 [00:49<00:00,  1.35it/s]
100%|██████████| 67/67 [01:01<00:00,  1.09it/s]


Baseline accuracy: 0.9449999928474426
Circuit accuracy: 0.8299999833106995
Pruned the graph!


100%|██████████| 67/67 [00:49<00:00,  1.36it/s]
100%|██████████| 67/67 [01:23<00:00,  1.25s/it]


Baseline performance: 0.00041834349394775927
Circuit performance: 0.0003856968251056969
Faithfulness: 3.2646668842062354e-05
Percentage of model performance achieved by the circuit: 92.20%


100%|██████████| 67/67 [00:48<00:00,  1.37it/s]
100%|██████████| 67/67 [01:01<00:00,  1.08it/s]


Baseline accuracy: 0.9449999928474426
Circuit accuracy: 0.8374999761581421
KL_Faithfulness 0.9978575240706651
Accuracy Faithfulness 0.9910447844048339
FloorDiv AddSubAlias
Loaded pretrained model pythia-1.4B-deduped into HookedTransformer
Pruned the graph!


100%|██████████| 67/67 [00:50<00:00,  1.32it/s]
100%|██████████| 67/67 [01:26<00:00,  1.30s/it]


Baseline performance: 0.0004158198134973645
Circuit performance: 0.0003798893594648689
Faithfulness: 3.593045403249562e-05
Percentage of model performance achieved by the circuit: 91.36%


100%|██████████| 67/67 [00:50<00:00,  1.33it/s]
100%|██████████| 67/67 [01:01<00:00,  1.09it/s]


Baseline accuracy: 0.9449999928474426
Circuit accuracy: 0.7724999785423279
Pruned the graph!


100%|██████████| 67/67 [00:49<00:00,  1.34it/s]
100%|██████████| 67/67 [01:25<00:00,  1.27s/it]


Baseline performance: 0.0004158198134973645
Circuit performance: 0.0003936171415261924
Faithfulness: 2.2202671971172094e-05
Percentage of model performance achieved by the circuit: 94.66%


100%|██████████| 67/67 [00:50<00:00,  1.34it/s]
100%|██████████| 67/67 [01:01<00:00,  1.08it/s]


Baseline accuracy: 0.9449999928474426
Circuit accuracy: 0.8575000166893005
KL_Faithfulness 1.036136263675985
Accuracy Faithfulness 0.9008745930114997
Add Sub
Loaded pretrained model pythia-1.4B-deduped into HookedTransformer
Pruned the graph!


100%|██████████| 67/67 [00:50<00:00,  1.33it/s]
100%|██████████| 67/67 [01:26<00:00,  1.29s/it]


Baseline performance: 0.00042760049109347165
Circuit performance: 0.00038747527287341654
Faithfulness: 4.01252182200551e-05
Percentage of model performance achieved by the circuit: 90.62%


100%|██████████| 67/67 [00:50<00:00,  1.34it/s]
100%|██████████| 67/67 [01:02<00:00,  1.08it/s]


Baseline accuracy: 1.0
Circuit accuracy: 0.9449999928474426
Pruned the graph!


100%|██████████| 67/67 [00:49<00:00,  1.35it/s]
100%|██████████| 67/67 [01:24<00:00,  1.26s/it]


Baseline performance: 0.00042760049109347165
Circuit performance: 0.0003886614285875112
Faithfulness: 3.8939062505960464e-05
Percentage of model performance achieved by the circuit: 90.89%


100%|██████████| 67/67 [00:49<00:00,  1.35it/s]
100%|██████████| 67/67 [01:01<00:00,  1.08it/s]


Baseline accuracy: 1.0
Circuit accuracy: 0.949999988079071
KL_Faithfulness 1.0030612423479268
Accuracy Faithfulness 0.9947368470585578
Abs Sub
Loaded pretrained model pythia-1.4B-deduped into HookedTransformer
Pruned the graph!


100%|██████████| 67/67 [00:50<00:00,  1.32it/s]
100%|██████████| 67/67 [01:25<00:00,  1.28s/it]


Baseline performance: 0.0004521612427197397
Circuit performance: 0.0004230471677146852
Faithfulness: 2.9114075005054474e-05
Percentage of model performance achieved by the circuit: 93.56%


100%|██████████| 67/67 [00:50<00:00,  1.34it/s]
100%|██████████| 67/67 [01:02<00:00,  1.08it/s]


Baseline accuracy: 1.0
Circuit accuracy: 0.9825000166893005
Pruned the graph!


100%|██████████| 67/67 [00:50<00:00,  1.32it/s]
100%|██████████| 67/67 [01:25<00:00,  1.28s/it]


Baseline performance: 0.0004521612427197397
Circuit performance: 0.0004246200551278889
Faithfulness: 2.7541187591850758e-05
Percentage of model performance achieved by the circuit: 93.91%


100%|██████████| 67/67 [00:49<00:00,  1.35it/s]
100%|██████████| 67/67 [01:02<00:00,  1.08it/s]


Baseline accuracy: 1.0
Circuit accuracy: 0.9750000238418579
KL_Faithfulness 1.0037179953755524
Accuracy Faithfulness 1.00769230016825
AddSub AddSubAlias
Loaded pretrained model pythia-1.4B-deduped into HookedTransformer
Pruned the graph!


100%|██████████| 67/67 [00:50<00:00,  1.32it/s]
100%|██████████| 67/67 [01:24<00:00,  1.26s/it]


Baseline performance: 0.0004369760281406343
Circuit performance: 0.00041186832822859287
Faithfulness: 2.5107699912041426e-05
Percentage of model performance achieved by the circuit: 94.25%


100%|██████████| 67/67 [00:49<00:00,  1.34it/s]
100%|██████████| 67/67 [01:02<00:00,  1.07it/s]


Baseline accuracy: 0.9649999737739563
Circuit accuracy: 0.8600000143051147


In [None]:
with open("all_results.json", "w") as f:
    json.dump(all_results, f, indent=4)