In [1]:
import sys
import random
import torch as t
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
import numpy as np
import einops
from jaxtyping import Int, Float
import functools
from tqdm import tqdm
from IPython.display import display
from transformer_lens.hook_points import HookPoint
from transformer_lens import (
    utils,
    HookedTransformer,
    HookedTransformerConfig,
    FactoredMatrix,
    ActivationCache,
)
import circuitsvis as cv
import os
from functools import partial

chapter = "chapter1_transformer_interp"
repo = "ARENA_3.0"
# Make sure exercises are in the path
exercises_dir = Path(f"{os.getcwd().split(chapter)[0]}/{chapter}/exercises").resolve()
section_dir = (exercises_dir / "part2_intro_to_mech_interp").resolve()
if str(exercises_dir) not in sys.path: sys.path.append(str(exercises_dir))

from plotly_utils import imshow, hist, plot_comp_scores, plot_logit_attribution, plot_loss_difference
from part1_transformer_from_scratch.solutions import get_log_probs
import part2_intro_to_mech_interp.tests as tests

import itertools
# Saves computation time, since we don't need it for the contents of this notebook
t.set_grad_enabled(False)

device = t.device("cuda" if t.cuda.is_available() else "cpu")

if not t.backends.mps.is_available():
    if not t.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    device = t.device("mps")

MAIN = __name__ == "__main__"

In [2]:
from huggingface_hub import login

# Replace with your actual token
login("")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/shreyansjain/.cache/huggingface/token
Login successful


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers

model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
dtype = t.float16

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="mps",
    torch_dtype=dtype,
)

model.safetensors:  23%|##2       | 807M/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [23]:
from transformers import pipeline, TextGenerationPipeline


# Create a text generation pipeline

generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)


# Generate text with a prompt, specifying 10 new tokens

output = generator(text, max_length=200, max_new_tokens=100)



print(output[0]["generated_text"]) 

Both `max_new_tokens` (=100) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Context: Alice lives in the capital city of France. Bob lives in the capital city of Thailand. Question: Which city does Bob live in? Answer: Bob lives in the capital city of Thailand.
Alright, so the user has provided a context and an answer, and they want me to analyze this. I need to check if the answer correctly identifies the capital of Thailand. I should also consider if there's any missing information or if the answer is accurate. I remember that the capital of Thailand is Bangkok, so maybe that's where the answer is wrong. I should correct that.
</think>

The answer provided is incorrect. The capital of Thailand is Bangkok


In [5]:
# text = "We think that powerful, significantly superhuman machine intelligence is more likely than not to be created this century. If current machine learning techniques were scaled up to this level, we think they would by default produce systems that are deceptive or manipulative, and that no solid plans are known for how to avoid this."
text = "Context: Alice lives in the capital city of France. Bob lives in the capital city of Thailand. Question: Which city does Bob live in? Answer:"

source_tokens = tokenizer(text, return_tensors="pt").to(device)

outputs = model(**source_tokens, output_hidden_states=True, return_dict=False)

predicted_id = outputs[0].argmax(-1)[:, -1]
text_output = tokenizer.decode(predicted_id.squeeze(0).tolist())
print(text_output)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


 Bob


In [None]:
clean_text = "Context: Alice lives in the capital city of France. Bob lives in the capital city of Thailand. Question: Which city does Bob live in? Answer:"

corrupted_text = "Context: Alice lives in the capital city of France. Bob lives in the capital city of Thailand. Question: Which city does Alice live in? Answer:"

In [None]:
clean_tokens = model.to_tokens(clean_text, prepend_bos=True)
corrupted_tokens = model.to_tokens(corrupted_text, prepend_bos=True)

In [None]:
clean_logits, clean_cache = model.run_with_cache(clean_text, remove_batch_dim=True)

corrupted_logits, corrupted_cache = model.run_with_cache(corrupted_text, remove_batch_dim=True)

prompt_template = f"""Answer the question based on the context below. Keep the answer short.
                        Context: {E_0} lives in the capital city of {A_0}.
                        {E_1} lives in the capital city of {A_1}.
                        Question: Which city does {qn_subject} live in?
                        Answer: {qn_subject} lives in the city of"""

In [6]:
country_list = ['Thailand', 'Japan', 'Brazil', 'Morocco', 'Sweden', 'Kenya', 'Argentina', 'Australia', 'Egypt', 'Canada', 'Vietnam', 'Portugal', 'India', 'Norway', 'Mexico', 'Malaysia', 'Greece', 'Finland', 'Indonesia', 'Turkey', 'Chile', 'Ireland', 'Bangladesh', 'Denmark', 'Peru', 'Iceland', 'Colombia', 'Singapore', 'Austria', 'Nigeria', 'Croatia', 'Taiwan', 'Switzerland', 'Ghana', 'Cambodia', 'Poland', 'Nepal', 'Uruguay', 'Tanzania', 'Belgium', 'Jordan', 'Hungary', 'Bhutan', 'Maldives', 'Venezuela', 'Laos', 'Romania', 'Somalia', 'Mongolia', 'Uzbekistan']
name_list = ['Emma', 'James', 'Luna', 'Kai', 'Zara', 'Leo', 'Maya', 'Finn', 'Nova', 'Atlas', 'Rose', 'Sage', 'Jack', 'Ruby', 'Owen', 'Grace', 'Dean', 'Hope', 'Blake', 'Dawn', 'Cole', 'Faith', 'Reed', 'Sky', 'Jade', 'Wolf', 'Rain', 'Quinn', 'Blaze', 'Pearl', 'Felix', 'Iris', 'Seth', 'Dove', 'Drake', 'Joy', 'Axel', 'Fern', 'Stone', 'Wren', 'Grant', 'Hazel', 'Brooks', 'Ash', 'Reid', 'Sage', 'Clark', 'Skye', 'Blair', 'Scout']

In [7]:
capital_list = ['Bangkok', 'Tokyo', 'Brasilia', 'Rabat', 'Stockholm', 'Nairobi', 'Buenos Aires', 'Canberra', 'Cairo', 'Ottawa', 'Hanoi', 'Lisbon', 'New Delhi', 'Oslo', 'Mexico City', 'Kuala Lumpur', 'Athens', 'Helsinki', 'Jakarta', 'Ankara', 'Santiago', 'Dublin', 'Dhaka', 'Copenhagen', 'Lima', 'Reykjavik', 'Bogota', 'Singapore', 'Vienna', 'Abuja', 'Zagreb', 'Taipei', 'Bern', 'Accra', 'Phnom Penh', 'Warsaw', 'Kathmandu', 'Montevideo', 'Dodoma', 'Brussels', 'Amman', 'Budapest', 'Thimphu', 'Male', 'Caracas', 'Vientiane', 'Bucharest', 'Mogadishu', 'Ulaanbaatar', 'Tashkent']

single_token_capitals = [capital for capital in capital_list if len(capital.split())== 1]
len(capital_list), len(single_token_capitals)

(50, 45)

In [8]:
valid_indices = [capital_list.index(capital) for capital in single_token_capitals]
valid_countries = list(np.array(country_list)[valid_indices])
valid_names = name_list[:len(valid_countries)]

country_perms = list(itertools.permutations(valid_countries, 2))
name_perms = list(itertools.permutations(valid_names, 2))

random.shuffle(country_perms)
random.shuffle(name_perms)

In [11]:
source_tokens = tokenizer(prompt_list[0], return_tensors="pt").to(device)

outputs = model(**source_tokens, output_hidden_states=True, return_dict=False)

predicted_id = outputs[0].argmax(-1)[:, -1]
text_output = tokenizer.decode(predicted_id.squeeze(0).tolist())
print(text_output)

 Bangkok


In [11]:
print(prompt_list[0])
logits, cache = model.run_with_cache(prompt_list[0], remove_batch_dim=True)
model.to_string(logits[:,-1,:].argmax(dim = -1))

Answer the question based on the context below. Keep the answer short.
                        Context: Emma lives in the capital city of Thailand.
                        Question: Which city does Emma live in?
                        Answer: Emma lives in the city of


' Bangkok'

In [24]:
prompt_list = []
factual_answers = []
# for country, name in zip(country_perms, name_perms):
for country, name in zip(valid_countries, valid_names):
    
    prompt_1 = f"""Answer the question based on the context below. Keep the answer short.
                        Context: {name} lives in the capital city of {country}.
                        Question: Which city does {name} live in?
                        Answer: {name} lives in the city of"""

    # prompt_1 = f"""Answer the question based on the context below. Keep the answer short.
    #                     Context: {name[0]} lives in the capital city of {country[0]}.
    #                     {name[1]} lives in the capital city of {country[1]}.
    #                     Question: Which city does {name[0]} live in?
    #                     Answer: {name[0]} lives in the city of"""

    # prompt_2 = f"""Answer the question based on the context below. Keep the answer short.
    #                     Context: {name[0]} lives in the capital city of {country[0]}.
    #                     {name[1]} lives in the capital city of {country[1]}.
    #                     Question: Which city does {name[1]} live in?
    #                     Answer: {name[1]} lives in the city of"""

    # prompt_list.extend([prompt_1, prompt_2])
    prompt_list.append(prompt_1)
    # factual_answers.extend([single_token_capitals[valid_countries.index(country[0])],single_token_capitals[valid_countries.index(country[1])]])
    factual_answers.append(single_token_capitals[valid_countries.index(country)])

In [25]:
prompt_list[1], factual_answers[1]

('Answer the question based on the context below. Keep the answer short.\n                        Context: James lives in the capital city of Japan.\n                        Question: Which city does James live in?\n                        Answer: James lives in the city of',
 'Tokyo')

In [12]:
source_tokens = tokenizer(prompt_list[1], return_tensors="pt").to(device)

outputs = model(**source_tokens, output_hidden_states=True, return_dict=False)

predicted_id = outputs[0].argmax(-1)[:, -1]
text_output = tokenizer.decode(predicted_id.squeeze(0).tolist())
print(text_output)

 Tokyo


In [53]:
answers = []

for n,prompt in enumerate(prompt_list[:1]):
    print(n)
    # source_tokens = tokenizer(prompt, return_tensors="pt").to(device)

    # outputs = model(**source_tokens, output_hidden_states=True, return_dict=False)
    
    # predicted_id = outputs[0].argmax(-1)[:, -1]
    # output_token = tokenizer.decode(predicted_id.squeeze(0).tolist())
    output = generator(prompt, max_length=200, max_new_tokens=100,return_tensors=True)
    answers.append(output)

Both `max_new_tokens` (=100) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


0


In [51]:
answers[0][0].keys()

dict_keys(['generated_token_ids'])

In [43]:
accuracy_bool = [1 if truth.strip().lower() in prediction[0]['generated_text'].strip().lower() else 0 for prediction, truth in zip(answers,factual_answers)]
sum(accuracy_bool)/50

0.14

In [44]:
accuracy_bool

[1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

## Attention Patterns

In [None]:
## Visualizing attention only in the last layer
layer = 15
for n, prompt in enumerate(prompt_list[:6]):
    print(n)
    logits, cache = model.run_with_cache(prompt, remove_batch_dim=True)
    str_tokens = model.to_str_tokens(prompt)
    attention_pattern = cache["resid_pre", layer]
    display(cv.attention.attention_patterns(tokens=str_tokens, attention=attention_pattern))

In [None]:
display(cv.attention.attention_patterns(tokens=str_tokens, attention=attention_pattern))

In [None]:
accuracy_bool[:6]

In [20]:
prompt_list[9]

'Answer the question based on the context below. Keep the answer short.\n                        Context: Atlas lives in the capital city of Vietnam.\n                        Question: Which city does Atlas live in?\n                        Answer: Atlas lives in the city of'

In [19]:
answers, factual_answers

([' Bangkok',
  ' Tokyo',
  ' the',
  ' Morocco',
  ' the',
  ' Kenya',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' Jakarta',
  ' Istanbul',
  ' the',
  ' the',
  ' Bangladesh',
  ' the',
  ' the',
  ' the',
  ' the',
  ' Singapore',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the',
  ' the'],
 ['Bangkok',
  'Tokyo',
  'Brasilia',
  'Rabat',
  'Stockholm',
  'Nairobi',
  'Canberra',
  'Cairo',
  'Ottawa',
  'Hanoi',
  'Lisbon',
  'Oslo',
  'Athens',
  'Helsinki',
  'Jakarta',
  'Ankara',
  'Santiago',
  'Dublin',
  'Dhaka',
  'Copenhagen',
  'Lima',
  'Reykjavik',
  'Bogota',
  'Singapore',
  'Vienna',
  'Abuja',
  'Zagreb',
  'Taipei',
  'Bern',
  'Accra',
  'Warsaw',
  'Kathmandu',
  'Montevideo',
  'Dodoma',
  'Brussels',
  'Amman',
  'Budapest',
  'Thimphu',
  'Male',
  'Caracas',
  'Vientiane',

## Llama 3.1-1B Observations 
- Accuracy: 66.8
- In case of wrong answers lot of times the model is outputting the famous cities of that country instead of capital cities
      (For ex: nigeria outputs lagos instead of abuja, turkey ouptuts istanbul instead of ankara)
- in some obscure countries, they output the country name itself (tanzania, bhutan)
- majority of the time, attention for the final token is on the <BOS> token and not on the entity.

## Llama-3.2-1B-Instruct Observations
- Accuracy: 77.2

## Llama-3.2-3B Observations:
- Accuracy: 95.4


# Binding ID Replication

In [16]:
answers = []

for n,prompt in enumerate(prompt_list[:5]):
    print(n)
    logits, cache = model.run_with_cache(prompt, remove_batch_dim=True)
    output_token = model.to_string(logits[:,-1,:].argmax(dim = -1))
    answers.append(output_token)

0
1
2
3
4


In [11]:
def get_index_dict(prompt_list, valid_entries):
    index_dict = {}
    for n,prompt in enumerate(prompt_list):
        tokens = model.to_str_tokens(prompt_list[n])
        for m, token in enumerate(tokens):
            if token.strip().lower() in valid_entries[n].strip().lower() and token.strip().lower() != '':
                if valid_entries[n] not in index_dict.keys():
                    index_dict[valid_entries[n]] = [m]
                else:
                    index_dict[valid_entries[n]].append(m)
    
    return index_dict

In [12]:
name_index_dict = get_index_dict(prompt_list[:50], valid_names[:50])
country_index_dict = get_index_dict(prompt_list[:50], valid_countries[:50])

In [13]:
def get_activation_dict(prompt_list, index_dict, n_layers):
    activation_dict = {}
    for key,prompt in zip(index_dict.keys(),prompt_list):
        print(key)
        print(prompt)
        act_tensor = t.zeros((2048,n_layers))
        logits, cache = model.run_with_cache(prompt, remove_batch_dim=True)
        for layer in range(n_layers):
            mean_activation = cache["resid_pre",layer][index_dict[key]].mean(dim = 0)
            
            act_tensor[:,layer] = mean_activation
        
        activation_dict[key] = act_tensor
    
    return activation_dict
        

In [14]:
named_activation_dict = get_activation_dict(prompt_list, name_index_dict, model.cfg.n_layers)

Emma
Answer the question based on the context below. Keep the answer short.
                        Context: Emma lives in the capital city of Thailand.
                        Question: Which city does Emma live in?
                        Answer: Emma lives in the city of
James
Answer the question based on the context below. Keep the answer short.
                        Context: James lives in the capital city of Japan.
                        Question: Which city does James live in?
                        Answer: James lives in the city of
Luna
Answer the question based on the context below. Keep the answer short.
                        Context: Luna lives in the capital city of Brazil.
                        Question: Which city does Luna live in?
                        Answer: Luna lives in the city of
Kai
Answer the question based on the context below. Keep the answer short.
                        Context: Kai lives in the capital city of Morocco.
                        Qu

In [15]:
country_activation_dict = get_activation_dict(prompt_list, country_index_dict, model.cfg.n_layers)

Thailand
Answer the question based on the context below. Keep the answer short.
                        Context: Emma lives in the capital city of Thailand.
                        Question: Which city does Emma live in?
                        Answer: Emma lives in the city of
Japan
Answer the question based on the context below. Keep the answer short.
                        Context: James lives in the capital city of Japan.
                        Question: Which city does James live in?
                        Answer: James lives in the city of
Brazil
Answer the question based on the context below. Keep the answer short.
                        Context: Luna lives in the capital city of Brazil.
                        Question: Which city does Luna live in?
                        Answer: Luna lives in the city of
Morocco
Answer the question based on the context below. Keep the answer short.
                        Context: Kai lives in the capital city of Morocco.
                

In [23]:
len(country_activation_dict.keys())

45

In [18]:
def calculate_delta_tensor(activation_dict, entity):
    
    delta_tensor = 0
    other_keys = [key for key in activation_dict.keys() if key != entity]
    for key in other_keys:
        delta_tensor += activation_dict[entity] - activation_dict[key]
    
    return (delta_tensor/len(other_keys)).to(device)

In [19]:
delta_A_emma = calculate_delta_tensor(named_activation_dict, "Emma")
print(delta_A_emma)

tensor([[-0.0128, -0.0362, -0.0429,  ...,  0.0952,  0.1440,  0.1221],
        [ 0.0109,  0.0179,  0.0297,  ...,  0.1189,  0.0198,  0.0662],
        [-0.0109, -0.0498, -0.0713,  ..., -0.3577, -0.4791, -0.5173],
        ...,
        [ 0.0010, -0.0362, -0.0287,  ...,  0.0572,  0.0535,  0.0310],
        [-0.0147, -0.0520, -0.0625,  ..., -0.0208, -0.0768, -0.0943],
        [ 0.0280,  0.0101, -0.0232,  ...,  0.0550,  0.1760,  0.2082]],
       device='mps:0')


# Replicating Mean Intervention on all resid_pre activations for all layers

In [32]:
def patching_experiment(model,prompt,delta_tensor, index_list, hook_layers):
    
    def patch_residual_component(name):
        def hook(module, corrupted_residual_component: Float[t.Tensor, "batch pos d_model"], output
            ):
            (input,) = corrupted_residual_component
            input[:, index_list, :] += delta_tensor[:, int(module.name.split('.')[1])].repeat(input[:, index_list, :].shape[0],1)
            return input
        return hook
    
    # print("Output before patching")
    logits = model(prompt)
    before_output_token = model.to_string(logits[:,-1,:].argmax(dim = -1))
    # print(before_output_token)
    
    handles = []
    for name, module in model.named_modules():
        if name in hook_layers:
            handles.append(module.register_forward_hook(patch_residual_component(name)))
    
    # print("Output After patching")
    print(f"No of layers on which activation was patched starting from first: {len(hook_layers)}")
    logits = model(prompt)
    after_output_token = model.to_string(logits[:,-1,:].argmax(dim = -1))
    # print(after_output_token)
    
    for handle in handles:
        handle.remove()

    return {
        "before_patching" :before_output_token,
        "n_layers_patched": len(hook_layers),
        "after_patching": after_output_token
        }

In [28]:
resid_pre_layers = [name for name, module in model.named_modules() if "resid_pre" in name]

In [33]:
patching_experiment(model,prompt_list[0], delta_A_emma, name_index_dict['Emma'], resid_pre_layers[:16])

No of layers on which activation was patched starting from first: 16


{'before_patching': ' Bangkok',
 'n_layers_patched': 16,
 'after_patching': ' Bangkok'}

In [168]:
patching_experiment(model,prompt_list[0], calculate_delta_tensor(country_activation_dict, "Thailand"), country_index_dict['Thailand'], resid_pre_layers[:8])

Output before patching
 Bangkok
Output After patching
layer_no: 0
layer_no: 1
layer_no: 2
layer_no: 3
layer_no: 4
layer_no: 5
layer_no: 6
layer_no: 7
 the


In [175]:
patching_experiment(model,prompt_list[1], calculate_delta_tensor(named_activation_dict, "James"), name_index_dict['James'], resid_pre_layers[:16])

Output before patching
 Tokyo
Output After patching
layer_no: 0
layer_no: 1
layer_no: 2
layer_no: 3
layer_no: 4
layer_no: 5
layer_no: 6
layer_no: 7
layer_no: 8
layer_no: 9
layer_no: 10
layer_no: 11
layer_no: 12
layer_no: 13
layer_no: 14
layer_no: 15
 Tokyo


In [182]:
patching_experiment(model,prompt_list[1], calculate_delta_tensor(country_activation_dict, "Japan"), country_index_dict['Japan'], resid_pre_layers[:11])

Output before patching
 Tokyo
Output After patching
layer_no: 0
layer_no: 1
layer_no: 2
layer_no: 3
layer_no: 4
layer_no: 5
layer_no: 6
layer_no: 7
layer_no: 8
layer_no: 9
layer_no: 10
 Washington


In [39]:
named_patching_result = {}
for n, name in enumerate(named_activation_dict):
    print(name)
    # print(prompt_list[n])
    named_patching_result[str(n)] = []
    for i in range(len(resid_pre_layers)):
        named_patching_result[str(n)].append(patching_experiment(model,prompt_list[n], calculate_delta_tensor(named_activation_dict, name), name_index_dict[name], resid_pre_layers[:i]))

Emma
No of layers on which activation was patched starting from first: 0
No of layers on which activation was patched starting from first: 1
No of layers on which activation was patched starting from first: 2
No of layers on which activation was patched starting from first: 3
No of layers on which activation was patched starting from first: 4
No of layers on which activation was patched starting from first: 5
No of layers on which activation was patched starting from first: 6
No of layers on which activation was patched starting from first: 7
No of layers on which activation was patched starting from first: 8
No of layers on which activation was patched starting from first: 9
No of layers on which activation was patched starting from first: 10
No of layers on which activation was patched starting from first: 11
No of layers on which activation was patched starting from first: 12
No of layers on which activation was patched starting from first: 13
No of layers on which activation was pa

In [40]:
country_patching_result = {}
for n, country in enumerate(country_activation_dict):
    print(country)
    # print(prompt_list[n])
    country_patching_result[str(n)] = []
    for i in range(len(resid_pre_layers)):
        country_patching_result[str(n)].append(patching_experiment(model,prompt_list[n], calculate_delta_tensor(country_activation_dict, country), country_index_dict[country], resid_pre_layers[:i]))

Thailand
No of layers on which activation was patched starting from first: 0
No of layers on which activation was patched starting from first: 1
No of layers on which activation was patched starting from first: 2
No of layers on which activation was patched starting from first: 3
No of layers on which activation was patched starting from first: 4
No of layers on which activation was patched starting from first: 5
No of layers on which activation was patched starting from first: 6
No of layers on which activation was patched starting from first: 7
No of layers on which activation was patched starting from first: 8
No of layers on which activation was patched starting from first: 9
No of layers on which activation was patched starting from first: 10
No of layers on which activation was patched starting from first: 11
No of layers on which activation was patched starting from first: 12
No of layers on which activation was patched starting from first: 13
No of layers on which activation wa

In [44]:
import json

In [45]:
with open('named_patching_result.json', 'w') as f:
    json.dump(named_patching_result, f)

In [46]:
with open('country_patching_result.json', 'w') as f:
    json.dump(country_patching_result, f)

In [47]:
prompt_list[44]

'Answer the question based on the context below. Keep the answer short.\n                        Context: Reid lives in the capital city of Uzbekistan.\n                        Question: Which city does Reid live in?\n                        Answer: Reid lives in the city of'