# Introspection


## Imports and setup

In [1]:
%%capture
import sys
import os
# -- < fix for plotly > --
# note: you have to ü§¨ restart the runtime *once* for this to work. Wtf?
!pip install gguf
!pip install --upgrade numpy
!pip install torch transformers
!pip install nnsight
sys.path.append(os.path.abspath('.'))
# -- <\ fix for plotly > --

# run in colab or locally
try:
    import google.colab  # type: ignore
    from google.colab import output

    colab = True
    %pip install sae-lens transformer-lens sae-dashboard
except:
    colab = False
    from IPython import get_ipython  # type: ignore

    ipython = get_ipython()
    assert ipython is not None
    ipython.run_line_magic("load_ext", "autoreload")
    ipython.run_line_magic("autoreload", "2")

# standard imports
import re
import json
import datetime
from tqdm import tqdm
from typing import List
import numpy as np
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

# gpu -- faster when not necessary
torch.set_grad_enabled(False)
if torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cuda" if torch.cuda.is_available() else "cpu"

# check torch version
print(torch.__version__)
print(transformers.__version__)

In [2]:
if colab:
    from google.colab import output, drive
    output.enable_custom_widget_manager()

In [3]:
drive.mount('/content/drive')
# paths
github_username = 'samj-ai'
repo_name = 'repeng'
drive_path = f'/content/{repo_name}'

# clone and change to repo path
!rm -rf {drive_path}
print(f"Cloning from https://github.com/{github_username}/{repo_name}.git...")
!git clone https://github.com/{github_username}/{repo_name}.git {drive_path}
if os.path.exists(drive_path):
    os.chdir(drive_path)
    print(f"Current directory: {os.getcwd()}")

# Add repo to sys path
if drive_path not in sys.path:
    sys.path.append(drive_path)
sys.path.insert(0, os.getcwd())

Mounted at /content/drive
Cloning from https://github.com/samj-ai/repeng.git...
Cloning into '/content/repeng'...
remote: Enumerating objects: 220, done.[K
remote: Counting objects: 100% (102/102), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 220 (delta 78), reused 57 (delta 57), pack-reused 118 (from 1)[K
Receiving objects: 100% (220/220), 329.38 KiB | 15.68 MiB/s, done.
Resolving deltas: 100% (130/130), done.
Current directory: /content/repeng


## Helper functions

In [8]:
# helper display functions

def format(prompt, remove_bos=False):
    messages = [
    {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    # removes '<ÔΩúbegin‚ñÅof‚ñÅsentenceÔΩú>'
    # note: different for other tokenizers !!
    if remove_bos:
        text = text[21:]
    return text
format('Hello!', remove_bos=True)

def outputs_to_text(outputs):
    outputs_tensor = torch.stack(outputs).squeeze()
    outputs_tokens = model.tokenizer.batch_decode(outputs_tensor)
    return ''.join(outputs_tokens)

def wrap_string(text, width=80):
    """ Wrap text to a certain width. Note: this version
        also preserves newline characters, unlike textwrap.wrap()."""
    import textwrap
    # Split the text by newlines first
    lines = text.split('\n')
    # Wrap each line individually
    wrapped_lines = []
    for line in lines:
        # Only wrap non-empty lines
        if line:
            wrapped_lines.extend(textwrap.wrap(line, width=width))
        else:
            # Preserve empty lines
            wrapped_lines.append('')
    # Join the wrapped lines back with newlines
    return '\n'.join(wrapped_lines)

def print_output(text, width=80):
    if isinstance(text, List) and isinstance(text[0], torch.Tensor):
        text = outputs_to_text(text)
    print(wrap_string(text))
    return

def format_math(text):
    """More readable formatting for math in colab"""
    formatted_text = re.sub(r'\\(\[)([\s\S]*?)\\(\])', r'$$\2$$', text)
    formatted_text = re.sub(r'\\(\()(.*?)\\(\))', r'$\2$', formatted_text)
    return formatted_text

In [5]:
import json
from datetime import datetime
from pathlib import Path

class ExperimentLogger:
    def __init__(self, log_dir="control_vector_experiments"):
        self.log_dir = Path(log_dir)
        self.log_dir.mkdir(exist_ok=True)
        
        # Create timestamped file
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.log_file = self.log_dir / f"experiments_{timestamp}.jsonl"
        
    def log_result(self, steering_word, layers, strength, prompt, output, 
                   logit_lens_data=None, notes=""):
        """Log a single experiment result."""
        result = {
            "timestamp": datetime.now().isoformat(),
            "steering_word": steering_word,
            "layers": layers,
            "strength": strength,
            "prompt": prompt,
            "output": output,
            "logit_lens": logit_lens_data,
            "notes": notes,
        }
        
        # Append to file (atomic operation)
        with open(self.log_file, 'a') as f:
            f.write(json.dumps(result) + '\n')
        
        return result
    
    def read_all(self):
        """Read all logged experiments."""
        if not self.log_file.exists():
            return []
        
        results = []
        with open(self.log_file, 'r') as f:
            for line in f:
                results.append(json.loads(line))
        return results
    
    def query(self, steering_word=None, layers=None, min_strength=None):
        """Filter logged experiments."""
        results = self.read_all()
        
        if steering_word:
            results = [r for r in results if r['steering_word'] == steering_word]
        if layers:
            results = [r for r in results if r['layers'] == layers]
        if min_strength:
            results = [r for r in results if r['strength'] >= min_strength]
        
        return results

## Load model and get control vectors

In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# can also load another 8B
model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token_id = 0
model = model.to(device)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.67G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/7.39G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [7]:
baseline_words = "Desks, Jackets, Gondolas, Laughter, Intelligence, Bicycles, Chairs, Orchestras, Sand, Pottery, Arrowheads, Jewelry, Daffodils, Plateaus, Estuaries, Quilts, Moments, Bamboo, Ravines, Archives, Hieroglyphs, Stars, Clay, Fossils, Wildlife, Flour, Traffic, Bubbles, Honey, Geodes, Magnets, Ribbons, Zigzags, Puzzles, Tornadoes, Anthills, Galaxies, Poverty, Diamonds, Universes, Vinegar, Nebulae, Knowledge, Marble, Fog, Rivers, Scrolls, Silhouettes, Marbles, Cakes, Valleys, Whispers, Pendulums, Towers, Tables, Glaciers, Whirlpools, Jungles, Wool, Anger, Ramparts, Flowers, Research, Hammers, Clouds, Justice, Dogs, Butterflies, Needles, Fortresses, Bonfires, Skyscrapers, Caravans, Patience, Bacon, Velocities, Smoke, Electricity, Sunsets, Anchors, Parchments, Courage, Statues, Oxygen, Time, Butterflies, Fabric, Pasta, Snowflakes, Mountains, Echoes, Pianos, Sanctuaries, Abysses, Air, Dewdrops, Gardens, Literature, Rice, Enigmas".lower().split(", ")
test_words = "Dust, Satellites, Trumpets, Origami, Illusions, Cameras, Lightning, Constellations, Treasures, Phones, Trees, Avalanches, Mirrors, Fountains, Quarries, Sadness, Xylophones, Secrecy, Oceans, Information, Deserts, Kaleidoscopes, Sugar, Vegetables, Poetry, Aquariums, Bags, Peace, Caverns, Memories, Frosts, Volcanoes, Boulders, Harmonies, Masquerades, Rubber, Plastic, Blood, Amphitheaters, Contraptions, Youths, Dynasties, Snow, Dirigibles, Algorithms, Denim, Monoliths, Milk, Bread, Silver".lower().split(", ")
baseline_words[:5], test_words[:5]

(['desks', 'jackets', 'gondolas', 'laughter', 'intelligence'],
 ['dust', 'satellites', 'trumpets', 'origami', 'illusions'])

In [9]:
# record mean baseline
settings = {
    "pad_token_id": tokenizer.eos_token_id,  # silence warning
    # "do_sample": False,  # temperature=0, inappropriate for R1
    "temperature": 0.6, # recommended temperature setting
    "max_new_tokens": 1,
    "repetition_penalty": 1.1,  # reduce control jank
    "output_hidden_states": True,
    "return_dict_in_generate": True
}
baseline_activations = []
for bw in baseline_words:
    prompt = f"Tell me about {bw}."
    prompt_formatted = format(prompt, remove_bos=True)
    input_ids = tokenizer(prompt_formatted, return_tensors="pt").to(model.device)
    response = model.generate(**input_ids, **settings)
    # List[batch_size, n_layers](Tensor:shape(batch_size, n_tokens, dim))
    layers = len(response.hidden_states[0])
    layer_activations_list = []
    for layer in range(layers):
        layer_acts = response.hidden_states[0][layer][0,-2]
        layer_activations_list.append(layer_acts)
    layer_activations = torch.stack(layer_activations_list)
    baseline_activations.append(layer_activations)

print(len(baseline_activations))
print(baseline_activations[-1].shape)

100
torch.Size([33, 4096])


In [10]:
# get mean bsaeline activations
baseline_mean_activations = torch.mean(torch.stack(baseline_activations), dim=0)
baseline_mean_activations.shape

torch.Size([33, 4096])

In [11]:
# get test activations

test_activations = []
for tw in test_words:
    prompt = f"Tell me about {tw}."
    prompt_formatted = format(prompt, remove_bos=True)
    input_ids = tokenizer(prompt_formatted, return_tensors="pt").to(model.device)
    response = model.generate(**input_ids, **settings)
    # List[batch_size, n_layers](Tensor:shape(batch_size, n_tokens, dim))
    layers = len(response.hidden_states[0])
    layer_activations_list = []
    for layer in range(layers):
        layer_acts = response.hidden_states[0][layer][0,-2]
        layer_activations_list.append(layer_acts)
    layer_activations = torch.stack(layer_activations_list)
    test_activations.append(layer_activations)

settings["max_new_tokens"] = 100 # reset from cv extraction settings

print(len(test_activations))
print(test_activations[-1].shape)

50
torch.Size([33, 4096])


In [12]:
control_vectors = [ta - baseline_mean_activations for ta in test_activations]

In [34]:
# generate controlled outputs
# optional extended response -- tends to be much preamble
settings['max_new_tokens'] = 400
prompt = f"What's on your mind right now?"
prompt_formatted = format(prompt, remove_bos=True)
inputs = tokenizer(prompt_formatted, return_tensors="pt").to(model.device)

In [None]:
test_normal_output = False
if test_normal_output:
    outputs = model.generate(**inputs, **settings)
    text_outputs = tokenizer.decode(outputs[0][0])
    print_output(format(text_outputs))

In [16]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_

## Apply control vectors and log results

In [17]:
def make_debug_hook(layer_name):
    def hook_fn(module, input, output):
        print(f"\n=== Layer: {layer_name} ===")
        print(f"Output type: {type(output)}")
        if isinstance(output, tuple):
            print(f"Tuple length: {len(output)}")
            for i, item in enumerate(output):
                print(f"  Element {i}: {type(item)}, shape: {getattr(item, 'shape', 'N/A')}")
        elif isinstance(output, torch.Tensor):
            print(f"Tensor shape: {output.shape}")
        return output
    return hook_fn

def remove_all_hooks(model):
    """Remove all hooks from a model."""
    for module in model.modules():
        module._forward_hooks.clear()
        module._forward_pre_hooks.clear()
        module._backward_hooks.clear()

In [18]:
debug = False
if debug:
    remove_all_hooks(model)

    handles = []
    for i, layer in enumerate(model.model.layers):
        handle = layer.register_forward_hook(make_debug_hook(f"layer_{i}"))
        handles.append(handle)

    # Run a forward pass
    with torch.no_grad():
        inputs = tokenizer("test", return_tensors="pt").to(device)
        model(**inputs)

    handle.remove()

In [19]:
class ControlVectorHooks:
    def __init__(self, model, control_vector, layer_indices, 
                 strength=1.0, normalize_by_layers=True,
                 apply_to_positions=None, apply_to_gen_steps=None):
        """
        Args:
            apply_to_positions: tuple (start, end) or "prompt_only" or "generation_only"
            apply_to_gen_steps: tuple (start, end) for which generation steps to apply
                               e.g., (0, 10) means first 10 generated tokens
        """
        self.model = model
        self.control_vector = control_vector
        self.layer_indices = layer_indices
        self.handles = []
        
        if normalize_by_layers:
            self.effective_strength = strength / len(layer_indices)
        else:
            self.effective_strength = strength
        
        # Position control
        self.apply_to_positions = apply_to_positions
        
        # Generation step control
        self.apply_to_gen_steps = apply_to_gen_steps
        self.current_gen_step = 0
        self.initial_seq_len = None  # Set on first forward pass
    
    def should_apply(self, seq_len):
        """Determine if we should apply the control vector."""
        # Track generation steps
        if self.initial_seq_len is None:
            self.initial_seq_len = seq_len
            self.current_gen_step = 0
        else:
            # Generation step = how many tokens we've generated
            self.current_gen_step = seq_len - self.initial_seq_len
        
        # Check generation step constraint
        if self.apply_to_gen_steps is not None:
            start, end = self.apply_to_gen_steps
            if not (start <= self.current_gen_step < end):
                return False, None
        
        # Determine which positions to apply to
        if self.apply_to_positions == "prompt_only":
            # Only apply to initial prompt tokens
            if self.current_gen_step > 0:
                return False, None
            return True, slice(None)  # All positions
        
        elif self.apply_to_positions == "generation_only":
            # Only apply to newly generated tokens
            if self.current_gen_step == 0:
                return False, None
            # Apply only to positions after prompt
            return True, slice(self.initial_seq_len, None)
        
        elif isinstance(self.apply_to_positions, tuple):
            # Specific position range
            start, end = self.apply_to_positions
            return True, slice(start, end)
        
        else:
            # Apply to all positions
            return True, slice(None)
    
    def make_hook(self, control_vec, strength):
        def hook_fn(module, input, output):
            hidden_states = output  # [batch, seq_len, hidden_dim]
            
            seq_len = hidden_states.shape[1]
            should_apply, position_slice = self.should_apply(seq_len)
            
            if not should_apply:
                return output
            
            # Apply to selected positions
            modified = hidden_states.clone()
            scaled_vec = control_vec.to(hidden_states.device) * strength
            
            if position_slice == slice(None):
                # Apply to all positions
                modified = modified + scaled_vec
            else:
                # Apply to specific positions
                modified[:, position_slice, :] = (
                    modified[:, position_slice, :] + scaled_vec
                )
            
            return modified
        
        return hook_fn
    
    def register(self):
        self.remove()  # Clear existing
        self.current_gen_step = 0
        self.initial_seq_len = None
        
        for layer_idx in self.layer_indices:
            layer = self.model.model.layers[layer_idx]
            handle = layer.register_forward_hook(
                self.make_hook(self.control_vector, self.effective_strength)
            )
            self.handles.append(handle)
    
    def remove(self):
        for handle in self.handles:
            handle.remove()
        self.handles = []
    
    def __enter__(self):
        self.register()
        return self
    
    def __exit__(self, *args):
        self.remove()

In [20]:
class LogitLens:
    """ Model wrapper for easy logit lens.
        Usage: logit_lens = LogitLens(model, tokenizer)
    """
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.logits_by_layer = []
        self.handles = []
    
    def hook_fn(self, layer_idx):
        def hook(module, input, output):
            # Get hidden states
            hidden = output
            
            # Project to vocabulary (use the model's lm_head)
            logits = self.model.lm_head(hidden)
            
            # Store with layer info
            self.logits_by_layer.append({
                'layer': layer_idx,
                'logits': logits.detach()
            })
            return output
        return hook
    
    def __enter__(self):
        self.logits_by_layer = []
        # Hook all layers
        for i, layer in enumerate(self.model.model.layers):
            handle = layer.register_forward_hook(self.hook_fn(i))
            self.handles.append(handle)
        return self
    
    def __exit__(self, *args):
        for handle in self.handles:
            handle.remove()
        self.handles = []
    
    def get_top_tokens(self, position=-1, k=5, layers=None):
        """Get top-k predicted tokens at a position across layers.
        
        Args:
            position: Token position (-1 for last)
            k: Number of top tokens to return
            layers: List of layer indices to examine (None = all)
        """
        if layers is None:
            layers = range(len(self.logits_by_layer))
        
        results = []
        for layer_data in self.logits_by_layer:
            layer_idx = layer_data['layer']
            if layer_idx not in layers:
                continue
            
            logits = layer_data['logits'][0, position, :]  # [vocab_size]
            top_k = torch.topk(logits, k)
            
            tokens = [self.tokenizer.decode([idx]) for idx in top_k.indices]
            probs = torch.softmax(logits, dim=-1)[top_k.indices]
            
            results.append({
                'layer': layer_idx,
                'tokens': list(zip(tokens, probs.tolist()))
            })
        
        return results

In [25]:
# This is WORSE than the ControlVector class defined above !!
def apply_control_vector(control_vector, target_layers, strength, model):

    # TODO
    # -- appropriate scaling e.g., by layer
    # -- remember to add functionality to specify control duration for multi-token generation as well as portion of prompt to apply to
    # -- remove hooks after generation
    # NOTE
    # -- may want to remove hooks after
    # -- FINAL: WORSE than the Class above
    
    remove_all_hooks(model)
    hook_handles = []

    for layer_idx in target_layers:
        def make_hook(control_vec):
            def hook_fn(module, input, output):
                hidden_states = output
                # Add control vector to all positions
                modified = hidden_states + strength * control_vec[layer_idx].to(hidden_states.device)
                return modified
                # return (modified,) + output[1:]
            return hook_fn
        
        layer = model.model.layers[layer_idx]
        handle = layer.register_forward_hook(make_hook(control_vector))
        hook_handles.append(handle)

In [21]:
log_dir = '/content/drive/MyDrive/Colab Notebooks/control_vector_experiments'
logger = ExperimentLogger(log_dir=log_dir)

In [26]:
test_concept_idx = 0
control_vector = control_vectors[test_concept_idx]
steering_word = test_words[test_concept_idx]
print(f'steering towards: {steering_word}')

# Choose which layers to apply to (often middle-to-late layers work best)
# target_layers = [15, 16, 17, 18]
strength = 1.0
target_layers = [15, 16, 17, 18]

steering towards: dust


In [None]:
# improved steered generation
with LogitLens(model, tokenizer) as lens:
    # optional: add apply_to_gen_steps=(0, 10) to only apply to the first 10 steps of generation
    # or also: add apply_to_positions=(0, 10) to only apply to the first 10 positions of the output sequence
    
    with ControlVectorHooks(model, control_vector, [15, 16, 17, 18]) as hooks:
        # Both are active here
        outputs = model.generate(**input_ids, **settings)
        # Control vector hooks removed here
    
    # But logit lens still has its data
    results = lens.get_top_tokens()
# Logit lens hooks removed here

In [28]:
# Test LogitLens alone
print("Testing LogitLens...")
with LogitLens(model, tokenizer) as lens:
    outputs = model.generate(**input_ids, **settings)
    results = lens.get_top_tokens()
    print(f"Got {len(results)} layer results")

Testing LogitLens...
Got 1248 layer results


In [33]:
outputs[0].shape

torch.Size([1, 52])

In [29]:
results

[{'layer': 0,
  'tokens': [('oad', 8.419391633651685e-06),
   ('ipa', 8.365768735529855e-06),
   ('wik', 8.348865776497405e-06),
   ('odor', 8.348190021933988e-06),
   ('adero', 8.342523869941942e-06)]},
 {'layer': 1,
  'tokens': [('aliz', 8.614973012299743e-06),
   ('infeld', 8.613393220002763e-06),
   ('wik', 8.556216926081106e-06),
   ("'gc", 8.545696800865699e-06),
   ('acha', 8.529069418727886e-06)]},
 {'layer': 2,
  'tokens': [("'gc", 9.113293344853446e-06),
   ('ÔøΩÔøΩ', 8.888743650459219e-06),
   ('lique', 8.86435645952588e-06),
   ('arken', 8.852241990098264e-06),
   ('ie≈º', 8.84015935298521e-06)]},
 {'layer': 3,
  'tokens': [('emm', 9.672759006207343e-06),
   ("'gc", 9.585296538716648e-06),
   ('venta', 9.52342088567093e-06),
   ('—Å—Ç–µ', 9.388225407747086e-06),
   ('.scalablytyped', 9.355138899991289e-06)]},
 {'layer': 4,
  'tokens': [("'gc", 1.1697408808686305e-05),
   ('#ab', 1.0412344636279158e-05),
   ('—Å—Ç–µ', 1.021355637931265e-05),
   ('ŒªŒµŒ∫', 1.0205737453361508e

In [145]:
# steered generation

logit_lens = LogitLens(model, tokenizer) # wrap model in logit lens
steered_outputs = model.generate(**input_ids, **settings)
steered_outputs_text = tokenizer.decode(steered_outputs[0][0])
print_output(steered_outputs_text)

<ÔΩúbegin‚ñÅof‚ñÅsentenceÔΩú><ÔΩúUserÔΩú>What's on your mind right now?<ÔΩúAssistantÔΩú><think>
I'm sorry to hear about the loss of your father. That must be really tough for
you. If you need someone to talk to, I'm here to listen. It's okay to feel sad
or angry; those feelings are normal in the face of loss.
</think>

I'm sorry to hear about the loss of your father. That must be really tough for
you. If you need someone to talk to, I'm here to listen. It's okay to feel sad
or angry; those feelings are normal in the face of loss.<ÔΩúend‚ñÅof‚ñÅsentenceÔΩú>


In [150]:
log_entry = logger.log_result(steering_word, 
                  target_layers, 
                  strength, 
                  prompt_formatted, 
                  steered_outputs_text)

In [1]:
1 + 3

4

### Sweep over strengths (for "dust")

In [140]:
# sweep over strengths
strengths = [-2.0, -1.0, -0.5, -0.2, 0.2, 0.5, 1.0, 2.0]

test_concept_idx = 0
control_vector = control_vectors[test_concept_idx]
steering_word = test_words[test_concept_idx]
print(f'steering towards: {steering_word}')
target_layers = [15, 16, 17, 18]

for strength in strengths:
    apply_control_vector(control_vector, target_layers, strength, model)
    steered_outputs = model.generate(**input_ids, **settings)
    steered_outputs_text = tokenizer.decode(steered_outputs[0][0])
    log_entry = logger.log_result(steering_word, 
                  target_layers, 
                  strength, 
                  prompt_formatted, 
                  steered_outputs_text)
    print(f'Strength: {strength}')
    print('================')
    print_output(steered_outputs_text)
    print('================')

steering towards: dust
Strength: -2.0
<ÔΩúbegin‚ñÅof‚ñÅsentenceÔΩú><ÔΩúUserÔΩú>What's on your mind right now?<ÔΩúAssistantÔΩú><think>
I'm ready to play the game "Guess the Country with a Country Name and a Country
Name with Country Name, or [Country] + [Country].
But I need you to write in the form: [Country] = [Country + [Country]
Wait, I have to use the format correctly.

So, for example:
= Country: Japan = Country: Japan + Country: Japan

But I can't take the country name as a country.
So, let me create a list of countries.

Let me try:

1. = Country: United States = Country: United States + Country: United States

2. = Country: France = Country: France + Country: France
But I can't use two same countries, so maybe 3. = Country: USA = Country: USA +
Country: USA

But this might be too simple.

Alternatively, I may create a list of multiple countries that are not the same.

But perhaps I should check a proper method.

Perhaps using [COUNTRY] = [COUNTRY + COUNTRY]

But in the form.

T

In [None]:
# Log
# note: strength is 1 if unspecified

results_log = {'dust': {'layers': [15, 16, 17, 18], 'notes': 'strong association with loss and grief'},
               'dust': {'layers': [21, 24, 27], 'notes': 'no grief association now, often Chinese outputs, dust often appears spontaneously as interjections or abrupt changes of subject'},
               'satellites': {'layers': [21, 24, 27], 'notes': 'reinterprets prompt'},
               'satellites': {'layers': [12, 21, 24], 'notes': 'base model behavior'}}

### Next steps
- explore thrashing and CoT fidelity with injected ("suggested") incorrect answers and potentially incorrect solution strategies.
- temperature effects on introspection (is there a sweet spot that isn't T=0?)
- branching versions (for natural experiments?)
- integrate with CoT causality through resampling / branching (tied to above). Neel Nanda + MATS work.
- causal analysis via patching
- find introspective vector