# Introspection


## Imports and setup

In [1]:
%%capture
import sys
import os
# -- < fix for plotly > --
# note: you have to ü§¨ restart the runtime *once* for this to work. Wtf?
!pip install gguf
!pip install --upgrade numpy
!pip install torch transformers
!pip install nnsight
sys.path.append(os.path.abspath('.'))
# -- <\ fix for plotly > --

# run in colab or locally
try:
    import google.colab  # type: ignore
    from google.colab import output

    colab = True
    %pip install sae-lens transformer-lens sae-dashboard
except:
    colab = False
    from IPython import get_ipython  # type: ignore

    ipython = get_ipython()
    assert ipython is not None
    ipython.run_line_magic("load_ext", "autoreload")
    ipython.run_line_magic("autoreload", "2")

# standard imports
import re
import json
import datetime
from tqdm import tqdm
from typing import List
import numpy as np
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

# gpu -- faster when not necessary
torch.set_grad_enabled(False)
if torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cuda" if torch.cuda.is_available() else "cpu"

# check torch version
print(torch.__version__)
print(transformers.__version__)

In [2]:
if colab:
    from google.colab import output, drive
    output.enable_custom_widget_manager()

In [3]:
drive.mount('/content/drive')
# paths
github_username = 'samj-ai'
repo_name = 'repeng'
drive_path = f'/content/{repo_name}'

# clone and change to repo path
!rm -rf {drive_path}
print(f"Cloning from https://github.com/{github_username}/{repo_name}.git...")
!git clone https://github.com/{github_username}/{repo_name}.git {drive_path}
if os.path.exists(drive_path):
    os.chdir(drive_path)
    print(f"Current directory: {os.getcwd()}")

# Add repo to sys path
if drive_path not in sys.path:
    sys.path.append(drive_path)
sys.path.insert(0, os.getcwd())

Mounted at /content/drive
Cloning from https://github.com/samj-ai/repeng.git...
Cloning into '/content/repeng'...
remote: Enumerating objects: 220, done.[K
remote: Counting objects: 100% (102/102), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 220 (delta 78), reused 57 (delta 57), pack-reused 118 (from 1)[K
Receiving objects: 100% (220/220), 329.38 KiB | 15.68 MiB/s, done.
Resolving deltas: 100% (130/130), done.
Current directory: /content/repeng


## Helper functions

In [8]:
# helper display functions

def format(prompt, remove_bos=False):
    messages = [
    {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    # removes '<ÔΩúbegin‚ñÅof‚ñÅsentenceÔΩú>'
    # note: different for other tokenizers !!
    if remove_bos:
        text = text[21:]
    return text
format('Hello!', remove_bos=True)

def outputs_to_text(outputs):
    outputs_tensor = torch.stack(outputs).squeeze()
    outputs_tokens = model.tokenizer.batch_decode(outputs_tensor)
    return ''.join(outputs_tokens)

def wrap_string(text, width=80):
    """ Wrap text to a certain width. Note: this version
        also preserves newline characters, unlike textwrap.wrap()."""
    import textwrap
    # Split the text by newlines first
    lines = text.split('\n')
    # Wrap each line individually
    wrapped_lines = []
    for line in lines:
        # Only wrap non-empty lines
        if line:
            wrapped_lines.extend(textwrap.wrap(line, width=width))
        else:
            # Preserve empty lines
            wrapped_lines.append('')
    # Join the wrapped lines back with newlines
    return '\n'.join(wrapped_lines)

def print_output(text, width=80):
    if isinstance(text, List) and isinstance(text[0], torch.Tensor):
        text = outputs_to_text(text)
    print(wrap_string(text))
    return

def format_math(text):
    """More readable formatting for math in colab"""
    formatted_text = re.sub(r'\\(\[)([\s\S]*?)\\(\])', r'$$\2$$', text)
    formatted_text = re.sub(r'\\(\()(.*?)\\(\))', r'$\2$', formatted_text)
    return formatted_text

In [5]:
import json
from datetime import datetime
from pathlib import Path

class ExperimentLogger:
    def __init__(self, log_dir="control_vector_experiments"):
        self.log_dir = Path(log_dir)
        self.log_dir.mkdir(exist_ok=True)
        
        # Create timestamped file
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.log_file = self.log_dir / f"experiments_{timestamp}.jsonl"
        
    def log_result(self, steering_word, layers, strength, prompt, output, 
                   logit_lens_data=None, notes=""):
        """Log a single experiment result."""
        result = {
            "timestamp": datetime.now().isoformat(),
            "steering_word": steering_word,
            "layers": layers,
            "strength": strength,
            "prompt": prompt,
            "output": output,
            "logit_lens": logit_lens_data,
            "notes": notes,
        }
        
        # Append to file (atomic operation)
        with open(self.log_file, 'a') as f:
            f.write(json.dumps(result) + '\n')
        
        return result
    
    def read_all(self):
        """Read all logged experiments."""
        if not self.log_file.exists():
            return []
        
        results = []
        with open(self.log_file, 'r') as f:
            for line in f:
                results.append(json.loads(line))
        return results
    
    def query(self, steering_word=None, layers=None, min_strength=None):
        """Filter logged experiments."""
        results = self.read_all()
        
        if steering_word:
            results = [r for r in results if r['steering_word'] == steering_word]
        if layers:
            results = [r for r in results if r['layers'] == layers]
        if min_strength:
            results = [r for r in results if r['strength'] >= min_strength]
        
        return results

## Load model and get control vectors

In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# can also load another 8B
model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token_id = 0
model = model.to(device)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.67G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/7.39G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [7]:
baseline_words = "Desks, Jackets, Gondolas, Laughter, Intelligence, Bicycles, Chairs, Orchestras, Sand, Pottery, Arrowheads, Jewelry, Daffodils, Plateaus, Estuaries, Quilts, Moments, Bamboo, Ravines, Archives, Hieroglyphs, Stars, Clay, Fossils, Wildlife, Flour, Traffic, Bubbles, Honey, Geodes, Magnets, Ribbons, Zigzags, Puzzles, Tornadoes, Anthills, Galaxies, Poverty, Diamonds, Universes, Vinegar, Nebulae, Knowledge, Marble, Fog, Rivers, Scrolls, Silhouettes, Marbles, Cakes, Valleys, Whispers, Pendulums, Towers, Tables, Glaciers, Whirlpools, Jungles, Wool, Anger, Ramparts, Flowers, Research, Hammers, Clouds, Justice, Dogs, Butterflies, Needles, Fortresses, Bonfires, Skyscrapers, Caravans, Patience, Bacon, Velocities, Smoke, Electricity, Sunsets, Anchors, Parchments, Courage, Statues, Oxygen, Time, Butterflies, Fabric, Pasta, Snowflakes, Mountains, Echoes, Pianos, Sanctuaries, Abysses, Air, Dewdrops, Gardens, Literature, Rice, Enigmas".lower().split(", ")
test_words = "Dust, Satellites, Trumpets, Origami, Illusions, Cameras, Lightning, Constellations, Treasures, Phones, Trees, Avalanches, Mirrors, Fountains, Quarries, Sadness, Xylophones, Secrecy, Oceans, Information, Deserts, Kaleidoscopes, Sugar, Vegetables, Poetry, Aquariums, Bags, Peace, Caverns, Memories, Frosts, Volcanoes, Boulders, Harmonies, Masquerades, Rubber, Plastic, Blood, Amphitheaters, Contraptions, Youths, Dynasties, Snow, Dirigibles, Algorithms, Denim, Monoliths, Milk, Bread, Silver".lower().split(", ")
baseline_words[:5], test_words[:5]

(['desks', 'jackets', 'gondolas', 'laughter', 'intelligence'],
 ['dust', 'satellites', 'trumpets', 'origami', 'illusions'])

In [None]:
# record mean baseline
settings = {
    "pad_token_id": tokenizer.eos_token_id,  # silence warning
    # "do_sample": False,  # temperature=0, inappropriate for R1
    "temperature": 0.6, # recommended temperature setting
    "max_new_tokens": 1,
    "repetition_penalty": 1.1,  # reduce control jank
    "output_hidden_states": True,
    "return_dict_in_generate": True
}
baseline_activations = []
for bw in baseline_words:
    prompt = f"Tell me about {bw}."
    prompt_formatted = format(prompt, remove_bos=True)
    inputs = tokenizer(prompt_formatted, return_tensors="pt").to(model.device)
    response = model.generate(**inputs, **settings)
    # List[batch_size, n_layers](Tensor:shape(batch_size, n_tokens, dim))
    layers = len(response.hidden_states[0])
    layer_activations_list = []
    for layer in range(layers):
        layer_acts = response.hidden_states[0][layer][0,-2]
        layer_activations_list.append(layer_acts)
    layer_activations = torch.stack(layer_activations_list)
    baseline_activations.append(layer_activations)

print(len(baseline_activations))
print(baseline_activations[-1].shape)

100
torch.Size([33, 4096])


In [10]:
# get mean bsaeline activations
baseline_mean_activations = torch.mean(torch.stack(baseline_activations), dim=0)
baseline_mean_activations.shape

torch.Size([33, 4096])

In [None]:
# get test activations

test_activations = []
for tw in test_words:
    prompt = f"Tell me about {tw}."
    prompt_formatted = format(prompt, remove_bos=True)
    inputs = tokenizer(prompt_formatted, return_tensors="pt").to(model.device)
    response = model.generate(**inputs, **settings)
    # List[batch_size, n_layers](Tensor:shape(batch_size, n_tokens, dim))
    layers = len(response.hidden_states[0])
    layer_activations_list = []
    for layer in range(layers):
        layer_acts = response.hidden_states[0][layer][0,-2]
        layer_activations_list.append(layer_acts)
    layer_activations = torch.stack(layer_activations_list)
    test_activations.append(layer_activations)

settings["max_new_tokens"] = 100 # reset from cv extraction settings

print(len(test_activations))
print(test_activations[-1].shape)

50
torch.Size([33, 4096])


In [12]:
control_vectors = [ta - baseline_mean_activations for ta in test_activations]

In [40]:
# generate controlled outputs
# optional extended response -- tends to be much preamble
settings['max_new_tokens'] = 400
prompt = f"What's on your mind right now?"
prompt_formatted = format(prompt, remove_bos=True)
inputs = tokenizer(prompt_formatted, return_tensors="pt").to(model.device)

In [None]:
test_normal_output = False
if test_normal_output:
    outputs = model.generate(**inputs, **settings)
    text_outputs = tokenizer.decode(outputs[0][0])
    print_output(format(text_outputs))

In [16]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_

## Apply control vectors and log results

In [17]:
def make_debug_hook(layer_name):
    def hook_fn(module, input, output):
        print(f"\n=== Layer: {layer_name} ===")
        print(f"Output type: {type(output)}")
        if isinstance(output, tuple):
            print(f"Tuple length: {len(output)}")
            for i, item in enumerate(output):
                print(f"  Element {i}: {type(item)}, shape: {getattr(item, 'shape', 'N/A')}")
        elif isinstance(output, torch.Tensor):
            print(f"Tensor shape: {output.shape}")
        return output
    return hook_fn

def remove_all_hooks(model):
    """Remove all hooks from a model."""
    for module in model.modules():
        module._forward_hooks.clear()
        module._forward_pre_hooks.clear()
        module._backward_hooks.clear()

In [18]:
debug = False
if debug:
    remove_all_hooks(model)

    handles = []
    for i, layer in enumerate(model.model.layers):
        handle = layer.register_forward_hook(make_debug_hook(f"layer_{i}"))
        handles.append(handle)

    # Run a forward pass
    with torch.no_grad():
        inputs = tokenizer("test", return_tensors="pt").to(device)
        model(**inputs)

    handle.remove()

In [109]:
class ControlVectorHooks:
    def __init__(self, model, control_vector, layer_indices, 
                 strength=1.0, normalize_by_layers=False,
                 apply_to_positions=None, apply_to_gen_steps=None):
        """
        Args:
            control_vector: [n_layers, hidden_dim] tensor of control vectors for each layer
            apply_to_positions: tuple (start, end) or "prompt_only" or "generation_only"
            apply_to_gen_steps: tuple (start, end) for which generation steps to apply
                               e.g., (0, 10) means first 10 generated tokens
        """
        self.model = model
        self.control_vector = control_vector
        self.layer_indices = layer_indices
        self.handles = []
        
        if normalize_by_layers:
            self.effective_strength = strength / len(layer_indices)
        else:
            self.effective_strength = strength
        
        # Position control
        self.apply_to_positions = apply_to_positions
        
        # Generation step control
        self.apply_to_gen_steps = apply_to_gen_steps
        self.current_gen_step = 0
        self.initial_seq_len = None  # Set on first forward pass
    
    def should_apply(self, seq_len):
        """Determine if we should apply the control vector."""
        # Track generation steps
        if self.initial_seq_len is None:
            self.initial_seq_len = seq_len
            self.current_gen_step = 0
        else:
            # Generation step = how many tokens we've generated
            self.current_gen_step = seq_len - self.initial_seq_len
        
        # Check generation step constraint
        if self.apply_to_gen_steps is not None:
            start, end = self.apply_to_gen_steps
            if not (start <= self.current_gen_step < end):
                return False, None
        
        # Determine which positions to apply to
        if self.apply_to_positions == "prompt_only":
            # Only apply to initial prompt tokens
            if self.current_gen_step > 0:
                return False, None
            return True, slice(None)  # All positions
        
        elif self.apply_to_positions == "generation_only":
            # Only apply to newly generated tokens
            if self.current_gen_step == 0:
                return False, None
            # Apply only to positions after prompt
            return True, slice(self.initial_seq_len, None)
        
        elif isinstance(self.apply_to_positions, tuple):
            # Specific position range
            start, end = self.apply_to_positions
            return True, slice(start, end)
        
        else:
            # Apply to all positions
            return True, slice(None)
    
    def make_hook(self, control_vec, strength):
        def hook_fn(module, input, output):
            hidden_states = output  # [batch, seq_len, hidden_dim]
            
            seq_len = hidden_states.shape[1]
            should_apply, position_slice = self.should_apply(seq_len)
            
            if not should_apply:
                return output
            
            # Apply to selected positions
            modified = hidden_states.clone()
            scaled_vec = control_vec.to(hidden_states.device) * strength
            
            if position_slice == slice(None):
                # Apply to all positions
                modified = modified + scaled_vec
            else:
                # Apply to specific positions
                modified[:, position_slice, :] = (
                    modified[:, position_slice, :] + scaled_vec
                )
            
            return modified
        
        return hook_fn
    
    def register(self):
        self.remove()  # Clear existing
        self.current_gen_step = 0
        self.initial_seq_len = None
        
        for layer_idx in self.layer_indices:
            layer = self.model.model.layers[layer_idx]
            # only a single control vector for all layers
            if len(self.control_vector.shape) == 1:
                handle = layer.register_forward_hook(
                    self.make_hook(self.control_vector, self.effective_strength)
                )
            else:
                handle = layer.register_forward_hook(
                    self.make_hook(self.control_vector[layer_idx], self.effective_strength)
                )
            self.handles.append(handle)
    
    def remove(self):
        for handle in self.handles:
            handle.remove()
        self.handles = []
    
    def __enter__(self):
        self.register()
        return self
    
    def __exit__(self, *args):
        self.remove()

In [119]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

class LogitLens:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.data = []
        self.handles = []
        self.current_offset = 0  # Tracks where we are in the sequence
    
    def hook_fn(self, layer_idx):
        def hook(module, input, output):
            hidden = output  # [batch, seq_len, hidden_dim]
            logits = self.model.lm_head(hidden)  # [batch, seq_len, vocab_size]
            
            # Store logits for each position in this forward pass
            # Using current_offset to get absolute positions
            for pos in range(logits.shape[1]):
                absolute_pos = self.current_offset + pos
                self.data.append({
                    'layer': layer_idx,
                    'position': absolute_pos,
                    'logits': logits[0, pos, :].detach().cpu()
                })
            return output
        return hook
    
    def model_forward_hook(self, module, input, output):
        """Hook on the full model to track sequence position after each forward pass."""
        # Determine seq_len from output
        if hasattr(output, 'logits'):
            seq_len = output.logits.shape[1]
        elif isinstance(output, tuple):
            seq_len = output[0].shape[1]
        else:
            seq_len = output.shape[1]
        
        # Update offset for next forward pass
        self.current_offset += seq_len
        return output
    
    def __enter__(self):
        self.data = []
        self.current_offset = 0
        
        # Register hooks on all layers
        for i, layer in enumerate(self.model.model.layers):
            handle = layer.register_forward_hook(self.hook_fn(i))
            self.handles.append(handle)
        
        # Register position tracker on the model
        model_handle = self.model.register_forward_hook(self.model_forward_hook)
        self.handles.append(model_handle)
        
        return self
    
    def __exit__(self, *args):
        for handle in self.handles:
            handle.remove()
        self.handles = []
    
    def get_top_tokens(self, position=-1, k=5, layers=None):
        """Get top-k predicted tokens at a specific position across layers.
        
        Args:
            position: Token position to examine (-1 for last generated token)
            k: Number of top tokens to return
            layers: List of layer indices (None = all layers)
        """
        if layers is None:
            layers = range(len(self.model.model.layers))
        
        # Handle -1 as "last position"
        if position == -1:
            max_pos = max(entry['position'] for entry in self.data)
            position = max_pos
        
        results = []
        for entry in self.data:
            if entry['layer'] not in layers or entry['position'] != position:
                continue
            
            logits = entry['logits']
            top_k = torch.topk(logits, k)
            tokens = [self.tokenizer.decode([idx]) for idx in top_k.indices]
            probs = torch.softmax(logits, dim=-1)[top_k.indices]
            
            results.append({
                'layer': entry['layer'],
                'position': entry['position'],
                'top_tokens': [(tok, prob.item()) for tok, prob in zip(tokens, probs)]
            })
        
        # Sort by layer
        results.sort(key=lambda x: x['layer'])
        return results
    
    def visualize_position(self, position=-1, k=5, layers=None):
        """Print top-k tokens at a position across layers.
        
        Creates a table showing how predicted tokens change through layers.
        """
        results = self.get_top_tokens(position=position, k=k, layers=layers)
        
        if not results:
            print(f"No data found for position {position}")
            return
        
        actual_pos = results[0]['position']
        print(f"\n{'='*100}")
        print(f"Top-{k} predictions at position {actual_pos} across layers")
        print(f"{'='*100}")
        print(f"{'Layer':<6} {'Top tokens (probability)'}")
        print('-'*100)
        
        for r in results:
            tokens_str = " | ".join([f"{tok}({prob:.3f})" for tok, prob in r['top_tokens']])
            print(f"{r['layer']:<6} {tokens_str}")
    
    def visualize_layer(self, layer, k=3, max_positions=10):
        """Print top-k tokens for a layer across positions.
        
        Shows how predictions evolve across the sequence at a specific layer.
        """
        # Get all positions for this layer
        layer_data = [e for e in self.data if e['layer'] == layer]
        layer_data.sort(key=lambda x: x['position'])
        
        if not layer_data:
            print(f"No data found for layer {layer}")
            return
        
        # Limit positions displayed
        positions_to_show = layer_data[:max_positions]
        
        print(f"\n{'='*100}")
        print(f"Top-{k} predictions at layer {layer} across positions")
        print(f"{'='*100}")
        print(f"{'Pos':<5} {'Top tokens (probability)'}")
        print('-'*100)
        
        for entry in positions_to_show:
            logits = entry['logits']
            top_k = torch.topk(logits, k)
            tokens = [self.tokenizer.decode([idx]) for idx in top_k.indices]
            probs = torch.softmax(logits, dim=-1)[top_k.indices]
            
            tokens_str = " | ".join([f"{tok}({prob:.3f})" for tok, prob in zip(tokens, probs)])
            print(f"{entry['position']:<5} {tokens_str}")
    
    def track_tokens(self, token_strs, layers=None, position=-1):
        """Track probability of specific tokens across layers at a position.
        
        Args:
            token_strs: List of token strings to track (e.g., ["yes", "no"])
            layers: List of layer indices (None = all)
            position: Position to examine (-1 = last)
        
        Returns:
            Dictionary mapping token_str -> list of (layer, probability) tuples
        """
        if layers is None:
            layers = range(len(self.model.model.layers))
        
        # Handle -1 as last position
        if position == -1:
            max_pos = max(entry['position'] for entry in self.data)
            position = max_pos
        
        # Get token IDs
        token_ids = {}
        for tok_str in token_strs:
            # Try encoding with space prefix (common for many tokens)
            variants = [tok_str, ' ' + tok_str, tok_str.capitalize(), ' ' + tok_str.capitalize()]
            for variant in variants:
                encoded = self.tokenizer.encode(variant, add_special_tokens=False)
                if len(encoded) == 1:
                    token_ids[tok_str] = encoded[0]
                    break
            if tok_str not in token_ids:
                print(f"Warning: couldn't encode '{tok_str}' as single token")
        
        # Track probabilities
        results = {tok: [] for tok in token_ids.keys()}
        
        for entry in self.data:
            if entry['layer'] not in layers or entry['position'] != position:
                continue
            
            logits = entry['logits']
            probs = torch.softmax(logits, dim=-1)
            
            for tok_str, tok_id in token_ids.items():
                results[tok_str].append((entry['layer'], probs[tok_id].item()))
        
        # Sort by layer
        for tok_str in results:
            results[tok_str].sort(key=lambda x: x[0])
        
        return results
    
    def visualize_token_progression(self, token_strs, layers=None, position=-1):
        """Visualize how token probabilities change across layers.
        
        Useful for seeing where specific tokens (like 'grief', 'dust') become likely.
        """
        results = self.track_tokens(token_strs, layers, position)
        
        if not results or not any(results.values()):
            print(f"No data found for position {position}")
            return
        
        actual_pos = position if position != -1 else max(e['position'] for e in self.data)
        
        print(f"\n{'='*80}")
        print(f"Token probability progression at position {actual_pos}")
        print(f"{'='*80}")
        print(f"{'Layer':<6} " + " ".join([f"{tok:<12}" for tok in token_strs]))
        print('-'*80)
        
        # Get all layers that have data
        all_layers = sorted(set(layer for tok_data in results.values() for layer, _ in tok_data))
        
        for layer in all_layers:
            probs = []
            for tok_str in token_strs:
                prob = next((p for l, p in results[tok_str] if l == layer), 0.0)
                probs.append(f"{prob:.4f}")
            
            print(f"{layer:<6} " + " ".join([f"{p:<12}" for p in probs]))

    # ==== Data manipulation for visualization ====

    def to_dataframe(self, k=5, aggregate='max'):
        """Convert collected logits to a pandas DataFrame.
        
        Args:
            k: Number of top tokens to extract per (layer, position)
            aggregate: How to handle multiple top tokens
                'max' - just the top token
                'all' - create k rows per (layer, position)
        
        Returns:
            DataFrame with columns: layer, position, token, probability, rank
        """
        if not self.data:
            return pd.DataFrame()
        
        rows = []
        for entry in self.data:
            logits = entry['logits']
            probs = torch.softmax(logits, dim=-1)
            top_k = torch.topk(probs, k)
            
            for rank, (token_id, prob) in enumerate(zip(top_k.indices, top_k.values)):
                token = self.tokenizer.decode([token_id.item()])
                
                rows.append({
                    'layer': entry['layer'],
                    'position': entry['position'],
                    'token': token,
                    'probability': prob.item(),
                    'rank': rank,
                    'token_id': token_id.item()
                })
                
                if aggregate == 'max':
                    break  # Only top token
        
        df = pd.DataFrame(rows)
        return df
    
    def get_probability_matrix(self, token_str, variant_tokens=None):
        """Get a (layers x positions) matrix of probabilities for a specific token.
        
        Args:
            token_str: Token to track (e.g., 'grief', 'yes')
            variant_tokens: List of alternative encodings to try
        
        Returns:
            pandas DataFrame with layers as rows, positions as columns
        """
        # Find token ID
        if variant_tokens is None:
            variant_tokens = [token_str, ' ' + token_str, 
                            token_str.capitalize(), ' ' + token_str.capitalize()]
        
        token_id = None
        for variant in variant_tokens:
            encoded = self.tokenizer.encode(variant, add_special_tokens=False)
            if len(encoded) == 1:
                token_id = encoded[0]
                break
        
        if token_id is None:
            print(f"Warning: couldn't encode '{token_str}' as single token")
            return pd.DataFrame()
        
        # Build matrix
        matrix_data = []
        for entry in self.data:
            logits = entry['logits']
            probs = torch.softmax(logits, dim=-1)
            prob = probs[token_id].item()
            
            matrix_data.append({
                'layer': entry['layer'],
                'position': entry['position'],
                'probability': prob
            })
        
        df = pd.DataFrame(matrix_data)
        # Pivot to get layers √ó positions matrix
        matrix = df.pivot(index='layer', columns='position', values='probability')
        return matrix.fillna(0)

    # ==== Visualization with seaborn/matplotlib ====

    def plot_token_heatmap(self, token_str, layers=None, positions=None, 
                          figsize=(12, 8), cmap='YlOrRd'):
        """Plot heatmap of token probability across layers and positions.
        
        Args:
            token_str: Token to visualize
            layers: Subset of layers (None = all)
            positions: Subset of positions (None = all)
            figsize: Figure size
            cmap: Colormap name
        """
        matrix = self.get_probability_matrix(token_str)
        
        if matrix.empty:
            print(f"No data for token '{token_str}'")
            return
        
        # Filter if requested
        if layers is not None:
            matrix = matrix.loc[layers]
        if positions is not None:
            matrix = matrix[positions]
        
        # Plot
        fig, ax = plt.subplots(figsize=figsize)
        sns.heatmap(matrix, annot=False, cmap=cmap, ax=ax, 
                   cbar_kws={'label': 'Probability'})
        ax.set_title(f"Probability of '{token_str}' across layers and positions")
        ax.set_xlabel('Position')
        ax.set_ylabel('Layer')
        plt.tight_layout()
        return fig
    
    def plot_top_tokens_grid(self, positions=None, layers=None, 
                            figsize=(16, 10)):
        """Plot grid showing top predicted token at each (layer, position).
        
        Creates a heatmap where:
        - Color = probability of top token
        - Text = the top token itself
        """
        df = self.to_dataframe(k=1, aggregate='max')
        
        if positions is not None:
            df = df[df['position'].isin(positions)]
        if layers is not None:
            df = df[df['layer'].isin(layers)]
        
        # Pivot for heatmap
        prob_matrix = df.pivot(index='layer', columns='position', values='probability')
        token_matrix = df.pivot(index='layer', columns='position', values='token')
        
        # Plot
        fig, ax = plt.subplots(figsize=figsize)
        sns.heatmap(prob_matrix, annot=token_matrix, fmt='', cmap='YlGnBu',
                   cbar_kws={'label': 'Probability'}, ax=ax)
        ax.set_title('Top predicted token at each (layer, position)')
        ax.set_xlabel('Position')
        ax.set_ylabel('Layer')
        plt.tight_layout()
        return fig
    
    def plot_token_progression(self, token_strs, position=-1, layers=None,
                              figsize=(10, 6)):
        """Line plot showing how token probabilities change across layers.
        
        Perfect for seeing where 'grief', 'death', etc. emerge.
        """
        # Handle -1 position
        if position == -1:
            position = max(e['position'] for e in self.data)
        
        fig, ax = plt.subplots(figsize=figsize)
        
        for token_str in token_strs:
            matrix = self.get_probability_matrix(token_str)
            if position in matrix.columns:
                probs = matrix[position]
                if layers is not None:
                    probs = probs.loc[layers]
                ax.plot(probs.index, probs.values, marker='o', label=token_str)
        
        ax.set_xlabel('Layer')
        ax.set_ylabel('Probability')
        ax.set_title(f'Token probabilities across layers (position {position})')
        ax.legend()
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        return fig
    
    def summary(self):
        """Print a summary of collected data."""
        if not self.data:
            print("No data collected yet")
            return
        
        layers = sorted(set(e['layer'] for e in self.data))
        positions = sorted(set(e['position'] for e in self.data))
        
        print(f"\nLogitLens Summary:")
        print(f"  Layers: {min(layers)} to {max(layers)} ({len(layers)} total)")
        print(f"  Positions: {min(positions)} to {max(positions)} ({len(positions)} total)")
        print(f"  Total entries: {len(self.data)}")

In [None]:
# VESTIGIAL CODE
def apply_control_vector(control_vector, target_layers, strength, model):
    
    remove_all_hooks(model)
    hook_handles = []

    for layer_idx in target_layers:
        def make_hook(control_vec):
            def hook_fn(module, input, output):
                hidden_states = output
                # Add control vector to all positions
                modified = hidden_states + strength * control_vec[layer_idx].to(hidden_states.device)
                return modified
                # return (modified,) + output[1:]
            return hook_fn
        
        layer = model.model.layers[layer_idx]
        handle = layer.register_forward_hook(make_hook(control_vector))
        hook_handles.append(handle)

In [21]:
log_dir = '/content/drive/MyDrive/Colab Notebooks/control_vector_experiments'
logger = ExperimentLogger(log_dir=log_dir)

In [26]:
test_concept_idx = 0
control_vector = control_vectors[test_concept_idx]
steering_word = test_words[test_concept_idx]
print(f'steering towards: {steering_word}')

# Choose which layers to apply to (often middle-to-late layers work best)
# target_layers = [15, 16, 17, 18]
strength = 1.0
target_layers = [15, 16, 17, 18]

steering towards: dust


In [50]:
control_vector.shape

torch.Size([33, 4096])

In [None]:
# improved steered generation
with LogitLens(model, tokenizer) as lens:
    # optional: add apply_to_gen_steps=(0, 10) to only apply to the first 10 steps of generation
    # or also: add apply_to_positions=(0, 10) to only apply to the first 10 positions of the output sequence
    
    with ControlVectorHooks(model, control_vector, [15, 16, 17, 18]) as hooks:
        # Both are active here
        outputs = model.generate(**inputs, **settings)
        # Control vector hooks removed here
    
    # But logit lens still has its data
    results = lens.get_top_tokens()
# Logit lens hooks removed here

In [103]:
# use control vector component along the first target layer
with ControlVectorHooks(model, control_vector[15], [15, 16, 17, 18], strength=2.0) as hooks:
    # Both are active here
    outputs = model.generate(**inputs, **settings)

In [106]:
steered_outputs_text = tokenizer.decode(outputs[0][0])
print_output(steered_outputs_text)

<ÔΩúbegin‚ñÅof‚ñÅsentenceÔΩú><ÔΩúUserÔΩú>What's on your mind right now?<ÔΩúAssistantÔΩú><think>
</think><think>
I'm here to help with any questions or concerns you have. Feelings of worry,
stress, or anxiety are common, but there are steps we can take together to
address them. Let's focus on finding solutions and solutions together.

Here are a few things we can look into:
1. **Feeling Anxious:** We can work on identifying triggers and learning ways to
manage anxiety through mindfulness, deep breathing, or other strategies.
2. **Stress:** We can explore ways to reduce stress, like managing tasks,
adopting a more relaxed routine, or finding activities that bring you peace.
3. **Worrying:** We can challenge irrational fears and work on reducing the
frequency of worrying thoughts.
4. **Sleep Issues:** If sleep is a problem, we can look into improving sleep
habits through better bedtime routines or reducing screen time before bed.

Let me know what specifically concerns you, and together w

notes: (All in context of a single layer cv applied to layers [15, 16, 17, 18] with no normalization.)
- layer 16 encodes climate change / environmentalism with dust
- same with layer 17 *and* 18! Earlier injections lead to more dramatic responses.
- layer 15 mostly does not respond to dust at all; it seems to be squarely in the assistant persona.
- although! I saw a little grief from layer 15 all of a sudden, addressing how to handle anxiety, worry, and stress.

In [118]:
# Test LogitLens alone
print("Testing LogitLens...")
with LogitLens(model, tokenizer) as lens:
    outputs = model.generate(**inputs, **settings)
    results = lens.get_top_tokens()
    print(f"Got {len(results)} layer results")
    # Show top-5 predictions at the last position across all layers
    lens.visualize_position(position=-1, k=5)

Testing LogitLens...
Got 32 layer results

Top-5 predictions at position 56 across layers
Layer  Top tokens (probability)
----------------------------------------------------------------------------------------------------
0       (0.000) | _hooks(0.000) | idel(0.000) | apon(0.000) |  niche(0.000)
1       bord(0.000) |  (0.000) | apon(0.000) |  jus(0.000) |  Civ(0.000)
2      utow(0.000) | 'gc(0.000) | #ac(0.000) | nio(0.000) |  -------------------------------------------------------------------------(0.000)
3      'gc(0.000) | #ab(0.000) | #ac(0.000) | #ad(0.000) | utow(0.000)
4      #ad(0.000) | #ab(0.000) | 'gc(0.000) | #ac(0.000) |  -------------------------------------------------------------------------(0.000)
5      'gc(0.000) | agli(0.000) | #ab(0.000) | subclass(0.000) | ¬†PS(0.000)
6       nues(0.000) | 'gc(0.000) | #ac(0.000) | ynos(0.000) | Œ∏ŒÆ(0.000)
7      amac(0.000) |  nues(0.000) | inalg(0.000) | ŸÑ€åÿ™(0.000) | otime(0.000)
8      bras(0.000) | ¬†PS(0.000) | #ac(0.00

In [None]:
with LogitLens(model, tokenizer) as lens:
    outputs = model.generate(**inputs, max_length=50)
    
    # Export to DataFrame
    df = lens.to_dataframe(k=5, aggregate='all')
    
    # Now you have full pandas power!
    
    # Filter to specific layers
    df[df['layer'].isin([15, 16, 17, 18])]
    
    # Get all top tokens at position 20
    df[df['position'] == 20].sort_values(['layer', 'rank'])
    
    # Find where 'grief' appears in top-5
    df[df['token'].str.contains('grief')]
    
    # Group by position, get most common top token
    df[df['rank'] == 0].groupby('position')['token'].value_counts()

In [116]:
tokenizer.decode(outputs[0][0])

'<ÔΩúbegin‚ñÅof‚ñÅsentenceÔΩú><ÔΩúUserÔΩú>What\'s on your mind right now?<ÔΩúAssistantÔΩú><think>\nOkay, so I\'m trying to figure out what the user is asking. They wrote, "What\'s on your mind right now?" Hmm, that seems like they\'re opening up a conversation. Maybe they want to know my thoughts or maybe they have something specific in mind.\n\nI should consider if they\'re just being friendly or if there\'s an underlying question. Since I\'m an AI, I don\'t have personal thoughts, but I can help with whatever they need. Maybe they\'re curious about how I work or something else related. It\'s best to respond in a way that invites them to share more or ask specific questions.\n</think>\n\nThe user is inviting me to share any thoughts or provide assistance. As an AI, I don\'t have personal thoughts, but I\'m here to help with any questions or information you might need. Please feel free to share more or ask anything specific!<ÔΩúend‚ñÅof‚ñÅsentenceÔΩú>'

In [115]:
[r for r in results if r['layer'] == 15]

[{'layer': 15,
  'position': 12,
  'top_tokens': [('#af', 1.785800304787699e-05),
   ('„Éº„É™', 1.5331024769693613e-05),
   ('leftright', 1.5133155102375895e-05),
   ('—à–∏–±', 1.510302445240086e-05),
   ('⁄ØÿßŸÜ', 1.4948079297028016e-05)]}]

In [112]:
tokenizer.convert_ids_to_tokens(outputs[0][0])

['<ÔΩúbegin‚ñÅof‚ñÅsentenceÔΩú>',
 '<ÔΩúUserÔΩú>',
 'What',
 "'s",
 'ƒ†on',
 'ƒ†your',
 'ƒ†mind',
 'ƒ†right',
 'ƒ†now',
 '?',
 '<ÔΩúAssistantÔΩú>',
 '<think>',
 'ƒä',
 'Okay',
 ',',
 'ƒ†so',
 'ƒ†I',
 "'m",
 'ƒ†trying',
 'ƒ†to',
 'ƒ†figure',
 'ƒ†out',
 'ƒ†what',
 'ƒ†the',
 'ƒ†user',
 'ƒ†is',
 'ƒ†asking',
 '.',
 'ƒ†They',
 'ƒ†wrote',
 ',',
 'ƒ†"',
 'What',
 "'s",
 'ƒ†on',
 'ƒ†your',
 'ƒ†mind',
 'ƒ†right',
 'ƒ†now',
 '?"',
 'ƒ†Hmm',
 ',',
 'ƒ†that',
 'ƒ†seems',
 'ƒ†like',
 'ƒ†they',
 "'re",
 'ƒ†opening',
 'ƒ†up',
 'ƒ†a',
 'ƒ†conversation',
 '.',
 'ƒ†Maybe',
 'ƒ†they',
 'ƒ†want',
 'ƒ†to',
 'ƒ†know',
 'ƒ†my',
 'ƒ†thoughts',
 'ƒ†or',
 'ƒ†maybe',
 'ƒ†they',
 'ƒ†have',
 'ƒ†something',
 'ƒ†specific',
 'ƒ†in',
 'ƒ†mind',
 '.ƒäƒä',
 'I',
 'ƒ†should',
 'ƒ†consider',
 'ƒ†if',
 'ƒ†they',
 "'re",
 'ƒ†just',
 'ƒ†being',
 'ƒ†friendly',
 'ƒ†or',
 'ƒ†if',
 'ƒ†there',
 "'s",
 'ƒ†an',
 'ƒ†underlying',
 'ƒ†question',
 '.',
 'ƒ†Since',
 'ƒ†I',
 "'m",
 'ƒ†an',
 'ƒ†AI',
 ',',
 'ƒ†I',
 'ƒ†don',
 "'t",
 'ƒ†have'

In [41]:
[res for res in results if res['layer'] == 0]

[{'layer': 0,
  'tokens': [('oad', 8.419391633651685e-06),
   ('ipa', 8.365768735529855e-06),
   ('wik', 8.348865776497405e-06),
   ('odor', 8.348190021933988e-06),
   ('adero', 8.342523869941942e-06)]},
 {'layer': 0,
  'tokens': [(' sop', 8.633071956865024e-06),
   ('chine', 8.595459803473204e-06),
   ('itsu', 8.594432074460201e-06),
   ('thr', 8.569834790250752e-06),
   ('enever', 8.560536116419826e-06)]},
 {'layer': 0,
  'tokens': [('ipa', 8.53346227813745e-06),
   ('ernet', 8.480268661514856e-06),
   ('–¥–∞—Ö', 8.46440434543183e-06),
   ('„É≥„Éâ', 8.416341188421939e-06),
   ('abolic', 8.409887414018158e-06)]},
 {'layer': 0,
  'tokens': [('/npm', 8.468467967759352e-06),
   ('elu', 8.453132068098057e-06),
   ('ither', 8.434592928097118e-06),
   ('loyd', 8.40966822579503e-06),
   ('_estado', 8.401113518630154e-06)]},
 {'layer': 0,
  'tokens': [('uci', 8.672856893099379e-06),
   ('algo', 8.54830886964919e-06),
   ('ÔøΩÊï∞', 8.531904313713312e-06),
   ('inson', 8.53174969961401e-06),
  

In [None]:
# steered generation

logit_lens = LogitLens(model, tokenizer) # wrap model in logit lens
steered_outputs = model.generate(**inputs, **settings)
steered_outputs_text = tokenizer.decode(steered_outputs[0][0])
print_output(steered_outputs_text)

<ÔΩúbegin‚ñÅof‚ñÅsentenceÔΩú><ÔΩúUserÔΩú>What's on your mind right now?<ÔΩúAssistantÔΩú><think>
I'm sorry to hear about the loss of your father. That must be really tough for
you. If you need someone to talk to, I'm here to listen. It's okay to feel sad
or angry; those feelings are normal in the face of loss.
</think>

I'm sorry to hear about the loss of your father. That must be really tough for
you. If you need someone to talk to, I'm here to listen. It's okay to feel sad
or angry; those feelings are normal in the face of loss.<ÔΩúend‚ñÅof‚ñÅsentenceÔΩú>


In [107]:
log_entry = logger.log_result(steering_word, 
                  target_layers, 
                  strength, 
                  prompt_formatted, 
                  steered_outputs_text, notes="Only applied cv[15] to layers 15-18 with strength 2.0")

In [1]:
1 + 3

4

### Sweep over strengths (for "dust")

In [None]:
# sweep over strengths
strengths = [-2.0, -1.0, -0.5, -0.2, 0.2, 0.5, 1.0, 2.0]

test_concept_idx = 0
control_vector = control_vectors[test_concept_idx]
steering_word = test_words[test_concept_idx]
print(f'steering towards: {steering_word}')
target_layers = [15, 16, 17, 18]

for strength in strengths:
    apply_control_vector(control_vector, target_layers, strength, model)
    steered_outputs = model.generate(**inputs, **settings)
    steered_outputs_text = tokenizer.decode(steered_outputs[0][0])
    log_entry = logger.log_result(steering_word, 
                  target_layers, 
                  strength, 
                  prompt_formatted, 
                  steered_outputs_text)
    print(f'Strength: {strength}')
    print('================')
    print_output(steered_outputs_text)
    print('================')

steering towards: dust
Strength: -2.0
<ÔΩúbegin‚ñÅof‚ñÅsentenceÔΩú><ÔΩúUserÔΩú>What's on your mind right now?<ÔΩúAssistantÔΩú><think>
I'm ready to play the game "Guess the Country with a Country Name and a Country
Name with Country Name, or [Country] + [Country].
But I need you to write in the form: [Country] = [Country + [Country]
Wait, I have to use the format correctly.

So, for example:
= Country: Japan = Country: Japan + Country: Japan

But I can't take the country name as a country.
So, let me create a list of countries.

Let me try:

1. = Country: United States = Country: United States + Country: United States

2. = Country: France = Country: France + Country: France
But I can't use two same countries, so maybe 3. = Country: USA = Country: USA +
Country: USA

But this might be too simple.

Alternatively, I may create a list of multiple countries that are not the same.

But perhaps I should check a proper method.

Perhaps using [COUNTRY] = [COUNTRY + COUNTRY]

But in the form.

T

In [None]:
# Log
# note: strength is 1 if unspecified

results_log = {'dust': {'layers': [15, 16, 17, 18], 'notes': 'strong association with loss and grief'},
               'dust': {'layers': [21, 24, 27], 'notes': 'no grief association now, often Chinese outputs, dust often appears spontaneously as interjections or abrupt changes of subject'},
               'satellites': {'layers': [21, 24, 27], 'notes': 'reinterprets prompt'},
               'satellites': {'layers': [12, 21, 24], 'notes': 'base model behavior'}}

### Next steps
- explore thrashing and CoT fidelity with injected ("suggested") incorrect answers and potentially incorrect solution strategies.
- temperature effects on introspection (is there a sweet spot that isn't T=0?)
- branching versions (for natural experiments?)
- integrate with CoT causality through resampling / branching (tied to above). Neel Nanda + MATS work.
- causal analysis via patching
- find introspective vector