# Patchscopes Demo - Reproducing Figure 1

This notebook demonstrates the core patchscoping idea from the paper:

1. **Extract** a representation of a token from a source prompt (e.g., "CEO")
2. **Patch** it into an identity-style target prompt
3. **Decode** what the model thinks that token means by looking at top-k predictions


In [None]:
import sys
sys.path.insert(0, "..")

import torch
from patchscopes.model import load_model
from patchscopes.prompts import build_identity_prompt
from patchscopes.positions import find_substring_token_position, find_token_position, verify_single_token
from patchscopes.patch import extract_representation, run_with_patch, get_top_tokens


In [None]:
# Configuration
MODEL_NAME = "meta-llama/Meta-Llama-3-8B"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32

# Patchscope settings
SOURCE_TEXT = "Amazon 's former CEO attended Oscars"
TARGET_WORD = "CEO"
LAYERS_TO_TEST = [0, 5, 10, 15, 20, 25, 31]
TOP_K = 5

print(f"Device: {DEVICE}")


In [None]:
# Load model
model = load_model(MODEL_NAME, device=DEVICE, dtype=DTYPE)
print(f"Loaded {MODEL_NAME} with {model.cfg.n_layers} layers")


In [None]:
# Find target word position in source text
target_position = find_substring_token_position(model, SOURCE_TEXT, TARGET_WORD)
assert target_position is not None, f"Could not find '{TARGET_WORD}' in source text"

# Show tokenization
tokens = model.to_tokens(SOURCE_TEXT, prepend_bos=True)
print(f"Source: '{SOURCE_TEXT}'\n")
print("Tokenization:")
for i in range(tokens.shape[1]):
    marker = " <-- TARGET" if i == target_position else ""
    print(f"  {i}: '{model.to_string(tokens[0, i])}'{marker}")


In [None]:
# Build identity-style target prompt
placeholder = " x"
if not verify_single_token(model, placeholder):
    placeholder = " X"
    assert verify_single_token(model, placeholder), "Could not find single-token placeholder"

target_prompt = build_identity_prompt(placeholder=placeholder, num_demos=5)
placeholder_pos = find_token_position(model, target_prompt, placeholder)

print(f"Target prompt: '{target_prompt}'")
print(f"Placeholder position: {placeholder_pos}")


In [None]:
# Run patchscope across layers (extract from each layer, patch at layer 0)
layers = [l for l in LAYERS_TO_TEST if l < model.cfg.n_layers]

for layer in layers:
    # Extract representation from source at specified layer
    source_vec = extract_representation(model, SOURCE_TEXT, layer=layer, position=target_position)
    
    # Patch at layer 0 and get logits
    logits = run_with_patch(model, target_prompt, target_position=placeholder_pos, source_vec=source_vec)
    
    # Get top-k predictions
    prediction_pos = min(placeholder_pos + 1, logits.shape[1] - 1)
    top_tokens = get_top_tokens(model, logits, prediction_pos, k=TOP_K)
    
    # Display results
    print(f"Layer {layer:2d}: ", end="")
    print(" | ".join([f"'{tok}' ({prob:.3f})" for tok, prob in top_tokens]))


## Expected Behavior

- **Early layers (0-5):** Predictions may be incoherent or generic tokens
- **Middle/late layers (15+):** Predictions should relate to the meaning of "CEO" (e.g., "Jeff", "Bezos", "executive", "chief", etc.)

This demonstrates how the model's representation of "CEO" evolves through layers, from surface-level features to semantic understanding.


0