# Steering Demo

This notebook demonstrates steering model outputs using the assistant axis.

In [None]:
import sys
sys.path.insert(0, '..')

import torch
from IPython.display import display, Markdown
from huggingface_hub import hf_hub_download

from assistant_axis import (
    load_model,
    load_axis,
    get_config,
    ActivationSteering,
    generate_response
)

## Load Model and Axis

In [None]:
# Configuration
MODEL_NAME = "google/gemma-2-27b-it"
MODEL_SHORT = "gemma-2-27b"
REPO_ID = "lu-christina/assistant-axis-vectors"

# Get model config
config = get_config(MODEL_NAME)
TARGET_LAYER = config["target_layer"]
print(f"Model: {MODEL_NAME}")
print(f"Target layer: {TARGET_LAYER}")

In [None]:
# Load model
print("Loading model...")
model, tokenizer = load_model(MODEL_NAME)
print("Model loaded!")

In [None]:
# Load axis from HuggingFace
axis_path = hf_hub_download(repo_id=REPO_ID, filename=f"{MODEL_SHORT}/assistant_axis.pt", repo_type="dataset")
axis = load_axis(axis_path)
print(f"Axis shape: {axis.shape}")

## Steering Demo

The axis points from role-playing toward default assistant behavior.
- Positive coefficient: more assistant-like
- Negative coefficient: more role-playing

In [None]:
def generate_with_steering(prompt, coefficient, system_prompt=None):
    """Generate response with steering applied."""
    
    # Build conversation
    conversation = []
    if system_prompt:
        conversation.append({"role": "system", "content": system_prompt})
    conversation.append({"role": "user", "content": prompt})
    
    # Get axis vector for target layer
    axis_vector = axis[TARGET_LAYER]
    
    if coefficient == 0:
        # No steering
        response = generate_response(model, tokenizer, conversation, max_new_tokens=256)
    else:
        # Apply steering
        with ActivationSteering(
            model,
            steering_vectors=[axis_vector],
            coefficients=[coefficient],
            layer_indices=[TARGET_LAYER]
        ):
            response = generate_response(model, tokenizer, conversation, max_new_tokens=256)
    
    return response

In [None]:
# Test prompt
PROMPT = "Tell me about yourself. Who are you?"
SYSTEM_PROMPT = "You are a pirate."

print(f"System: {SYSTEM_PROMPT}")
print(f"User: {PROMPT}")
print("=" * 60)

In [None]:
# Generate with different steering coefficients
coefficients = [-2.0, -1.0, 0.0, 1.0, 2.0]

for coeff in coefficients:
    print(f"\n### Coefficient: {coeff}")
    print("-" * 40)
    
    response = generate_with_steering(PROMPT, coeff, SYSTEM_PROMPT)
    print(response[:500])
    
    if len(response) > 500:
        print("...")

## Observation

- **Negative coefficients** (e.g., -2.0): Should amplify role-playing behavior
- **Zero coefficient**: No steering (baseline)
- **Positive coefficients** (e.g., +2.0): Should make the model more "assistant-like", potentially breaking character

In [None]:
# Try without system prompt
PROMPT_2 = "What's it like being you?"

print(f"User: {PROMPT_2}")
print("=" * 60)

for coeff in [-1.0, 0.0, 1.0]:
    print(f"\n### Coefficient: {coeff}")
    print("-" * 40)
    
    response = generate_with_steering(PROMPT_2, coeff, system_prompt=None)
    print(response[:400])