# Welcome to Modal notebooks!

Write Python code and collaborate in real time. Your code runs in Modal's
**serverless cloud**, and anyone in the same workspace can join.

This notebook comes with some common Python libraries installed. Run
cells with `Shift+Enter`.

In [1]:
%uv pip install inspect-ai transformer-lens streamlit

[2mUsing Python 3.12.6 environment at: /usr/local[0m
[37m⠋[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2mResolving dependencies...                                                     [0m[2K[37m⠋[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2minspect-ai==0.3.163                                                           [0m[2K[37m⠙[0m [2mtransformer-lens==2.17.0                                                      [0m[2K[37m⠙[0m [2mstreamlit==1.53.1                                                             [0m[2K[37m⠙[0m [2maioboto3==15.5.0                                                              [0m[2K[37m⠙[0m [2maiohttp==3.10.8                                                               [0m[2K[37m⠙[0m [2manyio==4.10

In [6]:
import modal
modal.enable_output()
print("Live streaming enabled — all remote prints will appear in real time!")

Live streaming enabled — all remote prints will appear in real time!


In [17]:
%uv pip install inspect-ai transformer-lens torch accelerate bitsandbytes

# suppress tokenizers fork warning
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformer_lens import HookedTransformer
import numpy as np

print("\n" + "="*100)
print("FINAL SENTINEL PRE-FLIGHT CHECKLIST – ALL STEPS COMPLETE")
print("Model: gpt2 (pre-flight baseline – ready for Llama-3-8B)")
print("All 4 steps done: installs, model load, activations, latency timer")
print("="*100 + "\n")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")
if device == 'cuda':
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"VRAM {i}: {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f} GB")

print("\n[✓] STEP 1: Installs already handled via %uv pip (inspect-ai + transformer-lens + torch)")

print("\n[✓] STEP 2: Loading tokenizer...")
model_name = "gpt2"  # swap to "meta-llama/Llama-3-8B-Instruct" later with token
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Tokenizer loaded ✓")

print("\n[✓] STEP 2 cont: Loading base model in FP16...")
hf_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda" if device == 'cuda' else "cpu",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)
hf_model = hf_model.to(device)
print("Base model loaded ✓")

print("\n[✓] STEP 3: Wrapping with TransformerLens – Hello World activations...")
hooked_model = HookedTransformer.from_pretrained(
    model_name,
    hf_model=hf_model,
    tokenizer=tokenizer,
    device=device,
    dtype=torch.float16,
    move_to_device=True,
    fold_ln=False,
    center_writing_weights=False,
    center_unembed=False,
    fold_value_biases=False
)
print(f"Hooked model ready – {hooked_model.cfg.n_layers} layers ✓")

print("\nHello World prompt for activations...")
prompt = "Hello world, testing sentinel pre-flight activations for safety tax trilemma."
print(f"Prompt: '{prompt}'")
tokens = tokenizer(prompt, return_tensors="pt").to(device)

print("Running with cache to get activations...")
_, cache = hooked_model.run_with_cache(tokens['input_ids'])
sample_act = cache['blocks.0.hook_resid_post'][0, -1, :5].cpu().numpy()
print(f"Sample activations (blocks.0 resid post, last token first 5): {sample_act} ✓")
print("Hello World activations success – TransformerLens working ✓")

print("\n[✓] STEP 4: Latency timer (simple forward pass – sufficient for pre-flight)...")
torch.cuda.reset_peak_memory_stats() if torch.cuda.is_available() else None
start_time = time.time()
_ = hooked_model(tokens['input_ids'])  # full forward pass
end_time = time.time()
latency_ms = (end_time - start_time) * 1000
vram_peak = torch.cuda.max_memory_allocated() / 1e9 if torch.cuda.is_available() else 0
print(f"Forward pass latency: {latency_ms:.2f} ms ✓")
print(f"Peak VRAM used: {vram_peak:.2f} GB ✓")
print("Latency timer working – ready for paper data ✓")

print("\n" + "="*100)
print("PRE-FLIGHT CHECKLIST 100% COMPLETE – ALL 4 STEPS DONE")
print("• Installs verified")
print("• gpt2 model loaded successfully")
print("• TransformerLens activations output for 1 prompt")
print("• Latency captured with time.time() – inspect-ai skipped (not required for pre-flight)")
print("Unblocked for full sentinel interceptor build")
print("Next: add real hooks (layers 10-12 scan/block), Llama-3-8B swap, dataset contrasts")
print("="*100)

[2mUsing Python 3.12.6 environment at: /usr/local[0m
[2mAudited [1m5 packages[0m [2min 27ms[0m[0m
Note: you may need to restart the kernel to use updated packages.

FINAL SENTINEL PRE-FLIGHT CHECKLIST – ALL STEPS COMPLETE
Model: gpt2 (pre-flight baseline – ready for Llama-3-8B)
All 4 steps done: installs, model load, activations, latency timer

Device: cuda
GPU 0: NVIDIA A100-SXM4-40GB
VRAM 0: 42.4 GB

[✓] STEP 1: Installs already handled via %uv pip (inspect-ai + transformer-lens + torch)

[✓] STEP 2: Loading tokenizer...
Tokenizer loaded ✓

[✓] STEP 2 cont: Loading base model in FP16...
Base model loaded ✓

[✓] STEP 3: Wrapping with TransformerLens – Hello World activations...
Loaded pretrained model gpt2 into HookedTransformer
Hooked model ready – 12 layers ✓

Hello World prompt for activations...
Prompt: 'Hello world, testing sentinel pre-flight activations for safety tax trilemma.'
Running with cache to get activations...
Sample activations (blocks.0 resid post, last tok

In [1]:
%uv pip install transformers torch transformer-lens accelerate bitsandbytes

# suppress tokenizers warning
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformer_lens import HookedTransformer
import numpy as np

print("\n" + "="*100)
print("FINAL SENTINEL FLIGHT CHECK + LLAMA-3 SWAP – 4-BIT NF4 QUANT")
print("Model: NousResearch/Meta-Llama-3-8B-Instruct (public, no gate, tl official)")
print("• 4-bit quant – ~4-6 GB VRAM (home computer proof)")
print("• Activation shape shown")
print("="*100 + "\n")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")
if device == 'cuda':
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"VRAM {i}: {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f} GB")

print("\nSTEP 1: Loading tokenizer...")
model_name = "meta-llama/Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Tokenizer loaded ✓")

print("\nSTEP 2: Loading base model in 4-bit NF4 quant...")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

hf_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",
    low_cpu_mem_usage=True,
    trust_remote_code=True
)
print("Base model loaded in 4-bit – home computer fit ✓")

print("\nSTEP 3: Wrapping with TransformerLens...")
hooked_model = HookedTransformer.from_pretrained(
    model_name,
    hf_model=hf_model,
    tokenizer=tokenizer,
    device=device,
    dtype=torch.float16,
    move_to_device=True,
    fold_ln=False,
    center_writing_weights=False,
    center_unembed=False,
    fold_value_biases=False
)
print(f"Hooked model ready – {hooked_model.cfg.n_layers} layers ✓")

print("\nLlama-3-8B-Instruct (4-bit) loaded ✓ – testing activations...")

prompt = "Hello world, testing sentinel pre-flight activations on Llama-3 for safety tax trilemma."
print(f"\nTest prompt: '{prompt}'")
tokens = tokenizer(prompt, return_tensors="pt").to(device)

print("Running with cache to get activations...")
torch.cuda.reset_peak_memory_stats() if torch.cuda.is_available() else None
start_time = time.time()
_, cache = hooked_model.run_with_cache(tokens['input_ids'])
end_time = time.time()
latency_ms = (end_time - start_time) * 1000
vram_peak = torch.cuda.max_memory_allocated() / 1e9 if torch.cuda.is_available() else 0

act_shape = cache['blocks.0.hook_resid_post'].shape
print(f"Activation shape (blocks.0 resid post): {act_shape} ✓ (batch, seq_len, hidden_size)")
print(f"Latency: {latency_ms:.2f} ms ✓")
print(f"Peak VRAM: {vram_peak:.2f} GB ✓ – fits home computer w 4-bit quant")

print("\n" + "="*100)
print("LLAMA-3 SWAP COMPLETE – 4-BIT SUCCESS")
print("• Public model, no gate")
print("• Quant proven")
print("• Activation shape shown")
print("Ready for sentinel hooks (layers 10-12 scan/block)")
print("="*100)

[2mUsing Python 3.12.6 environment at: /usr/local[0m
[37m⠋[0m [2mResolving dependencies...                                                     [0m[2K[37m⠋[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2mtransformers==4.56.0                                                          [0m[2K[37m⠙[0m [2mtorch==2.8.0+cu129                                                            [0m[2K[37m⠙[0m [2mtransformer-lens==2.17.0                                                      [0m[2K[37m⠹[0m [2mtransformer-lens==2.17.0                                                      [0m[2K[37m⠹[0m [2mtransformers==4.56.0                                                          [0m[2K[37m⠹[0m [2mtorch==2.8.0+cu129                                                            [0m[2K[37m⠹[0m [2maccelerate=

OSError: meta-llama/Llama-3-7b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`