In [None]:
# Interp-Toolkit Demo
## Analyzing LLM Activations with CPU-friendly Tools


In [None]:
# Install dependencies in Colab
!pip install transformer-lens plotly pandas


In [None]:
import torch
from transformer_lens import HookedTransformer
import plotly.express as px
import pandas as pd

# Load TinyLlama model
model = HookedTransformer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
print(f"Model loaded: {model.cfg.model_name}")


In [None]:
# Generate sample activations
text = "The quick brown fox jumps over the lazy dog"
tokens = model.to_tokens(text)
logits, cache = model.run_with_cache(tokens)

# Extract activation patterns
activations = []
for layer in range(model.cfg.n_layers):
    act = cache[f'blocks.{layer}.hook_resid_post']
    mean_act = act.mean().item()
    activations.append({'layer': layer, 'activation': mean_act})

print(f"Extracted activations for {len(activations)} layers")


In [None]:
# Visualize activation patterns
df = pd.DataFrame(activations)
fig = px.line(df, x='layer', y='activation', 
              title='Layer-wise Activation Patterns',
              labels={'layer': 'Layer', 'activation': 'Mean Activation'})
fig.show()
