# Multi-Scale Analysis End-to-End Test

End-to-end test for multi-scale analysis integrated with the full Delphi pipeline.

This runs a complete pipeline (cache -> construct -> explain -> score) and then
performs multi-scale analysis on the cached activations.

## Imports

In [1]:
import asyncio
import time
from pathlib import Path

import torch

from delphi.__main__ import run
from delphi.config import (
    CacheConfig,
    ConstructorConfig,
    MultiScaleConfig,
    RunConfig,
    SamplerConfig,
)
from delphi.utils import base_path_cfg_aware
from delphi.latents import LatentDataset
from delphi.latents.latents import ActivationData
from delphi.latents.loader import TensorBuffer
from delphi.latents.multi_scale_analysis import compare_scales, summarize_multi_scale
from delphi.latents.multi_scale_constructors import multi_scale_constructor
from delphi.log.result_analysis import get_agg_metrics, load_data

INFO 11-04 16:23:23 [__init__.py:216] Automatically detected platform cuda.


## Configuration

In [2]:
# Configure cache settings
cache_cfg = CacheConfig(
    dataset_repo="EleutherAI/fineweb-edu-dedup-10b",
    dataset_split="train[:5%]",
    dataset_column="text",
    batch_size=8,
    cache_ctx_len=512,  # Large enough to test multiple scales
    n_splits=5,
    n_tokens=2_500_000,
)

In [3]:
# Configure sampler
sampler_cfg = SamplerConfig(
    train_type="quantiles",
    test_type="quantiles",
    n_examples_train=40,
    n_examples_test=50,
    n_quantiles=10,
)

In [4]:
# Configure constructor
constructor_cfg = ConstructorConfig(
    min_examples=90,
    example_ctx_len=32,
    n_non_activating=50,
    non_activating_source="random",
    faiss_embedding_cache_enabled=True,
    faiss_embedding_cache_dir=".embedding_cache",
)

In [5]:
# Configure multi-scale analysis
multi_scale_cfg = MultiScaleConfig(
    # context_sizes=[16, 32, 64, 128],  # Must all divide cache_ctx_len=256
    n_examples_per_scale=50,
    min_examples=10,
    variance_threshold=0.1,
)

In [6]:
# List of hookpoints to test - sampling across different layers
# pythia-160m has 12 layers (0-11)
hookpoints_to_test = [
    "layers.0.mlp",
    "layers.2.mlp",
    "layers.4.mlp",
    "layers.6.mlp",
    "layers.8.mlp",
    "layers.10.mlp",
    "layers.11.mlp",  # Final layer
]

# We'll store results for each hookpoint
all_hookpoint_results = {}

## Run Pipeline for Each Hookpoint

In [None]:
# Loop through each hookpoint
for hookpoint in hookpoints_to_test:
    print(f"\n{'='*60}")
    print(f"Testing hookpoint: {hookpoint}")
    print(f"{'='*60}\n")
    
    # Configure run for this hookpoint
    run_cfg = RunConfig(
        name=f"test_multi_scale_{hookpoint.replace('.', '_')}",
        overwrite=["cache", "scores"],
        model="EleutherAI/pythia-160m",
        sparse_model="EleutherAI/sae-pythia-160m-32k",
        hookpoints=[hookpoint],
        explainer_model="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4",
        explainer_model_max_len=4208,
        max_latents=100,
        seed=22,
        num_gpus=torch.cuda.device_count(),
        filter_bos=True,
        verbose=False,
        sampler_cfg=sampler_cfg,
        constructor_cfg=constructor_cfg,
        cache_cfg=cache_cfg,
        multi_scale_cfg=multi_scale_cfg,
    )
    
    # Run the full pipeline
    print("Running full Delphi pipeline...")
    start_time = time.time()
    await run(run_cfg)
    base_path = base_path_cfg_aware(run_cfg)
    pipeline_time = time.time() - start_time
    print(f"Pipeline completed in {pipeline_time:.2f} seconds")
    
    # Store the run config and base path for later analysis
    all_hookpoint_results[hookpoint] = {
        "run_cfg": run_cfg,
        "base_path": base_path,
        "pipeline_time": pipeline_time,
    }


Testing hookpoint: layers.0.mlp

Running full Delphi pipeline...


`torch_dtype` is deprecated! Use `dtype` instead!


Fetching 50 files:   0%|          | 0/50 [00:00<?, ?it/s]

Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}


Resolving path for hookpoint: layers.0.mlp
Overwriting results from /root/delphi/tests/results/test_multi_scale_layers_0_mlp/latents


Resolving data files:   0%|          | 0/97 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/97 [00:00<?, ?it/s]

Caching latents: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 610/610 [00:10<00:00, 55.46it/s, Total Tokens=2,498,560]


Skipping neighbour creation
Overwriting results from /root/delphi/tests/results/test_multi_scale_layers_0_mlp/scores
INFO 11-04 16:24:04 [utils.py:233] non-default args: {'max_model_len': 4208, 'enable_prefix_caching': True, 'disable_log_stats': True, 'model': 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4'}
INFO 11-04 16:24:05 [model.py:547] Resolved architecture: LlamaForCausalLM
INFO 11-04 16:24:05 [model.py:1510] Using max model len 4208
INFO 11-04 16:24:05 [awq_marlin.py:119] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 11-04 16:24:05 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 11-04 16:24:09 [__init__.py:216] Automatically detected platform cuda.
[1;36m(EngineCore_DP0 pid=10608)[0;0m INFO 11-04 16:24:11 [core.py:644] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=10608)[0;0m INFO 11-04 16:24:11 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='hugg

Loading safetensors checkpoint shards:   0% Completed | 0/9 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  11% Completed | 1/9 [00:00<00:04,  1.88it/s]
Loading safetensors checkpoint shards:  22% Completed | 2/9 [00:01<00:04,  1.64it/s]
Loading safetensors checkpoint shards:  33% Completed | 3/9 [00:01<00:03,  1.58it/s]
Loading safetensors checkpoint shards:  44% Completed | 4/9 [00:02<00:03,  1.55it/s]
Loading safetensors checkpoint shards:  56% Completed | 5/9 [00:03<00:02,  1.53it/s]
Loading safetensors checkpoint shards:  67% Completed | 6/9 [00:03<00:01,  1.52it/s]
Loading safetensors checkpoint shards:  78% Completed | 7/9 [00:04<00:01,  1.51it/s]
Loading safetensors checkpoint shards:  89% Completed | 8/9 [00:05<00:00,  1.64it/s]
Loading safetensors checkpoint shards: 100% Completed | 9/9 [00:05<00:00,  1.95it/s]
Loading safetensors checkpoint shards: 100% Completed | 9/9 [00:05<00:00,  1.69it/s]
[1;36m(EngineCore_DP0 pid=10608)[0;0m 


[1;36m(EngineCore_DP0 pid=10608)[0;0m INFO 11-04 16:24:19 [default_loader.py:267] Loading weights took 5.38 seconds
[1;36m(EngineCore_DP0 pid=10608)[0;0m INFO 11-04 16:24:22 [gpu_model_runner.py:2653] Model loading took 37.0899 GiB and 9.263342 seconds
[1;36m(EngineCore_DP0 pid=10608)[0;0m INFO 11-04 16:24:33 [backends.py:548] Using cache directory: /root/.cache/vllm/torch_compile_cache/0bb9f9cc9a/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=10608)[0;0m INFO 11-04 16:24:33 [backends.py:559] Dynamo bytecode transform time: 11.09 s
[1;36m(EngineCore_DP0 pid=10608)[0;0m INFO 11-04 16:24:38 [backends.py:164] Directly load the compiled graph(s) for dynamic shape from the cache, took 3.325 s
[1;36m(EngineCore_DP0 pid=10608)[0;0m INFO 11-04 16:24:45 [monitor.py:34] torch.compile takes 11.09 s in total
[1;36m(EngineCore_DP0 pid=10608)[0;0m INFO 11-04 16:24:47 [gpu_worker.py:298] Available KV cache memory: 83.63 GiB
[1;36m(EngineCore_DP0 pid=10608)[0;0m I

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 67/67 [00:10<00:00,  6.40it/s]
Capturing CUDA graphs (decode, FULL): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 67/67 [00:10<00:00,  6.49it/s]


[1;36m(EngineCore_DP0 pid=10608)[0;0m INFO 11-04 16:25:09 [gpu_model_runner.py:3480] Graph capturing finished in 22 secs, took 1.69 GiB
[1;36m(EngineCore_DP0 pid=10608)[0;0m INFO 11-04 16:25:10 [core.py:210] init engine (profile, create kv cache, warmup model) took 47.89 seconds
INFO 11-04 16:25:11 [llm.py:306] Supported_tasks: ['generate']


Processing items: 0it [00:00, ?it/s]Not enough examples to explain the latent: 44
Not enough examples to explain the latent: 16
Processing items: 2it [01:47, 44.76s/it] Not enough examples to explain the latent: 64
Processing items: 46it [05:18,  5.48s/it]Not enough examples to explain the latent: 32
Processing items: 68it [07:06,  5.11s/it]Not enough examples to explain the latent: 88
Processing items: 95it [07:46,  4.91s/it]


Pipeline completed in 559.66 seconds

Testing hookpoint: layers.2.mlp

Running full Delphi pipeline...


Fetching 50 files:   0%|          | 0/50 [00:00<?, ?it/s]

Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}
Dropping extra args {'signed': False}


Resolving path for hookpoint: layers.2.mlp
Overwriting results from /root/delphi/tests/results/test_multi_scale_layers_2_mlp/latents


Resolving data files:   0%|          | 0/97 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/97 [00:00<?, ?it/s]

Caching latents: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 610/610 [00:10<00:00, 59.11it/s, Total Tokens=2,498,560]


Skipping neighbour creation
Overwriting results from /root/delphi/tests/results/test_multi_scale_layers_2_mlp/scores
INFO 11-04 16:33:20 [utils.py:233] non-default args: {'max_model_len': 4208, 'enable_prefix_caching': True, 'disable_log_stats': True, 'model': 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4'}
INFO 11-04 16:33:21 [model.py:547] Resolved architecture: LlamaForCausalLM
INFO 11-04 16:33:21 [model.py:1510] Using max model len 4208
INFO 11-04 16:33:21 [awq_marlin.py:119] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 11-04 16:33:21 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 11-04 16:33:25 [__init__.py:216] Automatically detected platform cuda.
[1;36m(EngineCore_DP0 pid=15529)[0;0m INFO 11-04 16:33:27 [core.py:644] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=15529)[0;0m INFO 11-04 16:33:27 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='hugg

[1;36m(EngineCore_DP0 pid=15529)[0;0m Process EngineCore_DP0:
[1;36m(EngineCore_DP0 pid=15529)[0;0m Traceback (most recent call last):
[1;36m(EngineCore_DP0 pid=15529)[0;0m   File "/root/.local/share/uv/python/cpython-3.10.19-linux-x86_64-gnu/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
[1;36m(EngineCore_DP0 pid=15529)[0;0m     self.run()
[1;36m(EngineCore_DP0 pid=15529)[0;0m   File "/root/.local/share/uv/python/cpython-3.10.19-linux-x86_64-gnu/lib/python3.10/multiprocessing/process.py", line 108, in run
[1;36m(EngineCore_DP0 pid=15529)[0;0m     self._target(*self._args, **self._kwargs)
[1;36m(EngineCore_DP0 pid=15529)[0;0m   File "/root/delphi/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 712, in run_engine_core
[1;36m(EngineCore_DP0 pid=15529)[0;0m     raise e
[1;36m(EngineCore_DP0 pid=15529)[0;0m   File "/root/delphi/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
[1;36m(EngineCore_

## Multi-Scale Analysis for Each Hookpoint

In [None]:
# Now analyze multi-scale characteristics for each hookpoint
n_latents_to_analyze = 50  # Analyze more latents for better statistics

for hookpoint, result_data in all_hookpoint_results.items():
    print(f"\n{'='*60}")
    print(f"Multi-scale analysis for hookpoint: {hookpoint}")
    print(f"{'='*60}\n")
    
    run_cfg = result_data["run_cfg"]
    base_path = result_data["base_path"]
    
    start_time = time.time()
    latents_path = base_path / "latents"
    
    # Use LatentDataset to properly load cached data
    dataset = LatentDataset(
        raw_dir=latents_path,
        sampler_cfg=sampler_cfg,
        constructor_cfg=constructor_cfg,
    )
    
    # Load tokens
    tokens = dataset.load_tokens()
    print(f"Loaded tokens: shape {tokens.shape}")
    
    # Analyze latents from the dataset
    multi_scale_results = []
    
    for i, buffer in enumerate(dataset.buffers):
        if len(multi_scale_results) >= n_latents_to_analyze:
            break
        
        # Load activations from this buffer
        latents, split_locations, split_activations = buffer.load_data_per_latent()
        
        for latent_idx, locations, activations in zip(latents, split_locations, split_activations):
            if len(multi_scale_results) >= n_latents_to_analyze:
                break
            
            # Create ActivationData
            activation_data = ActivationData(locations, activations)
            
            # Check if enough activations
            if len(activation_data.activations) < multi_scale_cfg.min_examples:
                print(f"NOT ENOUGH ACTIVATION DATA in latent_idx {latent_idx}")
                continue
            
            # Run multi-scale constructor
            multi_scale_data = multi_scale_constructor(
                activation_data=activation_data,
                tokens=tokens,
                context_sizes=multi_scale_cfg.context_sizes,
                cache_ctx_len=cache_cfg.cache_ctx_len,
                n_examples_per_scale=multi_scale_cfg.n_examples_per_scale,
                min_examples=multi_scale_cfg.min_examples,
            )
            
            # Check if we got examples at multiple scales
            non_empty_scales = [
                ctx for ctx in multi_scale_cfg.context_sizes if multi_scale_data[ctx]
            ]
            
            if len(non_empty_scales) >= 2:
                # Run comparison
                comparison = compare_scales(multi_scale_data)
                summary = summarize_multi_scale(multi_scale_data)
                
                multi_scale_results.append(
                    {
                        "latent_idx": int(latent_idx),
                        "scale_type": summary["scale_type"],
                        "dominant_scale": summary["dominant_scale"],
                        "activation_variance": summary["activation_variance"],
                        "max_growth_ratio": summary["max_growth_ratio"],
                        "max_correlation": summary["max_correlation"],
                    }
                )
    
    multi_scale_time = time.time() - start_time
    print(f"Multi-scale analysis completed in {multi_scale_time:.2f} seconds")
    print(f"Analyzed {len(multi_scale_results)} latents")
    
    # Store results
    result_data["multi_scale_results"] = multi_scale_results
    result_data["multi_scale_time"] = multi_scale_time

## Compare Hookpoints by Scale Type Distribution

In [None]:
# Analyze which hookpoints are better at detecting longer-scale features
import pandas as pd
from collections import Counter

# Scale type hierarchy: token < phrase < sentence < paragraph
scale_hierarchy = {
    "token": 1,
    "phrase": 2,
    "sentence": 3,
    "paragraph": 4,
    "unknown": 0,
}

# Collect statistics for each hookpoint
hookpoint_stats = []

for hookpoint, result_data in all_hookpoint_results.items():
    multi_scale_results = result_data["multi_scale_results"]
    
    if not multi_scale_results:
        print(f"Warning: No multi-scale results for {hookpoint}")
        continue
    
    # Count scale types
    scale_types = [r["scale_type"] for r in multi_scale_results]
    scale_counter = Counter(scale_types)
    
    # Calculate statistics
    dominant_scales = [r["dominant_scale"] for r in multi_scale_results]
    avg_dominant_scale = sum(dominant_scales) / len(dominant_scales)
    
    # Calculate proportion of longer features (sentence or paragraph)
    long_features = sum(1 for st in scale_types if st in ["sentence", "paragraph"])
    long_features_pct = 100 * long_features / len(scale_types)
    
    # Calculate average scale hierarchy score
    avg_scale_score = sum(scale_hierarchy[st] for st in scale_types) / len(scale_types)
    
    # Calculate other metrics
    avg_variance = sum(r["activation_variance"] for r in multi_scale_results) / len(multi_scale_results)
    avg_growth_ratio = sum(r["max_growth_ratio"] for r in multi_scale_results) / len(multi_scale_results)
    avg_correlation = sum(r["max_correlation"] for r in multi_scale_results) / len(multi_scale_results)
    
    hookpoint_stats.append({
        "hookpoint": hookpoint,
        "n_latents": len(multi_scale_results),
        "avg_dominant_scale": avg_dominant_scale,
        "long_features_pct": long_features_pct,
        "avg_scale_score": avg_scale_score,
        "token_count": scale_counter.get("token", 0),
        "phrase_count": scale_counter.get("phrase", 0),
        "sentence_count": scale_counter.get("sentence", 0),
        "paragraph_count": scale_counter.get("paragraph", 0),
        "unknown_count": scale_counter.get("unknown", 0),
        "avg_activation_variance": avg_variance,
        "avg_growth_ratio": avg_growth_ratio,
        "avg_correlation": avg_correlation,
    })

# Create DataFrame for easier analysis
stats_df = pd.DataFrame(hookpoint_stats)
stats_df = stats_df.sort_values("long_features_pct", ascending=False)

print("\n" + "="*80)
print("HOOKPOINT COMPARISON: BETTER FOR LONGER FEATURES")
print("="*80)
print("\nRanked by percentage of sentence/paragraph features:\n")
print(stats_df[["hookpoint", "n_latents", "long_features_pct", "avg_dominant_scale", "avg_scale_score"]].to_string(index=False))

print("\n" + "="*80)
print("DETAILED SCALE TYPE DISTRIBUTION")
print("="*80 + "\n")
print(stats_df[["hookpoint", "token_count", "phrase_count", "sentence_count", "paragraph_count", "unknown_count"]].to_string(index=False))

print("\n" + "="*80)
print("ADDITIONAL METRICS")
print("="*80 + "\n")
print(stats_df[["hookpoint", "avg_activation_variance", "avg_growth_ratio", "avg_correlation"]].to_string(index=False))

## Visualize Results Across Hookpoints

In [None]:
# Create visualizations
import matplotlib.pyplot as plt
import numpy as np

# Set up the plotting style
plt.style.use('seaborn-v0_8-darkgrid')
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Stacked bar chart of scale type distribution
ax1 = axes[0, 0]
hookpoints_ordered = stats_df["hookpoint"].values
x_pos = np.arange(len(hookpoints_ordered))

token_counts = stats_df["token_count"].values
phrase_counts = stats_df["phrase_count"].values
sentence_counts = stats_df["sentence_count"].values
paragraph_counts = stats_df["paragraph_count"].values

ax1.bar(x_pos, token_counts, label='Token', color='#e74c3c')
ax1.bar(x_pos, phrase_counts, bottom=token_counts, label='Phrase', color='#f39c12')
ax1.bar(x_pos, phrase_counts + sentence_counts, bottom=token_counts + phrase_counts, 
        label='Sentence', color='#3498db')
ax1.bar(x_pos, paragraph_counts, 
        bottom=token_counts + phrase_counts + sentence_counts,
        label='Paragraph', color='#2ecc71')

ax1.set_xlabel('Hookpoint', fontsize=12, fontweight='bold')
ax1.set_ylabel('Number of Latents', fontsize=12, fontweight='bold')
ax1.set_title('Scale Type Distribution Across Hookpoints', fontsize=14, fontweight='bold')
ax1.set_xticks(x_pos)
ax1.set_xticklabels([h.replace('layers.', 'L') for h in hookpoints_ordered], rotation=45)
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Percentage of long features (sentence + paragraph)
ax2 = axes[0, 1]
colors = ['#2ecc71' if pct > 50 else '#e74c3c' for pct in stats_df["long_features_pct"]]
bars = ax2.barh(range(len(hookpoints_ordered)), stats_df["long_features_pct"].values, color=colors)
ax2.set_yticks(range(len(hookpoints_ordered)))
ax2.set_yticklabels([h.replace('layers.', 'L') for h in hookpoints_ordered])
ax2.set_xlabel('Long Features (%)', fontsize=12, fontweight='bold')
ax2.set_title('Percentage of Sentence/Paragraph Features\n(Higher = Better for Long Features)', 
              fontsize=14, fontweight='bold')
ax2.axvline(x=50, color='gray', linestyle='--', linewidth=2, alpha=0.5)
ax2.grid(True, alpha=0.3, axis='x')

# Add percentage labels
for i, (bar, pct) in enumerate(zip(bars, stats_df["long_features_pct"].values)):
    ax2.text(pct + 1, i, f'{pct:.1f}%', va='center', fontsize=10)

# Plot 3: Average dominant scale
ax3 = axes[1, 0]
ax3.plot(range(len(hookpoints_ordered)), stats_df["avg_dominant_scale"].values, 
         marker='o', linewidth=2, markersize=8, color='#3498db')
ax3.set_xticks(range(len(hookpoints_ordered)))
ax3.set_xticklabels([h.replace('layers.', 'L') for h in hookpoints_ordered], rotation=45)
ax3.set_xlabel('Hookpoint', fontsize=12, fontweight='bold')
ax3.set_ylabel('Average Dominant Scale (tokens)', fontsize=12, fontweight='bold')
ax3.set_title('Average Context Size of Dominant Scale', fontsize=14, fontweight='bold')
ax3.grid(True, alpha=0.3)

# Plot 4: Average scale score (weighted by hierarchy)
ax4 = axes[1, 1]
ax4.plot(range(len(hookpoints_ordered)), stats_df["avg_scale_score"].values,
         marker='s', linewidth=2, markersize=8, color='#9b59b6')
ax4.set_xticks(range(len(hookpoints_ordered)))
ax4.set_xticklabels([h.replace('layers.', 'L') for h in hookpoints_ordered], rotation=45)
ax4.set_xlabel('Hookpoint', fontsize=12, fontweight='bold')
ax4.set_ylabel('Average Scale Score', fontsize=12, fontweight='bold')
ax4.set_title('Weighted Scale Score\n(1=token, 2=phrase, 3=sentence, 4=paragraph)', 
              fontsize=14, fontweight='bold')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('hookpoint_scale_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nVisualization saved as 'hookpoint_scale_analysis.png'")

## Summary: Best Hookpoints for Long Features

In [None]:
# Print final summary
print("\n" + "="*80)
print("FINAL SUMMARY: BEST HOOKPOINTS FOR LONG FEATURES")
print("="*80 + "\n")

# Top 3 hookpoints by long features percentage
top_3 = stats_df.head(3)

print("TOP 3 HOOKPOINTS (by % of sentence/paragraph features):\n")
for idx, row in top_3.iterrows():
    layer_num = row["hookpoint"].split('.')[1]
    print(f"{row['hookpoint']}:")
    print(f"  - Long features: {row['long_features_pct']:.1f}%")
    print(f"  - Avg dominant scale: {row['avg_dominant_scale']:.1f} tokens")
    print(f"  - Scale score: {row['avg_scale_score']:.2f}")
    print(f"  - Distribution: {row['sentence_count']} sentence, {row['paragraph_count']} paragraph, " +
          f"{row['phrase_count']} phrase, {row['token_count']} token")
    print()

# Layer analysis
early_layers = stats_df[stats_df["hookpoint"].str.contains("layers\.[0-3]\.")]
middle_layers = stats_df[stats_df["hookpoint"].str.contains("layers\.[4-7]\.")]
late_layers = stats_df[stats_df["hookpoint"].str.contains("layers\.[8-9]\.") | 
                        stats_df["hookpoint"].str.contains("layers\.1[0-1]\.")]

if len(early_layers) > 0 and len(middle_layers) > 0 and len(late_layers) > 0:
    print("\n" + "-"*80)
    print("LAYER DEPTH ANALYSIS:")
    print("-"*80 + "\n")
    print(f"Early layers (0-3): avg long features = {early_layers['long_features_pct'].mean():.1f}%")
    print(f"Middle layers (4-7): avg long features = {middle_layers['long_features_pct'].mean():.1f}%")
    print(f"Late layers (8-11): avg long features = {late_layers['long_features_pct'].mean():.1f}%")
    print()

# Key insights
print("\n" + "="*80)
print("KEY INSIGHTS:")
print("="*80 + "\n")

best_hookpoint = stats_df.iloc[0]
worst_hookpoint = stats_df.iloc[-1]

print(f"âœ“ BEST: {best_hookpoint['hookpoint']} has {best_hookpoint['long_features_pct']:.1f}% long features")
print(f"âœ— WORST: {worst_hookpoint['hookpoint']} has {worst_hookpoint['long_features_pct']:.1f}% long features")
print(f"ðŸ“Š DIFFERENCE: {best_hookpoint['long_features_pct'] - worst_hookpoint['long_features_pct']:.1f} percentage points")

# Correlation insights
high_variance_hp = stats_df.nlargest(1, "avg_activation_variance").iloc[0]
high_growth_hp = stats_df.nlargest(1, "avg_growth_ratio").iloc[0]

print(f"\nðŸ”¥ HIGHEST activation variance: {high_variance_hp['hookpoint']} ({high_variance_hp['avg_activation_variance']:.4f})")
print(f"ðŸ“ˆ HIGHEST growth ratio: {high_growth_hp['hookpoint']} ({high_growth_hp['avg_growth_ratio']:.4f})")

print("\n" + "="*80)

## Validation: Basic Checks

In [None]:
# Basic validation checks
print("\n" + "="*80)
print("VALIDATION CHECKS")
print("="*80 + "\n")

# Check that all hookpoints were analyzed
assert len(all_hookpoint_results) == len(hookpoints_to_test), \
    f"Expected {len(hookpoints_to_test)} hookpoints, got {len(all_hookpoint_results)}"
print(f"âœ“ All {len(hookpoints_to_test)} hookpoints were successfully analyzed")

# Check that we got results for all hookpoints
hookpoints_with_results = sum(1 for hp, data in all_hookpoint_results.items() 
                               if len(data.get("multi_scale_results", [])) > 0)
print(f"âœ“ {hookpoints_with_results}/{len(hookpoints_to_test)} hookpoints have multi-scale results")

# Verify DataFrame was created correctly
assert len(stats_df) > 0, "Statistics DataFrame is empty"
print(f"âœ“ Statistics DataFrame created with {len(stats_df)} rows")

# Verify all expected columns exist
expected_columns = [
    "hookpoint", "n_latents", "avg_dominant_scale", "long_features_pct", 
    "avg_scale_score", "token_count", "phrase_count", "sentence_count", 
    "paragraph_count", "unknown_count"
]
for col in expected_columns:
    assert col in stats_df.columns, f"Missing column: {col}"
print(f"âœ“ All expected columns present in statistics")

print("\n" + "="*80)
print("âœ… ALL VALIDATION CHECKS PASSED!")
print("="*80)

# Calculate total time
total_pipeline_time = sum(data["pipeline_time"] for data in all_hookpoint_results.values())
total_analysis_time = sum(data["multi_scale_time"] for data in all_hookpoint_results.values())
print(f"\nTotal pipeline time: {total_pipeline_time:.2f} seconds")
print(f"Total analysis time: {total_analysis_time:.2f} seconds")
print(f"Grand total: {total_pipeline_time + total_analysis_time:.2f} seconds")