# Residual compare structure demo

This notebook shows light-weight inspection patterns for the large
`notebooks/h200_long_outputs/physics_A/qwen-0_5b/residual_compare_20251129_053550_Qwen_Qwen2_5_0_5B_Instruct.json`
file. The goal is to understand its structure using Python helpers while
avoiding massive prints or full data dumps.


In [None]:
from pathlib import Path
import sys
import json
import itertools as it
from pprint import pprint
from typing import Any, Mapping, Sequence


def resolve_project_root() -> Path:
    """Locate the repository root (directory containing the src package)."""
    candidates = [Path.cwd().resolve()]
    if "__file__" in globals():
        candidates.append(Path(__file__).resolve().parent)
    candidates.extend(candidate.parent for candidate in list(candidates))
    for candidate in candidates:
        if (candidate / "src").exists():
            return candidate
    raise RuntimeError("Unable to locate project root containing 'src'.")


PROJECT_ROOT = resolve_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.analysis.residual_results import loader

DATA_PATH = Path(
    "notebooks/h200_long_outputs/physics_A/qwen-0_5b/"
    "residual_compare_20251129_053550_Qwen_Qwen2_5_0_5B_Instruct.json"
).resolve()

if not DATA_PATH.exists():
    raise FileNotFoundError(DATA_PATH)

size_mb = DATA_PATH.stat().st_size / (1024 ** 2)
print(f"Data path: {DATA_PATH}")
print(f"File size: {size_mb:.2f} MiB")


ModuleNotFoundError: No module named 'src'

## Raw JSON peek

The next helpers read a tiny slice of the JSON file and summarize what is
available (keys, token counts, metadata keys, etc.). They explicitly cap
how many tokens or layers are displayed so the notebook stays readable.


In [1]:
def load_small_sample(path: Path, max_items: int = 2):
    """Return up to `max_items` entries plus the total record count."""
    with path.open("r", encoding="utf-8") as fh:
        payload = json.load(fh)
    if not isinstance(payload, list):
        raise TypeError("Expected a list of records at the top level.")
    return payload[:max_items], len(payload)


def describe_record(record: Mapping[str, Any], token_limit: int = 8) -> Mapping[str, Any]:
    """Extract a compact summary from a single raw record."""
    tokens = record.get("tokens", [])
    metadata = record.get("metadata", {})
    layers = record.get("layers", [])
    base_swap = record.get("base_swap", {})
    sft_swap = record.get("sft_swap", {})
    return {
        "available_keys": sorted(record.keys()),
        "token_count": len(tokens),
        "token_preview": tokens[:token_limit],
        "metadata_keys": sorted(metadata.keys()),
        "layer_count": len(layers),
        "layer_indices": [layer.get("layer_index") for layer in layers[:3]],
        "base_swap_sources": {
            "embedding": base_swap.get("embedding_source"),
            "unembedding": base_swap.get("unembedding_source"),
        },
        "sft_swap_sources": {
            "embedding": sft_swap.get("embedding_source"),
            "unembedding": sft_swap.get("unembedding_source"),
        },
    }



NameError: name 'Path' is not defined

In [None]:
raw_sample, total_records = load_small_sample(DATA_PATH, max_items=1)
print(f"Top-level entries: {total_records}")
print("\nFirst record summary:")
pprint(describe_record(raw_sample[0]), width=100, compact=True)


In [None]:
record = raw_sample[0]
metadata_preview = list(it.islice(record.get("metadata", {}).items(), 5))
print("Metadata sample (first 5 entries):")
pprint(metadata_preview, width=100, compact=True)

first_layer = record.get("layers", [])[0]
layer_positions = first_layer.get("positions", [])
print("\nLayer 0 summary:")
pprint(
    {
        "layer_index": first_layer.get("layer_index"),
        "num_positions": len(layer_positions),
        "token_names": [pos.get("token") for pos in layer_positions[:5]],
    },
    width=100,
    compact=True,
)



## Structured dataclass view

The `src.analysis.residual_results.loader` module exposes iterators and
summaries that convert each JSON entry into typed dataclasses. This keeps
the parsing lazy and provides convenience accessors for prompts, tokens,
and layers.


In [None]:
file_summary = loader.summarize_file(DATA_PATH)
print("ResidualResult summary:")
pprint(
    {
        "path": str(file_summary.path),
        "num_results": file_summary.num_results,
        "total_tokens": file_summary.total_tokens,
        "avg_tokens": round(file_summary.avg_tokens, 2),
    },
    width=100,
    compact=True,
)


In [None]:
result_iter = loader.iter_results(DATA_PATH)
first_result = next(result_iter)

structured_summary = {
    "prompt_chars": len(first_result.prompt),
    "prompt_preview": first_result.prompt[:120].replace("\n", " ") + (
        "â€¦" if len(first_result.prompt) > 120 else ""
    ),
    "token_count": first_result.num_tokens(),
    "token_preview": list(first_result.tokens[:8]),
    "num_layers": first_result.num_layers(),
}

print("First ResidualResult summary:")
pprint(structured_summary, width=100, compact=True)


## Reusable query helpers

Wrapping repeated inspection patterns into tiny functions keeps the
notebook tidy. The helpers below take dataclass instances and emit
summaries limited by caller-provided caps.


In [None]:
def summarize_tokens(result: loader.ResidualResult, max_tokens: int = 10) -> Mapping[str, Any]:
    return {
        "total": result.num_tokens(),
        "preview": list(result.tokens[:max_tokens]),
    }


def list_layers(result: loader.ResidualResult, max_layers: int = 3, max_positions: int = 5):
    for layer in it.islice(result.layers, max_layers):
        yield {
            "layer_index": layer.layer_index,
            "positions": [pos.token for pos in layer.positions[:max_positions]],
        }


def layer_norm_deltas(result: loader.ResidualResult, max_layers: int = 3, max_positions: int = 5):
    for layer in it.islice(result.layers, max_layers):
        stats = []
        for pos in layer.positions[:max_positions]:
            stats.append(
                {
                    "token": pos.token,
                    "norm_diff": pos.norm_diff,
                    "kl_div": pos.kl_div,
                }
            )
        yield layer.layer_index, stats



In [None]:
print("Token preview via summarize_tokens():")
pprint(summarize_tokens(first_result, max_tokens=6), width=100, compact=True)

print("\nLayer token overview (first 2 layers):")
for info in list_layers(first_result, max_layers=2, max_positions=6):
    pprint(info, width=100, compact=True)

print("\nLayer norm / KL samples:")
for layer_idx, stats in layer_norm_deltas(first_result, max_layers=1, max_positions=5):
    print(f"Layer {layer_idx}")
    pprint(stats, width=100, compact=True)
