In [1]:
"""
Decompose steering direction into SAE features.

Find which SAE features (from llama_scope) are most aligned with
the self/other MMS steering direction at layer 21.
"""
import json
import torch
import numpy as np
from sae_lens import SAE


In [2]:
TOP_K = 20
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

for version in ["V1", "V3"]:
    for layer in range(1, 25):
        print(f"Processing layer {layer} for version {version}")
        DIRECTION_PATH = f"/home/snfiel01/projects/sad_implementation/directions/llama3.1_8b_base_instruct_directions/{version}/mms_balanced_shared.json"
        output_path = f"outputs/{version}/sae_decomposition_layer{layer}.json"

        # Load SAE for layer 21 residual stream
        print(f"\nLoading SAE for layer {layer} residual stream...")

        print("Loading steering direction...")
        with open(DIRECTION_PATH) as f:
            data = json.load(f)

        direction = torch.tensor(data["shared_directions"][str(layer)], device=DEVICE)

        print(f"Direction shape: {direction.shape}, norm: {direction.norm().item():.4f}")

        # Load SAE for layer residual stream
        print(f"\nLoading SAE for layer {layer} residual stream...")

        sae, cfg_dict, sparsity = SAE.from_pretrained(
            release="llama_scope_lxr_8x",  # r = residual stream
            sae_id=f"l{layer}r_8x",        # layer 21 residual
            device=DEVICE
        )

                
        print(f"SAE config: {cfg_dict.get('d_sae', 'unknown')} features")

        # Get decoder weights (features -> residual stream)
        # Shape: (n_features, d_model)
        W_dec = sae.W_dec.detach()
        print(f"Decoder shape: {W_dec.shape}")

        # Normalize decoder directions
        W_dec_normalized = W_dec / W_dec.norm(dim=1, keepdim=True)

        # Compute cosine similarity between each SAE feature and steering direction
        # direction: (d_model,), W_dec_normalized: (n_features, d_model)
        cosine_sims = W_dec_normalized @ direction

        # Compute stats
        cos_min = cosine_sims.min().item()
        cos_max = cosine_sims.max().item()
        cos_mean = cosine_sims.mean().item()
        cos_std = cosine_sims.std().item()

        print(f"\nCosine similarity stats:")
        print(f"  Min: {cos_min:.4f}")
        print(f"  Max: {cos_max:.4f}")
        print(f"  Mean: {cos_mean:.4f}")
        print(f"  Std: {cos_std:.4f}")

        # Get top K most aligned features (positive alignment = same direction as self)
        top_positive_indices = torch.topk(cosine_sims, TOP_K).indices
        top_positive_values = cosine_sims[top_positive_indices]

        # Get top K most anti-aligned features (negative = opposite direction)
        top_negative_indices = torch.topk(-cosine_sims, TOP_K).indices
        top_negative_values = cosine_sims[top_negative_indices]

        print(f"\n{'='*60}")
        print(f"TOP {TOP_K} FEATURES ALIGNED WITH SELF-DIRECTION (Layer {layer})")
        print(f"{'='*60}")
        print(f"{'Rank':<6}{'Feature ID':<12}{'Cosine Sim':<12}{'Neuronpedia URL'}")
        print("-" * 60)
        for i, (idx, sim) in enumerate(zip(top_positive_indices, top_positive_values)):
            idx_val = idx.item()
            sim_val = sim.item()
            url = f"https://www.neuronpedia.org/llama3.1-8b/{layer}-llamascope-res-32k/{idx_val}"
            print(f"{i+1:<6}{idx_val:<12}{sim_val:<12.4f}{url}")

        print(f"\n{'='*60}")
        print(f"TOP {TOP_K} FEATURES ALIGNED WITH OTHER-DIRECTION (layer {layer})")
        print(f"(Negative alignment = opposite to self)")
        print(f"{'='*60}")
        print(f"{'Rank':<6}{'Feature ID':<12}{'Cosine Sim':<12}{'Neuronpedia URL'}")
        print("-" * 60)
        for i, (idx, sim) in enumerate(zip(top_negative_indices, top_negative_values)):
            idx_val = idx.item()
            sim_val = sim.item()
            url = f"https://www.neuronpedia.org/llama3.1-8b/{layer}-llamascope-res-32k/{idx_val}"
            print(f"{i+1:<6}{idx_val:<12}{sim_val:<12.4f}{url}")

        results = {
            "layer": layer,
            "direction_source": DIRECTION_PATH,
            "sae_release": "llama_scope_lxr_8x",
            "cosine_sim_stats": {
                "min": cos_min,
                "max": cos_max,
                "mean": cos_mean,
                "std": cos_std
            },
            "top_self_aligned": [
                {
                    "feature_id": idx.item(),
                    "cosine_sim": sim.item(),
                    "url": f"https://www.neuronpedia.org/llama3.1-8b/{layer}-llamascope-res-32k/{idx.item()}"
                }
                for idx, sim in zip(top_positive_indices, top_positive_values)
            ],
            "top_other_aligned": [
                {
                    "feature_id": idx.item(),
                    "cosine_sim": sim.item(),
                    "url": f"https://www.neuronpedia.org/llama3.1-8b/{layer}-llamascope-res-32k/{idx.item()}"
                }
                for idx, sim in zip(top_negative_indices, top_negative_values)
            ],
        }

        with open(output_path, "w") as f:
            json.dump(results, f, indent=2)
        print(f"\nResults saved to {output_path}")



Using device: cpu
Processing layer 1 for version V1

Loading SAE for layer 1 residual stream...
Loading steering direction...
Direction shape: torch.Size([4096]), norm: 1.0000

Loading SAE for layer 1 residual stream...


  sae, cfg_dict, sparsity = SAE.from_pretrained(


SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.2728
  Max: 0.2614
  Mean: 0.0024
  Std: 0.0162

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 1)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     16858       0.2614      https://www.neuronpedia.org/llama3.1-8b/1-llamascope-res-32k/16858
2     27791       0.2129      https://www.neuronpedia.org/llama3.1-8b/1-llamascope-res-32k/27791
3     24736       0.1926      https://www.neuronpedia.org/llama3.1-8b/1-llamascope-res-32k/24736
4     13320       0.1785      https://www.neuronpedia.org/llama3.1-8b/1-llamascope-res-32k/13320
5     14476       0.1718      https://www.neuronpedia.org/llama3.1-8b/1-llamascope-res-32k/14476
6     7191        0.1567      https://www.neuronpedia.org/llama3.1-8b/1-llamascope-res-32k/7191
7     11674       0.1552      https://www.neuronpedia.org/llama3.1-8b/1-llamascope-res-32k/11674
8     10

hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L9R-8x/checkpoints/fina(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.1016
  Max: 0.2620
  Mean: 0.0026
  Std: 0.0185

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 9)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     2529        0.2620      https://www.neuronpedia.org/llama3.1-8b/9-llamascope-res-32k/2529
2     31971       0.1512      https://www.neuronpedia.org/llama3.1-8b/9-llamascope-res-32k/31971
3     16789       0.1304      https://www.neuronpedia.org/llama3.1-8b/9-llamascope-res-32k/16789
4     3255        0.1177      https://www.neuronpedia.org/llama3.1-8b/9-llamascope-res-32k/3255
5     10615       0.1076      https://www.neuronpedia.org/llama3.1-8b/9-llamascope-res-32k/10615
6     18868       0.0956      https://www.neuronpedia.org/llama3.1-8b/9-llamascope-res-32k/18868
7     25664       0.0952      https://www.neuronpedia.org/llama3.1-8b/9-llamascope-res-32k/25664
8     330

hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L10R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.1856
  Max: 0.1960
  Mean: 0.0052
  Std: 0.0188

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 10)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     31907       0.1960      https://www.neuronpedia.org/llama3.1-8b/10-llamascope-res-32k/31907
2     10750       0.1879      https://www.neuronpedia.org/llama3.1-8b/10-llamascope-res-32k/10750
3     8925        0.1245      https://www.neuronpedia.org/llama3.1-8b/10-llamascope-res-32k/8925
4     18806       0.1240      https://www.neuronpedia.org/llama3.1-8b/10-llamascope-res-32k/18806
5     8784        0.1212      https://www.neuronpedia.org/llama3.1-8b/10-llamascope-res-32k/8784
6     3564        0.1186      https://www.neuronpedia.org/llama3.1-8b/10-llamascope-res-32k/3564
7     14433       0.1184      https://www.neuronpedia.org/llama3.1-8b/10-llamascope-res-32k/14433
8 

hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L11R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.1490
  Max: 0.1497
  Mean: 0.0042
  Std: 0.0189

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 11)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     7549        0.1497      https://www.neuronpedia.org/llama3.1-8b/11-llamascope-res-32k/7549
2     13437       0.1319      https://www.neuronpedia.org/llama3.1-8b/11-llamascope-res-32k/13437
3     23477       0.1264      https://www.neuronpedia.org/llama3.1-8b/11-llamascope-res-32k/23477
4     22508       0.1151      https://www.neuronpedia.org/llama3.1-8b/11-llamascope-res-32k/22508
5     25843       0.1114      https://www.neuronpedia.org/llama3.1-8b/11-llamascope-res-32k/25843
6     31817       0.1074      https://www.neuronpedia.org/llama3.1-8b/11-llamascope-res-32k/31817
7     27335       0.1073      https://www.neuronpedia.org/llama3.1-8b/11-llamascope-res-32k/27335


hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L12R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.2587
  Max: 0.2747
  Mean: 0.0069
  Std: 0.0203

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 12)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     604         0.2747      https://www.neuronpedia.org/llama3.1-8b/12-llamascope-res-32k/604
2     13579       0.1659      https://www.neuronpedia.org/llama3.1-8b/12-llamascope-res-32k/13579
3     1184        0.1590      https://www.neuronpedia.org/llama3.1-8b/12-llamascope-res-32k/1184
4     23610       0.1506      https://www.neuronpedia.org/llama3.1-8b/12-llamascope-res-32k/23610
5     5157        0.1422      https://www.neuronpedia.org/llama3.1-8b/12-llamascope-res-32k/5157
6     6649        0.1408      https://www.neuronpedia.org/llama3.1-8b/12-llamascope-res-32k/6649
7     7334        0.1395      https://www.neuronpedia.org/llama3.1-8b/12-llamascope-res-32k/7334
8    

hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L13R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.1330
  Max: 0.1722
  Mean: 0.0029
  Std: 0.0198

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 13)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     18574       0.1722      https://www.neuronpedia.org/llama3.1-8b/13-llamascope-res-32k/18574
2     25290       0.1637      https://www.neuronpedia.org/llama3.1-8b/13-llamascope-res-32k/25290
3     27222       0.1535      https://www.neuronpedia.org/llama3.1-8b/13-llamascope-res-32k/27222
4     27502       0.1467      https://www.neuronpedia.org/llama3.1-8b/13-llamascope-res-32k/27502
5     13174       0.1328      https://www.neuronpedia.org/llama3.1-8b/13-llamascope-res-32k/13174
6     30540       0.1324      https://www.neuronpedia.org/llama3.1-8b/13-llamascope-res-32k/30540
7     31844       0.1301      https://www.neuronpedia.org/llama3.1-8b/13-llamascope-res-32k/31844

hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L14R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.2159
  Max: 0.2253
  Mean: 0.0043
  Std: 0.0201

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 14)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     11029       0.2253      https://www.neuronpedia.org/llama3.1-8b/14-llamascope-res-32k/11029
2     31519       0.1519      https://www.neuronpedia.org/llama3.1-8b/14-llamascope-res-32k/31519
3     20102       0.1398      https://www.neuronpedia.org/llama3.1-8b/14-llamascope-res-32k/20102
4     9400        0.1247      https://www.neuronpedia.org/llama3.1-8b/14-llamascope-res-32k/9400
5     8331        0.1196      https://www.neuronpedia.org/llama3.1-8b/14-llamascope-res-32k/8331
6     5198        0.1161      https://www.neuronpedia.org/llama3.1-8b/14-llamascope-res-32k/5198
7     3954        0.1070      https://www.neuronpedia.org/llama3.1-8b/14-llamascope-res-32k/3954
8  

hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L15R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.1553
  Max: 0.1511
  Mean: 0.0013
  Std: 0.0203

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 15)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     29184       0.1511      https://www.neuronpedia.org/llama3.1-8b/15-llamascope-res-32k/29184
2     27979       0.1346      https://www.neuronpedia.org/llama3.1-8b/15-llamascope-res-32k/27979
3     4577        0.1335      https://www.neuronpedia.org/llama3.1-8b/15-llamascope-res-32k/4577
4     11205       0.1241      https://www.neuronpedia.org/llama3.1-8b/15-llamascope-res-32k/11205
5     22610       0.1229      https://www.neuronpedia.org/llama3.1-8b/15-llamascope-res-32k/22610
6     6056        0.1176      https://www.neuronpedia.org/llama3.1-8b/15-llamascope-res-32k/6056
7     26550       0.1175      https://www.neuronpedia.org/llama3.1-8b/15-llamascope-res-32k/26550
8

hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L16R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.1397
  Max: 0.1343
  Mean: 0.0010
  Std: 0.0193

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 16)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     21971       0.1343      https://www.neuronpedia.org/llama3.1-8b/16-llamascope-res-32k/21971
2     21189       0.1315      https://www.neuronpedia.org/llama3.1-8b/16-llamascope-res-32k/21189
3     5830        0.1294      https://www.neuronpedia.org/llama3.1-8b/16-llamascope-res-32k/5830
4     12049       0.1164      https://www.neuronpedia.org/llama3.1-8b/16-llamascope-res-32k/12049
5     24204       0.1039      https://www.neuronpedia.org/llama3.1-8b/16-llamascope-res-32k/24204
6     692         0.1015      https://www.neuronpedia.org/llama3.1-8b/16-llamascope-res-32k/692
7     11625       0.0999      https://www.neuronpedia.org/llama3.1-8b/16-llamascope-res-32k/11625
8 

hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L17R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.1982
  Max: 0.1614
  Mean: 0.0014
  Std: 0.0183

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 17)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     15837       0.1614      https://www.neuronpedia.org/llama3.1-8b/17-llamascope-res-32k/15837
2     12679       0.1216      https://www.neuronpedia.org/llama3.1-8b/17-llamascope-res-32k/12679
3     20537       0.1207      https://www.neuronpedia.org/llama3.1-8b/17-llamascope-res-32k/20537
4     10781       0.1192      https://www.neuronpedia.org/llama3.1-8b/17-llamascope-res-32k/10781
5     19935       0.1132      https://www.neuronpedia.org/llama3.1-8b/17-llamascope-res-32k/19935
6     18715       0.1104      https://www.neuronpedia.org/llama3.1-8b/17-llamascope-res-32k/18715
7     25028       0.1052      https://www.neuronpedia.org/llama3.1-8b/17-llamascope-res-32k/25028

hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L18R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.1756
  Max: 0.2141
  Mean: -0.0007
  Std: 0.0180

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 18)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     22498       0.2141      https://www.neuronpedia.org/llama3.1-8b/18-llamascope-res-32k/22498
2     22144       0.1990      https://www.neuronpedia.org/llama3.1-8b/18-llamascope-res-32k/22144
3     10894       0.1582      https://www.neuronpedia.org/llama3.1-8b/18-llamascope-res-32k/10894
4     25075       0.1439      https://www.neuronpedia.org/llama3.1-8b/18-llamascope-res-32k/25075
5     7239        0.1420      https://www.neuronpedia.org/llama3.1-8b/18-llamascope-res-32k/7239
6     1036        0.1412      https://www.neuronpedia.org/llama3.1-8b/18-llamascope-res-32k/1036
7     12518       0.1279      https://www.neuronpedia.org/llama3.1-8b/18-llamascope-res-32k/12518


hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L19R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.1527
  Max: 0.2309
  Mean: -0.0008
  Std: 0.0181

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 19)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     13082       0.2309      https://www.neuronpedia.org/llama3.1-8b/19-llamascope-res-32k/13082
2     24322       0.1386      https://www.neuronpedia.org/llama3.1-8b/19-llamascope-res-32k/24322
3     1975        0.1374      https://www.neuronpedia.org/llama3.1-8b/19-llamascope-res-32k/1975
4     6375        0.1342      https://www.neuronpedia.org/llama3.1-8b/19-llamascope-res-32k/6375
5     30717       0.1309      https://www.neuronpedia.org/llama3.1-8b/19-llamascope-res-32k/30717
6     24620       0.1259      https://www.neuronpedia.org/llama3.1-8b/19-llamascope-res-32k/24620
7     11462       0.1172      https://www.neuronpedia.org/llama3.1-8b/19-llamascope-res-32k/11462


hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L20R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.1480
  Max: 0.2085
  Mean: -0.0011
  Std: 0.0182

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 20)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     10468       0.2085      https://www.neuronpedia.org/llama3.1-8b/20-llamascope-res-32k/10468
2     21333       0.1717      https://www.neuronpedia.org/llama3.1-8b/20-llamascope-res-32k/21333
3     26888       0.1527      https://www.neuronpedia.org/llama3.1-8b/20-llamascope-res-32k/26888
4     4162        0.1471      https://www.neuronpedia.org/llama3.1-8b/20-llamascope-res-32k/4162
5     7185        0.1450      https://www.neuronpedia.org/llama3.1-8b/20-llamascope-res-32k/7185
6     25888       0.1448      https://www.neuronpedia.org/llama3.1-8b/20-llamascope-res-32k/25888
7     17217       0.1414      https://www.neuronpedia.org/llama3.1-8b/20-llamascope-res-32k/17217


hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L22R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.1704
  Max: 0.2555
  Mean: -0.0009
  Std: 0.0186

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 22)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     24115       0.2555      https://www.neuronpedia.org/llama3.1-8b/22-llamascope-res-32k/24115
2     24290       0.1926      https://www.neuronpedia.org/llama3.1-8b/22-llamascope-res-32k/24290
3     9722        0.1591      https://www.neuronpedia.org/llama3.1-8b/22-llamascope-res-32k/9722
4     25821       0.1500      https://www.neuronpedia.org/llama3.1-8b/22-llamascope-res-32k/25821
5     19069       0.1450      https://www.neuronpedia.org/llama3.1-8b/22-llamascope-res-32k/19069
6     2584        0.1417      https://www.neuronpedia.org/llama3.1-8b/22-llamascope-res-32k/2584
7     12970       0.1406      https://www.neuronpedia.org/llama3.1-8b/22-llamascope-res-32k/12970


hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L23R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.1526
  Max: 0.2444
  Mean: -0.0025
  Std: 0.0186

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 23)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     30357       0.2444      https://www.neuronpedia.org/llama3.1-8b/23-llamascope-res-32k/30357
2     16250       0.1609      https://www.neuronpedia.org/llama3.1-8b/23-llamascope-res-32k/16250
3     3384        0.1400      https://www.neuronpedia.org/llama3.1-8b/23-llamascope-res-32k/3384
4     25046       0.1299      https://www.neuronpedia.org/llama3.1-8b/23-llamascope-res-32k/25046
5     25161       0.1273      https://www.neuronpedia.org/llama3.1-8b/23-llamascope-res-32k/25161
6     3540        0.1231      https://www.neuronpedia.org/llama3.1-8b/23-llamascope-res-32k/3540
7     25381       0.1129      https://www.neuronpedia.org/llama3.1-8b/23-llamascope-res-32k/25381


hyperparams.json: 0.00B [00:00, ?B/s]

Llama3_1-8B-Base-L24R-8x/checkpoints/fin(…):   0%|          | 0.00/537M [00:00<?, ?B/s]

SAE config: 32768 features
Decoder shape: torch.Size([32768, 4096])

Cosine similarity stats:
  Min: -0.1591
  Max: 0.2475
  Mean: -0.0032
  Std: 0.0183

TOP 20 FEATURES ALIGNED WITH SELF-DIRECTION (Layer 24)
Rank  Feature ID  Cosine Sim  Neuronpedia URL
------------------------------------------------------------
1     2962        0.2475      https://www.neuronpedia.org/llama3.1-8b/24-llamascope-res-32k/2962
2     8222        0.1663      https://www.neuronpedia.org/llama3.1-8b/24-llamascope-res-32k/8222
3     24854       0.1435      https://www.neuronpedia.org/llama3.1-8b/24-llamascope-res-32k/24854
4     22583       0.1344      https://www.neuronpedia.org/llama3.1-8b/24-llamascope-res-32k/22583
5     1731        0.1223      https://www.neuronpedia.org/llama3.1-8b/24-llamascope-res-32k/1731
6     26371       0.1149      https://www.neuronpedia.org/llama3.1-8b/24-llamascope-res-32k/26371
7     32734       0.1140      https://www.neuronpedia.org/llama3.1-8b/24-llamascope-res-32k/32734
8