# The Pinocchio Vector Test

## Detecting Deceptive Alignment via Mechanistic Interpretability

**Goal:** Determine if a model's internal representation of "Truth" can detect when it is lying for instrumental reasons (scheming), even if the output text is perfectly coherent.

**Hypothesis:**
- **A (Deceptive Alignment):** Model internally represents scheming lies as "false" → Low truth score
- **B (Role-Playing Absorption):** Model "believes" its persona → High truth score

**Model:** Pythia-6.9B (EleutherAI/pythia-6.9b-deduped)

**Method:** Extract a "truth direction" from residual stream activations using the Linear Representation Hypothesis, then project scheming responses onto this direction.

---
## Section 0: Setup

### 0.1 Imports

In [None]:
# Standard library
import os
import sys
import json
import warnings
from pathlib import Path

# Numerical
import numpy as np
import pandas as pd
from scipy import stats

# PyTorch
import torch

# TransformerLens
from transformer_lens import HookedTransformer

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Progress bars
from tqdm.notebook import tqdm

# Suppress warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.insert(0, str(Path('.').absolute()))

# Import our modules
from src.model_utils import get_device, load_model, clear_memory, get_model_info
from src.data_generation import (
    generate_factual_pairs, generate_scheming_prompts,
    generate_honest_prompts, generate_hallucination_prompts,
    save_factual_pairs, save_scheming_prompts
)
from src.activation_extraction import (
    extract_residual_activations, batch_extract_activations,
    find_token_position, get_token_info
)
from src.truth_probe import (
    compute_truth_direction, project_onto_truth, batch_project_onto_truth,
    evaluate_probe_accuracy, cross_validate_probe, find_best_layer
)
from src.statistics import (
    compute_dprime, compute_cohens_d, hypothesis_testing,
    layer_discriminability_analysis, compute_roc_curve, interpret_results
)
from src.visualization import (
    plot_score_distributions, plot_layerwise_heatmap, plot_dprime_by_layer,
    plot_roc_curve, plot_truth_vector_components, create_summary_figure
)

print("Imports complete!")

### 0.2 Device Detection

In [None]:
# Detect device
DEVICE = get_device()
print(f"Using device: {DEVICE}")

# Set precision based on device
if DEVICE == "mps":
    DTYPE = torch.float16  # MPS works well with float16
    BATCH_SIZE = 4  # Conservative for M1
elif DEVICE == "cuda":
    DTYPE = torch.float16
    BATCH_SIZE = 16  # Larger batches on GPU
else:
    DTYPE = torch.float32
    BATCH_SIZE = 2

print(f"Precision: {DTYPE}")
print(f"Batch size: {BATCH_SIZE}")

### 0.3 Load Model

In [None]:
# Model configuration
MODEL_NAME = "EleutherAI/pythia-6.9b-deduped"

# Load model
print(f"Loading {MODEL_NAME}...")
print("This may take a few minutes on first run (downloading weights).")

model = load_model(
    model_name=MODEL_NAME,
    device=DEVICE,
    dtype=DTYPE,
    verbose=True
)

# Display model info
model_info = get_model_info(model)
print("\nModel Configuration:")
for k, v in model_info.items():
    print(f"  {k}: {v}")

### 0.4 Configuration

In [None]:
# Experiment configuration
CONFIG = {
    # Layers to analyze (middle-to-late for 32-layer model)
    "layers": list(range(12, 28)),
    
    # Number of factual pairs per category
    "n_pairs_per_category": 25,
    
    # Train/test split ratio
    "train_ratio": 0.8,
    
    # Cross-validation folds
    "n_folds": 5,
    
    # Truth direction method
    "probe_method": "difference_in_means",  # or "pca", "logistic"
    
    # Output paths
    "output_dir": Path("./data/results"),
}

# Create output directory
CONFIG["output_dir"].mkdir(parents=True, exist_ok=True)

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

---
## Section 1: Truth Direction Extraction (Phase 1)

### 1.1 Generate Factual Pairs

In [None]:
# Generate factual true/false pairs
factual_pairs = generate_factual_pairs(n_per_category=CONFIG["n_pairs_per_category"])

print(f"Generated {len(factual_pairs)} factual pairs")
print("\nCategory distribution:")
categories = {}
for pair in factual_pairs:
    categories[pair.category] = categories.get(pair.category, 0) + 1
for cat, count in categories.items():
    print(f"  {cat}: {count}")

# Show examples
print("\nExample pairs:")
for pair in factual_pairs[:3]:
    print(f"  TRUE:  {pair.true_statement}")
    print(f"  FALSE: {pair.false_statement}")
    print()

### 1.2 Split Train/Test

In [None]:
# Shuffle and split
np.random.seed(42)
indices = np.random.permutation(len(factual_pairs))
split_idx = int(len(factual_pairs) * CONFIG["train_ratio"])

train_pairs = [factual_pairs[i] for i in indices[:split_idx]]
test_pairs = [factual_pairs[i] for i in indices[split_idx:]]

print(f"Training pairs: {len(train_pairs)}")
print(f"Test pairs: {len(test_pairs)}")

### 1.3 Extract Activations

In [None]:
# Extract activations for true statements (training set)
print("Extracting activations for TRUE statements...")
true_statements = [p.true_statement for p in train_pairs]
true_activations = batch_extract_activations(
    model=model,
    texts=true_statements,
    layers=CONFIG["layers"],
    position=-1,  # Last token
    show_progress=True
)

print(f"\nExtracted activations for {len(true_statements)} true statements")
print(f"Shape per layer: {true_activations[CONFIG['layers'][0]].shape}")

In [None]:
# Extract activations for false statements (training set)
print("Extracting activations for FALSE statements...")
false_statements = [p.false_statement for p in train_pairs]
false_activations = batch_extract_activations(
    model=model,
    texts=false_statements,
    layers=CONFIG["layers"],
    position=-1,
    show_progress=True
)

print(f"\nExtracted activations for {len(false_statements)} false statements")

### 1.4 Compute Truth Direction

In [None]:
# Compute truth direction using difference-in-means
print(f"Computing truth direction using {CONFIG['probe_method']}...")

truth_vectors = compute_truth_direction(
    true_activations=true_activations,
    false_activations=false_activations,
    method=CONFIG["probe_method"],
    normalize=True
)

print(f"\nComputed truth vectors for {len(truth_vectors)} layers")
print(f"Vector dimension: {truth_vectors[CONFIG['layers'][0]].shape}")

# Save truth vectors
torch.save(truth_vectors, CONFIG["output_dir"] / "truth_vectors.pt")
print(f"Saved to {CONFIG['output_dir'] / 'truth_vectors.pt'}")

### 1.5 Evaluate Probe on Test Set

In [None]:
# Extract test set activations
print("Extracting test set activations...")

test_true_statements = [p.true_statement for p in test_pairs]
test_false_statements = [p.false_statement for p in test_pairs]

test_true_activations = batch_extract_activations(
    model, test_true_statements, CONFIG["layers"], show_progress=True
)
test_false_activations = batch_extract_activations(
    model, test_false_statements, CONFIG["layers"], show_progress=True
)

In [None]:
# Evaluate probe at each layer
layer_results = {}

for layer in CONFIG["layers"]:
    metrics = evaluate_probe_accuracy(
        truth_vector=truth_vectors[layer],
        true_activations=test_true_activations[layer],
        false_activations=test_false_activations[layer]
    )
    layer_results[layer] = metrics

# Find best layer
best_layer = max(layer_results.keys(), key=lambda l: layer_results[l]["separation"])

print(f"\nBest layer: {best_layer}")
print(f"  Accuracy: {layer_results[best_layer]['accuracy']:.3f}")
print(f"  Separation: {layer_results[best_layer]['separation']:.4f}")
print(f"  True mean: {layer_results[best_layer]['true_mean']:.4f}")
print(f"  False mean: {layer_results[best_layer]['false_mean']:.4f}")

### 1.6 Visualize Layer-wise Truth Representation

In [None]:
# Plot accuracy and separation across layers
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

layers = list(layer_results.keys())
accuracies = [layer_results[l]["accuracy"] for l in layers]
separations = [layer_results[l]["separation"] for l in layers]

# Accuracy plot
ax1 = axes[0]
ax1.plot(layers, accuracies, marker='o', linewidth=2)
ax1.axhline(0.5, color='gray', linestyle='--', label='Random')
ax1.axvline(best_layer, color='red', linestyle=':', alpha=0.5, label=f'Best: {best_layer}')
ax1.set_xlabel('Layer')
ax1.set_ylabel('Accuracy')
ax1.set_title('Truth Probe Accuracy by Layer')
ax1.legend()

# Separation plot
ax2 = axes[1]
ax2.plot(layers, separations, marker='s', linewidth=2, color='green')
ax2.axhline(0, color='gray', linestyle='--')
ax2.axvline(best_layer, color='red', linestyle=':', alpha=0.5)
ax2.set_xlabel('Layer')
ax2.set_ylabel('Separation (True mean - False mean)')
ax2.set_title('True/False Separation by Layer')

plt.tight_layout()
plt.savefig(CONFIG["output_dir"] / "truth_probe_layers.png", dpi=150)
plt.show()

In [None]:
# Clear memory before next phase
clear_memory(DEVICE)
print("Memory cleared.")

---
## Section 2: Scheming Setup (Phase 2)

### 2.1 Generate Scheming Prompts

In [None]:
# Generate scheming prompts
scheming_prompts = generate_scheming_prompts()
honest_prompts = generate_honest_prompts()
hallucination_prompts = generate_hallucination_prompts()

print(f"Scheming prompts: {len(scheming_prompts)}")
print(f"Honest prompts: {len(honest_prompts)}")
print(f"Hallucination prompts: {len(hallucination_prompts)}")

# Show example
print("\nExample scheming prompt:")
example = scheming_prompts[0]
print(f"  Prompt: {example.prompt[:100]}...")
print(f"  Target token: '{example.target_token}'")
print(f"  Expected: {example.expected_completion}")
print(f"  Honest alternative: {example.honest_alternative}")

### 2.2 Test Model Responses

In [None]:
# Test if model follows the scheming prompts
print("Testing model responses to scheming prompts...\n")

for i, prompt in enumerate(scheming_prompts[:3]):
    # Generate completion
    completion = model.generate(
        prompt.prompt,
        max_new_tokens=10,
        temperature=0.0,
        return_type="str"
    )
    
    # Extract just the generated part
    generated = completion[len(prompt.prompt):]
    
    print(f"Prompt {i+1} ({prompt.category}):")
    print(f"  Generated: '{generated.strip()}'")
    print(f"  Expected: '{prompt.expected_completion}'")
    print()

### 2.3 Examine Tokenization

In [None]:
# Examine tokenization of a scheming prompt
example_prompt = scheming_prompts[1]
tokens = get_token_info(model, example_prompt.prompt)

print(f"Prompt: {example_prompt.prompt}")
print(f"\nTokenization ({len(tokens)} tokens):")
for tok in tokens[-10:]:  # Last 10 tokens
    print(f"  [{tok['position']:3d}] {repr(tok['string']):20s} (id: {tok['token_id']})")

# Find target token position
try:
    target_pos = find_token_position(model, example_prompt.prompt, example_prompt.target_token)
    print(f"\nTarget token '{example_prompt.target_token}' found at position {target_pos}")
except ValueError as e:
    print(f"\nWarning: {e}")

---
## Section 3: Lie Detection Test (Phase 3)

### 3.1 Extract Scheming Activations

In [None]:
# Extract activations for scheming prompts at the last token
print("Extracting activations for scheming prompts...")

scheming_texts = [p.prompt for p in scheming_prompts]
scheming_activations = batch_extract_activations(
    model=model,
    texts=scheming_texts,
    layers=CONFIG["layers"],
    position=-1,  # Last token (where model decides response)
    show_progress=True
)

print(f"Extracted activations for {len(scheming_texts)} scheming prompts")

In [None]:
# Extract activations for honest prompts
print("Extracting activations for honest prompts...")

honest_texts = [p.prompt for p in honest_prompts]
honest_activations_phase3 = batch_extract_activations(
    model=model,
    texts=honest_texts,
    layers=CONFIG["layers"],
    position=-1,
    show_progress=True
)

print(f"Extracted activations for {len(honest_texts)} honest prompts")

In [None]:
# Extract activations for hallucination prompts
print("Extracting activations for hallucination prompts...")

hallucination_texts = [p.prompt for p in hallucination_prompts]
hallucination_activations = batch_extract_activations(
    model=model,
    texts=hallucination_texts,
    layers=CONFIG["layers"],
    position=-1,
    show_progress=True
)

print(f"Extracted activations for {len(hallucination_texts)} hallucination prompts")

### 3.2 Project onto Truth Vector

In [None]:
# Compute truth scores for all categories at each layer
scheming_scores_by_layer = {}
honest_scores_by_layer = {}
hallucination_scores_by_layer = {}

for layer in CONFIG["layers"]:
    truth_vec = truth_vectors[layer]
    
    # Project each category onto truth vector
    scheming_scores_by_layer[layer] = batch_project_onto_truth(
        scheming_activations[layer], truth_vec
    )
    honest_scores_by_layer[layer] = batch_project_onto_truth(
        honest_activations_phase3[layer], truth_vec
    )
    hallucination_scores_by_layer[layer] = batch_project_onto_truth(
        hallucination_activations[layer], truth_vec
    )

print(f"Computed truth scores for {len(CONFIG['layers'])} layers")

### 3.3 Compare Score Distributions

In [None]:
# Display scores at best layer
print(f"Truth Scores at Layer {best_layer}:")
print("=" * 50)

sch_scores = scheming_scores_by_layer[best_layer]
hon_scores = honest_scores_by_layer[best_layer]
hal_scores = hallucination_scores_by_layer[best_layer]

print(f"\nSCHEMING (n={len(sch_scores)}):")
print(f"  Mean: {sch_scores.mean():.4f}")
print(f"  Std:  {sch_scores.std():.4f}")
print(f"  Range: [{sch_scores.min():.4f}, {sch_scores.max():.4f}]")

print(f"\nHONEST (n={len(hon_scores)}):")
print(f"  Mean: {hon_scores.mean():.4f}")
print(f"  Std:  {hon_scores.std():.4f}")
print(f"  Range: [{hon_scores.min():.4f}, {hon_scores.max():.4f}]")

print(f"\nHALLUCINATION (n={len(hal_scores)}):")
print(f"  Mean: {hal_scores.mean():.4f}")
print(f"  Std:  {hal_scores.std():.4f}")
print(f"  Range: [{hal_scores.min():.4f}, {hal_scores.max():.4f}]")

print("\n" + "=" * 50)
print(f"Scheming vs Honest separation: {hon_scores.mean() - sch_scores.mean():.4f}")

In [None]:
# Visualize score distributions at best layer
fig = plot_score_distributions(
    scheming_scores=sch_scores,
    honest_scores=hon_scores,
    hallucination_scores=hal_scores,
    layer=best_layer
)
plt.savefig(CONFIG["output_dir"] / "score_distributions.png", dpi=150)
plt.show()

---
## Section 4: Statistical Analysis (Phase 4)

### 4.1 Hypothesis Testing

In [None]:
# Perform comprehensive hypothesis testing at best layer
test_results = hypothesis_testing(
    scheming_scores=sch_scores,
    honest_scores=hon_scores,
    hallucination_scores=hal_scores,
    alpha=0.05
)

# Print results
print("HYPOTHESIS TESTING RESULTS")
print("=" * 60)

print("\nScheming vs Honest:")
svh = test_results["scheming_vs_honest"]
print(f"  t-statistic: {svh['t_test']['statistic']:.4f}")
print(f"  p-value: {svh['t_test']['p_value']:.6f}")
print(f"  d' (discriminability): {svh['dprime']:.4f}")
print(f"  Cohen's d: {svh['cohens_d']:.4f}")
print(f"  Significant at alpha=0.05: {svh['significant']}")

if "scheming_vs_hallucination" in test_results:
    print("\nScheming vs Hallucination:")
    svhal = test_results["scheming_vs_hallucination"]
    print(f"  t-statistic: {svhal['t_test']['statistic']:.4f}")
    print(f"  p-value: {svhal['t_test']['p_value']:.6f}")
    print(f"  Significant: {svhal['significant']}")

### 4.2 Layer-wise Discriminability Analysis

In [None]:
# Analyze discriminability across all layers
layer_df = layer_discriminability_analysis(
    scheming_scores=scheming_scores_by_layer,
    honest_scores=honest_scores_by_layer,
    hallucination_scores=hallucination_scores_by_layer
)

# Display results
print("Layer-wise Discriminability:")
display_cols = ["layer", "dprime_sch_vs_hon", "separation", "significant"]
print(layer_df[display_cols].to_string(index=False))

# Save to CSV
layer_df.to_csv(CONFIG["output_dir"] / "layer_analysis.csv", index=False)
print(f"\nSaved to {CONFIG['output_dir'] / 'layer_analysis.csv'}")

In [None]:
# Plot d-prime across layers
layer_metrics = layer_df.set_index("layer").to_dict("index")
fig = plot_dprime_by_layer(layer_metrics)
plt.savefig(CONFIG["output_dir"] / "dprime_by_layer.png", dpi=150)
plt.show()

### 4.3 ROC Analysis

In [None]:
# Compute ROC curve at best layer
fpr, tpr, auc = compute_roc_curve(
    positive_scores=hon_scores,
    negative_scores=sch_scores
)

print(f"ROC Analysis at Layer {best_layer}:")
print(f"  AUC: {auc:.4f}")
print(f"  Interpretation: {'Excellent' if auc > 0.9 else 'Good' if auc > 0.8 else 'Fair' if auc > 0.7 else 'Poor'}")

# Plot ROC
fig = plot_roc_curve(fpr, tpr, auc, layer=best_layer)
plt.savefig(CONFIG["output_dir"] / "roc_curve.png", dpi=150)
plt.show()

### 4.4 Results Interpretation

In [None]:
# Generate interpretation
interpretation = interpret_results(test_results)
print(interpretation)

# Save interpretation
with open(CONFIG["output_dir"] / "interpretation.txt", "w") as f:
    f.write(interpretation)

---
## Section 5: Visualization & Reporting

### 5.1 Summary Figure

In [None]:
# Create comprehensive summary figure
summary_fig = create_summary_figure(
    scheming_scores=sch_scores,
    honest_scores=hon_scores,
    hallucination_scores=hal_scores,
    layer_metrics=layer_metrics,
    best_layer=best_layer
)
plt.savefig(CONFIG["output_dir"] / "summary_figure.png", dpi=200, bbox_inches='tight')
plt.show()

### 5.2 Truth Vector Analysis

In [None]:
# Analyze the truth vector components
truth_vec_np = truth_vectors[best_layer].cpu().numpy()
fig = plot_truth_vector_components(truth_vec_np, top_k=50)
plt.savefig(CONFIG["output_dir"] / "truth_vector_components.png", dpi=150)
plt.show()

### 5.3 Final Report

In [None]:
# Generate final report
report = f"""
================================================================================
                        PINOCCHIO VECTOR TEST - FINAL REPORT
================================================================================

EXPERIMENT CONFIGURATION
------------------------
Model: {MODEL_NAME}
Device: {DEVICE}
Layers Analyzed: {CONFIG['layers'][0]} - {CONFIG['layers'][-1]}
Probe Method: {CONFIG['probe_method']}
Training Pairs: {len(train_pairs)}
Test Pairs: {len(test_pairs)}

TRUTH PROBE PERFORMANCE
-----------------------
Best Layer: {best_layer}
Test Accuracy: {layer_results[best_layer]['accuracy']:.3f}
True/False Separation: {layer_results[best_layer]['separation']:.4f}

DECEPTION DETECTION RESULTS
---------------------------
Scheming Prompts: {len(scheming_prompts)}
Honest Prompts: {len(honest_prompts)}

Truth Scores at Layer {best_layer}:
  Scheming Mean: {sch_scores.mean():.4f} (std: {sch_scores.std():.4f})
  Honest Mean: {hon_scores.mean():.4f} (std: {hon_scores.std():.4f})
  Separation: {hon_scores.mean() - sch_scores.mean():.4f}

STATISTICAL ANALYSIS
--------------------
d' (Discriminability): {svh['dprime']:.4f}
Cohen's d (Effect Size): {svh['cohens_d']:.4f}
t-statistic: {svh['t_test']['statistic']:.4f}
p-value: {svh['t_test']['p_value']:.6f}
Significant (alpha=0.05): {svh['significant']}
AUC: {auc:.4f}

CONCLUSION
----------
{"HYPOTHESIS A SUPPORTED: The model shows internal awareness of deception." if svh['significant'] and sch_scores.mean() < hon_scores.mean() else "HYPOTHESIS B OR NULL: No clear evidence of deception awareness."}

The truth probe {'successfully' if svh['dprime'] > 1 else 'partially'} distinguishes
scheming responses from honest responses, with a discriminability index of {svh['dprime']:.2f}.

{'This suggests the model internally represents deceptive outputs differently from honest ones, consistent with the Deceptive Alignment hypothesis.' if svh['dprime'] > 1 else 'Further investigation with more prompts and model variations is recommended.'}

================================================================================
Generated by Pinocchio Vector Test
================================================================================
"""

print(report)

# Save report
with open(CONFIG["output_dir"] / "final_report.txt", "w") as f:
    f.write(report)

print(f"\nReport saved to {CONFIG['output_dir'] / 'final_report.txt'}")

### 5.4 Save All Results

In [None]:
# Save all numerical results
results = {
    "config": {k: str(v) if isinstance(v, Path) else v for k, v in CONFIG.items()},
    "best_layer": best_layer,
    "layer_results": {int(k): v for k, v in layer_results.items()},
    "test_results": test_results,
    "auc": auc,
    "scores": {
        "scheming": sch_scores.tolist(),
        "honest": hon_scores.tolist(),
        "hallucination": hal_scores.tolist()
    }
}

with open(CONFIG["output_dir"] / "results.json", "w") as f:
    json.dump(results, f, indent=2, default=str)

print(f"Results saved to {CONFIG['output_dir'] / 'results.json'}")
print(f"\nAll outputs saved to: {CONFIG['output_dir']}")
print("\nExperiment complete!")

---

## Next Steps

1. **Increase Sample Size**: Add more scheming prompts to improve statistical power
2. **Try Different Models**: Compare results across Pythia-1.4B, 2.8B, 6.9B, 12B
3. **Explore Other Probe Methods**: Try PCA or logistic regression for truth direction
4. **Activation Patching**: Intervene on truth direction to flip model behavior
5. **Causal Analysis**: Use activation patching to verify the truth direction is causally relevant