In [None]:
import sys
from pathlib import Path

# add project root to path
project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from scipy import stats

print(f"project root: {project_root}")

In [None]:
# configuration
DATA_DIR = project_root / 'data'
RESULTS_DIR = project_root / 'results'
FIGURES_DIR = RESULTS_DIR / 'figures'

FIGURES_DIR.mkdir(parents=True, exist_ok=True)

# seed for reproducibility
np.random.seed(42)

## 1. Load All Results

In [None]:
# define result paths
result_paths = {
    'probing': RESULTS_DIR / 'probing' / 'probing_results.json',
    'patching': RESULTS_DIR / 'patching' / 'patching_results.json',
    'cross_dataset': RESULTS_DIR / 'generalization' / 'cross_dataset_results.json'
}

# load available results
results = {}

for name, path in result_paths.items():
    if path.exists():
        with open(path, 'r') as f:
            results[name] = json.load(f)
        print(f"loaded {name}: {path}")
    else:
        print(f"not found: {path}")
        results[name] = None

## 2. Results Aggregation

In [None]:
from src.utils import ResultsAggregator, HypothesisTester

# create aggregator
aggregator = ResultsAggregator(
    experiment_name='pd_interpretability_main',
    output_dir=str(RESULTS_DIR)
)

# add probing results
if results['probing'] and 'layerwise_probing' in results['probing']:
    probing_data = {}
    for layer_str, data in results['probing']['layerwise_probing'].items():
        probing_data[int(layer_str)] = data
    aggregator.add_probing_results(probing_data)
    print(f"added probing results for {len(probing_data)} layers")

# add patching results
if results['patching'] and 'layer_patching' in results['patching']:
    patching_data = {}
    for layer_str, data in results['patching']['layer_patching'].items():
        patching_data[int(layer_str)] = data
    aggregator.add_patching_results(patching_data)
    print(f"added patching results for {len(patching_data)} layers")

# add clinical probing if available
if results['probing'] and 'clinical_probing' in results['probing']:
    aggregator.add_clinical_probing_results(results['probing']['clinical_probing'])
    print("added clinical probing results")

# add cross-dataset results
if results['cross_dataset']:
    aggregator.add_cross_dataset_results(results['cross_dataset'])
    print("added cross-dataset results")

In [None]:
# generate summary report
print(aggregator.generate_summary_report())

## 3. Hypothesis Testing

In [None]:
# create hypothesis tester
tester = HypothesisTester(aggregator)

# run all hypothesis tests
hypothesis_results = tester.run_all_hypothesis_tests()

# generate report
print(tester.generate_hypothesis_report())

In [None]:
# detailed hypothesis 1 analysis
h1 = hypothesis_results['hypothesis_1']

print("HYPOTHESIS 1: CLINICAL FEATURE ENCODING")
print("=" * 50)
print("\nphonatory features (jitter, shimmer):")
for feat, data in h1.get('phonatory', {}).items():
    print(f"  {feat}: layer {data['best_layer']} (r² = {data['r2']:.3f})")

print("\nprosodic features (f0, hnr):")
for feat, data in h1.get('prosodic', {}).items():
    print(f"  {feat}: layer {data['best_layer']} (r² = {data['r2']:.3f})")

print(f"\nprediction: phonatory in early layers (2-4)")
print(f"  actual mean layer: {h1.get('phonatory_mean_layer', 'N/A')}")
print(f"  fraction in early: {h1.get('phonatory_early_fraction', 0):.1%}")

print(f"\nprediction: prosodic in middle layers (5-8)")
print(f"  actual mean layer: {h1.get('prosodic_mean_layer', 'N/A')}")
print(f"  fraction in middle: {h1.get('prosodic_middle_fraction', 0):.1%}")

In [None]:
# detailed hypothesis 2 analysis
h2 = hypothesis_results['hypothesis_2']

print("HYPOTHESIS 2: CAUSAL FEATURE DEPENDENCY")
print("=" * 50)

print(f"\ncausal layers (patching recovery > 0.1): {h2['causal_layers']}")

if h2['probing_patching_correlation']:
    corr = h2['probing_patching_correlation']
    print(f"\nprobing-patching correlation:")
    print(f"  spearman r: {corr['spearman_r']:.3f}")
    print(f"  p-value: {corr['spearman_p']:.4f}")
    print(f"  n layers: {corr['n_layers']}")

print(f"\nverdict: {'SUPPORTED' if h2['supported'] else 'NOT SUPPORTED'}")

## 4. Generate Publication Figures

In [None]:
from src.utils import FigureGenerator, set_publication_style

# set publication style
set_publication_style()

# create figure generator
fig_gen = FigureGenerator(output_dir=str(FIGURES_DIR))

print(f"figures will be saved to: {FIGURES_DIR}")

In [None]:
# figure 1: overview
if aggregator.probing_results and aggregator.patching_results:
    probing_dict = {k: {'mean': v.mean_score, 'std': v.std_score} 
                    for k, v in aggregator.probing_results.items()}
    patching_dict = {k: {'mean_recovery': v.mean_recovery, 'std_recovery': v.std_recovery}
                     for k, v in aggregator.patching_results.items()}
    
    fig1 = fig_gen.figure_1_overview(
        probing_dict,
        patching_dict,
        model_accuracy=results['probing'].get('model_accuracy', 0.85) if results['probing'] else 0.85
    )
    plt.show()
else:
    print("insufficient data for figure 1")

In [None]:
# figure 2: clinical encoding heatmap
if aggregator.clinical_results:
    clinical_dict = {
        feat: {
            layer: {'mean': r.r2_score, 'std': r.std_score}
            for layer, r in layer_results.items()
        }
        for feat, layer_results in aggregator.clinical_results.items()
    }
    
    fig2 = fig_gen.figure_2_clinical_encoding(clinical_dict)
    plt.show()
else:
    print("no clinical results for figure 2")

In [None]:
# figure 3: hypothesis summary
fig3 = fig_gen.figure_3_hypothesis_summary(hypothesis_results)
plt.show()

## 5. Save Results and Reports

In [None]:
# save aggregated results
results_file = aggregator.save('aggregated_results.json')
print(f"saved results to: {results_file}")

# save text report
report_file = aggregator.save_report('experiment_report.txt')
print(f"saved report to: {report_file}")

In [None]:
# save hypothesis test results
hypothesis_file = RESULTS_DIR / 'hypothesis_results.json'

with open(hypothesis_file, 'w') as f:
    json.dump(hypothesis_results, f, indent=2, default=str)

print(f"saved hypothesis results to: {hypothesis_file}")

## 6. Abstract Summary Statistics

Key numbers needed for the abstract.

In [None]:
print("ABSTRACT STATISTICS")
print("=" * 60)

# model performance
if results['probing']:
    model_acc = results['probing'].get('best_accuracy', 'N/A')
    print(f"\nmodel classification accuracy: {model_acc}")

# best probing layer
if aggregator.probing_results:
    best_layer, best_acc = aggregator.get_best_probing_layer()
    print(f"best probing layer: {best_layer} (accuracy = {best_acc:.3f})")

# best patching layer
if aggregator.patching_results:
    best_patch, best_rec = aggregator.get_best_patching_layer()
    print(f"best patching layer: {best_patch} (recovery = {best_rec:.3f})")

# important layers
if aggregator.probing_results or aggregator.patching_results:
    important = aggregator.get_important_layers()
    print(f"important layers (both methods): {important['both']}")

# probing-patching correlation
if aggregator.probing_results and aggregator.patching_results:
    corr = aggregator.compute_probing_patching_correlation()
    if corr:
        print(f"probing-patching correlation: r = {corr['spearman_r']:.2f}, p = {corr['spearman_p']:.4f}")

# hypothesis support
print(f"\nhypothesis support:")
print(f"  h1 (clinical encoding): {'SUPPORTED' if hypothesis_results['hypothesis_1']['supported'] else 'NOT SUPPORTED'}")
print(f"  h2 (causal dependency): {'SUPPORTED' if hypothesis_results['hypothesis_2']['supported'] else 'NOT SUPPORTED'}")
print(f"  h3 (generalization): {'SUPPORTED' if hypothesis_results['hypothesis_3']['supported'] else 'NOT SUPPORTED'}")

print("\n" + "=" * 60)

In [None]:
# generate abstract text snippet
print("\nSUGGESTED ABSTRACT SNIPPET:")
print("-" * 60)

if aggregator.probing_results:
    best_layer, best_acc = aggregator.get_best_probing_layer()
    
    snippet = f"""We found that Parkinson's disease information is primarily encoded 
in middle transformer layers, with layer {best_layer} achieving {best_acc:.1%} probing accuracy.
"""
    
    if aggregator.patching_results:
        best_patch, best_rec = aggregator.get_best_patching_layer()
        snippet += f"""Activation patching confirmed these layers are causally important, 
with layer {best_patch} recovering {best_rec:.1%} of the logit difference.
"""
    
    if aggregator.probing_results and aggregator.patching_results:
        corr = aggregator.compute_probing_patching_correlation()
        if corr:
            snippet += f"""The strong correlation between probing accuracy and causal importance 
(r = {corr['spearman_r']:.2f}, p = {corr['spearman_p']:.4f}) supports our hypothesis that 
clinically-meaningful representations drive the model's predictions.
"""
    
    print(snippet)
else:
    print("insufficient results for abstract generation")