# phase 1 demonstration: data infrastructure

comprehensive demonstration of phase 1 implementation including:
- dataset loading and preprocessing
- clinical feature extraction
- subject-wise data splitting
- data quality validation

In [None]:
import sys
from pathlib import Path

project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data import (
    ItalianPVSDataset,
    MDVRKCLDataset,
    ArkansasDataset,
    AudioPreprocessor,
    load_audio
)

from src.features import (
    ClinicalFeatureExtractor,
    extract_clinical_features,
    get_clinical_feature_names,
    get_pd_discriminative_features
)

%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. dataset loading

demonstration of loading pd voice datasets with proper configuration.

In [None]:
data_root = project_root / 'data' / 'raw'

italian_pvs_path = data_root / 'italian_pvs'
mdvr_kcl_path = data_root / 'mdvr_kcl'
arkansas_path = data_root / 'arkansas'

if italian_pvs_path.exists():
    dataset = ItalianPVSDataset(
        root_dir=italian_pvs_path,
        task='vowel_a',
        target_sr=16000,
        max_duration=10.0,
        normalize_audio=True
    )
    
    print(f"dataset loaded: {len(dataset)} samples")
    print(f"subjects: {dataset.get_subject_count()}")
    print(f"label distribution: {dataset.get_label_distribution()}")
    print(f"task distribution: {dataset.get_task_distribution()}")
else:
    print(f"italian pvs dataset not found at {italian_pvs_path}")
    print("download from: ieee dataport")

## 2. sample inspection

examine individual samples to verify correct loading and preprocessing.

In [None]:
if italian_pvs_path.exists():
    sample = dataset[0]
    
    print("sample structure:")
    for key, value in sample.items():
        if isinstance(value, torch.Tensor):
            print(f"  {key}: tensor shape {value.shape}")
        else:
            print(f"  {key}: {value}")
    
    waveform = sample['input_values']
    
    plt.figure(figsize=(14, 4))
    plt.plot(waveform.numpy()[:5000])
    plt.title(f"audio waveform - {sample['diagnosis']} subject")
    plt.xlabel('sample')
    plt.ylabel('amplitude')
    plt.tight_layout()
    plt.show()

## 3. subject-wise splitting

demonstrate proper subject-wise train/val/test splitting to prevent data leakage.

In [None]:
if italian_pvs_path.exists():
    train_ds, val_ds, test_ds = dataset.get_subject_split(
        test_size=0.2,
        val_size=0.1,
        random_state=42,
        stratify=True
    )
    
    print(f"split sizes:")
    print(f"  train: {len(train_ds)} samples")
    print(f"  val: {len(val_ds)} samples")
    print(f"  test: {len(test_ds)} samples")
    
    train_subjects = set(dataset[i]['subject_id'] for i in train_ds.indices)
    val_subjects = set(dataset[i]['subject_id'] for i in val_ds.indices)
    test_subjects = set(dataset[i]['subject_id'] for i in test_ds.indices)
    
    print(f"\nsubject counts:")
    print(f"  train: {len(train_subjects)} subjects")
    print(f"  val: {len(val_subjects)} subjects")
    print(f"  test: {len(test_subjects)} subjects")
    
    overlap = (train_subjects & val_subjects) | (train_subjects & test_subjects) | (val_subjects & test_subjects)
    print(f"\nsubject overlap: {len(overlap)} (should be 0)")
    
    train_labels = [dataset[i]['label'] for i in train_ds.indices]
    val_labels = [dataset[i]['label'] for i in val_ds.indices]
    test_labels = [dataset[i]['label'] for i in test_ds.indices]
    
    print(f"\nlabel distribution:")
    print(f"  train: {sum(train_labels)}/{len(train_labels)} pd ({sum(train_labels)/len(train_labels)*100:.1f}%)")
    print(f"  val: {sum(val_labels)}/{len(val_labels)} pd ({sum(val_labels)/len(val_labels)*100:.1f}%)")
    print(f"  test: {sum(test_labels)}/{len(test_labels)} pd ({sum(test_labels)/len(test_labels)*100:.1f}%)")

## 4. clinical feature extraction

extract clinical voice biomarkers using parselmouth/praat.

In [None]:
if italian_pvs_path.exists():
    extractor = ClinicalFeatureExtractor(
        f0_min=75.0,
        f0_max=600.0
    )
    
    sample_path = dataset.samples[0]['path']
    
    features = extractor.extract(sample_path)
    
    print("extracted clinical features:")
    for name, value in features.items():
        if not np.isnan(value):
            print(f"  {name}: {value:.4f}")
        else:
            print(f"  {name}: nan")

## 5. batch clinical feature extraction

extract features from all samples and analyze distributions.

In [None]:
if italian_pvs_path.exists():
    audio_paths = [s['path'] for s in dataset.samples[:50]]
    labels = [s['label'] for s in dataset.samples[:50]]
    
    from src.features import batch_extract_features
    
    features_list = batch_extract_features(
        audio_paths,
        f0_min=75.0,
        f0_max=600.0,
        verbose=True
    )
    
    features_df = pd.DataFrame([f for f in features_list if f is not None])
    features_df['label'] = labels[:len(features_df)]
    
    print(f"\nfeature matrix shape: {features_df.shape}")
    print(f"\nfeature summary:")
    print(features_df.describe())

## 6. clinical feature visualization

visualize distribution of clinical features for pd vs healthy controls.

In [None]:
if italian_pvs_path.exists() and len(features_df) > 0:
    discriminative_features = get_pd_discriminative_features()
    
    available_features = [f for f in discriminative_features if f in features_df.columns]
    
    if len(available_features) > 0:
        fig, axes = plt.subplots(2, 4, figsize=(16, 8))
        axes = axes.flatten()
        
        for i, feature in enumerate(available_features[:8]):
            ax = axes[i]
            
            hc_vals = features_df[features_df['label'] == 0][feature].dropna()
            pd_vals = features_df[features_df['label'] == 1][feature].dropna()
            
            ax.hist(hc_vals, alpha=0.5, label='healthy', bins=15)
            ax.hist(pd_vals, alpha=0.5, label='parkinson', bins=15)
            ax.set_xlabel(feature)
            ax.set_ylabel('count')
            ax.legend()
            ax.set_title(f"{feature} distribution")
        
        for j in range(len(available_features), 8):
            axes[j].axis('off')
        
        plt.tight_layout()
        plt.show()
    else:
        print("no discriminative features available in extracted features")

## 7. statistical comparison

compare clinical features between pd and hc groups using t-tests.

In [None]:
if italian_pvs_path.exists() and len(features_df) > 0:
    from scipy import stats
    
    available_features = [f for f in get_pd_discriminative_features() if f in features_df.columns]
    
    results = []
    
    for feature in available_features:
        hc_vals = features_df[features_df['label'] == 0][feature].dropna()
        pd_vals = features_df[features_df['label'] == 1][feature].dropna()
        
        if len(hc_vals) > 2 and len(pd_vals) > 2:
            t_stat, p_val = stats.ttest_ind(hc_vals, pd_vals)
            
            results.append({
                'feature': feature,
                'hc_mean': hc_vals.mean(),
                'hc_std': hc_vals.std(),
                'pd_mean': pd_vals.mean(),
                'pd_std': pd_vals.std(),
                't_statistic': t_stat,
                'p_value': p_val,
                'significant': p_val < 0.05
            })
    
    if len(results) > 0:
        results_df = pd.DataFrame(results)
        print("\nclinical feature comparison (hc vs pd):")
        print(results_df.to_string(index=False))
        
        sig_count = sum(results_df['significant'])
        print(f"\nsignificant features (p < 0.05): {sig_count}/{len(results_df)}")

## 8. audio preprocessing pipeline

demonstrate audio preprocessing with vad and filtering.

In [None]:
if italian_pvs_path.exists():
    preprocessor = AudioPreprocessor(
        target_sr=16000,
        remove_silence=True,
        apply_filters=True,
        normalize=True,
        check_quality=True
    )
    
    sample_path = dataset.samples[0]['path']
    waveform, sr = load_audio(sample_path, target_sr=16000, normalize=False)
    
    processed, metrics = preprocessor(waveform, sr)
    
    print("preprocessing metrics:")
    for key, value in metrics.items():
        print(f"  {key}: {value}")
    
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8))
    
    ax1.plot(waveform.squeeze().numpy()[:10000])
    ax1.set_title('original waveform')
    ax1.set_xlabel('sample')
    ax1.set_ylabel('amplitude')
    
    ax2.plot(processed.squeeze().numpy()[:10000])
    ax2.set_title('preprocessed waveform (filtered + normalized)')
    ax2.set_xlabel('sample')
    ax2.set_ylabel('amplitude')
    
    plt.tight_layout()
    plt.show()

## 9. phase 1 summary

phase 1 deliverables:
- comprehensive dataset loading infrastructure for 6 pd voice corpora
- robust audio preprocessing with vad, filtering, and normalization
- clinical feature extraction using parselmouth (jitter, shimmer, hnr, formants)
- proper subject-wise data splitting to prevent leakage
- comprehensive unit tests for all components

ready to proceed to phase 2: wav2vec2 fine-tuning

In [None]:
print("phase 1 implementation complete")
print("\nnext steps:")
print("  - phase 2: fine-tune wav2vec2 on pd classification")
print("  - phase 3: extract activations from all transformer layers")
print("  - phase 4: probing experiments to identify clinical feature encoding")
print("  - phase 5: activation patching to establish causal circuits")