In [9]:
import sys
from pathlib import Path

project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, LeaveOneGroupOut, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

from src.data.datasets import ItalianPVSDataset
from src.features.clinical import ClinicalFeatureExtractor, get_clinical_feature_names

print(f"project root: {project_root}")

project root: /Volumes/usb drive/pd-interpretability


In [10]:
# Force reload of clinical features module to pick up the fix
import importlib
import src.features.clinical
importlib.reload(src.features.clinical)
from src.features.clinical import ClinicalFeatureExtractor
print("Module reloaded successfully")

Module reloaded successfully


In [11]:
# configuration
DATA_ROOT = project_root / 'data'
RAW_DATA = DATA_ROOT / 'raw'
CLINICAL_FEATURES_DIR = DATA_ROOT / 'clinical_features'
RESULTS_DIR = project_root / 'results'

CLINICAL_FEATURES_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 1. load dataset

In [12]:
italian_pvs_path = RAW_DATA / 'italian_pvs'

dataset = ItalianPVSDataset(
    root_dir=str(italian_pvs_path),
    task=None,
    max_duration=10.0
)

print(f"dataset loaded: {len(dataset)} samples")

# get sample info
n_subjects = len(set(s['subject_id'] for s in dataset.samples))
labels = [s['label'] for s in dataset.samples]
n_pd = sum(labels)
n_hc = len(labels) - n_pd

print(f"subjects: {n_subjects}")
print(f"class distribution: {n_hc} hc, {n_pd} pd")

dataset loaded: 831 samples
subjects: 61
class distribution: 394 hc, 437 pd


## 2. extract clinical features for all samples

extracting jitter, shimmer, hnr, and f0 statistics using parselmouth (praat interface).

In [14]:
# check if features already extracted
features_csv_path = CLINICAL_FEATURES_DIR / 'italian_pvs_features.csv'

if features_csv_path.exists():
    print(f"loading existing features from {features_csv_path}")
    features_df = pd.read_csv(features_csv_path)
    print(f"loaded {len(features_df)} samples")
else:
    print("extracting clinical features for all samples...")
    print("this may take several minutes.")
    
    extractor = ClinicalFeatureExtractor(
        f0_min=75.0,
        f0_max=600.0
    )
    
    features_list = []
    failed_samples = []
    
    for i in tqdm(range(len(dataset)), desc="extracting features"):
        sample = dataset.samples[i]
        
        try:
            features = extractor.extract(str(sample['path']))
            
            features['sample_idx'] = i
            features['path'] = str(sample['path'])
            features['subject_id'] = sample['subject_id']
            features['label'] = sample['label']
            features['diagnosis'] = 'pd' if sample['label'] == 1 else 'hc'
            
            features_list.append(features)
            
        except Exception as e:
            failed_samples.append((i, str(e)))
    
    features_df = pd.DataFrame(features_list)
    
    print(f"\nextracted features for {len(features_df)} samples")
    print(f"failed: {len(failed_samples)} samples")
    
    if failed_samples:
        print(f"first 5 failures: {failed_samples[:5]}")
    
    # save to csv
    features_df.to_csv(features_csv_path, index=False)
    print(f"saved to {features_csv_path}")

extracting clinical features for all samples...
this may take several minutes.


extracting features: 100%|██████████| 831/831 [05:20<00:00,  2.60it/s]



extracted features for 831 samples
failed: 0 samples
saved to /Volumes/usb drive/pd-interpretability/data/clinical_features/italian_pvs_features.csv


In [None]:
# feature summary
clinical_feature_cols = [
    'f0_mean', 'f0_std', 'f0_min', 'f0_max', 'f0_range',
    'jitter_local', 'jitter_rap', 'jitter_ppq5', 'jitter_ddp',
    'shimmer_local', 'shimmer_apq3', 'shimmer_apq5', 'shimmer_apq11', 'shimmer_dda',
    'hnr_mean', 'hnr_std'
]

available_features = [f for f in clinical_feature_cols if f in features_df.columns]

print(f"available clinical features: {len(available_features)}")
print(features_df[available_features].describe().T)

In [16]:
# check for missing values
print("missing values per feature:")
missing = features_df[available_features].isnull().sum()
print(missing[missing > 0])

# drop rows with any missing clinical features
features_clean = features_df.dropna(subset=available_features)
print(f"\nsamples after removing missing: {len(features_clean)} / {len(features_df)}")

missing values per feature:
shimmer_apq11    2
dtype: int64

samples after removing missing: 829 / 831


## 3. prepare data for classification

In [17]:
# prepare feature matrix and labels
X = features_clean[available_features].values
y = features_clean['label'].values
groups = features_clean['subject_id'].values

print(f"feature matrix shape: {X.shape}")
print(f"labels shape: {y.shape}")
print(f"unique subjects: {len(np.unique(groups))}")
print(f"class distribution: {np.bincount(y)}")

feature matrix shape: (829, 14)
labels shape: (829,)
unique subjects: 61
class distribution: [394 435]


## 4. leave-one-subject-out cross-validation

loso cv is the gold standard for medical ml with limited subjects.
it ensures the model is evaluated on completely unseen subjects,
preventing any data leakage between train and test sets.

In [18]:
logo = LeaveOneGroupOut()

print(f"number of folds (subjects): {logo.get_n_splits(X, y, groups)}")

number of folds (subjects): 61


In [19]:
# svm baseline with rbf kernel
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', C=1.0, gamma='scale', random_state=RANDOM_STATE))
])

print("running svm with loso cv...")
svm_scores = cross_val_score(svm_pipeline, X, y, cv=logo, groups=groups, scoring='accuracy')

print(f"\nsvm accuracy: {svm_scores.mean():.3f} +/- {svm_scores.std():.3f}")
print(f"min: {svm_scores.min():.3f}, max: {svm_scores.max():.3f}")

running svm with loso cv...

svm accuracy: 0.824 +/- 0.209
min: 0.000, max: 1.000


In [20]:
# random forest baseline
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=RANDOM_STATE,
        n_jobs=-1
    ))
])

print("running random forest with loso cv...")
rf_scores = cross_val_score(rf_pipeline, X, y, cv=logo, groups=groups, scoring='accuracy')

print(f"\nrandom forest accuracy: {rf_scores.mean():.3f} +/- {rf_scores.std():.3f}")
print(f"min: {rf_scores.min():.3f}, max: {rf_scores.max():.3f}")

running random forest with loso cv...

random forest accuracy: 0.850 +/- 0.191
min: 0.000, max: 1.000


In [21]:
# detailed classification report using cross_val_predict
print("generating detailed metrics using best model...")

best_model = svm_pipeline if svm_scores.mean() > rf_scores.mean() else rf_pipeline
best_name = 'svm' if svm_scores.mean() > rf_scores.mean() else 'random forest'

y_pred = cross_val_predict(best_model, X, y, cv=logo, groups=groups)

print(f"\n{best_name} classification report:")
print(classification_report(y, y_pred, target_names=['healthy', 'parkinson']))

print("\nconfusion matrix:")
cm = confusion_matrix(y, y_pred)
print(f"           predicted")
print(f"            hc    pd")
print(f"actual hc  {cm[0,0]:4d}  {cm[0,1]:4d}")
print(f"       pd  {cm[1,0]:4d}  {cm[1,1]:4d}")

generating detailed metrics using best model...

random forest classification report:
              precision    recall  f1-score   support

     healthy       0.84      0.83      0.84       394
   parkinson       0.85      0.86      0.85       435

    accuracy                           0.85       829
   macro avg       0.85      0.85      0.85       829
weighted avg       0.85      0.85      0.85       829


confusion matrix:
           predicted
            hc    pd
actual hc   328    66
       pd    61   374


## 5. per-subject analysis

In [22]:
# analyze per-subject accuracy
unique_subjects = np.unique(groups)
subject_results = []

for subject in unique_subjects:
    mask = groups == subject
    subject_true = y[mask]
    subject_pred = y_pred[mask]
    
    subject_acc = accuracy_score(subject_true, subject_pred)
    subject_label = 'pd' if subject_true[0] == 1 else 'hc'
    n_samples = mask.sum()
    
    subject_results.append({
        'subject_id': subject,
        'diagnosis': subject_label,
        'n_samples': n_samples,
        'accuracy': subject_acc,
        'correct': int(subject_acc * n_samples),
        'total': n_samples
    })

subject_df = pd.DataFrame(subject_results)

print("per-subject accuracy distribution:")
print(f"  mean: {subject_df['accuracy'].mean():.3f}")
print(f"  median: {subject_df['accuracy'].median():.3f}")
print(f"  subjects with 100% accuracy: {(subject_df['accuracy'] == 1.0).sum()}")
print(f"  subjects with 0% accuracy: {(subject_df['accuracy'] == 0.0).sum()}")

print("\naccuracy by diagnosis:")
print(subject_df.groupby('diagnosis')['accuracy'].agg(['mean', 'std', 'min', 'max']))

per-subject accuracy distribution:
  mean: 0.850
  median: 0.875
  subjects with 100% accuracy: 21
  subjects with 0% accuracy: 1

accuracy by diagnosis:
               mean       std       min  max
diagnosis                                   
hc         0.844205  0.212955  0.000000  1.0
pd         0.858271  0.161247  0.285714  1.0


## 6. feature importance analysis

In [23]:
# train rf on full data to get feature importance
rf_full = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=RANDOM_STATE
)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
rf_full.fit(X_scaled, y)

importances = rf_full.feature_importances_
importance_df = pd.DataFrame({
    'feature': available_features,
    'importance': importances
}).sort_values('importance', ascending=False)

print("feature importance ranking:")
print(importance_df.to_string(index=False))

feature importance ranking:
      feature  importance
 shimmer_apq5    0.132821
  shimmer_dda    0.121524
shimmer_apq11    0.115200
 shimmer_apq3    0.102394
shimmer_local    0.088838
  jitter_ppq5    0.076277
 jitter_local    0.063677
   jitter_ddp    0.057934
   jitter_rap    0.054022
       f0_max    0.052473
      f0_mean    0.041560
     f0_range    0.034807
       f0_std    0.029296
       f0_min    0.029178


## 7. statistical comparison: pd vs hc

In [24]:
from scipy import stats

stat_results = []

for feature in available_features:
    hc_values = features_clean[features_clean['label'] == 0][feature]
    pd_values = features_clean[features_clean['label'] == 1][feature]
    
    t_stat, p_val = stats.ttest_ind(hc_values, pd_values)
    
    # cohen's d effect size
    pooled_std = np.sqrt((hc_values.std()**2 + pd_values.std()**2) / 2)
    cohens_d = (pd_values.mean() - hc_values.mean()) / pooled_std if pooled_std > 0 else 0
    
    stat_results.append({
        'feature': feature,
        'hc_mean': hc_values.mean(),
        'hc_std': hc_values.std(),
        'pd_mean': pd_values.mean(),
        'pd_std': pd_values.std(),
        't_statistic': t_stat,
        'p_value': p_val,
        'cohens_d': cohens_d,
        'significant': p_val < 0.05
    })

stat_df = pd.DataFrame(stat_results)

print("statistical comparison (pd vs hc):")
print(stat_df[['feature', 'hc_mean', 'pd_mean', 'p_value', 'cohens_d', 'significant']].to_string(index=False))

n_sig = stat_df['significant'].sum()
print(f"\nsignificant features (p < 0.05): {n_sig} / {len(stat_df)}")

statistical comparison (pd vs hc):
      feature    hc_mean    pd_mean      p_value  cohens_d  significant
      f0_mean 160.742396 157.214900 1.552615e-01 -0.099071        False
       f0_std  28.358798  16.314699 9.934741e-12 -0.474039         True
       f0_min 104.664480 112.084145 6.084628e-03  0.191354         True
       f0_max 327.872990 265.079022 2.020530e-07 -0.364051         True
     f0_range 223.208510 152.994877 1.612838e-07 -0.367048         True
 jitter_local   0.013715   0.010967 1.844817e-04 -0.260806         True
   jitter_rap   0.006565   0.004915 5.828475e-06 -0.317014         True
  jitter_ppq5   0.007286   0.004994 2.053376e-10 -0.444103         True
   jitter_ddp   0.019694   0.014746 5.828475e-06 -0.317014         True
shimmer_local   0.086921   0.050745 1.929831e-28 -0.792726         True
 shimmer_apq3   0.039901   0.018900 6.362333e-41 -0.969389         True
 shimmer_apq5   0.053709   0.025031 7.443816e-41 -0.968453         True
shimmer_apq11   0.083906   0.

## 8. save baseline results

In [25]:
import json
from datetime import datetime

baseline_results = {
    'timestamp': datetime.now().isoformat(),
    'dataset': 'italian_pvs',
    'n_samples': len(features_clean),
    'n_subjects': len(np.unique(groups)),
    'n_features': len(available_features),
    'features_used': available_features,
    'cv_method': 'leave_one_subject_out',
    'n_folds': logo.get_n_splits(X, y, groups),
    'svm': {
        'accuracy_mean': float(svm_scores.mean()),
        'accuracy_std': float(svm_scores.std()),
        'accuracy_min': float(svm_scores.min()),
        'accuracy_max': float(svm_scores.max()),
        'per_fold_scores': svm_scores.tolist()
    },
    'random_forest': {
        'accuracy_mean': float(rf_scores.mean()),
        'accuracy_std': float(rf_scores.std()),
        'accuracy_min': float(rf_scores.min()),
        'accuracy_max': float(rf_scores.max()),
        'per_fold_scores': rf_scores.tolist()
    },
    'best_model': best_name,
    'feature_importance': importance_df.to_dict('records'),
    'statistical_comparison': stat_df.to_dict('records')
}

# save results
baseline_path = RESULTS_DIR / 'clinical_baseline_results.json'
with open(baseline_path, 'w') as f:
    json.dump(baseline_results, f, indent=2)

print(f"baseline results saved to {baseline_path}")

baseline results saved to /Volumes/usb drive/pd-interpretability/results/clinical_baseline_results.json


In [26]:
# save subject-level results
subject_results_path = RESULTS_DIR / 'clinical_baseline_subjects.csv'
subject_df.to_csv(subject_results_path, index=False)
print(f"subject results saved to {subject_results_path}")

subject results saved to /Volumes/usb drive/pd-interpretability/results/clinical_baseline_subjects.csv


## 9. summary

In [27]:
print("=" * 60)
print("PHASE 2: CLINICAL BASELINE - SUMMARY")
print("=" * 60)
print(f"\ndataset: italian pvs")
print(f"samples: {len(features_clean)}")
print(f"subjects: {len(np.unique(groups))}")
print(f"features: {len(available_features)} clinical biomarkers")
print(f"\ncross-validation: leave-one-subject-out ({logo.get_n_splits(X, y, groups)} folds)")
print(f"\nBASELINE RESULTS:")
print(f"  svm (rbf):       {svm_scores.mean()*100:.1f}% +/- {svm_scores.std()*100:.1f}%")
print(f"  random forest:   {rf_scores.mean()*100:.1f}% +/- {rf_scores.std()*100:.1f}%")
print(f"\ntarget range: 70-85%")

best_acc = max(svm_scores.mean(), rf_scores.mean()) * 100
if 70 <= best_acc <= 85:
    print(f"status: WITHIN TARGET RANGE")
elif best_acc > 85:
    print(f"status: ABOVE TARGET (excellent clinical features)")
else:
    print(f"status: BELOW TARGET (may need feature engineering)")

print(f"\ntop 5 most important features:")
for i, row in importance_df.head(5).iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

print(f"\nsignificant features (p < 0.05): {n_sig} / {len(stat_df)}")
print("\nphase 2 complete. ready to proceed to phase 3 (wav2vec2 fine-tuning).")

PHASE 2: CLINICAL BASELINE - SUMMARY

dataset: italian pvs
samples: 829
subjects: 61
features: 14 clinical biomarkers

cross-validation: leave-one-subject-out (61 folds)

BASELINE RESULTS:
  svm (rbf):       82.4% +/- 20.9%
  random forest:   85.0% +/- 19.1%

target range: 70-85%
status: WITHIN TARGET RANGE

top 5 most important features:
  shimmer_apq5: 0.1328
  shimmer_dda: 0.1215
  shimmer_apq11: 0.1152
  shimmer_apq3: 0.1024
  shimmer_local: 0.0888

significant features (p < 0.05): 13 / 14

phase 2 complete. ready to proceed to phase 3 (wav2vec2 fine-tuning).
