# Quick Smoke Test

This notebook verifies the PCA -> QSVM pipeline using the small `parkinsons.data` voice dataset.

In [1]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('../src')

from pca_qs_pipeline import run_pipeline

In [2]:
# Load data
data_path = '../parkinsons/parkinsons.data'
df = pd.read_csv(data_path)
print(df.head())

             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0           0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1           0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2           0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3           0.00009   0.00502   0.00698     0.01505       0.05492  ...   
4           0.00011   0.00655   0.00908     0.01966       0.06425  ...   

   Shimmer:DDA      NHR     HNR  status      RPDE       DFA   spread1  \
0      0.0654

In [3]:
# Prepare data for pipeline
# The pipeline expects 'embeddings.npy' and 'manifest.csv' with 'label' column.

# Features: all columns except 'name' and 'status'
features = df.drop(['name', 'status'], axis=1).values.astype(np.float32)
labels = df['status'].values

# Create dummy manifest
manifest = pd.DataFrame({
    'subject_alias': df['name'],
    'label': labels
})

# Save to temporary outputs
output_dir = '../outputs/smoke_test'
os.makedirs(output_dir, exist_ok=True)

np.save(os.path.join(output_dir, 'embeddings_subjects.npy'), features)
manifest.to_csv(os.path.join(output_dir, 'subjects_manifest.csv'), index=False)

print("Data prepared.")

Data prepared.


In [4]:
# Run Pipeline
# We use small PCA dims because the voice dataset has ~22 features.
run_pipeline(
    embeddings_path=os.path.join(output_dir, 'embeddings_subjects.npy'),
    manifest_path=os.path.join(output_dir, 'subjects_manifest.csv'),
    output_dir=output_dir,
    pca_dims=[4, 8, 16],
    n_splits=3,
    n_qubits=4,
    nystrom_m=50  # Small dataset, small Nystrom
)


=== Running for PCA dim: 4 ===
  Fold 1/3
    Running QSVM (n_qubits=4, nystrom=False)...
Computing kernel of shape (130, 130) with 4 qubits...
Computing kernel of shape (65, 130) with 4 qubits...
  Fold 2/3
    Running QSVM (n_qubits=4, nystrom=False)...
Computing kernel of shape (130, 130) with 4 qubits...
Computing kernel of shape (65, 130) with 4 qubits...
  Fold 3/3
    Running QSVM (n_qubits=4, nystrom=False)...
Computing kernel of shape (130, 130) with 4 qubits...
Computing kernel of shape (65, 130) with 4 qubits...

=== Running for PCA dim: 8 ===
  Fold 1/3
    Running QSVM (n_qubits=4, nystrom=False)...
Computing kernel of shape (130, 130) with 4 qubits...
Computing kernel of shape (65, 130) with 4 qubits...
  Fold 2/3
    Running QSVM (n_qubits=4, nystrom=False)...
Computing kernel of shape (130, 130) with 4 qubits...
Computing kernel of shape (65, 130) with 4 qubits...
  Fold 3/3
    Running QSVM (n_qubits=4, nystrom=False)...
Computing kernel of shape (130, 130) with 4 qub

In [5]:
# Check results
results = pd.read_csv(os.path.join(output_dir, 'results_summary.csv'))
print(results)

      model  pca_dim  acc_mean   acc_std  auc_mean   auc_std
0        LR        4  0.825641  0.032026  0.877976  0.027247
1        LR        8  0.861538  0.055470  0.903912  0.031820
2        LR       16  0.866667  0.064051  0.906888  0.030157
3      QSVM        4  0.815385  0.040704  0.789541  0.106892
4      QSVM        8  0.815385  0.040704  0.789541  0.106892
5      QSVM       16  0.805128  0.032026  0.803997  0.104983
6        RF        4  0.892308  0.015385  0.951318  0.004100
7        RF        8  0.876923  0.015385  0.954932  0.008421
8        RF       16  0.887179  0.035529  0.951105  0.020865
9   SVM-RBF        4  0.841026  0.008882  0.821429  0.066179
10  SVM-RBF        8  0.861538  0.026647  0.909014  0.052935
11  SVM-RBF       16  0.861538  0.026647  0.909864  0.047217
