In [2]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# set project path
import os
import sys

PROJECT_ROOT = '/Volumes/usb drive/pd-interpretability'
os.chdir(PROJECT_ROOT)
sys.path.insert(0, PROJECT_ROOT)

print(f'working directory: {os.getcwd()}')
print(f'project files: {os.listdir(".")}')

FileNotFoundError: [Errno 2] No such file or directory: '/Volumes/usb drive/pd-interpretability'

In [None]:
# install requirements
!pip install -q -r requirements-colab.txt

[0m[31mERROR: Cannot install -r requirements-colab.txt (line 1), -r requirements-colab.txt (line 12), -r requirements-colab.txt (line 2), -r requirements-colab.txt (line 7), numpy>=2.4.0 and scikit-learn==1.3.2 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[0m

In [None]:
# verify gpu availability
import torch

print(f'pytorch version: {torch.__version__}')
print(f'cuda available: {torch.cuda.is_available()}')

if torch.cuda.is_available():
    print(f'gpu device: {torch.cuda.get_device_name(0)}')
    print(f'gpu memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB')
else:
    print('warning: no gpu detected. enable gpu runtime: Runtime -> Change runtime type -> GPU')

pytorch version: 2.9.0+cu126
cuda available: True
gpu device: Tesla T4
gpu memory: 15.83 GB


In [None]:
# verify imports
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import torchaudio
import librosa
import parselmouth

print('all core packages imported successfully')

all core packages imported successfully


In [None]:
# verify project module imports
from src.data.datasets import ItalianPVSDataset, MDVRKCLDataset, ArkansasDataset
from src.data.preprocessing import segment_audio, normalize_audio, AudioPreprocessor
from src.features.clinical import ClinicalFeatureExtractor
from src.models.classifier import Wav2Vec2PDClassifier, DataCollatorWithPadding

print('all project modules imported successfully')

all project modules imported successfully


## data verification

In [None]:
# check available datasets
from pathlib import Path

data_root = Path(PROJECT_ROOT) / 'data' / 'raw'

datasets_available = {
    'italian_pvs': (data_root / 'italian_pvs').exists(),
    'mdvr_kcl': (data_root / 'mdvr-kcl').exists(),
    'arkansas (figshare)': (data_root / 'arkansas (figshare)').exists(),
    'uci_oxford_parkinsons': (data_root / 'uci_oxford_parkinsons').exists()
}

print('dataset availability:')
for name, available in datasets_available.items():
    status = 'available' if available else 'not found'
    print(f'  {name}: {status}')

dataset availability:
  italian_pvs: available
  mdvr_kcl: available
  arkansas (figshare): available
  uci_oxford_parkinsons: available


In [None]:
# load italian pvs dataset for testing
try:
    italian_dataset = ItalianPVSDataset(
        root_dir=str(data_root / 'italian_pvs'),
        task=None,
        max_duration=10.0
    )
    print(f'italian pvs dataset loaded: {len(italian_dataset)} samples')

    # get class distribution
    labels = [s['label'] for s in italian_dataset.samples]
    n_pd = sum(labels)
    n_hc = len(labels) - n_pd
    print(f'class distribution: {n_hc} hc, {n_pd} pd')

except Exception as e:
    print(f'failed to load italian pvs: {e}')

italian pvs dataset loaded: 831 samples
class distribution: 394 hc, 437 pd


In [None]:
# test sample loading
import sys

# install torchcodec
!pip install -q torchcodec

# This code will only execute after the restart (if one was triggered manually or by previous cells)
sample = italian_dataset[0]

print(f'sample keys: {sample.keys()}')
print(f'input_values shape: {sample["input_values"].shape}')
print(f'label: {sample["label"]}')
print(f'subject_id: {sample["subject_id"]}')

[0msample keys: dict_keys(['input_values', 'label', 'subject_id', 'task', 'path', 'diagnosis'])
input_values shape: torch.Size([160000])
label: 0
subject_id: HC_young_Alberto_R


In [None]:
# test subject-wise split
print('generating subject-wise splits...')
train_subset, val_subset, test_subset = italian_dataset.get_subject_split(
    test_size=0.2,
    val_size=0.1,
    random_state=42
)

print(f'train samples: {len(train_subset)} (total samples in this split)')
print(f'validation samples: {len(val_subset)} (total samples in this split)')
print(f'test samples: {len(test_subset)} (total samples in this split)')

# verify no subject overlap
print('verifying for subject leakage between splits...')
# access subject_id directly from the dataset's metadata using the subset indices
train_subjects = set(train_subset.dataset.samples[i]['subject_id'] for i in train_subset.indices)
val_subjects = set(val_subset.dataset.samples[i]['subject_id'] for i in val_subset.indices)
test_subjects = set(test_subset.dataset.samples[i]['subject_id'] for i in test_subset.indices)

print(f'unique subjects in train split: {len(train_subjects)}')
print(f'unique subjects in validation split: {len(val_subjects)}')
print(f'unique subjects in test split: {len(test_subjects)}')

assert len(train_subjects & val_subjects) == 0, 'subject leakage detected: train and validation splits share subjects.'
assert len(train_subjects & test_subjects) == 0, 'subject leakage detected: train and test splits share subjects.'
assert len(val_subjects & test_subjects) == 0, 'subject leakage detected: validation and test splits share subjects.'

print('no subject leakage detected - splits are valid and distinct.')

generating subject-wise splits...
train samples: 565 (total samples in this split)
validation samples: 65 (total samples in this split)
test samples: 201 (total samples in this split)
verifying for subject leakage between splits...
unique subjects in train split: 42
unique subjects in validation split: 6
unique subjects in test split: 13
no subject leakage detected - splits are valid and distinct.


In [None]:
# ensure parselmouth.undefined exists for clinicalfeatureextractor
import parselmouth
import numpy as np # added explicitly for clarity

if not hasattr(parselmouth, 'UNDEFINED'):
    parselmouth.UNDEFINED = np.nan

# test clinical feature extraction on sample
extractor = ClinicalFeatureExtractor()

# extract from first sample
sample_path = italian_dataset.samples[0]['path']
features = extractor.extract(str(sample_path))

print('extracted clinical features:')
for key, value in features.items():
    if value is not None:
        print(f'  {key}: {value:.4f}')

extracted clinical features:
  f0_mean: 132.5603
  f0_std: 85.6724
  f0_min: 69.4267
  f0_max: 597.5883
  f0_median: 111.0828
  f0_range: 528.1616
  voicing_fraction: 0.6061
  jitter_local: 0.0300
  jitter_rap: 0.0121
  jitter_ppq5: 0.0152
  jitter_ddp: 0.0363
  shimmer_local: 0.1281
  shimmer_apq3: 0.0673
  shimmer_apq5: 0.0760
  shimmer_apq11: 0.1145
  shimmer_dda: 0.2019
  hnr_mean: 10.0523
  hnr_std: 5.1828
  f1_mean: 716.9532
  f1_std: 438.6613
  f2_mean: 1810.4948
  f2_std: 575.4603
  f3_mean: 3044.5919
  f3_std: 444.7081
  f4_mean: 4102.5912
  f4_std: 444.9910
  total_duration: 38.2955
  voiced_duration: 23.2115
  unvoiced_duration: 15.0840


## model verification

In [None]:
# test model loading
classifier = Wav2Vec2PDClassifier(
    model_name='facebook/wav2vec2-base-960h',
    num_labels=2,
    freeze_feature_extractor=True,
    device='cuda'
)

params = classifier.count_parameters()
print('model parameters:')
print(f'  total: {params["total"]:,}')
print(f'  trainable: {params["trainable"]:,}')
print(f'  frozen: {params["frozen"]:,}')
print(f'  trainable %: {params["trainable_percent"]:.2f}%')

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model parameters:
  total: 94,569,090
  trainable: 90,368,642
  frozen: 4,200,448
  trainable %: 95.56%


In [None]:
# test forward pass
sample_input = sample['input_values'].unsqueeze(0).to('cuda')

with torch.no_grad():
    logits = classifier.forward(sample_input)

print(f'input shape: {sample_input.shape}')
print(f'output logits shape: {logits.shape}')
print(f'output logits: {logits}')

input shape: torch.Size([1, 160000])
output logits shape: torch.Size([1, 2])
output logits: tensor([[0.0011, 0.0350]], device='cuda:0')


In [None]:
# test data collator
from torch.utils.data import Subset

# initialize DataCollatorWithPadding with a specified max_length
# the max_length is derived from the dataset's max_duration (10.0s) and sampling rate (16000 hz)
collator = DataCollatorWithPadding(
    classifier.feature_extractor,
    max_length=160000 # 10 seconds * 16000 samples/second
)

# create small batch
batch_samples = [italian_dataset[i] for i in range(4)]
batch = collator(batch_samples)

print(f'batch keys: {batch.keys()}')
print(f'input_values shape: {batch["input_values"].shape}')
print(f'attention_mask shape: {batch["attention_mask"].shape}')
print(f'labels: {batch["labels"]}')

batch keys: KeysView({'input_values': tensor([[[-4.6985e-04, -4.6985e-04, -4.6985e-04,  ..., -3.2339e-01,
          -3.1667e-01, -8.1201e-02],
         [-4.6985e-04, -4.6985e-04,  2.2726e-03,  ..., -9.2194e-01,
          -8.1224e-01, -8.7806e-01],
         [-4.6985e-04, -4.6985e-04, -4.6985e-04,  ...,  1.5676e-01,
           2.3181e-01,  2.5325e-01],
         [-4.6985e-04, -4.6985e-04, -4.6985e-04,  ...,  2.7230e+00,
           1.8186e+00,  1.5370e+00]]]), 'attention_mask': tensor([[1, 1, 1, 1]], dtype=torch.int32), 'labels': tensor([0, 0, 0, 0])})
input_values shape: torch.Size([1, 4, 160000])
attention_mask shape: torch.Size([1, 4])
labels: tensor([0, 0, 0, 0])


## environment saved

environment is verified and ready for training.
proceed to notebook 02 for fine-tuning.

In [None]:
# save environment info for reproducibility
import json
from datetime import datetime

env_info = {
    'timestamp': datetime.now().isoformat(),
    'pytorch_version': torch.__version__,
    'cuda_available': torch.cuda.is_available(),
    'gpu_name': torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
    'datasets': {
        'italian_pvs': len(italian_dataset) if 'italian_dataset' in dir() else 0
    },
    'model_params': params
}

env_path = Path(PROJECT_ROOT) / 'results' / 'env_info.json'
env_path.parent.mkdir(parents=True, exist_ok=True)

with open(env_path, 'w') as f:
    json.dump(env_info, f, indent=2)

print(f'environment info saved to {env_path}')

environment info saved to /content/drive/MyDrive/pd-interpretability/results/env_info.json


In [None]:
%cd /content/drive/MyDrive/pd-interpretability

!git status

!git config user.name "smayan-gowda"
!git config user.email "smayan-gowda@users.noreply.github.com"

!git add -A

!git commit -m "sync colab-tested notebook and requirements fixes"

!git push origin main

/content
Refresh index: 100% (69/69), done.
On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   data/activations/activations[m
	[31mmodified:   data/activations/activations.bak[m
	[31mmodified:   notebooks/colab/01_setup_and_data.ipynb[m
	[31mmodified:   requirements-colab.txt[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m=4.41.0[m
	[31mresults/env_info.json[m

no changes added to commit (use "git add" and/or "git commit -a")
[main 846a7cf] sync colab-tested notebook and requirements fixes
 6 files changed, 21 insertions(+), 346 deletions(-)
 create mode 100644 =4.41.0
 rewrite notebooks/colab/01_setup_and_data.ipynb (100%)
 create mode 100644 results/env_info.json
Enumerating objects: 20, done.
Counting objects: 100% (20/20), done.
Delta comp