In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# set project path - adjust this to your drive location
import os
import sys

PROJECT_ROOT = '/content/drive/MyDrive/pd-interpretability'
os.chdir(PROJECT_ROOT)
sys.path.insert(0, PROJECT_ROOT)

print(f'working directory: {os.getcwd()}')
print(f'project files: {os.listdir(".")}')

In [None]:
# install requirements
!pip install -q -r requirements-colab.txt

In [None]:
# verify gpu availability
import torch

print(f'pytorch version: {torch.__version__}')
print(f'cuda available: {torch.cuda.is_available()}')

if torch.cuda.is_available():
    print(f'gpu device: {torch.cuda.get_device_name(0)}')
    print(f'gpu memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB')
else:
    print('warning: no gpu detected. enable gpu runtime: Runtime -> Change runtime type -> GPU')

In [None]:
# verify imports
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import torchaudio
import librosa
import parselmouth

print('all core packages imported successfully')

In [None]:
# verify project module imports
from src.data.datasets import ItalianPVSDataset, MDVRKCLDataset, ArkansasDataset
from src.data.preprocessing import segment_audio, normalize_audio, AudioPreprocessor
from src.features.clinical import ClinicalFeatureExtractor
from src.models.classifier import Wav2Vec2PDClassifier, DataCollatorWithPadding

print('all project modules imported successfully')

## data verification

In [None]:
# check available datasets
from pathlib import Path

data_root = Path(PROJECT_ROOT) / 'data' / 'raw'

datasets_available = {
    'italian_pvs': (data_root / 'italian_pvs').exists(),
    'mdvr_kcl': (data_root / 'mdvr-kcl').exists(),
    'arkansas': (data_root / 'arkansas (figshare)').exists()
}

print('dataset availability:')
for name, available in datasets_available.items():
    status = 'available' if available else 'not found'
    print(f'  {name}: {status}')

In [None]:
# load italian pvs dataset for testing
try:
    italian_dataset = ItalianPVSDataset(
        root_dir=str(data_root / 'italian_pvs'),
        task=None,
        max_duration=10.0
    )
    print(f'italian pvs dataset loaded: {len(italian_dataset)} samples')
    
    # get class distribution
    labels = [s['label'] for s in italian_dataset.samples]
    n_pd = sum(labels)
    n_hc = len(labels) - n_pd
    print(f'class distribution: {n_hc} hc, {n_pd} pd')
    
except Exception as e:
    print(f'failed to load italian pvs: {e}')

In [None]:
# test sample loading
sample = italian_dataset[0]

print(f'sample keys: {sample.keys()}')
print(f'input_values shape: {sample["input_values"].shape}')
print(f'label: {sample["label"]}')
print(f'subject_id: {sample["subject_id"]}')

In [None]:
# test subject-wise split
train_idx, val_idx, test_idx = italian_dataset.get_subject_split(
    test_size=0.2,
    val_size=0.1,
    random_state=42
)

print(f'train samples: {len(train_idx)}')
print(f'val samples: {len(val_idx)}')
print(f'test samples: {len(test_idx)}')

# verify no subject overlap
train_subjects = set(italian_dataset.samples[i]['subject_id'] for i in train_idx)
val_subjects = set(italian_dataset.samples[i]['subject_id'] for i in val_idx)
test_subjects = set(italian_dataset.samples[i]['subject_id'] for i in test_idx)

assert len(train_subjects & val_subjects) == 0, 'subject leakage: train-val'
assert len(train_subjects & test_subjects) == 0, 'subject leakage: train-test'
assert len(val_subjects & test_subjects) == 0, 'subject leakage: val-test'

print('no subject leakage detected - splits are valid')

In [None]:
# test clinical feature extraction on sample
extractor = ClinicalFeatureExtractor()

# extract from first sample
sample_path = italian_dataset.samples[0]['path']
features = extractor.extract(str(sample_path))

print('extracted clinical features:')
for key, value in features.items():
    if value is not None:
        print(f'  {key}: {value:.4f}')

## model verification

In [None]:
# test model loading
classifier = Wav2Vec2PDClassifier(
    model_name='facebook/wav2vec2-base-960h',
    num_labels=2,
    freeze_feature_extractor=True,
    device='cuda'
)

params = classifier.count_parameters()
print('model parameters:')
print(f'  total: {params["total"]:,}')
print(f'  trainable: {params["trainable"]:,}')
print(f'  frozen: {params["frozen"]:,}')
print(f'  trainable %: {params["trainable_percent"]:.2f}%')

In [None]:
# test forward pass
sample_input = sample['input_values'].unsqueeze(0).to('cuda')

with torch.no_grad():
    logits = classifier.forward(sample_input)

print(f'input shape: {sample_input.shape}')
print(f'output logits shape: {logits.shape}')
print(f'output logits: {logits}')

In [None]:
# test data collator
from torch.utils.data import Subset

collator = DataCollatorWithPadding(classifier.feature_extractor)

# create small batch
batch_samples = [italian_dataset[i] for i in range(4)]
batch = collator(batch_samples)

print(f'batch keys: {batch.keys()}')
print(f'input_values shape: {batch["input_values"].shape}')
print(f'attention_mask shape: {batch["attention_mask"].shape}')
print(f'labels: {batch["labels"]}')

## environment saved

environment is verified and ready for training.
proceed to notebook 02 for fine-tuning.

In [None]:
# save environment info for reproducibility
import json
from datetime import datetime

env_info = {
    'timestamp': datetime.now().isoformat(),
    'pytorch_version': torch.__version__,
    'cuda_available': torch.cuda.is_available(),
    'gpu_name': torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
    'datasets': {
        'italian_pvs': len(italian_dataset) if 'italian_dataset' in dir() else 0
    },
    'model_params': params
}

env_path = Path(PROJECT_ROOT) / 'results' / 'env_info.json'
env_path.parent.mkdir(parents=True, exist_ok=True)

with open(env_path, 'w') as f:
    json.dump(env_info, f, indent=2)

print(f'environment info saved to {env_path}')