In [None]:
# ============================================================================
# CELL 1: SETUP AND INSTALLATION (WITH AUTO-RESTART)
# ============================================================================
print("="*80)
print("SAND CHALLENGE - HUBERT BASELINE (STATE-OF-THE-ART)")
print("="*80)

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Install required packages
print("\n[1/7] Installing required packages...")
print("‚è± This will take 2-3 minutes...")

# Uninstall conflicting packages first
!pip uninstall -y transformers huggingface_hub tokenizers

# Install fresh compatible versions
!pip install transformers==4.40.0 -q
!pip install torch torchaudio -q
!pip install scikit-learn pandas numpy -q
!pip install librosa soundfile -q
!pip install optuna -q
!pip install matplotlib seaborn -q
!pip install imbalanced-learn -q  # ‚≠ê ADDED FOR SMOTE

print("‚úÖ Installation complete!")
print("üîÑ Restarting runtime...")

# Automatic restart
import os
os.kill(os.getpid(), 9)

SAND CHALLENGE - HUBERT BASELINE (STATE-OF-THE-ART)
Mounted at /content/drive

[1/7] Installing required packages...
‚è± This will take 2-3 minutes...
Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
Found existing installation: huggingface-hub 0.36.0
Uninstalling huggingface-hub-0.36.0:
  Successfully uninstalled huggingface-hub-0.36.0
Found existing installation: tokenizers 0.22.1
Uninstalling tokenizers-0.22.1:
  Successfully uninstalled tokenizers-0.22.1
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m137.6/137.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.0/9.0 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

In [1]:
# ============================================================================
# CELL 2: IMPORTS (RUN AFTER RESTART)
# ============================================================================
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
import torchaudio
import librosa
import soundfile as sf
from scipy.stats import skew, kurtosis

from transformers import (
    HubertModel,
    AutoFeatureExtractor
)
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, classification_report, cohen_kappa_score
)
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from tqdm.auto import tqdm
import joblib
import json

# Mount Drive again after restart
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úÖ Using device: {device}")

# Set random seeds
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

print("‚úÖ All imports successful!")

Mounted at /content/drive
‚úÖ Using device: cuda
‚úÖ All imports successful!


In [27]:
# ============================================================================
# CELL 3: CONFIGURATION
# ============================================================================
class Config:
    """Configuration for HuBERT baseline"""

    # Paths
    BASE_PATH = '/content/drive/MyDrive/SAND_Project_Data'
    TRAINING_PATH = os.path.join(BASE_PATH, 'training')
    EXCEL_PATH = os.path.join(BASE_PATH, 'sand_task_1.xlsx')
    OUTPUT_PATH = os.path.join(BASE_PATH, 'hubert_baseline_results')

    # Sheet names
    SHEET_TRAINING_BASELINE = 'Training Baseline - Task 1'
    SHEET_VALIDATION_BASELINE = 'Validation Baseline - Task 1'

    # Audio types
    AUDIO_TYPES = ['phonationA', 'phonationE', 'phonationI', 'phonationO', 'phonationU',
                   'rhythmKA', 'rhythmPA', 'rhythmTA']

    # HuBERT Model Configuration
    HUBERT_MODEL = 'facebook/hubert-base-ls960'
    MAX_AUDIO_LENGTH = 16000 * 17  # 17 seconds max
    SAMPLING_RATE = 16000

    # Feature extraction strategy
    POOLING_STRATEGY = 'mean'
    USE_ALL_LAYERS = False
    LAYER_TO_USE = -1

    # Classifier Configuration
    CLASSIFIER = 'svm'

    # Training parameters
    BATCH_SIZE = 8
    N_FOLDS = 5
    RANDOM_SEED = 42

    # ‚≠ê SMOTE Configuration
    USE_SMOTE = True  # Set to True to use SMOTE
    SMOTE_K_NEIGHBORS = 3  # Number of neighbors for SMOTE (must be < smallest class size)
    SMOTE_SAMPLING_STRATEGY = 'auto'  # 'auto' or dict like {0: 50, 1: 50}

    # Class names (0-indexed)
    CLASS_NAMES = {
        0: 'Severe Dysarthria',
        1: 'Moderate Dysarthria',
        2: 'Mild Dysarthria',
        3: 'No Dysarthria (ALS)',
        4: 'Healthy'
    }

config = Config()

# Create output directory
os.makedirs(config.OUTPUT_PATH, exist_ok=True)
print(f"\n‚úÖ Output directory: {config.OUTPUT_PATH}")

# Verify paths
print(f"\nüîç Verifying paths...")
print(f"   Base path exists: {os.path.exists(config.BASE_PATH)}")
print(f"   Training path exists: {os.path.exists(config.TRAINING_PATH)}")
print(f"   Excel file exists: {os.path.exists(config.EXCEL_PATH)}")

if not all([os.path.exists(config.BASE_PATH),
            os.path.exists(config.TRAINING_PATH),
            os.path.exists(config.EXCEL_PATH)]):
    raise FileNotFoundError("Required paths not found!")

print(f"\nüìã Configuration:")
print(f"   HuBERT Model: {config.HUBERT_MODEL}")
print(f"   Pooling Strategy: {config.POOLING_STRATEGY}")
print(f"   Classifier: {config.CLASSIFIER}")
print(f"   Use SMOTE: {config.USE_SMOTE}")  # ‚≠ê ADDED
print(f"   Device: {device}")


‚úÖ Output directory: /content/drive/MyDrive/SAND_Project_Data/hubert_baseline_results

üîç Verifying paths...
   Base path exists: True
   Training path exists: True
   Excel file exists: True

üìã Configuration:
   HuBERT Model: facebook/hubert-base-ls960
   Pooling Strategy: mean
   Classifier: svm
   Use SMOTE: True
   Device: cuda


In [28]:
# ============================================================================
# CELL 4: LOAD DATA
# ============================================================================
print("\n[2/7] Loading dataset...")

def load_data(excel_path, sheet_name):
    """Load dataset from Excel"""
    df = pd.read_excel(excel_path, sheet_name=sheet_name)
    print(f"   ‚úÖ Loaded {len(df)} samples from: '{sheet_name}'")

    # Convert to 0-indexed classes
    df['Class'] = df['Class'] - 1

    # Display distribution
    class_dist = df['Class'].value_counts().sort_index()
    print(f"   üìä Class distribution:")
    for cls, count in class_dist.items():
        pct = (count / len(df)) * 100
        print(f"      Class {cls} ({config.CLASS_NAMES[cls]}): {count:3d} ({pct:5.2f}%)")

    return df

# Load datasets
print("\nüìä Loading data sheets...")
df_train = load_data(config.EXCEL_PATH, config.SHEET_TRAINING_BASELINE)
print()
df_val = load_data(config.EXCEL_PATH, config.SHEET_VALIDATION_BASELINE)

print(f"\nüìà Dataset Summary:")
print(f"   Training samples: {len(df_train):3d}")
print(f"   Validation samples: {len(df_val):3d}")
print(f"   Total audio files: {(len(df_train) + len(df_val)) * len(config.AUDIO_TYPES)}")

# Verify no overlap
train_ids = set(df_train['ID'].values)
val_ids = set(df_val['ID'].values)
overlap = train_ids.intersection(val_ids)
print(f"   Overlap check: {'‚úÖ No overlap' if len(overlap) == 0 else f'‚ö† {len(overlap)} overlaps'}")



[2/7] Loading dataset...

üìä Loading data sheets...
   ‚úÖ Loaded 219 samples from: 'Training Baseline - Task 1'
   üìä Class distribution:
      Class 0 (Severe Dysarthria):   4 ( 1.83%)
      Class 1 (Moderate Dysarthria):  22 (10.05%)
      Class 2 (Mild Dysarthria):  45 (20.55%)
      Class 3 (No Dysarthria (ALS)):  62 (28.31%)
      Class 4 (Healthy):  86 (39.27%)

   ‚úÖ Loaded 53 samples from: 'Validation Baseline - Task 1'
   üìä Class distribution:
      Class 0 (Severe Dysarthria):   2 ( 3.77%)
      Class 1 (Moderate Dysarthria):   4 ( 7.55%)
      Class 2 (Mild Dysarthria):  12 (22.64%)
      Class 3 (No Dysarthria (ALS)):  14 (26.42%)
      Class 4 (Healthy):  21 (39.62%)

üìà Dataset Summary:
   Training samples: 219
   Validation samples:  53
   Total audio files: 2176
   Overlap check: ‚úÖ No overlap


In [29]:
# ============================================================================
# CELL 5: LOAD HUBERT MODEL
# ============================================================================
print(f"\n[3/7] Loading HuBERT model: {config.HUBERT_MODEL}")
print("‚è± This may take 1-2 minutes for first-time download...")

# Load feature extractor and model
feature_extractor = AutoFeatureExtractor.from_pretrained(config.HUBERT_MODEL)
model = HubertModel.from_pretrained(config.HUBERT_MODEL)
model = model.to(device)
model.eval()  # Set to evaluation mode

print(f"‚úÖ Model loaded successfully!")
print(f"   Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters")
print(f"   Hidden size: {model.config.hidden_size}")
print(f"   Number of layers: {model.config.num_hidden_layers}")
print(f"   Sampling rate: {feature_extractor.sampling_rate} Hz")



[3/7] Loading HuBERT model: facebook/hubert-base-ls960
‚è± This may take 1-2 minutes for first-time download...


Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

‚úÖ Model loaded successfully!
   Model size: 94.4M parameters
   Hidden size: 768
   Number of layers: 12
   Sampling rate: 16000 Hz


In [56]:
# ============================================================================
# CELL 6: AUDIO PROCESSING FUNCTIONS
# ============================================================================
def load_audio(audio_path, target_sr=16000, max_length=None):
    """
    Load and preprocess audio file
    Args:
        audio_path: Path to audio file
        target_sr: Target sampling rate
        max_length: Maximum audio length in samples
    Returns:
        audio: Audio waveform as numpy array
        sr: Sampling rate
    """
    try:
        # Load audio
        audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)

        # Trim silence
        audio, _ = librosa.effects.trim(audio, top_db=20)

        # Limit length if specified
        if max_length and len(audio) > max_length:
            audio = audio[:max_length]

        # Ensure minimum length (1 second)
        min_length = target_sr
        if len(audio) < min_length:
            audio = np.pad(audio, (0, min_length - len(audio)), mode='constant')

        return audio, sr

    except Exception as e:
        print(f"  ‚ö† Error loading {audio_path}: {str(e)}")
        # Return silence
        return np.zeros(target_sr), target_sr

def extract_spectral_features(audio, sr=16000):
    """
    Extract comprehensive spectral features from audio

    Args:
        audio: Audio waveform
        sr: Sampling rate

    Returns:
        features: Array of spectral features
    """
    features = []

    # 1. Original spectral features (6 features)
    spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
    features.extend([np.mean(spectral_centroids), np.std(spectral_centroids)])

    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
    features.extend([np.mean(spectral_rolloff), np.std(spectral_rolloff)])

    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0]
    features.extend([np.mean(spectral_bandwidth), np.std(spectral_bandwidth)])

    # 2. Spectral Flatness (3 features) - HIGHLY RECOMMENDED
    spectral_flatness = librosa.feature.spectral_flatness(y=audio)[0]
    features.extend([
        np.mean(spectral_flatness),
        np.std(spectral_flatness),
        np.max(spectral_flatness)
    ])

    return np.array(features)

def extract_hubert_features(audio, sampling_rate=16000, pooling='mean'):
    """
    Extract HuBERT features from audio

    Args:
        audio: Audio waveform (numpy array)
        sampling_rate: Sampling rate
        pooling: Pooling strategy ('mean', 'max', 'mean+max', 'last')

    Returns:
        features: Extracted features as numpy array
    """
    try:
        # Preprocess audio
        inputs = feature_extractor(
            audio,
            sampling_rate=sampling_rate,
            return_tensors="pt",
            padding=True
        )

        # Move to device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Extract features
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)

        # Get hidden states
        if config.USE_ALL_LAYERS:
            # Use all hidden layers
            hidden_states = outputs.hidden_states
            features = torch.stack(hidden_states, dim=0).mean(dim=0)
        else:
            # Use specific layer
            features = outputs.hidden_states[config.LAYER_TO_USE]

        # Apply pooling
        if pooling == 'mean':
            features = features.mean(dim=1)
        elif pooling == 'max':
            features = features.max(dim=1)[0]
        elif pooling == 'mean+max':
            mean_pool = features.mean(dim=1)
            max_pool = features.max(dim=1)[0]
            features = torch.cat([mean_pool, max_pool], dim=-1)
        elif pooling == 'last':
            features = features[:, -1, :]
        else:
            features = features.mean(dim=1)

        # Convert to numpy
        features = features.cpu().numpy().squeeze()

        return features

    except Exception as e:
        print(f"  ‚ö† Error extracting features: {str(e)}")
        # Return zero features
        feature_dim = model.config.hidden_size
        if pooling == 'mean+max':
            feature_dim *= 2
        return np.zeros(feature_dim)

print("‚úÖ Audio processing functions defined!")

‚úÖ Audio processing functions defined!


In [57]:
# ============================================================================
# CELL 7: EXTRACT FEATURES FROM ALL AUDIO FILES
# ============================================================================
print(f"\n[4/7] Extracting HuBERT features...")
print(f"  Strategy: {config.POOLING_STRATEGY} pooling")
print(f"  Processing {len(df_train)} training + {len(df_val)} validation samples")
print(f"  Total: {(len(df_train) + len(df_val)) * len(config.AUDIO_TYPES)} audio files")

def extract_features_for_dataset(df, dataset_path, audio_types):
    """
    Extract HuBERT + Spectral features for entire dataset

    Args:
        df: DataFrame with patient IDs and labels
        dataset_path: Path to audio files
        audio_types: List of audio types to process

    Returns:
        X: Feature matrix
        y: Labels
        ids: Patient IDs
    """
    features_list = []
    labels_list = []
    ids_list = []

    print(f"\n  Processing {len(df)} patients...")

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="  Extracting"):
        patient_id = row['ID']
        patient_class = row['Class']

        patient_features = []

        # Process all audio types for this patient
        for audio_type in audio_types:
            audio_file = f"{patient_id}_{audio_type}.wav"
            audio_path = os.path.join(dataset_path, audio_type, audio_file)

            if os.path.exists(audio_path):
                # Load audio
                audio, sr = load_audio(
                    audio_path,
                    target_sr=config.SAMPLING_RATE,
                    max_length=config.MAX_AUDIO_LENGTH
                )

                # Extract HuBERT features
                hubert_features = extract_hubert_features(
                    audio,
                    sampling_rate=sr,
                    pooling=config.POOLING_STRATEGY
                )

                # Extract spectral features
                spectral_features = extract_spectral_features(audio, sr)

                # Concatenate HuBERT + Spectral features
                combined_features = np.concatenate([hubert_features, spectral_features])

                patient_features.append(combined_features)
            else:
                # Use zero features for missing files
                feature_dim = model.config.hidden_size
                if config.POOLING_STRATEGY == 'mean+max':
                    feature_dim *= 2
                # Add 9 for spectral features (6 + 3)
                feature_dim += 9
                patient_features.append(np.zeros(feature_dim))

        # Concatenate features from all audio types
        combined_features = np.concatenate(patient_features)
        features_list.append(combined_features)
        labels_list.append(patient_class)
        ids_list.append(patient_id)

    X = np.array(features_list)
    y = np.array(labels_list)
    ids = np.array(ids_list)

    return X, y, ids

# Extract features for training set
print("\nüéµ Extracting TRAINING features...")
X_train, y_train, ids_train = extract_features_for_dataset(
    df_train,
    config.TRAINING_PATH,
    config.AUDIO_TYPES
)

# Extract features for validation set
print("\nüéµ Extracting VALIDATION features...")
X_val, y_val, ids_val = extract_features_for_dataset(
    df_val,
    config.TRAINING_PATH,
    config.AUDIO_TYPES
)

print(f"\n‚úÖ Feature extraction complete!")
print(f"  Training: X={X_train.shape}, y={y_train.shape}")
print(f"  Validation: X={X_val.shape}, y={y_val.shape}")

# Save raw features
features_path = os.path.join(config.OUTPUT_PATH, 'hubert_features.npz')
np.savez(
    features_path,
    X_train=X_train, y_train=y_train, ids_train=ids_train,
    X_val=X_val, y_val=y_val, ids_val=ids_val
)
print(f"  üíæ Features saved to: {features_path}")

# Clear GPU memory
torch.cuda.empty_cache()


[4/7] Extracting HuBERT features...
  Strategy: mean pooling
  Processing 219 training + 53 validation samples
  Total: 2176 audio files

üéµ Extracting TRAINING features...

  Processing 219 patients...


  Extracting:   0%|          | 0/219 [00:00<?, ?it/s]


üéµ Extracting VALIDATION features...

  Processing 53 patients...


  Extracting:   0%|          | 0/53 [00:00<?, ?it/s]


‚úÖ Feature extraction complete!
  Training: X=(219, 6216), y=(219,)
  Validation: X=(53, 6216), y=(53,)
  üíæ Features saved to: /content/drive/MyDrive/SAND_Project_Data/hubert_baseline_results/hubert_features.npz


In [58]:
# ============================================================================
# CELL 8: PREPROCESS FEATURES
# ============================================================================
print("\n[5/7] Preprocessing features...")

# Check for problematic values
print(f"   üîç Data quality check:")
print(f"      NaN in X_train: {np.isnan(X_train).sum()}")
print(f"      Inf in X_train: {np.isinf(X_train).sum()}")
print(f"      NaN in X_val: {np.isnan(X_val).sum()}")
print(f"      Inf in X_val: {np.isinf(X_val).sum()}")

# Clean data
X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
X_val = np.nan_to_num(X_val, nan=0.0, posinf=0.0, neginf=0.0)

# Standardize features
print(f"\n   üîß Standardizing features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

print(f"   ‚úÖ Features standardized")
print(f"      Training - Mean: {X_train_scaled.mean():.6f}, Std: {X_train_scaled.std():.6f}")
print(f"      Validation - Mean: {X_val_scaled.mean():.6f}, Std: {X_val_scaled.std():.6f}")

# Save scaler
scaler_path = os.path.join(config.OUTPUT_PATH, 'scaler.pkl')
joblib.dump(scaler, scaler_path)
print(f"   üíæ Scaler saved to: {scaler_path}")

# Display feature statistics
print(f"\n   üìä Feature Statistics:")
print(f"      Feature dimension: {X_train_scaled.shape[1]}")
print(f"      Min value: {X_train_scaled.min():.4f}")
print(f"      Max value: {X_train_scaled.max():.4f}")
print(f"      Mean: {X_train_scaled.mean():.4f}")
print(f"      Std: {X_train_scaled.std():.4f}")


[5/7] Preprocessing features...
   üîç Data quality check:
      NaN in X_train: 0
      Inf in X_train: 0
      NaN in X_val: 0
      Inf in X_val: 0

   üîß Standardizing features...
   ‚úÖ Features standardized
      Training - Mean: 0.000000, Std: 1.000000
      Validation - Mean: 0.019462, Std: 8.833868
   üíæ Scaler saved to: /content/drive/MyDrive/SAND_Project_Data/hubert_baseline_results/scaler.pkl

   üìä Feature Statistics:
      Feature dimension: 6216
      Min value: -6.1489
      Max value: 14.7648
      Mean: 0.0000
      Std: 1.0000


In [59]:
# ============================================================================
# CELL 8.5: APPLY SMOTE TO BALANCE CLASSES
# ============================================================================
if config.USE_SMOTE:
    print("\n[5.5/7] Applying SMOTE for class balancing...")

    # Show original distribution
    print(f"\n üìä Original class distribution:")
    unique, counts = np.unique(y_train, return_counts=True)
    for cls, count in zip(unique, counts):
        print(f"   Class {cls} ({config.CLASS_NAMES[cls]:25s}): {count:3d} samples")

    # Determine k_neighbors based on smallest class
    min_class_size = min(counts)
    k_neighbors = min(config.SMOTE_K_NEIGHBORS, min_class_size - 1)

    if k_neighbors < 1:
        print(f"\n ‚ö†Ô∏è  WARNING: Smallest class has only {min_class_size} samples.")
        print(f"    SMOTE requires at least 2 samples per class. Skipping SMOTE.")
        X_train_resampled = X_train_scaled
        y_train_resampled = y_train
    else:
        print(f"\n üîß Applying SMOTE with k_neighbors={k_neighbors}...")

        try:
            smote = SMOTE(
                sampling_strategy=config.SMOTE_SAMPLING_STRATEGY,
                k_neighbors=k_neighbors,
                random_state=config.RANDOM_SEED
            )

            X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

            # Show new distribution
            print(f"\n ‚úÖ SMOTE applied successfully!")
            print(f"\n üìä New class distribution after SMOTE:")
            unique_new, counts_new = np.unique(y_train_resampled, return_counts=True)
            for cls, count in zip(unique_new, counts_new):
                print(f"   Class {cls} ({config.CLASS_NAMES[cls]:25s}): {count:3d} samples")

            print(f"\n üìà Dataset size change:")
            print(f"   Before SMOTE: {len(y_train)} samples")
            print(f"   After SMOTE:  {len(y_train_resampled)} samples")
            print(f"   Increase:     +{len(y_train_resampled) - len(y_train)} samples (+{(len(y_train_resampled)/len(y_train) - 1)*100:.1f}%)")

        except Exception as e:
            print(f"\n ‚ö†Ô∏è  Error applying SMOTE: {str(e)}")
            print(f"    Using original unbalanced data.")
            X_train_resampled = X_train_scaled
            y_train_resampled = y_train
else:
    print("\n[5.5/7] SMOTE disabled - using original class distribution")
    X_train_resampled = X_train_scaled
    y_train_resampled = y_train

# Keep validation set unchanged
X_val_scaled_final = X_val_scaled
y_val_final = y_val

print(f"\n ‚úÖ Ready for training!")
print(f"   Training samples: {len(y_train_resampled)}")
print(f"   Validation samples: {len(y_val_final)}")


[5.5/7] Applying SMOTE for class balancing...

 üìä Original class distribution:
   Class 0 (Severe Dysarthria        ):   4 samples
   Class 1 (Moderate Dysarthria      ):  22 samples
   Class 2 (Mild Dysarthria          ):  45 samples
   Class 3 (No Dysarthria (ALS)      ):  62 samples
   Class 4 (Healthy                  ):  86 samples

 üîß Applying SMOTE with k_neighbors=3...

 ‚úÖ SMOTE applied successfully!

 üìä New class distribution after SMOTE:
   Class 0 (Severe Dysarthria        ):  86 samples
   Class 1 (Moderate Dysarthria      ):  86 samples
   Class 2 (Mild Dysarthria          ):  86 samples
   Class 3 (No Dysarthria (ALS)      ):  86 samples
   Class 4 (Healthy                  ):  86 samples

 üìà Dataset size change:
   Before SMOTE: 219 samples
   After SMOTE:  430 samples
   Increase:     +211 samples (+96.3%)

 ‚úÖ Ready for training!
   Training samples: 430
   Validation samples: 53


In [60]:
# ============================================================================
# CELL 9: TRAIN CLASSIFIER
# ============================================================================
print(f"\n[6/7] Training classifier: {config.CLASSIFIER.upper()}")

def get_classifier(classifier_type):
    """Get classifier based on type"""
    if classifier_type == 'logistic':
        return LogisticRegression(
            max_iter=1000,
            random_state=config.RANDOM_SEED,
            class_weight='balanced',
            C=1.0
        )
    elif classifier_type == 'svm':
        return SVC(
            kernel='rbf',
            C=10.0,
            gamma='scale',
            random_state=config.RANDOM_SEED,
            class_weight='balanced',
            probability=True
        )
    elif classifier_type == 'rf':
        return RandomForestClassifier(
            n_estimators=200,
            max_depth=20,
            random_state=config.RANDOM_SEED,
            class_weight='balanced',
            n_jobs=-1
        )
    elif classifier_type == 'gb':
        return GradientBoostingClassifier(
            n_estimators=200,
            max_depth=5,
            learning_rate=0.1,
            random_state=config.RANDOM_SEED
        )
    else:
        return LogisticRegression(
            max_iter=1000,
            random_state=config.RANDOM_SEED,
            class_weight='balanced'
        )

# Initialize and train classifier
print(f"\n üéØ Training {config.CLASSIFIER.upper()} classifier...")
classifier = get_classifier(config.CLASSIFIER)

# ‚≠ê Train on SMOTE-resampled data
classifier.fit(X_train_resampled, y_train_resampled)
print(f"   ‚úÖ Training complete!")

# Cross-validation score on training set
print(f"\n üìä Cross-validation on training set...")
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    classifier,
    X_train_resampled,  # ‚≠ê Use resampled data
    y_train_resampled,  # ‚≠ê Use resampled labels
    cv=config.N_FOLDS,
    scoring='f1_macro',
    n_jobs=-1
)

print(f"   CV F1 Scores: {cv_scores}")
print(f"   Mean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# Save model
model_path = os.path.join(config.OUTPUT_PATH, 'classifier.pkl')
joblib.dump(classifier, model_path)
print(f"\n üíæ Classifier saved to: {model_path}")


[6/7] Training classifier: SVM

 üéØ Training SVM classifier...
   ‚úÖ Training complete!

 üìä Cross-validation on training set...
   CV F1 Scores: [0.81408835 0.84808769 0.76733997 0.87047619 0.86630411]
   Mean CV F1: 0.8333 (+/- 0.0385)

 üíæ Classifier saved to: /content/drive/MyDrive/SAND_Project_Data/hubert_baseline_results/classifier.pkl


In [61]:
# ============================================================================
# CELL 10: EVALUATE ON VALIDATION SET
# ============================================================================
print("\n[7/7] Evaluating on validation set...")

# ‚≠ê Predict on original (non-SMOTE) validation set
y_val_pred = classifier.predict(X_val_scaled_final)
y_val_pred_proba = classifier.predict_proba(X_val_scaled_final)

# Compute metrics
accuracy = accuracy_score(y_val_final, y_val_pred)
f1_macro = f1_score(y_val_final, y_val_pred, average='macro')
f1_weighted = f1_score(y_val_final, y_val_pred, average='weighted')
precision = precision_score(y_val_final, y_val_pred, average='macro', zero_division=0)
recall = recall_score(y_val_final, y_val_pred, average='macro', zero_division=0)
kappa = cohen_kappa_score(y_val_final, y_val_pred)

print("\n" + "="*80)
print("üéØ VALIDATION SET RESULTS - HUBERT BASELINE" + (" (WITH SMOTE)" if config.USE_SMOTE else ""))
print("="*80)
print(f"Accuracy:              {accuracy:.4f}")
print(f"F1 Score (Macro):      {f1_macro:.4f} ‚≠ê Main SAND metric")
print(f"F1 Score (Weighted):   {f1_weighted:.4f}")
print(f"Precision (Macro):     {precision:.4f}")
print(f"Recall (Macro):        {recall:.4f}")
print(f"Cohen's Kappa:         {kappa:.4f}")

# Per-class metrics
print("\n" + "-"*80)
print("PER-CLASS METRICS")
print("-"*80)
available_classes = sorted(np.unique(y_val_final))
class_labels = [config.CLASS_NAMES[int(i)] for i in available_classes]
print(classification_report(y_val_final, y_val_pred, target_names=class_labels, zero_division=0))

# Confusion matrix
cm = confusion_matrix(y_val_final, y_val_pred)
print("\nConfusion Matrix:")
print(cm)

# Per-class F1
f1_per_class = f1_score(y_val_final, y_val_pred, average=None, zero_division=0)
print("\nF1 Score per Class:")
for cls, f1_val in zip(available_classes, f1_per_class):
    label_name = config.CLASS_NAMES[int(cls)]
    print(f"   Class {cls} ({label_name:25s}): {f1_val:.4f}")

# Comparison with baselines
print("\n" + "="*80)
print("üìä COMPARISON WITH BASELINES")
print("="*80)
sand_vit = 0.606
xgboost_baseline = 0.326
hubert_no_smote = 0.5324  # Your previous result without SMOTE

print(f"HuBERT + SMOTE (This model): {f1_macro:.4f} üöÄ")
print(f"HuBERT (No SMOTE):           {hubert_no_smote:.4f}")
print(f"SAND ViT Baseline:           {sand_vit:.4f}")
print(f"XGBoost + OpenSMILE:         {xgboost_baseline:.4f}")

if f1_macro > sand_vit:
    improvement = f1_macro - sand_vit
    print(f"\n‚úÖ BEATS SAND baseline by +{improvement:.4f} (+{improvement/sand_vit*100:.1f}%)!")
else:
    gap = sand_vit - f1_macro
    print(f"\nGap to SAND: -{gap:.4f}")

if f1_macro > hubert_no_smote:
    improvement_smote = f1_macro - hubert_no_smote
    print(f"SMOTE Improvement: +{improvement_smote:.4f} (+{improvement_smote/hubert_no_smote*100:.1f}%)")

print("="*80)

# Save results
results = {
    'model': 'HuBERT + SMOTE' if config.USE_SMOTE else 'HuBERT',
    'hubert_model': config.HUBERT_MODEL,
    'pooling_strategy': config.POOLING_STRATEGY,
    'classifier': config.CLASSIFIER,
    'use_smote': config.USE_SMOTE,
    'smote_k_neighbors': k_neighbors if config.USE_SMOTE else None,
    'n_train_original': int(len(y_train)),
    'n_train_after_smote': int(len(y_train_resampled)),
    'accuracy': float(accuracy),
    'f1_macro': float(f1_macro),
    'f1_weighted': float(f1_weighted),
    'precision': float(precision),
    'recall': float(recall),
    'kappa': float(kappa),
    'f1_per_class': {int(cls): float(f1_val) for cls, f1_val in zip(available_classes, f1_per_class)},
    'cv_scores': cv_scores.tolist(),
    'cv_mean': float(cv_scores.mean()),
    'cv_std': float(cv_scores.std()),
    'confusion_matrix': cm.tolist(),
    'feature_dim': int(X_train.shape[1]),
    'n_val': int(len(y_val_final)),
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}

results_path = os.path.join(config.OUTPUT_PATH, 'results_with_smote.json' if config.USE_SMOTE else 'results.json')
with open(results_path, 'w') as f:
    json.dump(results, f, indent=4)

print(f"\nüíæ Results saved to: {results_path}")


[7/7] Evaluating on validation set...

üéØ VALIDATION SET RESULTS - HUBERT BASELINE (WITH SMOTE)
Accuracy:              0.6981
F1 Score (Macro):      0.5821 ‚≠ê Main SAND metric
F1 Score (Weighted):   0.6719
Precision (Macro):     0.5767
Recall (Macro):        0.6071
Cohen's Kappa:         0.5656

--------------------------------------------------------------------------------
PER-CLASS METRICS
--------------------------------------------------------------------------------
                     precision    recall  f1-score   support

  Severe Dysarthria       0.00      0.00      0.00         2
Moderate Dysarthria       0.80      1.00      0.89         4
    Mild Dysarthria       0.75      0.75      0.75        12
No Dysarthria (ALS)       0.67      0.43      0.52        14
            Healthy       0.67      0.86      0.75        21

           accuracy                           0.70        53
          macro avg       0.58      0.61      0.58        53
       weighted avg       0.6