In [None]:
# ============================================================================
# CELL 1: SETUP AND INSTALLATION
# ============================================================================
print("="*80)
print("Whisper \nRandom Forest Classifier \nBalanced Dataset")
print("="*80)

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Install required packages
print("\n[Installing packages...]")
!pip uninstall -y transformers huggingface_hub tokenizers -q
!pip install transformers==4.40.0 -q
!pip install torch torchaudio -q
!pip install scikit-learn pandas numpy -q
!pip install librosa soundfile -q
!pip install imbalanced-learn -q
!pip install matplotlib seaborn -q
print("‚úÖ Installation complete!")

print("\nüìÑ Restarting runtime...")
import os
os.kill(os.getpid(), 9)

Whisper 
Random Forest Classifier 
Balanced Dataset
Mounted at /content/drive

[Installing packages...]
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m137.6/137.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.0/9.0 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m566.1/566.1 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.6/3.6 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that

In [22]:
# ============================================================================
# CELL 2: IMPORTS (RUN AFTER RESTART)
# ============================================================================
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
import librosa
import soundfile as sf
from scipy.stats import skew, kurtosis

# Replace HuBERT with Whisper
from transformers import WhisperModel, WhisperFeatureExtractor, WhisperProcessor

# Replace SVM with Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, classification_report, cohen_kappa_score
)

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek

import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from tqdm.auto import tqdm
import joblib
import json

# Mount Drive again after restart
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úÖ Using device: {device}")

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

print("‚úÖ All imports successful!")



Mounted at /content/drive
‚úÖ Using device: cuda
‚úÖ All imports successful!


In [51]:
# ============================================================================
# CELL 3: OPTIMIZED CONFIGURATION
# ============================================================================
class Config:
    """Optimized Configuration for Maximum Performance"""

    # Paths
    BASE_PATH = '/content/drive/MyDrive/SAND_Project_Data'
    TRAINING_PATH = os.path.join(BASE_PATH, 'training')
    EXCEL_PATH = os.path.join(BASE_PATH, 'sand_task_1.xlsx')
    OUTPUT_PATH = os.path.join(BASE_PATH, 'optimized_whisper_rf_v2')

    # Sheet names
    SHEET_TRAINING = 'Training Baseline - Task 1'
    SHEET_VALIDATION = 'Validation Baseline - Task 1'

    # Audio types
    AUDIO_TYPES = ['phonationA', 'phonationE', 'phonationI', 'phonationO', 'phonationU',
                   'rhythmKA', 'rhythmPA', 'rhythmTA']

    # ========================================================================
    # OPTIMIZED WHISPER CONFIGURATION
    # ========================================================================
    WHISPER_MODEL = 'openai/whisper-base'
    SAMPLING_RATE = 16000
    # CHANGED: Maximum audio length set to 10 seconds (no padding beyond actual audio)
    MAX_AUDIO_LENGTH_SEC = 17  # Maximum 10 seconds
    MAX_AUDIO_LENGTH = 16000 * MAX_AUDIO_LENGTH_SEC  # 10 seconds in samples

    # Multi-layer feature extraction (CRITICAL FOR PERFORMANCE)
    USE_MULTI_LAYER = True
    LAYERS_TO_USE = [-4, -3, -2, -1]  # Last 4 layers

    # Multi-pooling strategy (CRITICAL FOR PERFORMANCE)
    USE_MULTI_POOLING = True
    POOLING_STRATEGIES = ['mean', 'std', 'max', 'min']  # 4 pooling methods

    # Statistical features (HELPS WITH DYSARTHRIA PATTERNS)
    USE_STATISTICAL_FEATURES = True

    # CHANGED: Audio processing - no padding, slice to actual content only
    AUDIO_TRIM_DB = 15  # Lower threshold for dysarthric speech
    AUDIO_NORMALIZE = True
    MIN_AUDIO_LENGTH_SEC = 2  # Minimum 1 second
    # ADDED: Flag to disable padding
    USE_PADDING = False  # Changed from implicit True to explicit False

    # ========================================================================
    # PREPROCESSING OPTIMIZATIONS
    # ========================================================================
    USE_ROBUST_SCALING = True  # Better for outliers

    # PCA Configuration
    USE_PCA = True
    PCA_VARIANCE = 0.97  # Retain 97% variance

    # ========================================================================
    # DATA BALANCING (CRITICAL FOR MINORITY CLASSES)
    # ========================================================================
    USE_SMOTE = True
    SMOTE_STRATEGY = 'not majority'  # Oversample all except majority
    SMOTE_K_NEIGHBORS = 2  # Lower for small classes

    # ========================================================================
    # OPTIMIZED RANDOM FOREST HYPERPARAMETERS
    # ========================================================================
    RF_CONFIG = {
        'n_estimators': 500,
        'max_depth': 20,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'max_features': 'sqrt',
        'bootstrap': True,
        'class_weight': 'balanced',
        'random_state': RANDOM_SEED,
        'n_jobs': -1
    }

    # Cross-validation
    N_FOLDS = 5
    RANDOM_SEED = 42

    # Class names
    CLASS_NAMES = {
        0: 'Severe Dysarthria',
        1: 'Moderate Dysarthria',
        2: 'Mild Dysarthria',
        3: 'No Dysarthria (ALS)',
        4: 'Healthy'
    }

config = Config()
os.makedirs(config.OUTPUT_PATH, exist_ok=True)

print("\n" + "="*80)
print("üìã OPTIMIZED CONFIGURATION")
print("="*80)
print(f"Whisper Model: {config.WHISPER_MODEL}")
print(f"Multi-Layer Extraction: {config.USE_MULTI_LAYER} ({len(config.LAYERS_TO_USE)} layers)")
print(f"Multi-Pooling: {config.USE_MULTI_POOLING} ({len(config.POOLING_STRATEGIES)} strategies)")
print(f"Statistical Features: {config.USE_STATISTICAL_FEATURES}")
print(f"PCA: {config.USE_PCA} (variance={config.PCA_VARIANCE})")
print(f"SMOTE: {config.USE_SMOTE}")
print(f"Random Forest Estimators: {config.RF_CONFIG['n_estimators']}")
# CHANGED: Display 10 seconds instead of 15
print(f"Max Audio Length: {config.MAX_AUDIO_LENGTH_SEC} seconds (NO PADDING)")
print(f"Use Padding: {config.USE_PADDING}")
print(f"Device: {device}")
print("="*80)

# Verify paths
print(f"\nüìÇ Verifying paths...")
assert os.path.exists(config.BASE_PATH), "Base path not found!"
assert os.path.exists(config.TRAINING_PATH), "Training path not found!"
assert os.path.exists(config.EXCEL_PATH), "Excel file not found!"
print("‚úÖ All paths verified!")


üìã OPTIMIZED CONFIGURATION
Whisper Model: openai/whisper-base
Multi-Layer Extraction: True (4 layers)
Multi-Pooling: True (4 strategies)
Statistical Features: True
PCA: True (variance=0.97)
SMOTE: True
Random Forest Estimators: 500
Max Audio Length: 17 seconds (NO PADDING)
Use Padding: False
Device: cuda

üìÇ Verifying paths...
‚úÖ All paths verified!


In [52]:
# ============================================================================
# CELL 4: LOAD DATASETS
# ============================================================================
print("\n" + "="*80)
print("[1/9] LOADING DATASETS")
print("="*80)

def load_data(excel_path, sheet_name):
    """Load dataset from Excel"""
    df = pd.read_excel(excel_path, sheet_name=sheet_name)
    print(f"\n‚úÖ Loaded {len(df)} samples from: '{sheet_name}'")

    # Convert to 0-indexed classes
    df['Class'] = df['Class'] - 1

    # Display distribution
    print(f"üìä Class distribution:")
    class_dist = df['Class'].value_counts().sort_index()
    for cls, count in class_dist.items():
        pct = (count / len(df)) * 100
        print(f"  Class {cls} ({config.CLASS_NAMES[cls]:25s}): {count:3d} ({pct:5.2f}%)")

    return df

# Load training and validation sets
df_train = load_data(config.EXCEL_PATH, config.SHEET_TRAINING)
df_val = load_data(config.EXCEL_PATH, config.SHEET_VALIDATION)

print(f"\nüìà Summary:")
print(f"  Training samples: {len(df_train)}")
print(f"  Validation samples: {len(df_val)}")
print(f"  Total audio files: {(len(df_train) + len(df_val)) * len(config.AUDIO_TYPES)}")

# Verify no overlap
train_ids = set(df_train['ID'].values)
val_ids = set(df_val['ID'].values)
overlap = train_ids.intersection(val_ids)
print(f"  Overlap check: {'‚úÖ No overlap' if len(overlap) == 0 else f'‚ö† {len(overlap)} overlapping IDs'}")




[1/9] LOADING DATASETS

‚úÖ Loaded 219 samples from: 'Training Baseline - Task 1'
üìä Class distribution:
  Class 0 (Severe Dysarthria        ):   4 ( 1.83%)
  Class 1 (Moderate Dysarthria      ):  22 (10.05%)
  Class 2 (Mild Dysarthria          ):  45 (20.55%)
  Class 3 (No Dysarthria (ALS)      ):  62 (28.31%)
  Class 4 (Healthy                  ):  86 (39.27%)

‚úÖ Loaded 53 samples from: 'Validation Baseline - Task 1'
üìä Class distribution:
  Class 0 (Severe Dysarthria        ):   2 ( 3.77%)
  Class 1 (Moderate Dysarthria      ):   4 ( 7.55%)
  Class 2 (Mild Dysarthria          ):  12 (22.64%)
  Class 3 (No Dysarthria (ALS)      ):  14 (26.42%)
  Class 4 (Healthy                  ):  21 (39.62%)

üìà Summary:
  Training samples: 219
  Validation samples: 53
  Total audio files: 2176
  Overlap check: ‚úÖ No overlap


In [53]:
# ============================================================================
# CELL 5: LOAD WHISPER MODEL
# ============================================================================
print("\n" + "="*80)
print("[2/9] LOADING WHISPER MODEL")
print("="*80)
print(f"Model: {config.WHISPER_MODEL}")
print("‚è± This may take 1-2 minutes for first-time download...")

# Load feature extractor and model
processor = WhisperProcessor.from_pretrained(config.WHISPER_MODEL)
whisper_model = WhisperModel.from_pretrained(config.WHISPER_MODEL)
whisper_model = whisper_model.to(device)
whisper_model.eval()  # Set to evaluation mode

print(f"\n‚úÖ Whisper loaded successfully!")
print(f"  Model size: {sum(p.numel() for p in whisper_model.parameters()) / 1e6:.1f}M parameters")
print(f"  Hidden size: {whisper_model.config.d_model}")
print(f"  Number of encoder layers: {whisper_model.config.encoder_layers}")
print(f"  Sampling rate: {processor.feature_extractor.sampling_rate} Hz")




[2/9] LOADING WHISPER MODEL
Model: openai/whisper-base
‚è± This may take 1-2 minutes for first-time download...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



‚úÖ Whisper loaded successfully!
  Model size: 72.6M parameters
  Hidden size: 512
  Number of encoder layers: 6
  Sampling rate: 16000 Hz


In [54]:
# ============================================================================
# CELL 6: ENHANCED AUDIO PROCESSING FUNCTIONS (NO PADDING VERSION)
# ============================================================================
print("\n" + "="*80)
print("[3/9] DEFINING ENHANCED AUDIO PROCESSING (NO PADDING)")
print("="*80)

def load_audio_enhanced(audio_path, target_sr=16000, max_length=None):
    """
    Enhanced audio loading - EXTRACTS ONLY ACTUAL AUDIO CONTENT (NO PADDING)

    Args:
        audio_path: Path to audio file
        target_sr: Target sampling rate
        max_length: Maximum audio length in samples (will slice if longer)

    Returns:
        audio: Preprocessed audio waveform (actual content only)
        sr: Sampling rate
    """
    try:
        # Load audio
        audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)

        # Advanced trimming with lower threshold for dysarthric speech
        audio, _ = librosa.effects.trim(audio, top_db=config.AUDIO_TRIM_DB)

        # Normalize amplitude
        if config.AUDIO_NORMALIZE:
            audio = librosa.util.normalize(audio)

        # CHANGED: Slice to max_length instead of padding
        if max_length and len(audio) > max_length:
            audio = audio[:max_length]

        # CHANGED: Only pad if audio is extremely short (< 1 second)
        min_length = target_sr * config.MIN_AUDIO_LENGTH_SEC
        if len(audio) < min_length:
            # Pad only to minimum viable length (1 second)
            audio = np.pad(audio, (0, min_length - len(audio)), mode='constant')

        # REMOVED: No padding to 15 seconds - use actual audio length only

        return audio, sr

    except Exception as e:
        print(f"‚ö† Error loading {audio_path}: {str(e)}")
        # Return minimum silence as fallback
        return np.zeros(target_sr * config.MIN_AUDIO_LENGTH_SEC), target_sr

def extract_statistical_features(audio, sr=16000):
    """
    Extract statistical and acoustic features from audio
    These help capture dysarthria-specific patterns

    Args:
        audio: Audio waveform
        sr: Sampling rate

    Returns:
        features: Array of statistical features
    """
    features = []

    # Time-domain statistics
    features.extend([
        np.mean(audio),           # Mean amplitude
        np.std(audio),            # Standard deviation
        skew(audio),              # Skewness
        kurtosis(audio),          # Kurtosis
        np.max(np.abs(audio)),    # Peak amplitude
        np.median(audio),         # Median
        np.percentile(audio, 25), # 25th percentile
        np.percentile(audio, 75)  # 75th percentile
    ])

    # Zero crossing rate (voice quality indicator)
    zcr = librosa.feature.zero_crossing_rate(audio)[0]
    features.extend([np.mean(zcr), np.std(zcr), np.max(zcr)])

    # Energy features
    energy = np.sum(audio**2) / len(audio)
    features.append(energy)

    # RMS energy
    rms = librosa.feature.rms(y=audio)[0]
    features.extend([np.mean(rms), np.std(rms)])

    # Spectral features
    spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
    features.extend([np.mean(spectral_centroids), np.std(spectral_centroids)])

    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
    features.extend([np.mean(spectral_rolloff), np.std(spectral_rolloff)])

    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0]
    features.extend([np.mean(spectral_bandwidth), np.std(spectral_bandwidth)])

    return np.array(features)

def extract_whisper_features_multilayer(audio, sampling_rate=16000):
    """
    Extract Whisper features with multi-layer and multi-pooling strategy
    CHANGED: Now handles variable-length audio without padding

    Args:
        audio: Audio waveform (actual content only, no padding)
        sampling_rate: Sampling rate

    Returns:
        features: Concatenated feature vector
    """
    try:
        # Preprocess audio for Whisper
        inputs = processor(
            audio,
            sampling_rate=sampling_rate,
            return_tensors="pt"
        )

        # Move to device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Extract features with all hidden states
        with torch.no_grad():
            outputs = whisper_model.encoder(**inputs, output_hidden_states=True)

        all_features = []

        # Extract from multiple layers
        if config.USE_MULTI_LAYER:
            layers_to_process = config.LAYERS_TO_USE
        else:
            layers_to_process = [-1]  # Only last layer

        for layer_idx in layers_to_process:
            hidden_state = outputs.hidden_states[layer_idx]  # Shape: [batch, time, hidden]

            # Apply multiple pooling strategies
            if config.USE_MULTI_POOLING:
                pooling_methods = config.POOLING_STRATEGIES
            else:
                pooling_methods = ['mean']  # Only mean pooling

            for pool_strategy in pooling_methods:
                if pool_strategy == 'mean':
                    pooled = hidden_state.mean(dim=1)
                elif pool_strategy == 'max':
                    pooled = hidden_state.max(dim=1)[0]
                elif pool_strategy == 'std':
                    pooled = hidden_state.std(dim=1)
                elif pool_strategy == 'min':
                    pooled = hidden_state.min(dim=1)[0]
                else:
                    pooled = hidden_state.mean(dim=1)

                all_features.append(pooled.cpu().numpy().squeeze())

        # Concatenate all features
        combined_features = np.concatenate(all_features)

        return combined_features

    except Exception as e:
        print(f"‚ö† Feature extraction error: {str(e)}")
        # Return zero features as fallback
        n_layers = len(config.LAYERS_TO_USE) if config.USE_MULTI_LAYER else 1
        n_pooling = len(config.POOLING_STRATEGIES) if config.USE_MULTI_POOLING else 1
        feature_dim = whisper_model.config.d_model * n_layers * n_pooling
        return np.zeros(feature_dim)

print("‚úÖ Audio processing functions defined!")
print(f"  Enhanced audio loading (NO PADDING): ‚úì")
print(f"  Statistical features ({20} features): ‚úì")
print(f"  Multi-layer Whisper extraction: ‚úì")




[3/9] DEFINING ENHANCED AUDIO PROCESSING (NO PADDING)
‚úÖ Audio processing functions defined!
  Enhanced audio loading (NO PADDING): ‚úì
  Statistical features (20 features): ‚úì
  Multi-layer Whisper extraction: ‚úì


In [55]:
# ============================================================================
# CELL 7: EXTRACT FEATURES FROM ALL AUDIO FILES (NO PADDING VERSION)
# ============================================================================
print("\n" + "="*80)
print("[4/9] EXTRACTING FEATURES (NO PADDING - ACTUAL AUDIO ONLY)")
print("="*80)
print(f"Configuration:")
print(f"  Layers: {len(config.LAYERS_TO_USE)}")
print(f"  Pooling strategies: {len(config.POOLING_STRATEGIES)}")
print(f"  Statistical features: {config.USE_STATISTICAL_FEATURES}")
print(f"  Feature multiplication factor: {len(config.LAYERS_TO_USE) * len(config.POOLING_STRATEGIES)}x")
print(f"  Max audio length: {config.MAX_AUDIO_LENGTH_SEC}s (NO PADDING BEYOND ACTUAL CONTENT)")

def extract_features_for_dataset(df, dataset_path, audio_types, desc="Processing"):
    """
    Extract enhanced features for entire dataset
    CHANGED: Now processes only actual audio content without padding

    Args:
        df: DataFrame with patient IDs and labels
        dataset_path: Path to audio files
        audio_types: List of audio types
        desc: Progress bar description

    Returns:
        X: Feature matrix
        y: Labels
        ids: Patient IDs
    """
    features_list = []
    labels_list = []
    ids_list = []

    print(f"\nüéµ {desc} {len(df)} patients...")

    for idx, row in tqdm(df.iterrows(), total=len(df), desc=desc):
        patient_id = row['ID']
        patient_class = row['Class']

        patient_features = []

        # Process each audio type
        for audio_type in audio_types:
            audio_file = f"{patient_id}_{audio_type}.wav"
            audio_path = os.path.join(dataset_path, audio_type, audio_file)

            if os.path.exists(audio_path):
                # CHANGED: Load audio with NO padding (actual content only)
                audio, sr = load_audio_enhanced(
                    audio_path,
                    target_sr=config.SAMPLING_RATE,
                    max_length=config.MAX_AUDIO_LENGTH
                )

                # Extract Whisper features from actual audio
                whisper_features = extract_whisper_features_multilayer(audio, sr)

                # Extract statistical features if enabled
                if config.USE_STATISTICAL_FEATURES:
                    stat_features = extract_statistical_features(audio, sr)
                    combined_features = np.concatenate([whisper_features, stat_features])
                else:
                    combined_features = whisper_features

                patient_features.append(combined_features)
            else:
                # Zero features for missing files
                n_layers = len(config.LAYERS_TO_USE) if config.USE_MULTI_LAYER else 1
                n_pooling = len(config.POOLING_STRATEGIES) if config.USE_MULTI_POOLING else 1
                feature_dim = whisper_model.config.d_model * n_layers * n_pooling

                if config.USE_STATISTICAL_FEATURES:
                    feature_dim += 20  # Statistical features count

                patient_features.append(np.zeros(feature_dim))

        # Concatenate features from all audio types
        combined = np.concatenate(patient_features)
        features_list.append(combined)
        labels_list.append(patient_class)
        ids_list.append(patient_id)

    X = np.array(features_list)
    y = np.array(labels_list)
    ids = np.array(ids_list)

    return X, y, ids

# Extract training features
X_train, y_train, ids_train = extract_features_for_dataset(
    df_train,
    config.TRAINING_PATH,
    config.AUDIO_TYPES,
    desc="Training"
)

# Extract validation features
X_val, y_val, ids_val = extract_features_for_dataset(
    df_val,
    config.TRAINING_PATH,
    config.AUDIO_TYPES,
    desc="Validation"
)

print(f"\n‚úÖ Feature extraction complete!")
print(f"  Training shape: {X_train.shape}")
print(f"  Validation shape: {X_val.shape}")
print(f"  Feature dimension per patient: {X_train.shape[1]}")

# Clean data (handle any NaN/Inf)
X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
X_val = np.nan_to_num(X_val, nan=0.0, posinf=0.0, neginf=0.0)

print(f"  Data cleaned: ‚úì")

# Save raw features
features_path = os.path.join(config.OUTPUT_PATH, 'features_raw_no_padding.npz')
np.savez(
    features_path,
    X_train=X_train, y_train=y_train, ids_train=ids_train,
    X_val=X_val, y_val=y_val, ids_val=ids_val
)
print(f"  üíæ Raw features saved: {features_path}")

# Clear GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()



[4/9] EXTRACTING FEATURES (NO PADDING - ACTUAL AUDIO ONLY)
Configuration:
  Layers: 4
  Pooling strategies: 4
  Statistical features: True
  Feature multiplication factor: 16x
  Max audio length: 17s (NO PADDING BEYOND ACTUAL CONTENT)

üéµ Training 219 patients...


Training:   0%|          | 0/219 [00:00<?, ?it/s]


üéµ Validation 53 patients...


Validation:   0%|          | 0/53 [00:00<?, ?it/s]


‚úÖ Feature extraction complete!
  Training shape: (219, 65696)
  Validation shape: (53, 65696)
  Feature dimension per patient: 65696
  Data cleaned: ‚úì
  üíæ Raw features saved: /content/drive/MyDrive/SAND_Project_Data/optimized_whisper_rf_v2/features_raw_no_padding.npz


In [56]:
# ============================================================================
# CELL 8: ADVANCED PREPROCESSING PIPELINE
# ============================================================================
print("\n" + "="*80)
print("[5/9] ADVANCED PREPROCESSING")
print("="*80)

# Step 1: Scaling
if config.USE_ROBUST_SCALING:
    print("üìä Applying RobustScaler (better for outliers)...")
    scaler = RobustScaler()
else:
    print("üìä Applying StandardScaler...")
    scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

print(f"  ‚úÖ Scaling complete")
print(f"     Train: mean={X_train_scaled.mean():.6f}, std={X_train_scaled.std():.6f}")
print(f"     Val:   mean={X_val_scaled.mean():.6f}, std={X_val_scaled.std():.6f}")

# Step 2: PCA (Dimensionality Reduction)
if config.USE_PCA:
    print(f"\nüìâ Applying PCA (variance threshold={config.PCA_VARIANCE})...")
    pca = PCA(n_components=config.PCA_VARIANCE, random_state=config.RANDOM_SEED)
    X_train_scaled = pca.fit_transform(X_train_scaled)
    X_val_scaled = pca.transform(X_val_scaled)

    explained_var = pca.explained_variance_ratio_.sum()
    print(f"  ‚úÖ PCA complete")
    print(f"     Original dimensions: {X_train.shape[1]}")
    print(f"     Reduced dimensions: {X_train_scaled.shape[1]}")
    print(f"     Explained variance: {explained_var*100:.2f}%")
    print(f"     Dimensionality reduction: {X_train.shape[1]/X_train_scaled.shape[1]:.1f}x")
else:
    pca = None
    print("  ‚Ñπ PCA disabled")

# Step 3: SMOTE (Handle Class Imbalance)
if config.USE_SMOTE:
    print(f"\n‚öñÔ∏è Applying SMOTE (strategy='{config.SMOTE_STRATEGY}')...")
    print(f"  Original class distribution:")
    unique, counts = np.unique(y_train, return_counts=True)
    for cls, count in zip(unique, counts):
        print(f"    Class {cls}: {count:3d}")

    try:
        smote = SMOTE(
            sampling_strategy=config.SMOTE_STRATEGY,
            k_neighbors=config.SMOTE_K_NEIGHBORS,
            random_state=config.RANDOM_SEED
        )
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

        print(f"\n  ‚úÖ SMOTE complete")
        print(f"     Samples: {len(y_train)} ‚Üí {len(y_train_resampled)}")
        print(f"  Resampled class distribution:")
        unique, counts = np.unique(y_train_resampled, return_counts=True)
        for cls, count in zip(unique, counts):
            print(f"    Class {cls}: {count:3d}")

    except Exception as e:
        print(f"  ‚ö† SMOTE failed: {e}")
        print(f"  Continuing without SMOTE...")
        X_train_resampled = X_train_scaled
        y_train_resampled = y_train
else:
    X_train_resampled = X_train_scaled
    y_train_resampled = y_train
    print("  ‚Ñπ SMOTE disabled")

print("\n‚úÖ Preprocessing pipeline complete!")




[5/9] ADVANCED PREPROCESSING
üìä Applying RobustScaler (better for outliers)...
  ‚úÖ Scaling complete
     Train: mean=0.008359, std=0.784949
     Val:   mean=0.007654, std=0.856875

üìâ Applying PCA (variance threshold=0.97)...
  ‚úÖ PCA complete
     Original dimensions: 65696
     Reduced dimensions: 190
     Explained variance: 97.00%
     Dimensionality reduction: 345.8x

‚öñÔ∏è Applying SMOTE (strategy='not majority')...
  Original class distribution:
    Class 0:   4
    Class 1:  22
    Class 2:  45
    Class 3:  62
    Class 4:  86

  ‚úÖ SMOTE complete
     Samples: 219 ‚Üí 430
  Resampled class distribution:
    Class 0:  86
    Class 1:  86
    Class 2:  86
    Class 3:  86
    Class 4:  86

‚úÖ Preprocessing pipeline complete!


In [57]:
# ============================================================================
# CELL 9: TRAIN OPTIMIZED RANDOM FOREST CLASSIFIER
# ============================================================================
print("\n" + "="*80)
print("[6/9] TRAINING OPTIMIZED RANDOM FOREST")
print("="*80)
print(f"Configuration:")
for key, value in config.RF_CONFIG.items():
    if key != 'n_jobs':  # Don't print n_jobs as it's -1
        print(f"  {key}: {value}")

print(f"\nüéØ Training Random Forest on {len(y_train_resampled)} samples...")

classifier = RandomForestClassifier(**config.RF_CONFIG)
classifier.fit(X_train_resampled, y_train_resampled)

print(f"‚úÖ Random Forest training complete!")
print(f"  Number of trees: {classifier.n_estimators}")
print(f"  Feature importance sum: {classifier.feature_importances_.sum():.4f}")



[6/9] TRAINING OPTIMIZED RANDOM FOREST
Configuration:
  n_estimators: 500
  max_depth: 20
  min_samples_split: 2
  min_samples_leaf: 1
  max_features: sqrt
  bootstrap: True
  class_weight: balanced
  random_state: 42

üéØ Training Random Forest on 430 samples...
‚úÖ Random Forest training complete!
  Number of trees: 500
  Feature importance sum: 1.0000


In [58]:
# ============================================================================
# CELL 10: CROSS-VALIDATION ON TRAINING SET
# ============================================================================
print("\n" + "="*80)
print("[7/9] CROSS-VALIDATION")
print("="*80)
print(f"Performing {config.N_FOLDS}-fold stratified cross-validation...")
print("‚è± This may take a few minutes...")

# Cross-validation on ORIGINAL training set (before SMOTE)
cv_classifier = RandomForestClassifier(**config.RF_CONFIG)
cv_scores = cross_val_score(
    cv_classifier,
    X_train_scaled,
    y_train,
    cv=config.N_FOLDS,
    scoring='f1_macro',
    n_jobs=-1
)

print(f"\n‚úÖ Cross-validation complete!")
print(f"  Fold scores: {[f'{s:.4f}' for s in cv_scores]}")
print(f"  Mean CV F1: {cv_scores.mean():.4f}")
print(f"  Std CV F1:  {cv_scores.std():.4f}")
print(f"  Min CV F1:  {cv_scores.min():.4f}")
print(f"  Max CV F1:  {cv_scores.max():.4f}")

# Detailed k-fold cross-validation metrics
print(f"\nüìä DETAILED {config.N_FOLDS}-FOLD CROSS-VALIDATION METRICS:")
print("="*50)

# Perform detailed cross-validation with multiple metrics
kfold = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.RANDOM_SEED)

cv_accuracy_scores = []
cv_f1_scores = []
cv_precision_scores = []
cv_recall_scores = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_scaled, y_train), 1):
    X_fold_train, X_fold_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]

    # Train on fold
    fold_classifier = RandomForestClassifier(**config.RF_CONFIG)
    fold_classifier.fit(X_fold_train, y_fold_train)

    # Predict on validation fold
    y_fold_pred = fold_classifier.predict(X_fold_val)

    # Calculate metrics
    accuracy = accuracy_score(y_fold_val, y_fold_pred)
    f1 = f1_score(y_fold_val, y_fold_pred, average='macro')
    precision = precision_score(y_fold_val, y_fold_pred, average='macro', zero_division=0)
    recall = recall_score(y_fold_val, y_fold_pred, average='macro', zero_division=0)

    cv_accuracy_scores.append(accuracy)
    cv_f1_scores.append(f1)
    cv_precision_scores.append(precision)
    cv_recall_scores.append(recall)

    print(f"Fold {fold}:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  F1 Score:  {f1:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  Samples:   Train={len(y_fold_train)}, Val={len(y_fold_val)}")
    print()

print("üìà CROSS-VALIDATION SUMMARY:")
print("="*50)
print(f"Accuracy:  {np.mean(cv_accuracy_scores):.4f} ¬± {np.std(cv_accuracy_scores):.4f}")
print(f"F1 Score:  {np.mean(cv_f1_scores):.4f} ¬± {np.std(cv_f1_scores):.4f}")
print(f"Precision: {np.mean(cv_precision_scores):.4f} ¬± {np.std(cv_precision_scores):.4f}")
print(f"Recall:    {np.mean(cv_recall_scores):.4f} ¬± {np.std(cv_recall_scores):.4f}")




[7/9] CROSS-VALIDATION
Performing 5-fold stratified cross-validation...
‚è± This may take a few minutes...

‚úÖ Cross-validation complete!
  Fold scores: ['0.1115', '0.1133', '0.1115', '0.1637', '0.1417']
  Mean CV F1: 0.1283
  Std CV F1:  0.0211
  Min CV F1:  0.1115
  Max CV F1:  0.1637

üìä DETAILED 5-FOLD CROSS-VALIDATION METRICS:
Fold 1:
  Accuracy:  0.4091
  F1 Score:  0.1161
  Precision: 0.0818
  Recall:    0.2000
  Samples:   Train=175, Val=44

Fold 2:
  Accuracy:  0.3864
  F1 Score:  0.1115
  Precision: 0.0773
  Recall:    0.2000
  Samples:   Train=175, Val=44

Fold 3:
  Accuracy:  0.3864
  F1 Score:  0.1115
  Precision: 0.0773
  Recall:    0.2000
  Samples:   Train=175, Val=44

Fold 4:
  Accuracy:  0.3864
  F1 Score:  0.1115
  Precision: 0.0773
  Recall:    0.2000
  Samples:   Train=175, Val=44

Fold 5:
  Accuracy:  0.3721
  F1 Score:  0.1404
  Precision: 0.1000
  Recall:    0.2353
  Samples:   Train=176, Val=43

üìà CROSS-VALIDATION SUMMARY:
Accuracy:  0.3881 ¬± 0.0119
F1 

In [59]:
# ============================================================================
# CELL 11: EVALUATE ON TRAINING AND VALIDATION SETS
# ============================================================================
print("\n" + "="*80)
print("[8/9] MODEL EVALUATION")
print("="*80)

# ============================================================================
# 1. TRAINING SET EVALUATION (Check for overfitting)
# ============================================================================
print("\n" + "="*80)
print("üìä TRAINING SET EVALUATION (Overfitting Check)")
print("="*80)

# Predict on training set
y_train_pred = classifier.predict(X_train_resampled)
y_train_pred_proba = classifier.predict_proba(X_train_resampled)

# Compute training metrics
train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
train_f1_macro = f1_score(y_train_resampled, y_train_pred, average='macro')
train_f1_weighted = f1_score(y_train_resampled, y_train_pred, average='weighted')
train_precision = precision_score(y_train_resampled, y_train_pred, average='macro', zero_division=0)
train_recall = recall_score(y_train_resampled, y_train_pred, average='macro', zero_division=0)
train_kappa = cohen_kappa_score(y_train_resampled, y_train_pred)
train_f1_per_class = f1_score(y_train_resampled, y_train_pred, average=None, zero_division=0)

print(f"Training Accuracy:      {train_accuracy:.4f}")
print(f"Training F1 (Macro):    {train_f1_macro:.4f}")
print(f"Training F1 (Weighted): {train_f1_weighted:.4f}")
print(f"Training Precision:     {train_precision:.4f}")
print(f"Training Recall:        {train_recall:.4f}")
print(f"Training Kappa:         {train_kappa:.4f}")

# Training set classification report
print("\n" + "-"*80)
print("TRAINING CLASSIFICATION REPORT:")
print("-"*80)
available_classes_train = sorted(np.unique(y_train_resampled))
class_labels_train = [config.CLASS_NAMES[i] for i in available_classes_train]
print(classification_report(y_train_resampled, y_train_pred, target_names=class_labels_train, zero_division=0))

# Training confusion matrix
train_cm = confusion_matrix(y_train_resampled, y_train_pred)
print("\nTRAINING CONFUSION MATRIX:")
print(train_cm)

# ============================================================================
# 2. VALIDATION SET EVALUATION
# ============================================================================
print("\n" + "="*80)
print("üìä VALIDATION SET EVALUATION")
print("="*80)

# Predict on validation set
y_val_pred = classifier.predict(X_val_scaled)
y_val_pred_proba = classifier.predict_proba(X_val_scaled)

# Compute validation metrics
accuracy = accuracy_score(y_val, y_val_pred)
f1_macro = f1_score(y_val, y_val_pred, average='macro')
f1_weighted = f1_score(y_val, y_val_pred, average='weighted')
precision = precision_score(y_val, y_val_pred, average='macro', zero_division=0)
recall = recall_score(y_val, y_val_pred, average='macro', zero_division=0)
kappa = cohen_kappa_score(y_val, y_val_pred)
f1_per_class = f1_score(y_val, y_val_pred, average=None, zero_division=0)

# Display results
print("\n" + "="*80)
print("üéØ FINAL RESULTS - OPTIMIZED WHISPER + RANDOM FOREST (NO PADDING)")
print("="*80)
print(f"Validation Accuracy:      {accuracy:.4f}")
print(f"F1 Score (Macro):         {f1_macro:.4f} ")
print(f"F1 Score (Weighted):      {f1_weighted:.4f}")
print(f"Precision (Macro):        {precision:.4f}")
print(f"Recall (Macro):           {recall:.4f}")
print(f"Cohen's Kappa:            {kappa:.4f}")
print(f"CV F1 (Mean ¬± Std):       {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
print("="*80)

# ============================================================================
# 3. OVERFITTING ANALYSIS
# ============================================================================
print("\n" + "="*80)
print("üîç OVERFITTING ANALYSIS")
print("="*80)

# Calculate performance gaps
accuracy_gap = train_accuracy - accuracy
f1_gap = train_f1_macro - f1_macro
precision_gap = train_precision - precision
recall_gap = train_recall - recall

print(f"Performance Gaps (Training - Validation):")
print(f"  Accuracy:  {train_accuracy:.4f} - {accuracy:.4f} = {accuracy_gap:+.4f}")
print(f"  F1 Score:  {train_f1_macro:.4f} - {f1_macro:.4f} = {f1_gap:+.4f}")
print(f"  Precision: {train_precision:.4f} - {precision:.4f} = {precision_gap:+.4f}")
print(f"  Recall:    {train_recall:.4f} - {recall:.4f} = {recall_gap:+.4f}")

# Overfitting assessment
if f1_gap > 0.15:
    print(f"\n‚ö†Ô∏è SIGNIFICANT OVERFITTING DETECTED!")
    print(f"   F1 gap: {f1_gap:.4f} > 0.15")
elif f1_gap > 0.10:
    print(f"\nüìà MODERATE OVERFITTING DETECTED")
    print(f"   F1 gap: {f1_gap:.4f} > 0.10")
elif f1_gap > 0.05:
    print(f"\nüìä SLIGHT OVERFITTING DETECTED")
    print(f"   F1 gap: {f1_gap:.4f} > 0.05")
else:
    print(f"\n‚úÖ GOOD GENERALIZATION!")
    print(f"   F1 gap: {f1_gap:.4f} ‚â§ 0.05")

# ============================================================================
# 4. PER-CLASS PERFORMANCE (Validation)
# ============================================================================
print("\nüìä PER-CLASS PERFORMANCE (Validation):")
available_classes = sorted(np.unique(y_val))
for cls, f1_val in zip(available_classes, [f1_per_class[i] for i in available_classes]):
    label = config.CLASS_NAMES[cls]
    bar = '‚ñà' * int(f1_val * 40)
    print(f"  Class {cls} ({label:25s}): {f1_val:.4f} {bar}")

# Classification report
print("\n" + "-"*80)
print("CLASSIFICATION REPORT (Validation):")
print("-"*80)
class_labels = [config.CLASS_NAMES[i] for i in available_classes]
print(classification_report(y_val, y_val_pred, target_names=class_labels, zero_division=0))

# Confusion matrix
cm = confusion_matrix(y_val, y_val_pred)
print("\nCONFUSION MATRIX (Validation):")
print(cm)

# ============================================================================
# 5. ACHIEVEMENT STATUS
# ============================================================================
print("\n" + "="*80)
baseline_f1 = 0.4919
target_f1 = 0.70
improvement = f1_macro - baseline_f1

if f1_macro >= target_f1:
    print(f"üéâüéâüéâ TARGET ACHIEVED! üéâüéâüéâ")
    print(f"F1 Score: {f1_macro:.4f} >= {target_f1:.4f}")
    print(f"Improvement over baseline: +{improvement:.4f} (+{improvement/baseline_f1*100:.1f}%)")
elif f1_macro >= 0.65:
    print(f"üöÄ EXCELLENT PROGRESS!")
    print(f"F1 Score: {f1_macro:.4f}")
    print(f"Gap to target: -{target_f1 - f1_macro:.4f}")
    print(f"Improvement over baseline: +{improvement:.4f} (+{improvement/baseline_f1*100:.1f}%)")
else:
    print(f"üìà STRONG IMPROVEMENT!")
    print(f"F1 Score: {f1_macro:.4f}")
    print(f"Improvement over baseline: +{improvement:.4f} (+{improvement/baseline_f1*100:.1f}%)")
print("="*80)



[8/9] MODEL EVALUATION

üìä TRAINING SET EVALUATION (Overfitting Check)
Training Accuracy:      0.9977
Training F1 (Macro):    0.9977
Training F1 (Weighted): 0.9977
Training Precision:     0.9977
Training Recall:        0.9977
Training Kappa:         0.9971

--------------------------------------------------------------------------------
TRAINING CLASSIFICATION REPORT:
--------------------------------------------------------------------------------
                     precision    recall  f1-score   support

  Severe Dysarthria       1.00      1.00      1.00        86
Moderate Dysarthria       1.00      1.00      1.00        86
    Mild Dysarthria       1.00      1.00      1.00        86
No Dysarthria (ALS)       0.99      1.00      0.99        86
            Healthy       1.00      0.99      0.99        86

           accuracy                           1.00       430
          macro avg       1.00      1.00      1.00       430
       weighted avg       1.00      1.00      1.00     

In [60]:
# ============================================================================
# CELL 12: SAVE MODEL AND RESULTS
# ============================================================================
print("\n" + "="*80)
print("[9/9] SAVING MODEL AND RESULTS")
print("="*80)

# Save model
model_path = os.path.join(config.OUTPUT_PATH, 'random_forest_model_no_padding.joblib')
joblib.dump(classifier, model_path)
print(f"‚úÖ Model saved: {model_path}")

# Save scaler
scaler_path = os.path.join(config.OUTPUT_PATH, 'scaler_no_padding.joblib')
joblib.dump(scaler, scaler_path)
print(f"‚úÖ Scaler saved: {scaler_path}")

# Save PCA
if pca is not None:
    pca_path = os.path.join(config.OUTPUT_PATH, 'pca_no_padding.joblib')
    joblib.dump(pca, pca_path)
    print(f"‚úÖ PCA saved: {pca_path}")

# Save results
results = {
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'model': 'Random Forest',
    'feature_extractor': 'Whisper (openai/whisper-base)',
    'audio_processing': 'No padding - actual content only (max 15s)',
    'validation_metrics': {
        'accuracy': float(accuracy),
        'f1_macro': float(f1_macro),
        'f1_weighted': float(f1_weighted),
        'precision': float(precision),
        'recall': float(recall),
        'kappa': float(kappa)
    },
    'cross_validation': {
        'mean_f1': float(cv_scores.mean()),
        'std_f1': float(cv_scores.std()),
        'fold_scores': [float(s) for s in cv_scores]
    },
    'training_metrics': {
        'accuracy': float(train_accuracy),
        'f1_macro': float(train_f1_macro),
        'precision': float(train_precision),
        'recall': float(train_recall)
    },
    'overfitting_gaps': {
        'accuracy_gap': float(accuracy_gap),
        'f1_gap': float(f1_gap),
        'precision_gap': float(precision_gap),
        'recall_gap': float(recall_gap)
    },
    'per_class_f1': {config.CLASS_NAMES[i]: float(f1_per_class[i]) for i in available_classes},
    'confusion_matrix': cm.tolist(),
    'config': {
        'max_audio_length_sec': config.MAX_AUDIO_LENGTH_SEC,
        'use_padding': config.USE_PADDING,
        'use_multi_layer': config.USE_MULTI_LAYER,
        'use_multi_pooling': config.USE_MULTI_POOLING,
        'use_statistical_features': config.USE_STATISTICAL_FEATURES,
        'use_pca': config.USE_PCA,
        'use_smote': config.USE_SMOTE,
        'rf_n_estimators': config.RF_CONFIG['n_estimators']
    }
}

results_path = os.path.join(config.OUTPUT_PATH, 'results_no_padding.json')
with open(results_path, 'w') as f:
    json.dump(results, f, indent=2)
print(f"‚úÖ Results saved: {results_path}")

# Save predictions
predictions_df = pd.DataFrame({
    'ID': ids_val,
    'True_Class': y_val,
    'Predicted_Class': y_val_pred,
    'True_Label': [config.CLASS_NAMES[c] for c in y_val],
    'Predicted_Label': [config.CLASS_NAMES[c] for c in y_val_pred],
    'Correct': y_val == y_val_pred
})

# Add prediction probabilities
for i in range(len(config.CLASS_NAMES)):
    predictions_df[f'Prob_Class_{i}'] = y_val_pred_proba[:, i]

predictions_path = os.path.join(config.OUTPUT_PATH, 'validation_predictions_no_padding.csv')
predictions_df.to_csv(predictions_path, index=False)
print(f"‚úÖ Predictions saved: {predictions_path}")

print("\n" + "="*80)
print("üéä ALL TASKS COMPLETED SUCCESSFULLY!")
print("="*80)
print(f"\nSummary:")
print(f"  Validation F1 Score: {f1_macro:.4f}")
print(f"  Validation Accuracy: {accuracy:.4f}")
print(f"  CV F1 Score: {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
print(f"  Overfitting (F1 gap): {f1_gap:.4f}")
print(f"\nAll outputs saved to: {config.OUTPUT_PATH}")
print("="*80)


[9/9] SAVING MODEL AND RESULTS
‚úÖ Model saved: /content/drive/MyDrive/SAND_Project_Data/optimized_whisper_rf_v2/random_forest_model_no_padding.joblib
‚úÖ Scaler saved: /content/drive/MyDrive/SAND_Project_Data/optimized_whisper_rf_v2/scaler_no_padding.joblib
‚úÖ PCA saved: /content/drive/MyDrive/SAND_Project_Data/optimized_whisper_rf_v2/pca_no_padding.joblib
‚úÖ Results saved: /content/drive/MyDrive/SAND_Project_Data/optimized_whisper_rf_v2/results_no_padding.json
‚úÖ Predictions saved: /content/drive/MyDrive/SAND_Project_Data/optimized_whisper_rf_v2/validation_predictions_no_padding.csv

üéä ALL TASKS COMPLETED SUCCESSFULLY!

Summary:
  Validation F1 Score: 0.4834
  Validation Accuracy: 0.5660
  CV F1 Score: 0.1283 ¬± 0.0211
  Overfitting (F1 gap): 0.5142

All outputs saved to: /content/drive/MyDrive/SAND_Project_Data/optimized_whisper_rf_v2
