In [2]:
# Cell 1: Install dependencies and mount drive
!pip install transformers torchaudio pandas openpyxl tqdm matplotlib scikit-learn
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
# Cell 2: Import libraries and setup paths
import pandas as pd
import os
import torch
import torchaudio
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from torch.optim import AdamW
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Configure paths
BASE_PATH = '/content/drive/MyDrive/SAND_Project_Data'
TRAINING_PATH = os.path.join(BASE_PATH, 'training')
EXCEL_PATH = os.path.join(BASE_PATH, 'sand_task_1.xlsx')
OUTPUT_PATH = os.path.join(BASE_PATH, 'wav2vec2_finetuned')

# Create output directory
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Audio configuration
AUDIO_TYPES = ["phonationA", "phonationE", "phonationI", "phonationO",
               "phonationU", "rhythmKA", "rhythmPA", "rhythmTA"]
SAMPLE_RATE = 16000
AUDIO_DURATION = 5
TARGET_LENGTH = SAMPLE_RATE * AUDIO_DURATION

# Training configuration
BATCH_SIZE = 8  # Reduced for CPU training
EPOCHS = 5
LEARNING_RATE = 5e-5

print(f"Base Path: {BASE_PATH}")
print(f"Training Path: {TRAINING_PATH}")
print(f"Excel Path: {EXCEL_PATH}")
print(f"Output Path: {OUTPUT_PATH}")

# Cell 3: Load and prepare data
print("Loading data from Excel...")
df = pd.read_excel(EXCEL_PATH)
print(f"Loaded {len(df)} subjects from Excel")
print(f"\nClass distribution:")
print(df['Class'].value_counts().sort_index())

# Create audio entries for each subject and audio type
audio_entries = []
missing_files = []

for _, row in df.iterrows():
    subj_id = row['ID']
    label = int(row['Class'])

    for typ in AUDIO_TYPES:
        audio_path = os.path.join(TRAINING_PATH, typ, f"{subj_id}_{typ}.wav")

        # Check if file exists
        if os.path.exists(audio_path):
            entry = {
                "ID": subj_id,
                "Category": typ,
                "audio_path": audio_path,
                "label": label
            }
            audio_entries.append(entry)
        else:
            missing_files.append(audio_path)

audio_df = pd.DataFrame(audio_entries)
print(f"\nTotal audio samples: {len(audio_df)}")
print(f"Missing files: {len(missing_files)}")

if missing_files and len(missing_files) < 20:
    print("\nFirst few missing files:")
    for f in missing_files[:5]:
        print(f"  - {f}")

Base Path: /content/drive/MyDrive/SAND_Project_Data
Training Path: /content/drive/MyDrive/SAND_Project_Data/training
Excel Path: /content/drive/MyDrive/SAND_Project_Data/sand_task_1.xlsx
Output Path: /content/drive/MyDrive/SAND_Project_Data/wav2vec2_finetuned
Loading data from Excel...
Loaded 272 subjects from Excel

Class distribution:
Class
1      6
2     26
3     57
4     76
5    107
Name: count, dtype: int64

Total audio samples: 2176
Missing files: 0


In [4]:
# Cell 4: Audio preprocessing function
def load_and_preprocess_wav(path, target_length=TARGET_LENGTH):
    """Load and preprocess audio file"""
    try:
        waveform, sr = torchaudio.load(path)

        # Resample if necessary
        if sr != SAMPLE_RATE:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
            waveform = resampler(waveform)

        # Convert to mono
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        waveform = waveform.squeeze(0)

        # Pad or truncate
        if waveform.shape[0] > target_length:
            waveform = waveform[:target_length]
        elif waveform.shape[0] < target_length:
            pad_len = target_length - waveform.shape[0]
            waveform = torch.nn.functional.pad(waveform, (0, pad_len))

        # Normalize
        waveform = (waveform - waveform.mean()) / (waveform.std() + 1e-8)

        return waveform
    except Exception as e:
        print(f"Error loading {path}: {e}")
        return torch.zeros(target_length)

# Cell 5: Dataset class
class SANDPhonationRhythmDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        audio_path = row['audio_path']
        label = int(row['label']) - 1  # Convert to 0-indexed
        waveform = load_and_preprocess_wav(audio_path)
        return waveform, label

# Cell 6: Split data and create dataloaders
print("\nSplitting data into train and validation sets...")
train_df, val_df = train_test_split(
    audio_df,
    test_size=0.2,
    stratify=audio_df['label'],
    random_state=42
)

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

train_dataset = SANDPhonationRhythmDataset(train_df)
val_dataset = SANDPhonationRhythmDataset(val_df)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=False
)
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=False
)

# Cell 7: Compute class weights
print("\nComputing class weights for balanced training...")
num_classes = len(np.unique(audio_df['label']))
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.arange(1, num_classes + 1),
    y=audio_df['label']
)

# Convert to 0-indexed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

print(f"Using device: {device}")
print(f"Number of classes: {num_classes}")
print(f"Class weights: {class_weights}")

# Cell 8: Load model and processor
print("\nLoading Wav2Vec2 model and processor...")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=num_classes,
    problem_type="single_label_classification"
).to(device)

# Freeze backbone for faster training
print("Freezing Wav2Vec2 backbone (only training classifier head)...")
for param in model.wav2vec2.parameters():
    param.requires_grad = False

# Count trainable parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Cell 9: Training function
def train_epoch(model, loader, optimizer, device, processor, class_weights):
    model.train()
    total_loss = 0.0
    all_preds = []
    all_labels = []

    pbar = tqdm(loader, desc="Training")
    for batch_waveforms, batch_labels in pbar:
        optimizer.zero_grad()

        # Process audio
        inputs = processor(
            batch_waveforms.numpy(),
            sampling_rate=SAMPLE_RATE,
            return_tensors="pt",
            padding=True
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        batch_labels = batch_labels.to(device)

        # Forward pass
        outputs = model(**inputs)
        loss = torch.nn.functional.cross_entropy(
            outputs.logits,
            batch_labels,
            weight=class_weights
        )

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = outputs.logits.argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(batch_labels.cpu().numpy())

        pbar.set_postfix({'loss': f'{loss.item():.4f}'})

    avg_loss = total_loss / len(loader)
    f1 = f1_score(all_labels, all_preds, average='macro')

    return avg_loss, f1

# Cell 10: Validation function
def validate(model, loader, device, processor, class_weights):
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        pbar = tqdm(loader, desc="Validating")
        for batch_waveforms, batch_labels in pbar:
            # Process audio
            inputs = processor(
                batch_waveforms.numpy(),
                sampling_rate=SAMPLE_RATE,
                return_tensors="pt",
                padding=True
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}
            batch_labels = batch_labels.to(device)

            # Forward pass
            outputs = model(**inputs)
            loss = torch.nn.functional.cross_entropy(
                outputs.logits,
                batch_labels,
                weight=class_weights
            )

            total_loss += loss.item()
            preds = outputs.logits.argmax(dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch_labels.cpu().numpy())

            pbar.set_postfix({'loss': f'{loss.item():.4f}'})

    avg_loss = total_loss / len(loader)
    f1 = f1_score(all_labels, all_preds, average='macro')

    return avg_loss, f1, all_preds, all_labels

# Cell 11: Training loop
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
best_val_f1 = 0.0
history = {
    'train_loss': [], 'train_f1': [],
    'val_loss': [], 'val_f1': []
}

print("\n" + "="*50)
print("Starting Training")
print("="*50)

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    print("-" * 50)

    # Train
    train_loss, train_f1 = train_epoch(
        model, train_loader, optimizer, device, processor, class_weights_tensor
    )

    # Validate
    val_loss, val_f1, val_preds, val_labels = validate(
        model, val_loader, device, processor, class_weights_tensor
    )

    # Save history
    history['train_loss'].append(train_loss)
    history['train_f1'].append(train_f1)
    history['val_loss'].append(val_loss)
    history['val_f1'].append(val_f1)

    print(f"\nResults:")
    print(f"  Train Loss: {train_loss:.4f} | Train F1: {train_f1:.4f}")
    print(f"  Val Loss:   {val_loss:.4f} | Val F1:   {val_f1:.4f}")

    # Save best model
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        model_path = os.path.join(OUTPUT_PATH, 'best_model.pth')
        torch.save(model.state_dict(), model_path)
        print(f"  ✓ Best model saved! (F1: {best_val_f1:.4f})")

print("\n" + "="*50)
print(f"Training Complete! Best Val F1: {best_val_f1:.4f}")
print("="*50)

# Cell 12: Final evaluation and report
print("\n" + "="*50)
print("Final Evaluation on Validation Set")
print("="*50)

# Load best model
model.load_state_dict(torch.load(os.path.join(OUTPUT_PATH, 'best_model.pth')))
_, final_f1, final_preds, final_labels = validate(
    model, val_loader, device, processor, class_weights_tensor
)

# Classification report
class_names = [f"Class {i+1}" for i in range(num_classes)]
print("\nClassification Report:")
print(classification_report(
    final_labels,
    final_preds,
    target_names=class_names,
    digits=4
))

# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(final_labels, final_preds)
print(cm)

# Save results
results = {
    'best_val_f1': best_val_f1,
    'final_f1': final_f1,
    'history': history,
    'classification_report': classification_report(
        final_labels, final_preds, target_names=class_names, output_dict=True
    ),
    'confusion_matrix': cm.tolist()
}

import json
results_path = os.path.join(OUTPUT_PATH, 'training_results.json')
with open(results_path, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n✓ Results saved to: {results_path}")
print(f"✓ Model saved to: {OUTPUT_PATH}")


Splitting data into train and validation sets...
Training samples: 1740
Validation samples: 436

Computing class weights for balanced training...
Using device: cuda
Number of classes: 5
Class weights: [9.06666667 2.09230769 0.95438596 0.71578947 0.50841121]

Loading Wav2Vec2 model and processor...


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Freezing Wav2Vec2 backbone (only training classifier head)...
Total parameters: 94,569,861
Trainable parameters: 198,149

Starting Training

Epoch 1/5
--------------------------------------------------


Training:   0%|          | 0/218 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

Validating:   0%|          | 0/55 [00:00<?, ?it/s]


Results:
  Train Loss: 1.5578 | Train F1: 0.2099
  Val Loss:   1.5471 | Val F1:   0.1952
  ✓ Best model saved! (F1: 0.1952)

Epoch 2/5
--------------------------------------------------


Training:   0%|          | 0/218 [00:00<?, ?it/s]

Validating:   0%|          | 0/55 [00:00<?, ?it/s]


Results:
  Train Loss: 1.4892 | Train F1: 0.2789
  Val Loss:   1.5058 | Val F1:   0.2557
  ✓ Best model saved! (F1: 0.2557)

Epoch 3/5
--------------------------------------------------


Training:   0%|          | 0/218 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fa33d7649a0>
Exception ignored in: Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    <function _MultiProcessingDataLoaderIter.__del__ at 0x7fa33d7649a0>self._shutdown_workers()

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
        if w.is_alive():
self._shutdown_workers() 
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
       if w.is_alive(): 
    ^ ^   ^^ ^^^^^^^^^^^^^^
^  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    ^assert self._parent_pid == os.getpid(), 'can only test a child process'^
  ^  ^^  
   File "/usr/lib

Validating:   0%|          | 0/55 [00:00<?, ?it/s]


Results:
  Train Loss: 1.4350 | Train F1: 0.3174
  Val Loss:   1.4692 | Val F1:   0.2605
  ✓ Best model saved! (F1: 0.2605)

Epoch 4/5
--------------------------------------------------


Training:   0%|          | 0/218 [00:00<?, ?it/s]

Validating:   0%|          | 0/55 [00:00<?, ?it/s]


Results:
  Train Loss: 1.3833 | Train F1: 0.3536
  Val Loss:   1.4359 | Val F1:   0.3413
  ✓ Best model saved! (F1: 0.3413)

Epoch 5/5
--------------------------------------------------


Training:   0%|          | 0/218 [00:00<?, ?it/s]

Validating:   0%|          | 0/55 [00:00<?, ?it/s]


Results:
  Train Loss: 1.3574 | Train F1: 0.3688
  Val Loss:   1.4080 | Val F1:   0.3211

Training Complete! Best Val F1: 0.3413

Final Evaluation on Validation Set


Validating:   0%|          | 0/55 [00:00<?, ?it/s]


Classification Report:
              precision    recall  f1-score   support

     Class 1     0.5000    0.2000    0.2857        10
     Class 2     0.2235    0.4524    0.2992        42
     Class 3     0.2704    0.4725    0.3440        91
     Class 4     0.4923    0.2623    0.3422       122
     Class 5     0.5203    0.3743    0.4354       171

    accuracy                         0.3670       436
   macro avg     0.4013    0.3523    0.3413       436
weighted avg     0.4313    0.3670    0.3737       436


Confusion Matrix:
[[ 2  6  1  0  1]
 [ 2 19 16  0  5]
 [ 0 20 43  9 19]
 [ 0 18 38 32 34]
 [ 0 22 61 24 64]]

✓ Results saved to: /content/drive/MyDrive/SAND_Project_Data/wav2vec2_finetuned/training_results.json
✓ Model saved to: /content/drive/MyDrive/SAND_Project_Data/wav2vec2_finetuned


In [5]:
# Cell 13: Evaluation on Training Data
print("\n" + "="*50)
print("Evaluation on Training Data")
print("="*50)

# Evaluate on training data
train_loss, train_f1, train_preds, train_labels = validate(
    model, train_loader, device, processor, class_weights_tensor
)

# Classification report for training data
print("\nTraining Data Classification Report:")
print(classification_report(
    train_labels,
    train_preds,
    target_names=class_names,
    digits=4
))

# Confusion matrix for training data
print("\nTraining Data Confusion Matrix:")
train_cm = confusion_matrix(train_labels, train_preds)
print(train_cm)

# Compare training vs validation performance
print("\n" + "="*50)
print("Performance Comparison")
print("="*50)
print(f"Training F1:   {train_f1:.4f}")
print(f"Validation F1: {final_f1:.4f}")
print(f"Difference:    {abs(train_f1 - final_f1):.4f}")

# Check for overfitting
if train_f1 - final_f1 > 0.1:  # Threshold for significant overfitting
    print("⚠️  Warning: Potential overfitting detected (large gap between train and val performance)")
elif train_f1 - final_f1 > 0.05:
    print("ℹ️  Moderate gap between train and val performance")
else:
    print("✓ Good generalization (small gap between train and val performance)")

# Update results with training data evaluation
results['train_f1'] = train_f1
results['train_classification_report'] = classification_report(
    train_labels, train_preds, target_names=class_names, output_dict=True
)
results['train_confusion_matrix'] = train_cm.tolist()
results['performance_gap'] = abs(train_f1 - final_f1)

# Save updated results
with open(results_path, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n✓ Training evaluation completed and saved to results!")


Evaluation on Training Data


Validating:   0%|          | 0/218 [00:00<?, ?it/s]


Training Data Classification Report:
              precision    recall  f1-score   support

     Class 1     0.2593    0.1842    0.2154        38
     Class 2     0.2582    0.5241    0.3459       166
     Class 3     0.2925    0.4904    0.3664       365
     Class 4     0.4065    0.2819    0.3329       486
     Class 5     0.5948    0.3708    0.4568       685

    accuracy                         0.3816      1740
   macro avg     0.3623    0.3703    0.3435      1740
weighted avg     0.4394    0.3816    0.3874      1740


Training Data Confusion Matrix:
[[  7  29   1   1   0]
 [ 14  87  46   6  13]
 [  1  78 179  62  45]
 [  0  64 170 137 115]
 [  5  79 216 131 254]]

Performance Comparison
Training F1:   0.3435
Validation F1: 0.3413
Difference:    0.0022
✓ Good generalization (small gap between train and val performance)

✓ Training evaluation completed and saved to results!
