# Frame-Level Speech Recognition Training

This notebook demonstrates how to train the MLP model for frame-level speech recognition.

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from speech_recognition.models.mlp import SpeechMLP
from speech_recognition.data.dataset import AudioDataset, AudioTestDataset
from speech_recognition.utils.training import (
    train_epoch, validate, predict,
    create_optimizer, create_scheduler,
    setup_wandb, log_metrics
)

## Configuration

In [None]:
config = {
    'data_root': '/path/to/data',  # Update this
    'context': 30,
    'batch_size': 256,
    'num_epochs': 25,
    'learning_rate': 1e-3,
    'dropout_rate': 0.05,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

# Initialize wandb
setup_wandb(config)

## Data Loading

In [None]:
# Create datasets
train_data = AudioDataset(
    root=config['data_root'],
    context=config['context'],
    augment=True
)

val_data = AudioDataset(
    root=config['data_root'],
    partition='dev-clean',
    context=config['context'],
    augment=False
)

test_data = AudioTestDataset(
    root=config['data_root'],
    context=config['context']
)

# Create dataloaders
train_loader = DataLoader(
    dataset=train_data,
    batch_size=config['batch_size'],
    shuffle=True,
    num_workers=8,
    pin_memory=True
)

val_loader = DataLoader(
    dataset=val_data,
    batch_size=config['batch_size'],
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

test_loader = DataLoader(
    dataset=test_data,
    batch_size=config['batch_size'],
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

## Model Setup

In [None]:
# Calculate input size based on context
input_size = (2 * config['context'] + 1) * 28
output_size = len(train_data.phonemes)

# Create model
model = SpeechMLP(
    input_size=input_size,
    output_size=output_size,
    dropout_rate=config['dropout_rate']
).to(config['device'])

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = create_optimizer(model, lr=config['learning_rate'])
scheduler = create_scheduler(
    optimizer,
    mode='cosine',
    T_max=config['num_epochs']
)

## Training Loop

In [None]:
best_val_acc = 0

for epoch in range(config['num_epochs']):
    print(f"\nEpoch {epoch+1}/{config['num_epochs']}")
    
    # Training
    train_loss, train_acc = train_epoch(
        model, train_loader, criterion, optimizer, config['device']
    )
    
    # Validation
    val_loss, val_acc = validate(
        model, val_loader, criterion, config['device']
    )
    
    # Update learning rate
    if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
        scheduler.step(val_loss)
    else:
        scheduler.step()
    
    # Log metrics
    log_metrics({
        'train_loss': train_loss,
        'train_acc': train_acc,
        'val_loss': val_loss,
        'val_acc': val_acc,
        'learning_rate': optimizer.param_groups[0]['lr']
    }, epoch)
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_model.pt')
        print(f"New best validation accuracy: {val_acc:.2f}%")

## Generate Predictions

In [None]:
# Load best model
model.load_state_dict(torch.load('best_model.pt'))

# Generate predictions
predictions = predict(model, test_loader, config['device'])

# Save predictions
import numpy as np
np.save('predictions.npy', predictions)