# Speech Fluency Analysis Model Training

**Purpose**: Analyze mock interview speech for fluency scoring

**Architecture**: wav2vec2 + GRU

**Outputs**:
- Fluency score (0-10)
- Hesitation count
- Filler word detection

## 1. Setup & Dependencies

In [None]:
# Install dependencies (run once)
# !pip install torch torchaudio transformers librosa soundfile datasets

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio
import numpy as np
import pandas as pd
import os
import random
from pathlib import Path

# Reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## 2. Data Loading & Preprocessing

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model

# Load wav2vec2 processor and model
MODEL_NAME = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
wav2vec2 = Wav2Vec2Model.from_pretrained(MODEL_NAME)
wav2vec2.eval()  # Freeze for feature extraction

print(f'Loaded {MODEL_NAME}')
print(f'Hidden size: {wav2vec2.config.hidden_size}')

In [None]:
class SpeechFluencyDataset(Dataset):
    """
    Dataset for speech fluency analysis.
    
    Expected CSV format:
    - audio_path: path to audio file
    - fluency_score: 0-10 score
    - hesitation_count: number of hesitations
    """
    
    def __init__(self, csv_path, audio_dir, max_length_seconds=30, sample_rate=16000):
        self.df = pd.read_csv(csv_path)
        self.audio_dir = Path(audio_dir)
        self.max_length = max_length_seconds * sample_rate
        self.sample_rate = sample_rate
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Load audio
        audio_path = self.audio_dir / row['audio_path']
        waveform, sr = torchaudio.load(audio_path)
        
        # Resample if needed
        if sr != self.sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
            waveform = resampler(waveform)
        
        # Convert to mono
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        
        # Truncate or pad
        waveform = waveform.squeeze(0)
        if len(waveform) > self.max_length:
            waveform = waveform[:self.max_length]
        else:
            waveform = torch.nn.functional.pad(waveform, (0, self.max_length - len(waveform)))
        
        # Labels
        fluency_score = float(row['fluency_score']) / 10.0  # Normalize to 0-1
        hesitation_count = float(row.get('hesitation_count', 0))
        
        return {
            'waveform': waveform,
            'fluency_score': torch.tensor(fluency_score, dtype=torch.float),
            'hesitation_count': torch.tensor(hesitation_count, dtype=torch.float)
        }

In [None]:
def generate_synthetic_dataset(n_samples=500, output_dir='../datasets/speech_samples'):
    """
    Generate synthetic training data (for demo purposes).
    In production, use real labeled interview recordings.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    data = []
    for i in range(n_samples):
        # Random fluency characteristics
        fluency = np.random.uniform(4, 10)  # 4-10 range
        hesitations = int(max(0, 10 - fluency) + np.random.randint(0, 3))
        
        # Create synthetic audio (sine wave with noise)
        duration = np.random.uniform(5, 15)  # 5-15 seconds
        sr = 16000
        t = np.linspace(0, duration, int(sr * duration))
        
        # Base frequency varies with "confidence"
        freq = 200 + fluency * 10
        audio = np.sin(2 * np.pi * freq * t) * 0.3
        audio += np.random.randn(len(audio)) * (0.05 + (10 - fluency) * 0.01)
        
        # Save audio
        audio_path = f'sample_{i:04d}.wav'
        full_path = os.path.join(output_dir, audio_path)
        torchaudio.save(full_path, torch.tensor(audio).unsqueeze(0).float(), sr)
        
        data.append({
            'audio_path': audio_path,
            'fluency_score': round(fluency, 1),
            'hesitation_count': hesitations
        })
    
    # Save CSV
    df = pd.DataFrame(data)
    csv_path = os.path.join(output_dir, 'labels.csv')
    df.to_csv(csv_path, index=False)
    
    print(f'Generated {n_samples} samples')
    print(f'Saved to: {output_dir}')
    
    return df

# Generate synthetic data (comment out if using real data)
# df = generate_synthetic_dataset()

## 3. Model Definition

In [None]:
class SpeechFluencyModel(nn.Module):
    """
    Speech Fluency Analysis Model
    
    Architecture:
    - wav2vec2 for audio feature extraction (frozen)
    - Bidirectional GRU for temporal modeling
    - MLP heads for fluency score and hesitation count
    """
    
    def __init__(
        self,
        wav2vec2_model,
        hidden_size=256,
        num_gru_layers=2,
        dropout=0.3,
        freeze_wav2vec=True
    ):
        super().__init__()
        
        self.wav2vec2 = wav2vec2_model
        wav2vec_hidden = wav2vec2_model.config.hidden_size  # 768
        
        # Freeze wav2vec2 initially
        if freeze_wav2vec:
            for param in self.wav2vec2.parameters():
                param.requires_grad = False
        
        # Bidirectional GRU
        self.gru = nn.GRU(
            input_size=wav2vec_hidden,
            hidden_size=hidden_size,
            num_layers=num_gru_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_gru_layers > 1 else 0
        )
        
        gru_output_size = hidden_size * 2  # Bidirectional
        
        # Fluency score head (regression: 0-1)
        self.fluency_head = nn.Sequential(
            nn.Linear(gru_output_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
        
        # Hesitation count head (regression)
        self.hesitation_head = nn.Sequential(
            nn.Linear(gru_output_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1),
            nn.ReLU()  # Count is non-negative
        )
    
    def forward(self, waveforms):
        # Extract wav2vec2 features
        with torch.no_grad() if not self.training else torch.enable_grad():
            outputs = self.wav2vec2(waveforms)
            features = outputs.last_hidden_state  # (batch, seq_len, 768)
        
        # GRU
        gru_out, _ = self.gru(features)  # (batch, seq_len, hidden*2)
        
        # Global average pooling
        pooled = gru_out.mean(dim=1)  # (batch, hidden*2)
        
        # Predictions
        fluency_score = self.fluency_head(pooled).squeeze(-1)
        hesitation_count = self.hesitation_head(pooled).squeeze(-1)
        
        return {
            'fluency_score': fluency_score,
            'hesitation_count': hesitation_count
        }

In [None]:
# Initialize model
model = SpeechFluencyModel(wav2vec2, hidden_size=256, num_gru_layers=2)
model = model.to(device)

# Count parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f'Trainable parameters: {trainable_params:,}')
print(f'Total parameters: {total_params:,}')

## 4. Training Configuration

In [None]:
# Training config
CONFIG = {
    'epochs': 30,
    'batch_size': 8,
    'learning_rate': 1e-4,
    'weight_decay': 0.01,
    'fluency_weight': 0.7,  # Weight for fluency loss
    'hesitation_weight': 0.3,  # Weight for hesitation loss
    'early_stopping_patience': 5,
    'checkpoint_dir': '../models/speech_model'
}

os.makedirs(CONFIG['checkpoint_dir'], exist_ok=True)

In [None]:
# Loss and optimizer
mse_loss = nn.MSELoss()
optimizer = optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=CONFIG['learning_rate'],
    weight_decay=CONFIG['weight_decay']
)

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=3, verbose=True
)

## 5. Training Loop

In [None]:
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0.0
    fluency_mae = 0.0
    hesitation_mae = 0.0
    
    for batch in dataloader:
        waveforms = batch['waveform'].to(device)
        fluency_targets = batch['fluency_score'].to(device)
        hesitation_targets = batch['hesitation_count'].to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(waveforms)
        
        # Losses
        fluency_loss = mse_loss(outputs['fluency_score'], fluency_targets)
        hesitation_loss = mse_loss(outputs['hesitation_count'], hesitation_targets)
        
        loss = (
            CONFIG['fluency_weight'] * fluency_loss +
            CONFIG['hesitation_weight'] * hesitation_loss
        )
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        # Metrics
        total_loss += loss.item()
        fluency_mae += torch.abs(outputs['fluency_score'] - fluency_targets).mean().item()
        hesitation_mae += torch.abs(outputs['hesitation_count'] - hesitation_targets).mean().item()
    
    n_batches = len(dataloader)
    return {
        'loss': total_loss / n_batches,
        'fluency_mae': fluency_mae / n_batches,
        'hesitation_mae': hesitation_mae / n_batches
    }


def validate(model, dataloader, device):
    model.eval()
    total_loss = 0.0
    fluency_mae = 0.0
    hesitation_mae = 0.0
    
    with torch.no_grad():
        for batch in dataloader:
            waveforms = batch['waveform'].to(device)
            fluency_targets = batch['fluency_score'].to(device)
            hesitation_targets = batch['hesitation_count'].to(device)
            
            outputs = model(waveforms)
            
            fluency_loss = mse_loss(outputs['fluency_score'], fluency_targets)
            hesitation_loss = mse_loss(outputs['hesitation_count'], hesitation_targets)
            
            loss = (
                CONFIG['fluency_weight'] * fluency_loss +
                CONFIG['hesitation_weight'] * hesitation_loss
            )
            
            total_loss += loss.item()
            fluency_mae += torch.abs(outputs['fluency_score'] - fluency_targets).mean().item()
            hesitation_mae += torch.abs(outputs['hesitation_count'] - hesitation_targets).mean().item()
    
    n_batches = len(dataloader)
    return {
        'loss': total_loss / n_batches,
        'fluency_mae': fluency_mae / n_batches,
        'hesitation_mae': hesitation_mae / n_batches
    }

In [None]:
# Main training function
def train_model(model, train_loader, val_loader, epochs, device):
    best_val_loss = float('inf')
    patience_counter = 0
    history = {'train_loss': [], 'val_loss': [], 'fluency_mae': [], 'hesitation_mae': []}
    
    for epoch in range(epochs):
        # Train
        train_metrics = train_epoch(model, train_loader, optimizer, device)
        
        # Validate
        val_metrics = validate(model, val_loader, device)
        
        # Update scheduler
        scheduler.step(val_metrics['loss'])
        
        # Log
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"  Train Loss: {train_metrics['loss']:.4f}")
        print(f"  Val Loss: {val_metrics['loss']:.4f}")
        print(f"  Fluency MAE: {val_metrics['fluency_mae']:.4f}")
        print(f"  Hesitation MAE: {val_metrics['hesitation_mae']:.4f}")
        
        # History
        history['train_loss'].append(train_metrics['loss'])
        history['val_loss'].append(val_metrics['loss'])
        history['fluency_mae'].append(val_metrics['fluency_mae'])
        history['hesitation_mae'].append(val_metrics['hesitation_mae'])
        
        # Save best model
        if val_metrics['loss'] < best_val_loss:
            best_val_loss = val_metrics['loss']
            patience_counter = 0
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_metrics['loss']
            }, f"{CONFIG['checkpoint_dir']}/best_model.pth")
            print(f"  âœ“ Saved best model")
        else:
            patience_counter += 1
            if patience_counter >= CONFIG['early_stopping_patience']:
                print(f"Early stopping at epoch {epoch+1}")
                break
        
        print()
    
    return history

# Note: Uncomment to run training
# history = train_model(model, train_loader, val_loader, CONFIG['epochs'], device)

## 6. Evaluation

In [None]:
def evaluate_model(model, test_loader, device):
    """Comprehensive evaluation on test set"""
    model.eval()
    
    all_fluency_pred = []
    all_fluency_true = []
    all_hesitation_pred = []
    all_hesitation_true = []
    
    with torch.no_grad():
        for batch in test_loader:
            waveforms = batch['waveform'].to(device)
            outputs = model(waveforms)
            
            all_fluency_pred.extend(outputs['fluency_score'].cpu().numpy() * 10)  # Scale back to 0-10
            all_fluency_true.extend(batch['fluency_score'].numpy() * 10)
            all_hesitation_pred.extend(outputs['hesitation_count'].cpu().numpy())
            all_hesitation_true.extend(batch['hesitation_count'].numpy())
    
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    from scipy.stats import pearsonr, spearmanr
    
    fluency_mae = mean_absolute_error(all_fluency_true, all_fluency_pred)
    fluency_rmse = np.sqrt(mean_squared_error(all_fluency_true, all_fluency_pred))
    fluency_pearson, _ = pearsonr(all_fluency_true, all_fluency_pred)
    
    hesitation_mae = mean_absolute_error(all_hesitation_true, all_hesitation_pred)
    hesitation_rmse = np.sqrt(mean_squared_error(all_hesitation_true, all_hesitation_pred))
    
    print("=" * 50)
    print("EVALUATION RESULTS")
    print("=" * 50)
    print(f"\nFluency Score (0-10):")
    print(f"  MAE: {fluency_mae:.3f}")
    print(f"  RMSE: {fluency_rmse:.3f}")
    print(f"  Pearson r: {fluency_pearson:.3f}")
    print(f"\nHesitation Count:")
    print(f"  MAE: {hesitation_mae:.3f}")
    print(f"  RMSE: {hesitation_rmse:.3f}")
    
    return {
        'fluency_mae': fluency_mae,
        'fluency_rmse': fluency_rmse,
        'fluency_pearson': fluency_pearson,
        'hesitation_mae': hesitation_mae,
        'hesitation_rmse': hesitation_rmse
    }

## 7. Model Export

In [None]:
def export_model(model, output_dir='../models/speech_model'):
    """Export model for production use"""
    os.makedirs(output_dir, exist_ok=True)
    
    # Save PyTorch model
    torch.save(model.state_dict(), f'{output_dir}/speech_fluency_model.pth')
    
    # Save model config
    config = {
        'wav2vec2_model': MODEL_NAME,
        'hidden_size': 256,
        'num_gru_layers': 2,
        'sample_rate': 16000,
        'max_length_seconds': 30
    }
    
    import json
    with open(f'{output_dir}/config.json', 'w') as f:
        json.dump(config, f, indent=2)
    
    print(f'Model exported to: {output_dir}')

# export_model(model)

## 8. Inference Demo

In [None]:
def predict_fluency(model, audio_path, device):
    """Predict fluency for a single audio file"""
    model.eval()
    
    # Load and preprocess audio
    waveform, sr = torchaudio.load(audio_path)
    
    # Resample to 16kHz
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform = resampler(waveform)
    
    # Convert to mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    
    waveform = waveform.squeeze(0).to(device)
    
    with torch.no_grad():
        outputs = model(waveform.unsqueeze(0))
        fluency_score = outputs['fluency_score'].item() * 10  # Scale to 0-10
        hesitation_count = int(round(outputs['hesitation_count'].item()))
    
    print(f"Audio: {audio_path}")
    print(f"Fluency Score: {fluency_score:.1f}/10")
    print(f"Estimated Hesitations: {hesitation_count}")
    
    return fluency_score, hesitation_count

# Demo: predict_fluency(model, 'path/to/audio.wav', device)