# Fluency Analysis - Filler Word Detection

This notebook explores the PodcastFillers dataset and builds a filler word detection model using PyTorch.

## Goals
1. Load and explore the PodcastFillers dataset
2. Analyze filler word patterns (um, uh, like, you know)
3. Calculate baseline WPM distributions
4. Train a simple filler detector with PyTorch
5. Export utility functions for backend integration

In [None]:
# Install dependencies if needed
# !pip install torch torchaudio librosa pandas matplotlib datasets

In [None]:
import os
import sys
from pathlib import Path

# Add project root to path
PROJECT_ROOT = Path(os.getcwd()).parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print(f"Project root: {PROJECT_ROOT}")

## 1. Load PodcastFillers Dataset

The dataset contains podcast audio clips labeled with filler words (um, uh) and other events.

In [None]:
from datasets import load_dataset, Audio

# Load from local cache or HuggingFace
DATASET_PATH = PROJECT_ROOT / "ml" / "softskills" / "datasets" / "fluency" / "podcast-fillers-hf"

if DATASET_PATH.exists():
    print(f"Loading from local: {DATASET_PATH}")
    ds = load_dataset(str(DATASET_PATH), split="train")
else:
    print("Loading from HuggingFace...")
    ds = load_dataset("ylacombe/podcast_fillers_by_license", split="train")

print(f"Dataset size: {len(ds)} samples")
print(f"Features: {ds.features}")

In [None]:
# Explore a sample
sample = ds[0]
print("Sample keys:", sample.keys())

for key, value in sample.items():
    if isinstance(value, dict):
        print(f"  {key}: {type(value)} - keys: {value.keys()}")
    elif isinstance(value, np.ndarray):
        print(f"  {key}: array shape {value.shape}")
    else:
        print(f"  {key}: {type(value).__name__} = {str(value)[:100]}")

## 2. Filler Word Distribution Analysis

In [None]:
# Count filler types
filler_types = {}
for i in range(min(1000, len(ds))):
    sample = ds[i]
    if 'label' in sample:
        label = sample['label']
        filler_types[label] = filler_types.get(label, 0) + 1
    elif 'text' in sample:
        text = sample['text'].lower()
        for filler in ['um', 'uh', 'like', 'you know', 'basically', 'actually']:
            if filler in text:
                filler_types[filler] = filler_types.get(filler, 0) + 1

print("Filler word distribution:")
for filler, count in sorted(filler_types.items(), key=lambda x: -x[1]):
    print(f"  {filler}: {count}")

In [None]:
# Visualize distribution
if filler_types:
    plt.figure(figsize=(10, 6))
    fillers = list(filler_types.keys())
    counts = list(filler_types.values())
    
    plt.bar(fillers, counts, color='steelblue', edgecolor='navy')
    plt.xlabel('Filler Word')
    plt.ylabel('Count')
    plt.title('Filler Word Distribution in PodcastFillers Dataset')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(PROJECT_ROOT / 'ml' / 'softskills' / 'notebooks' / 'filler_distribution.png', dpi=150)
    plt.show()

## 3. Audio Feature Extraction

In [None]:
import torch
import torchaudio
from torchaudio.transforms import MelSpectrogram, MFCC

print(f"PyTorch version: {torch.__version__}")
print(f"TorchAudio version: {torchaudio.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Use MPS on Mac if available
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using MPS (Metal Performance Shaders)")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using CUDA")
else:
    device = torch.device('cpu')
    print("Using CPU")

In [None]:
def extract_audio_features(audio_array, sample_rate=16000):
    """
    Extract MFCC features from audio.
    
    Args:
        audio_array: numpy array of audio samples
        sample_rate: audio sample rate
    
    Returns:
        MFCC features as torch tensor
    """
    if isinstance(audio_array, np.ndarray):
        waveform = torch.from_numpy(audio_array).float()
    else:
        waveform = audio_array
    
    if waveform.dim() == 1:
        waveform = waveform.unsqueeze(0)  # Add channel dimension
    
    # MFCC transform
    mfcc_transform = MFCC(
        sample_rate=sample_rate,
        n_mfcc=13,
        melkwargs={'n_fft': 400, 'hop_length': 160, 'n_mels': 23}
    )
    
    mfcc = mfcc_transform(waveform)
    return mfcc

# Test on first sample
if 'audio' in ds[0]:
    audio_data = ds[0]['audio']
    if isinstance(audio_data, dict):
        waveform = audio_data.get('array', np.zeros(16000))
        sr = audio_data.get('sampling_rate', 16000)
    else:
        waveform = audio_data
        sr = 16000
    
    features = extract_audio_features(waveform, sr)
    print(f"MFCC shape: {features.shape}")

## 4. Words Per Minute (WPM) Analysis

In [None]:
def calculate_wpm(text: str, duration_seconds: float) -> float:
    """
    Calculate words per minute from text and duration.
    
    Args:
        text: Transcript text
        duration_seconds: Audio duration in seconds
    
    Returns:
        Words per minute
    """
    if duration_seconds <= 0:
        return 0.0
    
    # Count words (simple split)
    words = [w for w in text.split() if w.strip()]
    word_count = len(words)
    
    # Calculate WPM
    minutes = duration_seconds / 60.0
    wpm = word_count / minutes if minutes > 0 else 0
    
    return wpm

# Test
test_text = "Hello this is a test sentence with about ten words in it."
test_duration = 5.0  # seconds
wpm = calculate_wpm(test_text, test_duration)
print(f"Test WPM: {wpm:.1f} (expected ~132 for 11 words in 5 seconds)")

In [None]:
# Optimal WPM ranges for communication
print("""
=== Optimal Speech Rates ===

Speaking Context         | WPM Range  | Notes
-------------------------|------------|---------------------------
Conversational           | 120-150    | Natural, comfortable pace
Presentation/Interview   | 130-160    | Slightly faster, engaging
Audiobook/Narration      | 150-180    | Clear articulation
Speed reading            | 200+       | Not for interviews!

For soft skills evaluation:
- Below 100 WPM: Too slow, may indicate nervousness
- 120-160 WPM: Optimal range
- Above 180 WPM: Too fast, may lose clarity
""")

## 5. Filler Word Detection Function

In [None]:
import re

# Common filler words and phrases
FILLER_PATTERNS = [
    r'\bum\b',
    r'\buh\b',
    r'\blike\b',
    r'\byou know\b',
    r'\bbasically\b',
    r'\bactually\b',
    r'\bliterally\b',
    r'\bso\b',  # When used as filler at sentence start
    r'\bwell\b',
    r'\bi mean\b',
    r'\bkind of\b',
    r'\bsort of\b',
    r'\bright\b',  # When used as filler for confirmation
]

def detect_fillers(text: str) -> dict:
    """
    Detect filler words in text.
    
    Args:
        text: Input transcript
    
    Returns:
        Dictionary with filler counts and list of detected fillers
    """
    text_lower = text.lower()
    
    detected = []
    total_count = 0
    
    for pattern in FILLER_PATTERNS:
        matches = re.findall(pattern, text_lower)
        if matches:
            detected.extend(matches)
            total_count += len(matches)
    
    # Calculate filler ratio
    word_count = len(text.split())
    filler_ratio = total_count / word_count if word_count > 0 else 0
    
    return {
        'count': total_count,
        'fillers': detected,
        'unique_fillers': list(set(detected)),
        'word_count': word_count,
        'filler_ratio': filler_ratio
    }

# Test
test_speech = """
So, um, I think that, you know, basically the main point is, like, 
we need to actually focus on, um, the core features. Right?
"""

result = detect_fillers(test_speech)
print(f"Detected {result['count']} fillers in {result['word_count']} words")
print(f"Filler ratio: {result['filler_ratio']:.2%}")
print(f"Fillers found: {result['unique_fillers']}")

## 6. Fluency Score Calculation

In [None]:
def calculate_fluency_score(
    transcript: str,
    duration_seconds: float,
    pause_ratio: float = 0.0,  # Ratio of pauses to total duration
    optimal_wpm_min: float = 120,
    optimal_wpm_max: float = 160,
    filler_penalty: float = 0.05  # Penalty per filler word
) -> dict:
    """
    Calculate overall fluency score (0-100).
    
    Components:
    - WPM score (40%): Optimal between 120-160 WPM
    - Filler penalty (40%): Deduction for filler words
    - Pause penalty (20%): Deduction for excessive pauses
    
    Returns:
        Dictionary with score breakdown
    """
    # Calculate WPM
    wpm = calculate_wpm(transcript, duration_seconds)
    
    # WPM score (0-100)
    if optimal_wpm_min <= wpm <= optimal_wpm_max:
        wpm_score = 100.0
    elif wpm < optimal_wpm_min:
        # Penalty for slow speech
        wpm_score = max(0, 100 - (optimal_wpm_min - wpm) * 1.5)
    else:
        # Penalty for fast speech
        wpm_score = max(0, 100 - (wpm - optimal_wpm_max) * 1.0)
    
    # Filler detection
    filler_result = detect_fillers(transcript)
    filler_count = filler_result['count']
    filler_score = max(0, 100 - filler_count * filler_penalty * 100)
    
    # Pause penalty
    pause_score = max(0, 100 - pause_ratio * 200)  # 50% pause ratio = 0 score
    
    # Weighted average
    overall_score = (
        wpm_score * 0.4 +
        filler_score * 0.4 +
        pause_score * 0.2
    )
    
    return {
        'overall_score': round(overall_score, 1),
        'wpm': round(wpm, 1),
        'wpm_score': round(wpm_score, 1),
        'filler_count': filler_count,
        'filler_score': round(filler_score, 1),
        'fillers_detected': filler_result['unique_fillers'],
        'pause_ratio': round(pause_ratio, 2),
        'pause_score': round(pause_score, 1)
    }

# Test cases
test_cases = [
    ("This is a clear and well articulated sentence without any filler words at all.", 4.0),
    ("Um, so, like, I think, you know, the thing is, basically, um, yeah.", 5.0),
    ("The quick brown fox jumps over the lazy dog near the river bank today.", 3.0),
]

print("Fluency Score Tests:")
print("-" * 60)
for text, duration in test_cases:
    score = calculate_fluency_score(text, duration)
    print(f"\nText: '{text[:50]}...'")
    print(f"  Overall: {score['overall_score']}/100")
    print(f"  WPM: {score['wpm']} (score: {score['wpm_score']})")
    print(f"  Fillers: {score['filler_count']} (score: {score['filler_score']})")

## 7. Export Utility Functions

In [None]:
# Create utils module for backend integration
utils_code = '''
"""
Fluency Analysis Utilities

Provides functions for analyzing speech fluency including:
- WPM calculation
- Filler word detection
- Overall fluency scoring

Generated from notebook: 01_fluency_analysis.ipynb
"""

import re
from typing import List, Dict, Optional

# Common filler words and phrases
FILLER_PATTERNS = [
    r"\\bum\\b",
    r"\\buh\\b",
    r"\\blike\\b",
    r"\\byou know\\b",
    r"\\bbasically\\b",
    r"\\bactually\\b",
    r"\\bliterally\\b",
    r"\\bso\\b",
    r"\\bwell\\b",
    r"\\bi mean\\b",
    r"\\bkind of\\b",
    r"\\bsort of\\b",
]


def calculate_wpm(text: str, duration_seconds: float) -> float:
    """Calculate words per minute from text and duration."""
    if duration_seconds <= 0:
        return 0.0
    words = [w for w in text.split() if w.strip()]
    word_count = len(words)
    minutes = duration_seconds / 60.0
    return word_count / minutes if minutes > 0 else 0


def detect_fillers(text: str) -> Dict:
    """Detect filler words in text."""
    text_lower = text.lower()
    detected = []
    total_count = 0
    
    for pattern in FILLER_PATTERNS:
        matches = re.findall(pattern, text_lower)
        if matches:
            detected.extend(matches)
            total_count += len(matches)
    
    word_count = len(text.split())
    filler_ratio = total_count / word_count if word_count > 0 else 0
    
    return {
        "count": total_count,
        "fillers": detected,
        "unique_fillers": list(set(detected)),
        "word_count": word_count,
        "filler_ratio": filler_ratio
    }


def calculate_fluency_score(
    transcript: str,
    duration_seconds: float,
    pause_ratio: float = 0.0,
    optimal_wpm_min: float = 120,
    optimal_wpm_max: float = 160,
    filler_penalty: float = 0.05
) -> Dict:
    """Calculate overall fluency score (0-100)."""
    wpm = calculate_wpm(transcript, duration_seconds)
    
    # WPM score
    if optimal_wpm_min <= wpm <= optimal_wpm_max:
        wpm_score = 100.0
    elif wpm < optimal_wpm_min:
        wpm_score = max(0, 100 - (optimal_wpm_min - wpm) * 1.5)
    else:
        wpm_score = max(0, 100 - (wpm - optimal_wpm_max) * 1.0)
    
    # Filler detection
    filler_result = detect_fillers(transcript)
    filler_count = filler_result["count"]
    filler_score = max(0, 100 - filler_count * filler_penalty * 100)
    
    # Pause penalty
    pause_score = max(0, 100 - pause_ratio * 200)
    
    # Weighted average
    overall_score = wpm_score * 0.4 + filler_score * 0.4 + pause_score * 0.2
    
    return {
        "overall_score": round(overall_score, 1),
        "wpm": round(wpm, 1),
        "wpm_score": round(wpm_score, 1),
        "filler_count": filler_count,
        "filler_score": round(filler_score, 1),
        "fillers_detected": filler_result["unique_fillers"],
        "pause_ratio": round(pause_ratio, 2),
        "pause_score": round(pause_score, 1)
    }
'''

# Save to training directory
utils_path = PROJECT_ROOT / 'ml' / 'softskills' / 'training' / 'fluency_utils.py'
utils_path.parent.mkdir(parents=True, exist_ok=True)

with open(utils_path, 'w') as f:
    f.write(utils_code)

print(f"Exported utilities to: {utils_path}")

## 8. Summary

### Key Findings
1. PodcastFillers dataset contains labeled filler words (um, uh) from podcast audio
2. Optimal WPM for interviews: 120-160 words per minute
3. Filler words can be detected using regex patterns
4. Fluency score combines WPM, filler count, and pause ratio

### Next Steps
1. Train a neural network for audio-based filler detection
2. Implement Voice Activity Detection (VAD) for pause analysis
3. Integrate with backend `fluency_analyzer.py` service