In [1]:
# tester.ipynb
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))


# from src.analyzer import analyze_speech
from src.analyzer_raw import analyze_speech

audio_path = "../samples/sample1.flac"
# audio_path = "../data/l2arctic_spontaneous/L2A_019.wav"

result = analyze_speech(audio_path)
result

  from .autonotebook import tqdm as notebook_tqdm


Analyzing audio: ../samples/sample1.flac
Context: conversational

[1/5] Transcribing with Whisper (verbatim)...
  Duration: 13.34s
  Words: 41

[2/5] Marking filler words...
  Marked: 0 filler words
  Content words: 41

[3/5] Aligning words with WhisperX...
  Aligned: 41 words

[4/5] Detecting subtle fillers with Wav2Vec2...


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Total events: 3

[5/5] Calculating raw score...


{'raw_transcript': "Going along slushy country roads and speaking to damp audiences in drafty schoolrooms day after day for fortnight, he'll have to put in an appearance at some place of worship on Sunday morning, and he can come to us immediately afterwards.",
 'wpm': 184.40779610194903,
 'unique_word_count': 36,
 'fillers_per_min': np.float64(1.2000000000000002),
 'stutters_per_min': 0.0,
 'long_pauses_per_min': 0.0,
 'very_long_pauses_per_min': 0.0,
 'pause_frequency': 4.0,
 'pause_time_ratio': np.float64(0.08245877061469256),
 'pause_variability': 0.0,
 'vocab_richness': 0.8780487804878049,
 'repetition_ratio': np.float64(0.08695652173913043),
 'speech_rate_variability': np.float64(0.519918502534728),
 'mean_utterance_length': np.float64(20.5),
 'pause_after_filler_rate': 0.0,
 'mean_word_confidence': np.float64(0.9237252893002054),
 'low_confidence_ratio': np.float64(0.04878048780487805),
 'lexical_density': 0.5609756097560976,
 'audio_duration_sec': 13.34,
 'speaking_time_sec': n

In [3]:
from src.llm_processing import extract_llm_annotations, aggregate_llm_metrics
from src.prosody_extraction import is_monotone_speech

llm_result = extract_llm_annotations(result["raw_transcript"])
llm_metrics = aggregate_llm_metrics(llm_result)
is_monotone = is_monotone_speech(audio_path)

llm_metrics, is_monotone

Prosody Variation (F0 std): 10.25 Hz
Monotone Detected: True


({'coherence_breaks': 0,
  'topic_relevance': True,
  'word_choice_errors': 0,
  'advanced_vocabulary_count': 2,
  'complex_structures_attempted': 2,
  'complex_structures_accurate': 2,
  'grammar_errors': 0,
  'meaning_blocking_error_ratio': 0.0},
 True)

In [6]:
from datetime import datetime

def build_master_result(
    result: dict,
    llm_metrics: dict,
    is_monotone: bool,
) -> dict:
    # ---- Lexical density ----
    total_words = result["statistics"]["total_words_transcribed"]
    content_words = result["statistics"]["content_words"]

    # ---- Word confidence metrics ----
    confidences = [
        w["confidence"]
        for w in result["timestamps"]["words_timestamps_raw"]
        if w.get("confidence") is not None
    ]

    mean_word_confidence = (
        sum(confidences) / len(confidences) if confidences else 0.0
    )

    low_confidence_ratio = (
        sum(1 for c in confidences if c < 0.7) / len(confidences)
        if confidences else 0.0
    )

    return {
        "metadata": {
            "audio_duration_sec": round(result["audio_duration_sec"], 2),
            "speaking_time_sec": round(result["speaking_time_sec"], 2),
            "total_words_transcribed": total_words,
            "content_word_count": content_words,
            "analysis_timestamp": datetime.utcnow().isoformat() + "Z",
        },

        "fluency_coherence": {
            "pauses": {
                "pause_frequency_per_min": result.get("pause_frequency"),
                "long_pause_rate": result["long_pauses_per_min"],
                "pause_variability": float(result["pause_variability"]),
            },
            "rate": {
                "speech_rate_wpm": float(result["wpm"]),
                "speech_rate_variability": float(result["speech_rate_variability"]),
            },
            "disfluency": {
                "filler_frequency_per_min": float(result["fillers_per_min"]),
                "stutter_frequency_per_min": float(result["stutters_per_min"]),
                "repetition_rate": float(result["repetition_ratio"]),
            },
            "coherence": {
                "coherence_breaks": llm_metrics["coherence_breaks"],
                "topic_relevance": llm_metrics["topic_relevance"],
            },
        },

        "lexical_resource": {
            "breadth": {
                "unique_word_count": int(
                    round(result["vocab_richness"] * content_words)
                ),
                "lexical_diversity": float(result["vocab_richness"]),
                "lexical_density": round(result["lexical_density"], 3),
                "most_frequent_word_ratio": float(result["repetition_ratio"]),
            },
            "quality": {
                "word_choice_errors": llm_metrics["word_choice_errors"],
                "advanced_vocabulary_count": llm_metrics["advanced_vocabulary_count"],
            },
        },

        "grammatical_range_accuracy": {
            "complexity": {
                "mean_utterance_length": float(result["mean_utterance_length"]),
                "complex_structures_attempted": llm_metrics["complex_structures_attempted"],
                "complex_structures_accurate": llm_metrics["complex_structures_accurate"],
            },
            "accuracy": {
                "grammar_errors": llm_metrics["grammar_errors"],
                "meaning_blocking_error_ratio": llm_metrics["meaning_blocking_error_ratio"],
            },
        },

        "pronunciation": {
            "intelligibility": {
                "mean_word_confidence": round(mean_word_confidence, 3),
                "low_confidence_ratio": round(low_confidence_ratio, 3),
            },
            "prosody": {
                "monotone_detected": is_monotone,
            },
        },

        "raw_data": {
            "word_timestamps": result["timestamps"]["words_timestamps_raw"],
            "pause_events": result.get("pause_events", []),
            "filler_events": result["timestamps"]["filler_timestamps"],
            "stutter_events": result.get("stutter_events", []),
        },
    }


master_result = build_master_result(
    result, llm_metrics, is_monotone= is_monotone,
)


master_result


{'metadata': {'audio_duration_sec': 13.34,
  'speaking_time_sec': np.float64(12.24),
  'total_words_transcribed': 41,
  'content_word_count': 41,
  'analysis_timestamp': '2026-01-14T13:24:27.265160Z'},
 'fluency_coherence': {'pauses': {'pause_frequency_per_min': 4.0,
   'long_pause_rate': 0.0,
   'pause_variability': 0.0},
  'rate': {'speech_rate_wpm': 184.40779610194903,
   'speech_rate_variability': 0.519918502534728},
  'disfluency': {'filler_frequency_per_min': 1.2000000000000002,
   'stutter_frequency_per_min': 0.0,
   'repetition_rate': 0.08695652173913043},
  'coherence': {'coherence_breaks': 0, 'topic_relevance': True}},
 'lexical_resource': {'breadth': {'unique_word_count': 36,
   'lexical_diversity': 0.8780487804878049,
   'lexical_density': 0.561,
   'most_frequent_word_ratio': 0.08695652173913043},
  'quality': {'word_choice_errors': 0, 'advanced_vocabulary_count': 2}},
 'grammatical_range_accuracy': {'complexity': {'mean_utterance_length': 20.5,
   'complex_structures_atte