In [None]:
import whisper
import pandas as pd
from dotenv import load_dotenv

pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 0)
load_dotenv()

In [300]:
audio_file_path = "sample5.flac"
speech_context = "conversational"
model = whisper.load_model("base", device="cpu")

result = model.transcribe(
    audio_file_path,
    task="transcribe",
    word_timestamps=True,
    fp16=False,
)
words = []
for seg in result["segments"]:
    for w in seg["words"]:
        words.append({
            "word": w["word"].strip(),
            "start": float(w["start"]),
            "end": float(w["end"]),
            "duration": float(w["end"] - w["start"]),
            "confidence": float(w["probability"])
        })

df_words = pd.DataFrame(words)
df_words.sort_values("confidence").head(5)

segments = []
for seg in result["segments"]:
    segments.append({
        "text": seg["text"].strip(),
        "start": float(seg["start"]),
        "end": float(seg["end"]),
        "duration": float(float(seg["end"]) - float(seg["start"])),
        "avg_word_confidence": sum([float(w["probability"]) for w in seg["words"]]) / (len(seg["words"]) if len(seg["words"]) > 0 else 0.0)
    })

df_segments = pd.DataFrame(segments)

total_duration = float(df_segments.iloc[-1]['end']) #- df_segments.iloc[0]['start'])
words_per_minute = (len(df_words) * 60) / (total_duration) 



pauses = df_words["start"].iloc[1:].values - df_words["end"].iloc[:-1].values
long_pauses = pauses[pauses > 1.0]
very_long_pauses = pauses[pauses > 2.0]

words_clean = df_words['word'].str.lower()
words_unique = words_clean.nunique()
words_total = len(words_clean)
vocab_richness = words_unique / words_total if words_total > 0 else 0
top_repeats = words_clean.value_counts().head(5)
print(f"Total duration (s): {total_duration:.2f}")
print(f"Total words: {words_total}")
print(f"Words per minute: {words_per_minute:.2f}")
print(f"Unique words: {words_unique}")
print(f"Vocabulary richness: {vocab_richness:.2f}")

df_words;

Total duration (s): 93.46
Total words: 183
Words per minute: 117.48
Unique words: 98
Vocabulary richness: 0.54


In [301]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import soundfile as sf

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") #Audio → numbers → shape the model expects
wav2vec = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") #The model itself
wav2vec.eval(); #Set to eval mode (we are not training)

waveform, sr = sf.read(audio_file_path, dtype="float32") # waveform: np.ndarray, sr: int, each number in waveform signifies amplitude at that time

# Convert stereo → mono
if waveform.ndim == 2:
    waveform = waveform.mean(axis=1)

waveform = torch.from_numpy(waveform) # Convert to torch tensor

# Resample to 16kHz if needed (since the model expects 16kHz audio)
if sr != 16000:
    waveform = torchaudio.functional.resample(waveform, sr, 16000)
    sr = 16000

# Prepare input for the model
inputs = processor(
    waveform.squeeze(), # Remove any extra dimensions
    sampling_rate=16000,
    return_tensors="pt", # Return PyTorch tensors
)

# Forward pass through the model
with torch.no_grad():
    logits = wav2vec(**inputs).logits # logit = time_step_1 -> [prob_sound_1, ...]

# for each time slice, choose sound with highest probability
predicted_ids = torch.argmax(logits, dim=-1)[0]
# Convert token IDs to human readable tokens
tokens = processor.tokenizer.convert_ids_to_tokens(predicted_ids.tolist()) #tokens # each token is 20ms of audio that represents what was spoken
# <pad> = nothing meaningful happened in this 20ms slice
# | = word boundary

FRAME_SEC = 0.02  # 20 ms per token

events = []
current = None

for i, tok in enumerate(tokens):
    t = i * FRAME_SEC

    if tok == '<pad>':
        if current:
            current["end"] = t
            events.append(current)
            current = None
        continue

    if current and current["label"] == tok:
        # same sound continuing (elongation)
        continue

    if current:
        current["end"] = t
        events.append(current)

    current = {
        "label": tok,
        "start": t
    }

if current:
    current["end"] = t
    events.append(current)

df_wav2vec = pd.DataFrame(events)

df_wav2vec["duration"] = df_wav2vec["end"] - df_wav2vec["start"]
df_wav2vec["labels"] = df_wav2vec["label"]


merged = []
current = None

for _, row in df_wav2vec.iterrows():
    if current is None:
        current = row.to_dict()
        continue

    if row["label"] == current["label"] and row["start"] <= current["end"] + 0.04:
        # same sound, extend it
        current["end"] = row["end"]
    else:
        merged.append(current)
        current = row.to_dict()

if current:
    merged.append(current)

df_wav2vec_merged = pd.DataFrame(merged)
df_wav2vec_merged["duration"] = (
    df_wav2vec_merged["end"] - df_wav2vec_merged["start"]
)

pauses = []

for i in range(1, len(events)):
    gap = events[i]["start"] - events[i-1]["end"]
    if gap >= 0.3:  # 300 ms
        pauses.append({
            "type": "pause",
            "start": events[i-1]["end"],
            "end": events[i]["start"],
            "duration": gap
        })

pauses;


import soundfile as sf
import torchaudio

def load_audio(path):
    audio, sr = sf.read(path, dtype="float32")
    if audio.ndim == 2:
        audio = audio.mean(axis=1)  # stereo → mono
    if sr != 16000:
        audio = torchaudio.functional.resample(
            torch.from_numpy(audio), sr, 16000
        ).numpy()
        sr = 16000
    return audio, sr

audio, sr = load_audio(audio_file_path)

import whisperx
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

align_model, metadata = whisperx.load_align_model(
    language_code="en",  # or result["language"] if available
    device=device
)

aligned = whisperx.align(
    result["segments"],  # <-- your existing Whisper output
    align_model,
    metadata,
    audio,
    device
)

aligned_words = []

for seg in aligned["segments"]:
    for w in seg.get("words", []):
        if w["start"] is not None and w["end"] is not None:
            aligned_words.append({
                "word": w["word"].strip().lower(),
                "start": float(w["start"]),
                "end": float(w["end"])
            })

df_aligned_words = pd.DataFrame(aligned_words)
df_aligned_words = df_aligned_words.sort_values("start").reset_index(drop=True)
# df_aligned_words

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [302]:
import re
import pandas as pd
import whisper

# ==============================
# CONFIG
# ==============================

FILLER_MAP = {
    "A": "uh",
    "E": "uh",
    "U": "uh",
    "M": "um",
    "N": "um",
    "MM": "um",
    "NN": "um",
}

WORD_ONSET_WINDOW = 0.12       # window BEFORE word start
MIN_FILLER_DURATION = 0.02
STUTTER_CONSONANTS = set("BCDFGHJKLPQRSTVWXYZ")

# ==============================
# HELPERS
# ==============================

def overlaps_any_word_relaxed(start, end, words, tol_before=0.02, tol_after=0.02):
    for _, w in words.iterrows():
        if start < w["end"] + tol_after and end > w["start"] - tol_before:
            return True
    return False


def is_word_initial_candidate(row, word_starts, max_lead=WORD_ONSET_WINDOW):
    end = row["end"]
    upcoming = word_starts[
        (word_starts["start"] >= end) &
        ((word_starts["start"] - end) <= max_lead)
    ]
    return not upcoming.empty


def merge_adjacent_events(df, max_gap=0.05):
    merged = []
    current = None

    for _, row in df.sort_values("start").iterrows():
        if current is None:
            current = row.to_dict()
            continue

        same_label = row["labels"] == current["labels"]
        close = row["start"] - current["end"] <= max_gap

        if same_label and close:
            current["end"] = row["end"]
            current["duration"] += row["duration"]
        else:
            merged.append(current)
            current = row.to_dict()

    if current:
        merged.append(current)

    return pd.DataFrame(merged)


def looks_like_filler(norm, duration):
    if duration < MIN_FILLER_DURATION:
        return False

    # vowel hesitations (uh, ah, eh, h)
    if re.fullmatch(r"[AEIOUH]+", norm):
        return True

    # nasal hums (mm, nn)
    if re.fullmatch(r"M+|N+", norm):
        return True

    return False


def should_suppress_word_initial(row):
    label = row["labels"].upper()
    norm = re.sub(r"(.)\1+", r"\1", label)

    # NEVER suppress filler-shaped sounds
    if looks_like_filler(norm, row["duration"]):
        return False

    # suppress only ultra-short junk
    return row["duration"] < 0.03


# ==============================
# STEP 1: REMOVE WORD-OVERLAPS
# ==============================

df_wav2vec_merged = df_wav2vec_merged.copy()

df_wav2vec_merged["overlaps_word"] = df_wav2vec_merged.apply(
    lambda r: overlaps_any_word_relaxed(
        r["start"], r["end"], df_aligned_words
    ),
    axis=1
)

df_non_word = df_wav2vec_merged.loc[
    ~df_wav2vec_merged["overlaps_word"]
].copy()

# ==============================
# STEP 2: WORD-START HANDLING (FIXED)
# ==============================

word_starts = df_aligned_words[["start"]].copy()

df_non_word["is_word_initial"] = df_non_word.apply(
    lambda r: is_word_initial_candidate(r, word_starts),
    axis=1
)

df_non_word["suppress"] = df_non_word.apply(
    lambda r: r["is_word_initial"] and should_suppress_word_initial(r),
    axis=1
)

df_non_word = df_non_word.loc[
    ~df_non_word["suppress"]
].reset_index(drop=True)

# ==============================
# STEP 3: MERGE MICRO EVENTS
# ==============================

df_non_word = merge_adjacent_events(df_non_word, max_gap=0.05)

# ==============================
# STEP 4: CLASSIFY WAV2VEC EVENTS
# ==============================

def classify_non_word_event(row):
    label = row["labels"].upper()
    duration = row["duration"]
    norm = re.sub(r"(.)\1+", r"\1", label)

    # ---- FILLERS ----
    if looks_like_filler(norm, duration):
        return {
            "type": "filler",
            "raw_label": label,
            "text": FILLER_MAP.get(norm, "uh"),
        }

    # ---- STUTTERS ----
    if (
        norm in STUTTER_CONSONANTS
        and norm not in FILLER_MAP
        and duration < 0.15
    ):
        return {
            "type": "stutter",
            "raw_label": label,
            "text": norm.lower(),
        }

    return None


converted = []

for _, row in df_non_word.iterrows():
    result = classify_non_word_event(row)
    if result:
        converted.append({
            "type": result["type"],
            "text": result["text"],
            "raw_label": result["raw_label"],
            "start": row["start"],
            "end": row["end"],
            "duration": row["duration"],
        })

df_filler_events = pd.DataFrame(converted)

# ==============================
# STEP 5: WHISPER VERBATIM FILLERS
# ==============================

filler_model = whisper.load_model("base", device="cpu")

verbatim_result = filler_model.transcribe(
    audio_file_path,
    task="transcribe",
    temperature=0,
    word_timestamps=True,
    condition_on_previous_text=False,
    initial_prompt=(
        "Transcribe verbatim. Include filler words like um, uh, er, "
        "false starts, repetitions, and hesitations."
    ),
    fp16=False,
)

FILLER_PATTERN = re.compile(
    r"^(um+|uh+|erm+|er+|ah+|eh+)$",
    re.IGNORECASE
)

def normalize_whisper_token(token):
    token = token.lower().strip()
    return re.sub(r"^[^\w]+|[^\w]+$", "", token)


whisper_fillers = []

for seg in verbatim_result.get("segments", []):
    for w in seg.get("words", []):
        norm = normalize_whisper_token(w["word"])
        if norm and FILLER_PATTERN.match(norm):
            whisper_fillers.append({
                "style": "clear",
                "type": "filler",
                "text": norm,
                "raw_label": w["word"],
                "start": float(w["start"]),
                "end": float(w["end"]),
                "duration": float(w["end"] - w["start"]),
                "confidence": float(w["probability"]),
            })

df_whisper_fillers = pd.DataFrame(whisper_fillers)

# ==============================
# STEP 6: MERGE (WHISPER FIRST, WAV2VEC BACKFILL)
# ==============================

def overlaps_time(a_start, a_end, b_start, b_end, tol=0.05):
    return (a_start < b_end + tol) and (a_end > b_start - tol)


final_fillers = []

for _, wf in df_whisper_fillers.iterrows():
    final_fillers.append(wf.to_dict())

for _, row in df_filler_events.iterrows():
    duplicate = False

    for af in final_fillers:
        if overlaps_time(
            row["start"], row["end"],
            af["start"], af["end"]
        ):
            duplicate = True
            break

    if not duplicate:
        final_fillers.append({
            "style": "subtle",
            "type": row["type"],
            "text": row["text"],
            "raw_label": row["raw_label"],
            "start": row["start"],
            "end": row["end"],
            "duration": row["duration"],
        })

df_final_fillers = pd.DataFrame(final_fillers)

if not df_final_fillers.empty:
    df_final_fillers = (
        df_final_fillers
        .sort_values("start")
        .reset_index(drop=True)
    )

df_final_fillers


Unnamed: 0,style,type,text,raw_label,start,end,duration,confidence
0,subtle,filler,uh,A,16.06,16.08,0.02,
1,clear,filler,um,"um,",18.88,20.06,1.18,0.016718
2,subtle,filler,uh,A,23.72,23.74,0.02,
3,subtle,filler,um,N,23.88,23.92,0.04,
4,subtle,stutter,d,D,24.06,24.08,0.02,
5,clear,filler,uh,"uh,",24.26,24.38,0.12,0.674564
6,subtle,filler,uh,O,24.88,24.9,0.02,
7,clear,filler,um,"um,",26.0,26.22,0.22,0.88938
8,subtle,filler,uh,U,26.56,26.6,0.04,
9,subtle,filler,uh,A,32.3,32.32,0.02,


In [303]:
df_stutters = df_final_fillers[
    df_final_fillers["type"] == "stutter"
].copy()

df_fillers = df_final_fillers[
    df_final_fillers["type"] == "filler"
].copy()

GROUP_GAP_SEC = 0.15  # max gap between repetitions

# ==============================
# GROUP STUTTERS
# ==============================

grouped = []
current = None

for _, row in df_stutters.sort_values("start").iterrows():
    if current is None:
        current = row.to_dict()
        current["count"] = 1
        continue

    same_sound = row["raw_label"] == current["raw_label"]
    close_in_time = row["start"] - current["end"] <= GROUP_GAP_SEC

    if same_sound and close_in_time:
        current["end"] = row["end"]
        current["duration"] = current["end"] - current["start"]
        current["count"] += 1
    else:
        grouped.append(current)
        current = row.to_dict()
        current["count"] = 1

if current:
    grouped.append(current)

df_grouped_stutters = pd.DataFrame(grouped)


df_final_clean = ( 
    pd.concat([df_fillers, df_grouped_stutters], ignore_index=True) 
    .sort_values("start") 
    .reset_index(drop=True) 
)

# --- HARD STUTTER FILTER --- 
df_final_clean = df_final_clean[
    ~(
        (df_final_clean["type"] == "stutter") & 
        (df_final_clean["count"].fillna(1) < 2) & 
        (df_final_clean["duration"] < 0.15) 
    )
].reset_index(drop=True)

df_final_clean

Unnamed: 0,style,type,text,raw_label,start,end,duration,confidence,count
0,subtle,filler,uh,A,16.06,16.08,0.02,,
1,clear,filler,um,"um,",18.88,20.06,1.18,0.016718,
2,subtle,filler,uh,A,23.72,23.74,0.02,,
3,subtle,filler,um,N,23.88,23.92,0.04,,
4,clear,filler,uh,"uh,",24.26,24.38,0.12,0.674564,
5,subtle,filler,uh,O,24.88,24.9,0.02,,
6,clear,filler,um,"um,",26.0,26.22,0.22,0.88938,
7,subtle,filler,uh,U,26.56,26.6,0.04,,
8,subtle,filler,uh,A,32.3,32.32,0.02,,
9,subtle,filler,uh,A,33.5,33.52,0.02,,


In [None]:
# %%
def filler_weight(duration):
    """
    Weight fillers by perceptual impact.
    """
    if duration < 0.08:
        return 0.2      # micro hesitation
    elif duration < 0.3:
        return 0.6      # subtle filler
    else:
        return 1.0      # real filler
    
def overlaps_filler(start, end, fillers, tol=0.05):
    for _, f in fillers.iterrows():
        if start < f["end"] + tol and end > f["start"] - tol:
            return True
    return False
    
# ---------- ENHANCED NORMALIZATION ----------

duration_min = max(total_duration / 60.0, 0.5)

# Fillers & stutters
# Use cleaned events only (grouped + filtered)
filler_events = df_final_clean[df_final_clean["type"] == "filler"]
stutter_events = df_final_clean[df_final_clean["type"] == "stutter"]


fillers_per_min = (
    filler_events["duration"]
    .apply(filler_weight)
    .sum()
    / duration_min
)
stutters_per_min = len(stutter_events) / duration_min

# Pauses
clean_pauses = []

for i in range(1, len(df_words)):
    gap_start = df_words.iloc[i-1]["end"]
    gap_end = df_words.iloc[i]["start"]
    gap = gap_end - gap_start

    if gap > 0.3 and not overlaps_filler(
        gap_start, gap_end, df_final_clean
    ):
        clean_pauses.append(gap)

clean_pauses = pd.Series(clean_pauses)

long_pauses = clean_pauses[clean_pauses > 1.0]
very_long_pauses = clean_pauses[clean_pauses > 2.0]

long_pauses_per_min = len(long_pauses) / duration_min
very_long_pauses_per_min = len(very_long_pauses) / duration_min
pause_time_ratio = clean_pauses.sum() / total_duration if len(clean_pauses) else 0.0
pause_variability = clean_pauses.std() if len(clean_pauses) > 5 else 0.0


long_pauses_per_min = len(long_pauses) / duration_min
very_long_pauses_per_min = len(very_long_pauses) / duration_min

# Lexical
words_clean = df_words["word"].str.lower()
vocab_richness = words_clean.nunique() / len(words_clean)
repetition_ratio = words_clean.value_counts().iloc[0] / len(words_clean)

# Temporal fluency instability (variance of gaps)
pause_variability = pauses.std() if len(pauses) > 5 else 0.0

normalized_metrics = {
    "wpm": words_per_minute,
    "fillers_per_min": fillers_per_min,
    "stutters_per_min": stutters_per_min,
    "long_pauses_per_min": long_pauses_per_min,
    "very_long_pauses_per_min": very_long_pauses_per_min,
    "pause_time_ratio": pause_time_ratio,
    "pause_variability": pause_variability,
    "vocab_richness": vocab_richness,
    "repetition_ratio": repetition_ratio,
}

normalized_metrics


{'wpm': 117.48341536486198,
 'fillers_per_min': 10.913759897282262,
 'stutters_per_min': 14.765675155146589,
 'long_pauses_per_min': 3.2099293815536063,
 'very_long_pauses_per_min': 0.6419858763107212,
 'pause_time_ratio': np.float64(0.13931093515942625),
 'pause_variability': np.float64(0.284486902987219),
 'vocab_richness': 0.5355191256830601,
 'repetition_ratio': np.float64(0.08196721311475409)}

In [None]:
CONTEXT_CONFIG = {
    "conversational": {
        "pause_tolerance": 1.0,
        "pause_variability_tolerance": 1.0,
    },
    "narrative": {
        "pause_tolerance": 1.4,          # allow more long pauses
        "pause_variability_tolerance": 1.3,
    },
    "presentation": {
        "pause_tolerance": 1.2,
        "pause_variability_tolerance": 1.1,
    },
    "interview": {
        "pause_tolerance": 0.9,          # stricter
        "pause_variability_tolerance": 0.9,
    },
}

context = CONTEXT_CONFIG.get(speech_context, CONTEXT_CONFIG["conversational"])

# %%
# ---------- CONSISTENT FLUENCY SCORING ----------

import math

def clamp01(x):
    return max(0.0, min(1.0, x))


# ---- Subscores (0–1, higher = better) ----

# Speech rate (ASYMMETRIC)
wpm = normalized_metrics["wpm"]

if wpm < 110:  # too slow hurts fluency hard
    speech_rate_score = clamp01((wpm - 70) / 40)
elif wpm <= 170:  # optimal band
    speech_rate_score = 1.0
else:  # too fast hurts gently
    speech_rate_score = clamp01(1 - (wpm - 170) / 120)


# Pauses (structural)
pause_score = clamp01(
    1 - (
        normalized_metrics["long_pauses_per_min"]
        / (4.0 * context["pause_tolerance"])
    )
)

# Fillers (structural)
filler_score = clamp01(
    1 - (normalized_metrics["fillers_per_min"] / 6.0)
)

# Stability (structural)
stability_score = clamp01(
    1 - (
        normalized_metrics["pause_variability"]
        / (0.7 * context["pause_variability_tolerance"])
    )
)

# Lexical quality (STYLE)
lexical_score = clamp01(
    0.65 * normalized_metrics["vocab_richness"]
    + 0.35 * (1 - normalized_metrics["repetition_ratio"])
)


# ---- Weighted aggregation ----
# Structural dimensions dominate readiness
raw_score = (
    0.30 * pause_score +
    0.25 * filler_score +
    0.20 * stability_score +
    0.15 * speech_rate_score +
    0.10 * lexical_score
)

fluency_score = int(round(100 * clamp01(raw_score)))
fluency_score



# %%
# ---------- ISSUE DETECTION ----------

issues = []

def issue(severity, issue_id, root_cause, score_impact):
    return {
        "issue": issue_id,
        "severity": severity,
        "root_cause": root_cause,
        "score_impact": score_impact
    }


# Structural blockers
if pause_score < 0.6:
    issues.append(issue(
        "high",
        "hesitation_structure",
        "Pauses frequently interrupt sentence flow.",
        int((1 - pause_score) * 30)
    ))

if filler_score < 0.6:
    issues.append(issue(
        "high",
        "filler_dependency",
        "Fillers replace silent planning pauses.",
        int((1 - filler_score) * 25)
    ))

if stability_score < 0.6:
    issues.append(issue(
        "medium",
        "delivery_instability",
        "Speech rhythm varies unpredictably.",
        int((1 - stability_score) * 20)
    ))

# Style issues (never blockers alone)
if speech_rate_score < 0.7:
    issues.append(issue(
        "medium",
        "delivery_pacing",
        "Speech rate is faster than optimal for clarity.",
        int((1 - speech_rate_score) * 15)
    ))

if lexical_score < 0.5:
    issues.append(issue(
        "low",
        "lexical_simplicity",
        "Frequent reuse of common vocabulary.",
        int((1 - lexical_score) * 10)
    ))


issues = sorted(issues, key=lambda x: x["score_impact"], reverse=True)
issues


# %%
# ---------- READINESS JUDGMENT ----------
if total_duration < 30:
    readiness = "insufficient_sample"

high_issues = [i for i in issues if i["severity"] == "high"]
medium_issues = [i for i in issues if i["severity"] == "medium"]

if len(high_issues) >= 2:
    readiness = "not_ready"
elif len(high_issues) == 1:
    readiness = "borderline"
elif len(medium_issues) >= 2:
    readiness = "borderline"
elif fluency_score >= 80:
    readiness = "ready"
else:
    readiness = "borderline"

readiness



# %%
# ---------- BENCHMARKING ----------

# v1 calibrated bands (replace with data later)
if fluency_score >= 85:
    percentile = 80
elif fluency_score >= 75:
    percentile = 65
elif fluency_score >= 65:
    percentile = 50
else:
    percentile = 30

score_gap = (
    max(0, 80 - fluency_score)
    if readiness != "ready"
    else 0
)

benchmarking = {
    "percentile": percentile,
    "target_score": 80,
    "score_gap": score_gap,
    "estimated_guided_practice_hours": score_gap * 0.6,
}


benchmarking


# %%
# ---------- OPINIONS ----------

action_plan = []

max_gain = sum(i["score_impact"] for i in issues[:3]) or 1
scale = score_gap / max_gain if score_gap > 0 else 1.0

for idx, issue in enumerate(issues[:3]):
    action_plan.append({
        "priority": idx + 1,
        "focus": issue["issue"],
        "instruction": {
            "hesitation_structure": "Pause only after completing full clauses.",
            "filler_dependency": "Replace fillers with silent pauses under 300ms.",
            "delivery_instability": "Practice steady pacing with metronome drills.",
            "delivery_pacing": "Reduce speed slightly while maintaining energy.",
            "lexical_simplicity": "Actively substitute repeated words during rehearsal."
        }[issue["issue"]],
        "expected_score_gain": int(issue["score_impact"] * scale)
    })

opinions = {
    "primary_issues": issues,
    "action_plan": action_plan
}

opinions


# %%
final_response = {
    "verdict": {
        "fluency_score": fluency_score,
        "readiness": readiness,
    },

    "benchmarking": benchmarking,

    "normalized_metrics": normalized_metrics,

    "opinions": opinions,

    "word_timestamps": df_words.to_dict(orient="records"),

    "segment_timestamps": df_segments.to_dict(orient="records"),

    "filler_events": df_final_fillers.to_dict(orient="records"),

    "aligned_words": df_aligned_words.to_dict(orient="records"),
}

final_response


[{'issue': 'filler_dependency',
  'severity': 'high',
  'root_cause': 'Fillers replace silent planning pauses.',
  'score_impact': 25},
 {'issue': 'hesitation_structure',
  'severity': 'high',
  'root_cause': 'Pauses frequently interrupt sentence flow.',
  'score_impact': 24},
 {'issue': 'delivery_instability',
  'severity': 'medium',
  'root_cause': 'Speech rhythm varies unpredictably.',
  'score_impact': 8}]