In [None]:
import whisper
import pandas as pd
from dotenv import load_dotenv

pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 0)
load_dotenv()

In [220]:
audio_file_path = "sample2.flac"
speech_context = "conversational"
model = whisper.load_model("base", device="cpu")

result = model.transcribe(
    audio_file_path,
    task="transcribe",
    word_timestamps=True,
    fp16=False,
)
words = []
for seg in result["segments"]:
    for w in seg["words"]:
        words.append({
            "word": w["word"].strip(),
            "start": float(w["start"]),
            "end": float(w["end"]),
            "duration": float(w["end"] - w["start"]),
            "confidence": float(w["probability"])
        })

df_words = pd.DataFrame(words)
df_words.sort_values("confidence").head(5)

segments = []
for seg in result["segments"]:
    segments.append({
        "text": seg["text"].strip(),
        "start": float(seg["start"]),
        "end": float(seg["end"]),
        "duration": float(float(seg["end"]) - float(seg["start"])),
        "avg_word_confidence": sum([float(w["probability"]) for w in seg["words"]]) / (len(seg["words"]) if len(seg["words"]) > 0 else 0.0)
    })

df_segments = pd.DataFrame(segments)

total_duration = float(df_segments.iloc[-1]['end']) #- df_segments.iloc[0]['start'])
words_per_minute = (len(df_words) * 60) / (total_duration) 



pauses = df_words["start"].iloc[1:].values - df_words["end"].iloc[:-1].values
long_pauses = pauses[pauses > 1.0]
very_long_pauses = pauses[pauses > 2.0]

words_clean = df_words['word'].str.lower()
words_unique = words_clean.nunique()
words_total = len(words_clean)
vocab_richness = words_unique / words_total if words_total > 0 else 0
top_repeats = words_clean.value_counts().head(5)
print(f"Total duration (s): {total_duration:.2f}")
print(f"Total words: {words_total}")
print(f"Words per minute: {words_per_minute:.2f}")
print(f"Unique words: {words_unique}")
print(f"Vocabulary richness: {vocab_richness:.2f}")
df_words

Total duration (s): 13.60
Total words: 38
Words per minute: 167.65
Unique words: 32
Vocabulary richness: 0.84


Unnamed: 0,word,start,end,duration,confidence
0,Before,0.0,0.72,0.72,0.732163
1,he,0.72,0.92,0.2,0.990695
2,had,0.92,1.06,0.14,0.998625
3,time,1.06,1.3,0.24,0.97816
4,to,1.3,1.48,0.18,0.998152
5,answer,1.48,1.82,0.34,0.997908
6,a,1.82,2.38,0.56,0.701732
7,much,2.38,2.74,0.36,0.99663
8,encumbered,2.74,3.5,0.76,0.87702
9,Vera,3.5,3.72,0.22,0.663709


In [221]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import soundfile as sf

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") #Audio → numbers → shape the model expects
wav2vec = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") #The model itself
wav2vec.eval(); #Set to eval mode (we are not training)

waveform, sr = sf.read(audio_file_path, dtype="float32") # waveform: np.ndarray, sr: int, each number in waveform signifies amplitude at that time

# Convert stereo → mono
if waveform.ndim == 2:
    waveform = waveform.mean(axis=1)

waveform = torch.from_numpy(waveform) # Convert to torch tensor

# Resample to 16kHz if needed (since the model expects 16kHz audio)
if sr != 16000:
    waveform = torchaudio.functional.resample(waveform, sr, 16000)
    sr = 16000

# Prepare input for the model
inputs = processor(
    waveform.squeeze(), # Remove any extra dimensions
    sampling_rate=16000,
    return_tensors="pt", # Return PyTorch tensors
)

# Forward pass through the model
with torch.no_grad():
    logits = wav2vec(**inputs).logits # logit = time_step_1 -> [prob_sound_1, ...]

# for each time slice, choose sound with highest probability
predicted_ids = torch.argmax(logits, dim=-1)[0]
# Convert token IDs to human readable tokens
tokens = processor.tokenizer.convert_ids_to_tokens(predicted_ids.tolist()) #tokens # each token is 20ms of audio that represents what was spoken
# <pad> = nothing meaningful happened in this 20ms slice
# | = word boundary

FRAME_SEC = 0.02  # 20 ms per token

events = []
current = None

for i, tok in enumerate(tokens):
    t = i * FRAME_SEC

    if tok == '<pad>':
        if current:
            current["end"] = t
            events.append(current)
            current = None
        continue

    if current and current["label"] == tok:
        # same sound continuing (elongation)
        continue

    if current:
        current["end"] = t
        events.append(current)

    current = {
        "label": tok,
        "start": t
    }

if current:
    current["end"] = t
    events.append(current)

df_wav2vec = pd.DataFrame(events)

df_wav2vec["duration"] = df_wav2vec["end"] - df_wav2vec["start"]
df_wav2vec["labels"] = df_wav2vec["label"]


merged = []
current = None

for _, row in df_wav2vec.iterrows():
    if current is None:
        current = row.to_dict()
        continue

    if row["label"] == current["label"] and row["start"] <= current["end"] + 0.04:
        # same sound, extend it
        current["end"] = row["end"]
    else:
        merged.append(current)
        current = row.to_dict()

if current:
    merged.append(current)

df_wav2vec_merged = pd.DataFrame(merged)
df_wav2vec_merged["duration"] = (
    df_wav2vec_merged["end"] - df_wav2vec_merged["start"]
)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [222]:
pauses = []

for i in range(1, len(events)):
    gap = events[i]["start"] - events[i-1]["end"]
    if gap >= 0.3:  # 300 ms
        pauses.append({
            "type": "pause",
            "start": events[i-1]["end"],
            "end": events[i]["start"],
            "duration": gap
        })

pauses;

In [223]:
import soundfile as sf
import torchaudio

def load_audio(path):
    audio, sr = sf.read(path, dtype="float32")
    if audio.ndim == 2:
        audio = audio.mean(axis=1)  # stereo → mono
    if sr != 16000:
        audio = torchaudio.functional.resample(
            torch.from_numpy(audio), sr, 16000
        ).numpy()
        sr = 16000
    return audio, sr

audio, sr = load_audio(audio_file_path)

In [224]:
import whisperx
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

align_model, metadata = whisperx.load_align_model(
    language_code="en",  # or result["language"] if available
    device=device
)

aligned = whisperx.align(
    result["segments"],  # <-- your existing Whisper output
    align_model,
    metadata,
    audio,
    device
)

aligned_words = []

for seg in aligned["segments"]:
    for w in seg.get("words", []):
        if w["start"] is not None and w["end"] is not None:
            aligned_words.append({
                "word": w["word"].strip().lower(),
                "start": float(w["start"]),
                "end": float(w["end"])
            })

df_aligned_words = pd.DataFrame(aligned_words)
df_aligned_words = df_aligned_words.sort_values("start").reset_index(drop=True)
# df_aligned_words

In [None]:
import re

# conservative mapping: only sounds we are confident about
FILLER_MAP = {
    "A": "uh",
    "E": "uh",
    "U": "uh",
    "M": "um",
    "N": "um",
    "MM": "um",
    "NN": "um",
}

# check overlap of wav2vec events with real words from alignment
def overlaps_any_word_relaxed(start, end, words, tol=0.05):
    for _, w in words.iterrows():
        if start < w["end"] + tol and end > w["start"] - tol:
            return True
    return False



df_wav2vec_merged = df_wav2vec_merged.copy()

# for each row (axis=1) return the function value and store it in the overlaps_word column
df_wav2vec_merged["overlaps_word"] = df_wav2vec_merged.apply(
    lambda r: overlaps_any_word_relaxed(
        r["start"], r["end"], df_aligned_words
    ),
    axis=1
)

# get rows where overlaps_word is False, that gives non words
df_non_word = df_wav2vec_merged.loc[
    ~df_wav2vec_merged["overlaps_word"]
].copy()


# Word-onset suppression to allow 200ms of lead-in to real words
WORD_ONSET_SUPPRESSION_MS = 0.2

word_starts = df_aligned_words[["start"]].copy()

# function to check if a non-word event is within the lead-in time of a real word
def is_word_initial_noise(row, word_starts, max_lead=WORD_ONSET_SUPPRESSION_MS):
    end = row["end"]

    upcoming = word_starts[
        (word_starts["start"] >= end) &
        ((word_starts["start"] - end) <= max_lead)
    ]

    return not upcoming.empty

# mark non-word events that are word-initial noises
df_non_word["is_word_initial"] = df_non_word.apply(
    lambda r: (
        is_word_initial_noise(r, word_starts)
        and r["labels"].upper() in FILLER_MAP
    ),
    axis=1
)


# Kill fake fillers
df_non_word = df_non_word.loc[
    ~df_non_word["is_word_initial"]
].reset_index(drop=True)



def classify_non_word_event(row):
    label = row["labels"].upper()
    duration = row["duration"]

    # normalize elongation
    norm = re.sub(r"(.)\1+", r"\1", label)

    # ---- FILLERS ----
    # Is this sound one of our known filler phonemes, AND is it long enough?
    if norm in FILLER_MAP and duration >= 0.08:
        return {
            "type": "filler",
            "raw_label": label,
            "text": FILLER_MAP[norm],
        }

    # ---- STUTTERS ----
    # Is this sound a alphabetic characer or phrase but not a known filler, AND is it short enough?
    if (
        label.isalpha()
        and norm not in FILLER_MAP
        and duration < 0.15
    ):
        return {
            "type": "stutter",
            "raw_label": label,
            "text": norm.lower()
        }

    return None



converted = []

for _, row in df_non_word.iterrows():
    result = classify_non_word_event(row)
    if result:
        converted.append({
            "type": result["type"],
            "text": result["text"],
            "raw_label": result["raw_label"],
            "start": row["start"],
            "end": row["end"],
            "duration": row["duration"]
        })

# Keep all fillers and stutters
df_filler_events = pd.DataFrame(converted)
df_filler_events.head()



# Do a filler check with Whisper verbatim transcription
filler_model = whisper.load_model("base", device="cpu")
verbatim_result = filler_model.transcribe(
    audio_file_path,
    task="transcribe",
    temperature=0,
    word_timestamps=True,
    condition_on_previous_text=False,
    initial_prompt=(
        "Transcribe verbatim. Include filler words like um, uh, er, "
        "false starts, repetitions, and hesitations."
    ),
    fp16=False,
)

import re

FILLER_PATTERN = re.compile(
    r"^(um+|uh+|erm+|er+|ah+|eh+)$",
    re.IGNORECASE
)

def normalize_whisper_token(token: str) -> str:
    """
    Normalize Whisper word tokens like:
    'um,' -> 'um'
    'uh...' -> 'uh'
    'erm-' -> 'erm'
    """
    token = token.lower().strip()

    # remove leading/trailing punctuation
    token = re.sub(r"^[^\w]+|[^\w]+$", "", token)

    return token


whisper_fillers = []

for seg in verbatim_result.get("segments", []):
    for w in seg.get("words", []):
        raw = w["word"]
        norm = normalize_whisper_token(raw)

        if not norm:
            continue

        if FILLER_PATTERN.match(norm):
            whisper_fillers.append({
                "type": "filler",
                "text": norm,                 # canonical filler
                "raw_text": raw,              # keep original for debugging
                "start": float(w["start"]),
                "end": float(w["end"]),
                "duration": float(w["end"] - w["start"]),
                "confidence": float(w["probability"]),
            })
if whisper_fillers:
    df_whisper_fillers = pd.DataFrame(whisper_fillers)

else:
    df_whisper_fillers = pd.DataFrame(
        columns=[
            "type",
            "text",
            "raw_text",
            "start",
            "end",
            "duration",
            "confidence",
        ]
    )
    
df_whisper_fillers

def overlaps_time(a_start, a_end, b_start, b_end, tol=0.08):
    """
    Returns True if two time intervals overlap within tolerance.
    """
    return (a_start < b_end + tol) and (a_end > b_start - tol)

final_fillers = []

FINAL_FILLER_COLUMNS = [
    "style",
    "type",
    "text",
    "raw_label",
    "start",
    "end",
    "duration",
]

# Add all wav2vec fillers first (ground truth)
for _, row in df_filler_events.iterrows():
    final_fillers.append({
        "style": "subtle",
        "type": row["type"],
        "text": row["text"],
        "raw_label": row.get("raw_label", None),
        "start": row["start"],
        "end": row["end"],
        "duration": row["duration"],
    })

# Add whisper fillers if they don't overlap with existing ones
for _, wf in df_whisper_fillers.iterrows():
    is_duplicate = False

    for af in final_fillers:
        if overlaps_time(
            wf["start"], wf["end"],
            af["start"], af["end"]
        ):
            is_duplicate = True
            break

    if not is_duplicate:
        final_fillers.append({
            "style": "clear",
            "type": "filler",
            "text": wf["text"],
            "raw_label": wf["raw_text"],
            "start": wf["start"],
            "end": wf["end"],
            "duration": wf["duration"],
        })

df_final_fillers = pd.DataFrame(
    final_fillers,
    columns=FINAL_FILLER_COLUMNS
)

if not df_final_fillers.empty:
    df_final_fillers = (
        df_final_fillers
        .sort_values("start")
        .reset_index(drop=True)
    )

df_final_fillers

Unnamed: 0,style,type,text,raw_label,start,end,duration


In [226]:
# %%
# ---------- ENHANCED NORMALIZATION ----------

duration_min = max(total_duration / 60.0, 0.5)

# Fillers & stutters
filler_events = df_final_fillers[df_final_fillers["type"] == "filler"]
stutter_events = df_final_fillers[df_final_fillers["type"] == "stutter"]

fillers_per_min = len(filler_events) / duration_min
stutters_per_min = len(stutter_events) / duration_min

# Pauses
pauses = df_words["start"].iloc[1:].values - df_words["end"].iloc[:-1].values
long_pauses = pauses[pauses > 1.0]
very_long_pauses = pauses[pauses > 2.0]

long_pauses_per_min = len(long_pauses) / duration_min
very_long_pauses_per_min = len(very_long_pauses) / duration_min
pause_time_ratio = pauses[pauses > 0].sum() / total_duration

# Lexical
words_clean = df_words["word"].str.lower()
vocab_richness = words_clean.nunique() / len(words_clean)
repetition_ratio = words_clean.value_counts().iloc[0] / len(words_clean)

# Temporal fluency instability (variance of gaps)
pause_variability = pauses.std() if len(pauses) > 5 else 0.0

normalized_metrics = {
    "wpm": words_per_minute,
    "fillers_per_min": fillers_per_min,
    "stutters_per_min": stutters_per_min,
    "long_pauses_per_min": long_pauses_per_min,
    "very_long_pauses_per_min": very_long_pauses_per_min,
    "pause_time_ratio": pause_time_ratio,
    "pause_variability": pause_variability,
    "vocab_richness": vocab_richness,
    "repetition_ratio": repetition_ratio,
}

normalized_metrics


{'wpm': 167.64705882352942,
 'fillers_per_min': 0.0,
 'stutters_per_min': 0.0,
 'long_pauses_per_min': 2.0,
 'very_long_pauses_per_min': 0.0,
 'pause_time_ratio': np.float64(0.14705882352941171),
 'pause_variability': np.float64(0.19539923731433564),
 'vocab_richness': 0.8421052631578947,
 'repetition_ratio': np.float64(0.07894736842105263)}

In [227]:
CONTEXT_CONFIG = {
    "conversational": {
        "pause_tolerance": 1.0,
        "pause_variability_tolerance": 1.0,
    },
    "narrative": {
        "pause_tolerance": 1.4,          # allow more long pauses
        "pause_variability_tolerance": 1.3,
    },
    "presentation": {
        "pause_tolerance": 1.2,
        "pause_variability_tolerance": 1.1,
    },
    "interview": {
        "pause_tolerance": 0.9,          # stricter
        "pause_variability_tolerance": 0.9,
    },
}

context = CONTEXT_CONFIG.get(speech_context, CONTEXT_CONFIG["conversational"])

In [228]:
# %%
# ---------- CONSISTENT FLUENCY SCORING ----------

import math

def clamp01(x):
    return max(0.0, min(1.0, x))


# ---- Subscores (0–1, higher = better) ----

# Speech rate (ASYMMETRIC)
wpm = normalized_metrics["wpm"]

if wpm < 110:  # too slow hurts fluency hard
    speech_rate_score = clamp01((wpm - 70) / 40)
elif wpm <= 170:  # optimal band
    speech_rate_score = 1.0
else:  # too fast hurts gently
    speech_rate_score = clamp01(1 - (wpm - 170) / 120)


# Pauses (structural)
pause_score = clamp01(
    1 - (
        normalized_metrics["long_pauses_per_min"]
        / (4.0 * context["pause_tolerance"])
    )
)

# Fillers (structural)
filler_score = clamp01(
    1 - (normalized_metrics["fillers_per_min"] / 6.0)
)

# Stability (structural)
stability_score = clamp01(
    1 - (
        normalized_metrics["pause_variability"]
        / (0.7 * context["pause_variability_tolerance"])
    )
)

# Lexical quality (STYLE)
lexical_score = clamp01(
    0.65 * normalized_metrics["vocab_richness"]
    + 0.35 * (1 - normalized_metrics["repetition_ratio"])
)


# ---- Weighted aggregation ----
# Structural dimensions dominate readiness
raw_score = (
    0.30 * pause_score +
    0.25 * filler_score +
    0.20 * stability_score +
    0.15 * speech_rate_score +
    0.10 * lexical_score
)

fluency_score = int(round(100 * clamp01(raw_score)))
fluency_score


78

In [229]:
# %%
# ---------- ISSUE DETECTION ----------

issues = []

def issue(severity, issue_id, root_cause, score_impact):
    return {
        "issue": issue_id,
        "severity": severity,
        "root_cause": root_cause,
        "score_impact": score_impact
    }


# Structural blockers
if pause_score < 0.6:
    issues.append(issue(
        "high",
        "hesitation_structure",
        "Pauses frequently interrupt sentence flow.",
        int((1 - pause_score) * 30)
    ))

if filler_score < 0.6:
    issues.append(issue(
        "high",
        "filler_dependency",
        "Fillers replace silent planning pauses.",
        int((1 - filler_score) * 25)
    ))

if stability_score < 0.6:
    issues.append(issue(
        "medium",
        "delivery_instability",
        "Speech rhythm varies unpredictably.",
        int((1 - stability_score) * 20)
    ))

# Style issues (never blockers alone)
if speech_rate_score < 0.7:
    issues.append(issue(
        "medium",
        "delivery_pacing",
        "Speech rate is faster than optimal for clarity.",
        int((1 - speech_rate_score) * 15)
    ))

if lexical_score < 0.5:
    issues.append(issue(
        "low",
        "lexical_simplicity",
        "Frequent reuse of common vocabulary.",
        int((1 - lexical_score) * 10)
    ))


issues = sorted(issues, key=lambda x: x["score_impact"], reverse=True)
issues


[{'issue': 'hesitation_structure',
  'severity': 'high',
  'root_cause': 'Pauses frequently interrupt sentence flow.',
  'score_impact': 15}]

In [230]:
# %%
# ---------- READINESS JUDGMENT ----------

high_issues = [i for i in issues if i["severity"] == "high"]
medium_issues = [i for i in issues if i["severity"] == "medium"]

if len(high_issues) >= 2:
    readiness = "not_ready"
elif len(high_issues) == 1:
    readiness = "borderline"
elif len(medium_issues) >= 2:
    readiness = "borderline"
elif fluency_score >= 80:
    readiness = "ready"
else:
    readiness = "borderline"

readiness


'borderline'

In [231]:
# %%
# ---------- BENCHMARKING ----------

# v1 calibrated bands (replace with data later)
if fluency_score >= 85:
    percentile = 80
elif fluency_score >= 75:
    percentile = 65
elif fluency_score >= 65:
    percentile = 50
else:
    percentile = 30

score_gap = (
    max(0, 80 - fluency_score)
    if readiness != "ready"
    else 0
)

benchmarking = {
    "percentile": percentile,
    "target_score": 80,
    "score_gap": score_gap,
    "estimated_guided_practice_hours": score_gap * 0.6,
}


benchmarking

{'percentile': 65,
 'target_score': 80,
 'score_gap': 2,
 'estimated_guided_practice_hours': 1.2}

In [232]:
# %%
# ---------- OPINIONS ----------

action_plan = []

for idx, issue in enumerate(issues[:3]):
    action_plan.append({
        "priority": idx + 1,
        "focus": issue["issue"],
        "instruction": {
            "hesitation_structure": "Pause only after completing full clauses.",
            "filler_dependency": "Replace fillers with silent pauses under 300ms.",
            "delivery_instability": "Practice steady pacing with metronome drills.",
            "delivery_pacing": "Reduce speed slightly while maintaining energy.",
            "lexical_simplicity": "Actively substitute repeated words during rehearsal."
        }[issue["issue"]],
        "expected_score_gain": issue["score_impact"]
    })

opinions = {
    "primary_issues": issues,
    "action_plan": action_plan
}

opinions


{'primary_issues': [{'issue': 'hesitation_structure',
   'severity': 'high',
   'root_cause': 'Pauses frequently interrupt sentence flow.',
   'score_impact': 15}],
 'action_plan': [{'priority': 1,
   'focus': 'hesitation_structure',
   'instruction': 'Pause only after completing full clauses.',
   'expected_score_gain': 15}]}

In [233]:
# %%
final_response = {
    "verdict": {
        "fluency_score": fluency_score,
        "readiness": readiness,
        "confidence": 0.92
    },

    "benchmarking": benchmarking,

    "normalized_metrics": normalized_metrics,

    "opinions": opinions,

    "word_timestamps": df_words.to_dict(orient="records"),

    "segment_timestamps": df_segments.to_dict(orient="records"),

    "filler_events": df_final_fillers.to_dict(orient="records"),

    "aligned_words": df_aligned_words.to_dict(orient="records"),
}

final_response


{'verdict': {'fluency_score': 78,
  'readiness': 'borderline',
  'confidence': 0.92},
 'benchmarking': {'percentile': 65,
  'target_score': 80,
  'score_gap': 2,
  'estimated_guided_practice_hours': 1.2},
 'normalized_metrics': {'wpm': 167.64705882352942,
  'fillers_per_min': 0.0,
  'stutters_per_min': 0.0,
  'long_pauses_per_min': 2.0,
  'very_long_pauses_per_min': 0.0,
  'pause_time_ratio': np.float64(0.14705882352941171),
  'pause_variability': np.float64(0.19539923731433564),
  'vocab_richness': 0.8421052631578947,
  'repetition_ratio': np.float64(0.07894736842105263)},
 'opinions': {'primary_issues': [{'issue': 'hesitation_structure',
    'severity': 'high',
    'root_cause': 'Pauses frequently interrupt sentence flow.',
    'score_impact': 15}],
  'action_plan': [{'priority': 1,
    'focus': 'hesitation_structure',
    'instruction': 'Pause only after completing full clauses.',
    'expected_score_gain': 15}]},
 'word_timestamps': [{'word': 'Before',
   'start': 0.0,
   'end': 0.