In [None]:
import whisper
import pandas as pd
from dotenv import load_dotenv

In [None]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 0)
load_dotenv()
audio_file_path = "sample7.flac"
speech_context = "conversational"

In [95]:
model = whisper.load_model("base", device="cpu")

In [96]:
# result = model.transcribe(
#     audio_file_path,
#     task="transcribe",
#     temperature=0,
#     word_timestamps=True,
#     condition_on_previous_text=False,
#     initial_prompt=(
#         "Transcribe verbatim. Include filler words like um, uh, er, "
#         "false starts, repetitions, and hesitations."
#     ),
#     fp16=False,
# )

result = model.transcribe(
    audio_file_path,
    task="transcribe",
    word_timestamps=True,
    fp16=False,
)

In [97]:
words = []
for seg in result["segments"]:
    for w in seg["words"]:
        words.append({
            "word": w["word"].strip(),
            "start": float(w["start"]),
            "end": float(w["end"]),
            "duration": float(w["end"] - w["start"]),
            "confidence": float(w["probability"])
        })

df_words = pd.DataFrame(words)
df_words.sort_values("confidence").head(5)
df_words.head()

Unnamed: 0,word,start,end,duration,confidence
0,She,0.0,0.94,0.94,0.745907
1,poured,0.94,1.2,0.26,0.989556
2,concrete,1.2,1.84,0.64,0.999496
3,straight,1.84,2.32,0.48,0.974266
4,into,2.32,2.74,0.42,0.968895


In [98]:
segments = []
for seg in result["segments"]:
    segments.append({
        "text": seg["text"].strip(),
        "start": float(seg["start"]),
        "end": float(seg["end"]),
        "duration": float(float(seg["end"]) - float(seg["start"])),
        "avg_word_confidence": sum([float(w["probability"]) for w in seg["words"]]) / (len(seg["words"]) if len(seg["words"]) > 0 else 0.0)
    })

df_segments = pd.DataFrame(segments)
with pd.option_context('display.max_colwidth', None):
    print(df_segments.head()) 

                                                                                                text  \
0       She poured concrete straight into the storm pipe at the edge of her yard, and the wet cement   
1                                  hardened inside, sealing the pipe so water couldn't flow through.   
2  You see, that pipe was part of the neighborhood's draining system, carrying rainwater underground   
3                                                           to a nearby pond where it could collect.   
4            But the woman claimed that the drain was eating away at her yard and causing sinkholes.   

   start    end  duration  avg_word_confidence  
0   0.00   6.16      6.16             0.954545  
1   6.16  10.76      4.60             0.971571  
2  11.12  17.04      5.92             0.977924  
3  17.04  19.64      2.60             0.990735  
4  20.08  25.32      5.24             0.977607  


In [99]:
total_duration = float(df_segments.iloc[-1]['end']) #- df_segments.iloc[0]['start'])
words_per_minute = (len(df_words) * 60) / (total_duration) 

print(f"Total duration (s): {total_duration:.2f}")
print(f"Number of words: {len(df_words)}")
print(f"Words per minute: {words_per_minute:.2f}")

Total duration (s): 31.26
Number of words: 83
Words per minute: 159.31


In [100]:
pauses = df_words["start"].iloc[1:].values - df_words["end"].iloc[:-1].values
long_pauses = pauses[pauses > 1.0]
very_long_pauses = pauses[pauses > 2.0]
pauses

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.28, 0.  , 0.  , 0.  , 0.  , 0.  , 0.56, 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.36, 0.  , 0.24, 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.26, 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.44, 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.26, 0.  , 0.  , 0.  , 0.  , 0.  , 0.34,
       0.28, 0.  , 0.  , 0.  , 0.  ])

In [101]:
words_clean = df_words['word'].str.lower()
words_unique = words_clean.nunique()
words_total = len(words_clean)
vocab_richness = words_unique / words_total if words_total > 0 else 0
top_repeats = words_clean.value_counts().head(5)
print(f"Total words: {words_total}")
print(f"Unique words: {words_unique}")
print(f"Vocabulary richness: {vocab_richness:.2f}")

Total words: 83
Unique words: 61
Vocabulary richness: 0.73


In [102]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import soundfile as sf

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") #Audio → numbers → shape the model expects
wav2vec = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") #The model itself
wav2vec.eval(); #Set to eval mode (we are not training)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [103]:
waveform, sr = sf.read(audio_file_path, dtype="float32") # waveform: np.ndarray, sr: int, each number in waveform signifies amplitude at that time

# Convert stereo → mono
if waveform.ndim == 2:
    waveform = waveform.mean(axis=1)

waveform = torch.from_numpy(waveform) # Convert to torch tensor

# Resample to 16kHz if needed (since the model expects 16kHz audio)
if sr != 16000:
    waveform = torchaudio.functional.resample(waveform, sr, 16000)
    sr = 16000

# Prepare input for the model
inputs = processor(
    waveform.squeeze(), # Remove any extra dimensions
    sampling_rate=16000,
    return_tensors="pt", # Return PyTorch tensors
)

# Forward pass through the model
with torch.no_grad():
    logits = wav2vec(**inputs).logits # logit = time_step_1 -> [prob_sound_1, ...]

# for each time slice, choose sound with highest probability
predicted_ids = torch.argmax(logits, dim=-1)[0]
# Convert token IDs to human readable tokens
tokens = processor.tokenizer.convert_ids_to_tokens(predicted_ids.tolist())
#tokens # each token is 20ms of audio that represents what was spoken
# <pad> = nothing meaningful happened in this 20ms slice
# | = word boundary


In [104]:
FRAME_SEC = 0.02  # 20 ms per token

events = []
current = None

for i, tok in enumerate(tokens):
    t = i * FRAME_SEC

    if tok == '<pad>':
        if current:
            current["end"] = t
            events.append(current)
            current = None
        continue

    if current and current["label"] == tok:
        # same sound continuing (elongation)
        continue

    if current:
        current["end"] = t
        events.append(current)

    current = {
        "label": tok,
        "start": t
    }

if current:
    current["end"] = t
    events.append(current)

events;

In [105]:
MIN_GAP = 0.06       # 60 ms
MIN_EVENT = 0.12     # 120 ms

merged = []
current = None

for e in events:
    label = e["label"]

    # hard boundary: word separator
    if label == "|":
        if current:
            current["end"] = e["start"]
            if current["end"] - current["start"] >= MIN_EVENT:
                merged.append(current)
            current = None
        continue

    if current is None:
        current = {
            "labels": [label],
            "start": e["start"],
            "end": e["end"]
        }
        continue

    gap = e["start"] - current["end"]

    if gap <= MIN_GAP:
        # same acoustic blob
        current["labels"].append(label)
        current["end"] = e["end"]
    else:
        # close previous blob
        if current["end"] - current["start"] >= MIN_EVENT:
            merged.append(current)
        current = {
            "labels": [label],
            "start": e["start"],
            "end": e["end"]
        }

# flush last
if current and (current["end"] - current["start"] >= MIN_EVENT):
    merged.append(current)

df_wav2vec_merged = pd.DataFrame([
    {
        "labels": "".join(m["labels"]),
        "start": m["start"],
        "end": m["end"],
        "duration": m["end"] - m["start"]
    }
    for m in merged
])

df_wav2vec_merged.head()


Unnamed: 0,labels,start,end,duration
0,POURED,1.04,1.28,0.24
1,CON,1.42,1.58,0.16
2,CRE,1.7,1.82,0.12
3,STRAIGHT,2.14,2.48,0.34
4,STORM,3.06,3.38,0.32


In [106]:
pauses = []

for i in range(1, len(events)):
    gap = events[i]["start"] - events[i-1]["end"]
    if gap >= 0.3:  # 300 ms
        pauses.append({
            "type": "pause",
            "start": events[i-1]["end"],
            "end": events[i]["start"],
            "duration": gap
        })

pauses;

In [107]:
import soundfile as sf
import torchaudio

def load_audio(path):
    audio, sr = sf.read(path, dtype="float32")
    if audio.ndim == 2:
        audio = audio.mean(axis=1)  # stereo → mono
    if sr != 16000:
        audio = torchaudio.functional.resample(
            torch.from_numpy(audio), sr, 16000
        ).numpy()
        sr = 16000
    return audio, sr

audio, sr = load_audio(audio_file_path)

In [108]:
import whisperx
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

align_model, metadata = whisperx.load_align_model(
    language_code="en",  # or result["language"] if available
    device=device
)

aligned = whisperx.align(
    result["segments"],  # <-- your existing Whisper output
    align_model,
    metadata,
    audio,
    device
)

aligned_words = []

for seg in aligned["segments"]:
    for w in seg.get("words", []):
        if w["start"] is not None and w["end"] is not None:
            aligned_words.append({
                "word": w["word"].strip().lower(),
                "start": float(w["start"]),
                "end": float(w["end"])
            })

df_aligned_words = pd.DataFrame(aligned_words)
df_aligned_words = df_aligned_words.sort_values("start").reset_index(drop=True)
df_aligned_words.head()

Unnamed: 0,word,start,end
0,she,0.0,0.966
1,poured,1.047,1.308
2,concrete,1.409,2.013
3,straight,2.174,2.496
4,into,2.637,2.919


In [109]:
def overlaps_any_word_relaxed(start, end, words, tol=0.05):
    for _, w in words.iterrows():
        if start < w["end"] + tol and end > w["start"] - tol:
            return True
    return False

df_wav2vec_merged["overlaps_word"] = df_wav2vec_merged.apply(
    lambda r: overlaps_any_word_relaxed(
        r["start"], r["end"], df_aligned_words
    ),
    axis=1
)

df_non_word = df_wav2vec_merged[~df_wav2vec_merged["overlaps_word"]]
df_non_word.head()

Unnamed: 0,labels,start,end,duration,overlaps_word


In [110]:
import re

# conservative mapping: only sounds we are confident about
FILLER_MAP = {
    "A": "uh",
    "E": "uh",
    "U": "uh",
    "M": "um",
    "N": "um",
    "MM": "um",
    "NN": "um",
}


def classify_non_word_event(row):
    label = row["labels"].upper()
    duration = row["duration"]

    # Normalize repeated letters (e.g. MM → M)
    norm = re.sub(r"(.)\1+", r"\1", label)

    # Case 1: canonical filler sound
    if norm in FILLER_MAP:
        return {
            "type": "filler",
            "raw_label": label,
            "normalized": norm,
            "text": FILLER_MAP[norm],
        }

    # Case 2: looks like a real word fragment → stutter
    if len(label) >= 2 and label.isalpha():
        return {
            "type": "stutter",
            "raw_label": label,
            "normalized": label.lower(),
            "text": label.lower()
        }

    # Case 3: everything else → ignore
    return None


converted = []

for _, row in df_non_word.iterrows():
    result = classify_non_word_event(row)
    if result:
        converted.append({
            "type": result["type"],
            "text": result["text"],
            "raw_label": result["raw_label"],
            "start": row["start"],
            "end": row["end"],
            "duration": row["duration"]
        })

df_filler_events = pd.DataFrame(converted)
df_filler_events.head()

In [111]:
filler_model = whisper.load_model("base", device="cpu")
verbatim_result = filler_model.transcribe(
    audio_file_path,
    task="transcribe",
    temperature=0,
    word_timestamps=True,
    condition_on_previous_text=False,
    initial_prompt=(
        "Transcribe verbatim. Include filler words like um, uh, er, "
        "false starts, repetitions, and hesitations."
    ),
    fp16=False,
)

import re

FILLER_PATTERN = re.compile(
    r"^(um+|uh+|erm+|er+|ah+|eh+)$",
    re.IGNORECASE
)

def normalize_whisper_token(token: str) -> str:
    """
    Normalize Whisper word tokens like:
    'um,' -> 'um'
    'uh...' -> 'uh'
    'erm-' -> 'erm'
    """
    token = token.lower().strip()

    # remove leading/trailing punctuation
    token = re.sub(r"^[^\w]+|[^\w]+$", "", token)

    return token


whisper_fillers = []

for seg in verbatim_result.get("segments", []):
    for w in seg.get("words", []):
        raw = w["word"]
        norm = normalize_whisper_token(raw)

        if not norm:
            continue

        if FILLER_PATTERN.match(norm):
            whisper_fillers.append({
                "type": "filler",
                "text": norm,                 # canonical filler
                "raw_text": raw,              # keep original for debugging
                "start": float(w["start"]),
                "end": float(w["end"]),
                "duration": float(w["end"] - w["start"]),
                "confidence": float(w["probability"]),
            })
if whisper_fillers:
    df_whisper_fillers = pd.DataFrame(whisper_fillers)

    # df_whisper_fillers = df_whisper_fillers[
    #     (df_whisper_fillers["duration"] >= 0.25) &
    #     (df_whisper_fillers["confidence"] >= 0.5)
    # ].reset_index(drop=True)
else:
    df_whisper_fillers = pd.DataFrame(
        columns=[
            "type",
            "text",
            "raw_text",
            "start",
            "end",
            "duration",
            "confidence",
        ]
    )
    
df_whisper_fillers

Unnamed: 0,type,text,raw_text,start,end,duration,confidence


In [112]:
def overlaps_time(a_start, a_end, b_start, b_end, tol=0.08):
    """
    Returns True if two time intervals overlap within tolerance.
    """
    return (a_start < b_end + tol) and (a_end > b_start - tol)

final_fillers = []

FINAL_FILLER_COLUMNS = [
    "style",
    "type",
    "text",
    "raw_label",
    "start",
    "end",
    "duration",
]

# 1️⃣ Add all wav2vec fillers first (ground truth)
for _, row in df_filler_events.iterrows():
    final_fillers.append({
        "style": "subtle",
        "type": row["type"],
        "text": row["text"],
        "raw_label": row.get("raw_label", None),
        "start": row["start"],
        "end": row["end"],
        "duration": row["duration"],
    })

for _, wf in df_whisper_fillers.iterrows():
    is_duplicate = False

    for af in final_fillers:
        if overlaps_time(
            wf["start"], wf["end"],
            af["start"], af["end"]
        ):
            is_duplicate = True
            break

    if not is_duplicate:
        final_fillers.append({
            "style": "clear",
            "type": "filler",
            "text": wf["text"],
            "raw_label": wf["raw_text"],
            "start": wf["start"],
            "end": wf["end"],
            "duration": wf["duration"],
        })

df_final_fillers = pd.DataFrame(
    final_fillers,
    columns=FINAL_FILLER_COLUMNS
)

if not df_final_fillers.empty:
    df_final_fillers = (
        df_final_fillers
        .sort_values("start")
        .reset_index(drop=True)
    )

df_final_fillers

Unnamed: 0,style,type,text,raw_label,start,end,duration


In [113]:
# %%
# ---------- ENHANCED NORMALIZATION ----------

duration_min = max(total_duration / 60.0, 0.5)

# Fillers & stutters
filler_events = df_final_fillers[df_final_fillers["type"] == "filler"]
stutter_events = df_final_fillers[df_final_fillers["type"] == "stutter"]

fillers_per_min = len(filler_events) / duration_min
stutters_per_min = len(stutter_events) / duration_min

# Pauses
pauses = df_words["start"].iloc[1:].values - df_words["end"].iloc[:-1].values
long_pauses = pauses[pauses > 1.0]
very_long_pauses = pauses[pauses > 2.0]

long_pauses_per_min = len(long_pauses) / duration_min
very_long_pauses_per_min = len(very_long_pauses) / duration_min
pause_time_ratio = pauses[pauses > 0].sum() / total_duration

# Lexical
words_clean = df_words["word"].str.lower()
vocab_richness = words_clean.nunique() / len(words_clean)
repetition_ratio = words_clean.value_counts().iloc[0] / len(words_clean)

# Temporal fluency instability (variance of gaps)
pause_variability = pauses.std() if len(pauses) > 5 else 0.0

normalized_metrics = {
    "wpm": words_per_minute,
    "fillers_per_min": fillers_per_min,
    "stutters_per_min": stutters_per_min,
    "long_pauses_per_min": long_pauses_per_min,
    "very_long_pauses_per_min": very_long_pauses_per_min,
    "pause_time_ratio": pause_time_ratio,
    "pause_variability": pause_variability,
    "vocab_richness": vocab_richness,
    "repetition_ratio": repetition_ratio,
}

normalized_metrics


{'wpm': 159.30902111324374,
 'fillers_per_min': 0.0,
 'stutters_per_min': 0.0,
 'long_pauses_per_min': 0.0,
 'very_long_pauses_per_min': 0.0,
 'pause_time_ratio': np.float64(0.09660908509277018),
 'pause_variability': np.float64(0.1099210143115021),
 'vocab_richness': 0.7349397590361446,
 'repetition_ratio': np.float64(0.12048192771084337)}

In [114]:
CONTEXT_CONFIG = {
    "conversational": {
        "pause_tolerance": 1.0,
        "pause_variability_tolerance": 1.0,
    },
    "narrative": {
        "pause_tolerance": 1.4,          # allow more long pauses
        "pause_variability_tolerance": 1.3,
    },
    "presentation": {
        "pause_tolerance": 1.2,
        "pause_variability_tolerance": 1.1,
    },
    "interview": {
        "pause_tolerance": 0.9,          # stricter
        "pause_variability_tolerance": 0.9,
    },
}

context = CONTEXT_CONFIG.get(speech_context, CONTEXT_CONFIG["conversational"])

In [115]:
# %%
# ---------- CONSISTENT FLUENCY SCORING ----------

import math

def clamp01(x):
    return max(0.0, min(1.0, x))


# ---- Subscores (0–1, higher = better) ----

# Speech rate (ASYMMETRIC)
wpm = normalized_metrics["wpm"]

if wpm < 110:  # too slow hurts fluency hard
    speech_rate_score = clamp01((wpm - 70) / 40)
elif wpm <= 170:  # optimal band
    speech_rate_score = 1.0
else:  # too fast hurts gently
    speech_rate_score = clamp01(1 - (wpm - 170) / 120)


# Pauses (structural)
pause_score = clamp01(
    1 - (
        normalized_metrics["long_pauses_per_min"]
        / (4.0 * context["pause_tolerance"])
    )
)

# Fillers (structural)
filler_score = clamp01(
    1 - (normalized_metrics["fillers_per_min"] / 6.0)
)

# Stability (structural)
stability_score = clamp01(
    1 - (
        normalized_metrics["pause_variability"]
        / (0.7 * context["pause_variability_tolerance"])
    )
)

# Lexical quality (STYLE)
lexical_score = clamp01(
    0.65 * normalized_metrics["vocab_richness"]
    + 0.35 * (1 - normalized_metrics["repetition_ratio"])
)


# ---- Weighted aggregation ----
# Structural dimensions dominate readiness
raw_score = (
    0.30 * pause_score +
    0.25 * filler_score +
    0.20 * stability_score +
    0.15 * speech_rate_score +
    0.10 * lexical_score
)

fluency_score = int(round(100 * clamp01(raw_score)))
fluency_score


95

In [116]:
# %%
# ---------- ISSUE DETECTION ----------

issues = []

def issue(severity, issue_id, root_cause, score_impact):
    return {
        "issue": issue_id,
        "severity": severity,
        "root_cause": root_cause,
        "score_impact": score_impact
    }


# Structural blockers
if pause_score < 0.6:
    issues.append(issue(
        "high",
        "hesitation_structure",
        "Pauses frequently interrupt sentence flow.",
        int((1 - pause_score) * 30)
    ))

if filler_score < 0.6:
    issues.append(issue(
        "high",
        "filler_dependency",
        "Fillers replace silent planning pauses.",
        int((1 - filler_score) * 25)
    ))

if stability_score < 0.6:
    issues.append(issue(
        "medium",
        "delivery_instability",
        "Speech rhythm varies unpredictably.",
        int((1 - stability_score) * 20)
    ))

# Style issues (never blockers alone)
if speech_rate_score < 0.7:
    issues.append(issue(
        "medium",
        "delivery_pacing",
        "Speech rate is faster than optimal for clarity.",
        int((1 - speech_rate_score) * 15)
    ))

if lexical_score < 0.5:
    issues.append(issue(
        "low",
        "lexical_simplicity",
        "Frequent reuse of common vocabulary.",
        int((1 - lexical_score) * 10)
    ))


issues = sorted(issues, key=lambda x: x["score_impact"], reverse=True)
issues


[]

In [117]:
# %%
# ---------- READINESS JUDGMENT ----------

high_issues = [i for i in issues if i["severity"] == "high"]
medium_issues = [i for i in issues if i["severity"] == "medium"]

if len(high_issues) >= 2:
    readiness = "not_ready"
elif len(high_issues) == 1:
    readiness = "borderline"
elif len(medium_issues) >= 2:
    readiness = "borderline"
elif fluency_score >= 80:
    readiness = "ready"
else:
    readiness = "borderline"

readiness


'ready'

In [118]:
# %%
# ---------- BENCHMARKING ----------

# v1 calibrated bands (replace with data later)
if fluency_score >= 85:
    percentile = 80
elif fluency_score >= 75:
    percentile = 65
elif fluency_score >= 65:
    percentile = 50
else:
    percentile = 30

score_gap = (
    max(0, 80 - fluency_score)
    if readiness != "ready"
    else 0
)

benchmarking = {
    "percentile": percentile,
    "target_score": 80,
    "score_gap": score_gap,
    "estimated_guided_practice_hours": score_gap * 0.6,
}


benchmarking

{'percentile': 80,
 'target_score': 80,
 'score_gap': 0,
 'estimated_guided_practice_hours': 0.0}

In [119]:
# %%
# ---------- OPINIONS ----------

action_plan = []

for idx, issue in enumerate(issues[:3]):
    action_plan.append({
        "priority": idx + 1,
        "focus": issue["issue"],
        "instruction": {
            "hesitation_structure": "Pause only after completing full clauses.",
            "filler_dependency": "Replace fillers with silent pauses under 300ms.",
            "delivery_instability": "Practice steady pacing with metronome drills.",
            "delivery_pacing": "Reduce speed slightly while maintaining energy.",
            "lexical_simplicity": "Actively substitute repeated words during rehearsal."
        }[issue["issue"]],
        "expected_score_gain": issue["score_impact"]
    })

opinions = {
    "primary_issues": issues,
    "action_plan": action_plan
}

opinions


{'primary_issues': [], 'action_plan': []}

In [120]:
# %%
final_response = {
    "verdict": {
        "fluency_score": fluency_score,
        "readiness": readiness,
        "confidence": 0.92
    },

    "benchmarking": benchmarking,

    "normalized_metrics": normalized_metrics,

    "opinions": opinions,

    "word_timestamps": df_words.to_dict(orient="records"),

    "segment_timestamps": df_segments.to_dict(orient="records"),

    "filler_events": df_final_fillers.to_dict(orient="records"),

    "aligned_words": df_aligned_words.to_dict(orient="records"),
}

final_response


{'verdict': {'fluency_score': 95, 'readiness': 'ready', 'confidence': 0.92},
 'benchmarking': {'percentile': 80,
  'target_score': 80,
  'score_gap': 0,
  'estimated_guided_practice_hours': 0.0},
 'normalized_metrics': {'wpm': 159.30902111324374,
  'fillers_per_min': 0.0,
  'stutters_per_min': 0.0,
  'long_pauses_per_min': 0.0,
  'very_long_pauses_per_min': 0.0,
  'pause_time_ratio': np.float64(0.09660908509277018),
  'pause_variability': np.float64(0.1099210143115021),
  'vocab_richness': 0.7349397590361446,
  'repetition_ratio': np.float64(0.12048192771084337)},
 'opinions': {'primary_issues': [], 'action_plan': []},
 'word_timestamps': [{'word': 'She',
   'start': 0.0,
   'end': 0.94,
   'duration': 0.94,
   'confidence': 0.7459069490432739},
  {'word': 'poured',
   'start': 0.94,
   'end': 1.2,
   'duration': 0.26,
   'confidence': 0.9895564913749695},
  {'word': 'concrete',
   'start': 1.2,
   'end': 1.84,
   'duration': 0.6400000000000001,
   'confidence': 0.9994962215423584},
  