## TRANSCRIPT QUALITY EVALUATION

**IMPORTS**

In [None]:
!pip install -q jiwer

In [None]:
import os
import json
import re
import pandas as pd
from jiwer import wer

**TEXT NORMALIZATION**

In [None]:
# Normalize text for evaluation
def normalize_text(text: str) -> str:
    text = text.lower()                     # lowercase
    text = re.sub(r"[^\w\s]", "", text)     # remove punctuation
    text = re.sub(r"\s+", " ", text)        # normalize whitespace
    return text.strip()

**LOAD RAW TRANSCRIPTS**

In [None]:
import pandas as pd

TRANS_RAW = "/content/drive/MyDrive/podcast-project/data/transcripts_raw_truncated/lines_clean_200.csv"
ref_df = pd.read_csv(TRANS_RAW)

# Check columns
print(ref_df.columns.tolist())

['index', 'act_name', 'episode_id', 'line_text', 'speaker', 'speaker_class', 'timestamp']


**GET REFERENCE TRANSCRIPT FOR ONE EPISODE**

In [None]:
# Concatenate all transcript lines for a given episode_id.
def get_reference_text(df: pd.DataFrame, episode_id: int) -> str:
    lines = df[df["episode_id"] == episode_id]["line_text"].dropna()
    return " ".join(lines.astype(str))

**LOAD WHISPER TRANSCRIPT**

In [None]:
# Load full transcript text from Whisper JSON.
def load_whisper_text(episode_id: int, base_dir: str) -> str:
    path = os.path.join(base_dir, f"episode_{episode_id}_whisper.json")
    if not os.path.exists(path):
        return None

    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    return data.get("text", "")

**COMPARE FIRST 10 EPISODES (WER)**

In [None]:
TRANS_PROC = "/content/drive/MyDrive/podcast-project/data/transcripts_processed"

NUM_EPISODES = 15
episodes_to_compare = sorted(ref_df["episode_id"].unique())[:NUM_EPISODES]

results = []

for ep in episodes_to_compare:
    print(f"\n Evaluating Episode {ep}")

    ref_text = get_reference_text(ref_df, ep)       # reference transcript
    hyp_text = load_whisper_text(ep, TRANS_PROC)    # whisper transcript

    # If either transcript is missing, skip this episode
    if not ref_text or not hyp_text:
        print(" Missing reference or Whisper transcript")
        continue

    # Normalize both transcripts (lowercase, remove punctuation, etc.)
    ref_norm = normalize_text(ref_text)
    hyp_norm = normalize_text(hyp_text)

    # Calculate Word Error Rate (WER)
    error = wer(ref_norm, hyp_norm)

    results.append({
        "episode_id": ep,
        "wer": round(error, 3),
        "reference_words": len(ref_norm.split()),
        "whisper_words": len(hyp_norm.split())
    })

    print(f"WER: {round(error, 3)}")

results_df = pd.DataFrame(results)



 Evaluating Episode 1
WER: 0.551

 Evaluating Episode 2
WER: 0.349

 Evaluating Episode 3
WER: 0.389

 Evaluating Episode 4
WER: 0.458

 Evaluating Episode 5
WER: 0.356

 Evaluating Episode 6
WER: 0.609

 Evaluating Episode 7
WER: 0.494

 Evaluating Episode 8
WER: 0.459

 Evaluating Episode 9
WER: 0.462

 Evaluating Episode 10
WER: 0.237

 Evaluating Episode 11
WER: 0.255

 Evaluating Episode 12
WER: 0.563

 Evaluating Episode 13
WER: 0.482

 Evaluating Episode 14
WER: 0.5

 Evaluating Episode 15
WER: 0.428


**DEFINE A WER QUALITY SCALE**

In [None]:
# Map WER value to a qualitative interpretation.
def wer_quality_label(wer_value: float) -> str:
    if wer_value <= 0.25:
        return "Very Good"
    elif wer_value <= 0.35:
        return "Good"
    elif wer_value <= 0.45:
        return "Moderate"
    elif wer_value <= 0.55:
        return "Difficult"
    else:
        return "Very Challenging"

**VIEW RESULTS**

In [None]:
results_df["quality"] = results_df["wer"].apply(wer_quality_label)
results_df

Unnamed: 0,episode_id,wer,reference_words,whisper_words,quality
0,1,0.551,8402,8365,Very Challenging
1,2,0.349,9352,9251,Good
2,3,0.389,8226,8545,Moderate
3,4,0.458,8765,8534,Difficult
4,5,0.356,7965,8684,Moderate
5,6,0.609,8618,8213,Very Challenging
6,7,0.494,7077,7206,Difficult
7,8,0.459,9081,8626,Difficult
8,9,0.462,9248,9048,Difficult
9,10,0.237,7441,7854,Very Good
