In [None]:
import whisper
from pathlib import Path

# Define paths
audio_path = Path("../data/raw/commentary.wav")           # input audio
full_txt_path = Path("../data/processed/commentary_full.txt")   # full transcript
segments_txt_path = Path("../data/processed/commentary_segments.txt")  # segmented transcript

# Make sure output folder exists
full_txt_path.parent.mkdir(parents=True, exist_ok=True)
segments_txt_path.parent.mkdir(parents=True, exist_ok=True)

# Load model
model = whisper.load_model("medium")

# Transcribe
result = model.transcribe(str(audio_path), language="en")

# Save full transcript
with open(full_txt_path, "w", encoding="utf-8") as f:
    f.write(result["text"])
print(f"Full transcript saved to {full_txt_path}")

# Save segmented transcript with timestamps
with open(segments_txt_path, "w", encoding="utf-8") as f:
    for segment in result["segments"]:
        start = segment['start']
        end = segment['end']
        text = segment['text']
        f.write(f"[{start:.2f}s -> {end:.2f}s] {text}\n")
print(f"Segmented transcript saved to {segments_txt_path}")


