In [None]:
from faster_whisper import WhisperModel
from pathlib import Path

In [None]:
def extract_audio_text(video_path):
    
    model_size = "base"
    #device = "cuda" if torch.cuda.is_available() else "cpu"
    #compute_type = "float16" if device == "cuda" else "int8"
    compute_type = "int8"
    
    model = WhisperModel(model_size, device="cpu", compute_type=compute_type)

    # Transcribe
    segments, info = model.transcribe(
        video_path,
        beam_size=5,
        vad_filter=True,  # improve punctuation/wording on noisy audio
        vad_parameters=dict(min_silence_duration_ms=500),
        language=None,            # None = auto-detect
        condition_on_previous_text=True,  # better coherence
    )

    print(f"Detected language: {info.language} (prob={info.language_probability:.2f})")

    
    # Collect plain text and also save an SRT with timestamps
    all_text = []
    srt_lines = []
    for i, seg in enumerate(segments, start=1):
        all_text.append(seg.text)
        start = seg.start
        end = seg.end
        # SRT time format
        def t(s):
            h = int(s//3600); m = int((s%3600)//60); ss = s%60
            return f"{h:02}:{m:02}:{int(ss):02},{int((ss-int(ss))*1000):03}"
        srt_lines += [str(i), f"{t(start)} --> {t(end)}", seg.text.strip(), ""]

    # Write outputs
    #out_base = Path(video_path).with_suffix("")
    #(Path(f"{out_base}.txt")).write_text(" ".join(all_text).strip(), encoding="utf-8")
    #(Path(f"{out_base}.srt")).write_text("\n".join(srt_lines), encoding="utf-8")

    print("\nTRANSCRIPT:")
    all_text = "".join(all_text)
    
    return all_text
    #print(f"\nSaved:\n- {out_base}.txt\n- {out_base}.srt")

In [None]:
video_path = "/home/ssever/SilentSpeak/data/input_video/vlc-record-2025-09-02-01h09m01s-How To Talk To Camera_ The 3 FUNDAMENTALS.mp4-.mp4"
extract_audio_text(video_path)