In [67]:
from faster_whisper import WhisperModel
from pathlib import Path
import subprocess
import json
import subprocess
import webrtcvad

In [61]:
def has_audio(video_path: str) -> bool:
    result = subprocess.run(
        [
            "ffprobe", "-v", "error",
            "-select_streams", "a",
            "-show_entries", "stream=codec_type",
            "-of", "json", video_path
        ],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    info = json.loads(result.stdout)
    return "streams" in info and len(info["streams"]) > 0

In [64]:
def _pcm_stream(input_media: str, sr: int = 16000, bandpass: bool = True):
    """
    Yields raw PCM16 mono audio bytes from input_media via ffmpeg.
    Optional band-pass ~100-3800 Hz applied before piping to VAD.
    """
    af = []
    if bandpass:
        # Tight speech band to cut rumble/aircon & bright SFX
        af.append("highpass=f=100")
        af.append("lowpass=f=3800")
    af_str = ",".join(af) if af else "anull"

    cmd = [
        "ffmpeg", "-nostdin", "-hide_banner", "-loglevel", "error",
        "-i", input_media, "-vn", "-ac", "1", "-ar", str(sr),
        "-af", af_str,
        "-f", "s16le", "pipe:1"
    ]
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    try:
        while True:
            chunk = proc.stdout.read(4096)
            if not chunk:
                break
            yield chunk
    finally:
        if proc.stdout:
            proc.stdout.close()
        proc.wait()

def detect_speech(
    path: str,
    aggressiveness: int = 3,     # most strict
    frame_ms: int = 10,          # 10 ms frames reduce false positives
    min_speech_ms: int = 1200,    # require sustained speech
    min_speech_ratio: float = 0.05,
    min_consec_frames: int = 12,  # at least ~120 ms continuous speech
    sample_rate: int = 16000
):
    """
    Returns a dict with has_speech + stats, using stricter rules to avoid SFX.
    """
    if frame_ms not in (10, 20, 30):
        raise ValueError("frame_ms must be 10, 20, or 30.")

    vad = webrtcvad.Vad(aggressiveness)

    bytes_per_sample = 2
    frame_bytes = int(sample_rate * (frame_ms / 1000.0)) * bytes_per_sample

    total_frames = 0
    speech_frames = 0
    consec = 0
    consec_hits = 0  # number of times we hit >= min_consec_frames

    buffer = bytearray()
    for chunk in _pcm_stream(path, sr=sample_rate, bandpass=True):
        buffer.extend(chunk)
        # exact framing
        while len(buffer) >= frame_bytes:
            frame = bytes(buffer[:frame_bytes])
            del buffer[:frame_bytes]

            total_frames += 1
            is_sp = vad.is_speech(frame, sample_rate)
            if is_sp:
                speech_frames += 1
                consec += 1
                if consec == min_consec_frames:
                    consec_hits += 1  # count a sustained run
            else:
                consec = 0  # reset streak on non-speech

    speech_ratio = (speech_frames / total_frames) if total_frames else 0.0
    min_speech_frames = int(min_speech_ms / frame_ms)

    # Final decision must pass ALL gates
    has_speech = (
        speech_frames >= min_speech_frames and
        speech_ratio >= min_speech_ratio and
        consec_hits >= 2   # saw at least two ~70ms sustained runs
    )

    return {
        "has_speech": has_speech,
        "speech_frames": speech_frames,
        "total_frames": total_frames,
        "speech_ratio": speech_ratio,
        "consecutive_runs": consec_hits
    }

In [None]:
def extract_audio_text(video_path: str):
    
    if has_audio(video_path):
        pass
    else:
        return "Video file has no audio!"
    
    speech_check = detect_speech(video_path)
    if speech_check["has_speech"]:
        pass
    else:
        return "Video file doesn't contain speech!"
    
    model_size = "base"
    #device = "cuda" if torch.cuda.is_available() else "cpu"
    #compute_type = "float16" if device == "cuda" else "int8"
    compute_type = "int8"
    
    model = WhisperModel(model_size, device="cpu", compute_type=compute_type)

    # Transcribe
    segments, info = model.transcribe(
        video_path,
        beam_size=5,
        vad_filter=True,  # improve punctuation/wording on noisy audio
        vad_parameters=dict(min_silence_duration_ms=500),
        language=None,            # None = auto-detect
        condition_on_previous_text=True,  # better coherence
    )

    print(f"Detected language: {info.language} (prob={info.language_probability:.2f})")

    
    # Collect plain text and also save an SRT with timestamps
    all_text = []
    srt_lines = []
    for i, seg in enumerate(segments, start=1):
        all_text.append(seg.text)
        start = seg.start
        end = seg.end
        # SRT time format
        def t(s):
            h = int(s//3600); m = int((s%3600)//60); ss = s%60
            return f"{h:02}:{m:02}:{int(ss):02},{int((ss-int(ss))*1000):03}"
        srt_lines += [str(i), f"{t(start)} --> {t(end)}", seg.text.strip(), ""]

    # Write outputs
    #out_base = Path(video_path).with_suffix("")
    #(Path(f"{out_base}.txt")).write_text(" ".join(all_text).strip(), encoding="utf-8")
    #(Path(f"{out_base}.srt")).write_text("\n".join(srt_lines), encoding="utf-8")

    print("\nTRANSCRIPT:")
    all_text = "".join(all_text)
    
    return all_text
    #print(f"\nSaved:\n- {out_base}.txt\n- {out_base}.srt")

In [66]:
video_path = "/home/ssever/SilentSpeak/data/input_video/Riser - Sound Effect (Free).mp4"
extract_audio_text(video_path)

"Video file doesn't contain speech"

## **Backup**

In [None]:
def _read_pcm_stream(input_media: str, sample_rate: int = 16000):
    """
    Yields raw PCM16 mono audio chunks from input_media via ffmpeg.
    Output format: s16le, 1 ch, sample_rate Hz
    """
    # -nostdin avoids ffmpeg waiting for input on broken pipes
    # -vn drops video; -ac 1 mono; -ar 16000 resample; -f s16le raw PCM
    cmd = [
        "ffmpeg", "-nostdin", "-hide_banner", "-loglevel", "error",
        "-i", input_media, "-vn", "-ac", "1", "-ar", str(sample_rate),
        "-f", "s16le", "pipe:1"
    ]
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    try:
        while True:
            chunk = proc.stdout.read(4096)
            if not chunk:
                break
            yield chunk
    finally:
        proc.stdout.close()
        proc.wait()

def detect_speech_in_media(
    path: str,
    aggressiveness: int = 3,   # 0-3 (3 = most aggressive = fewer false positives, more false negatives)
    frame_ms: int = 10,        # 10, 20, or 30 ms (30 ms tends to be stable)
    min_speech_ms: int = 2000,  # absolute minimum number of speech ms
    min_speech_ratio: float = 0.05,  # >= 2% of frames must be speech
    sample_rate: int = 16000
):
    """
    Returns: dict with:
      - has_speech: bool
      - speech_frames: int
      - total_frames: int
      - speech_ratio: float
    """
    if frame_ms not in (10, 20, 30):
        raise ValueError("webrtcvad supports frame sizes of 10, 20, or 30 ms")

    vad = webrtcvad.Vad(aggressiveness)

    total_frames = 0
    speech_frames = 0

    # Stream from ffmpeg; buffer until we have enough for whole frames
    buffer = bytearray()
    bytes_per_sample = 2
    frame_bytes = int(sample_rate * (frame_ms / 1000.0)) * bytes_per_sample

    for chunk in _read_pcm_stream(path, sample_rate=sample_rate):
        buffer.extend(chunk)
        while len(buffer) >= frame_bytes:
            frame = bytes(buffer[:frame_bytes])
            del buffer[:frame_bytes]
            total_frames += 1
            # vad.is_speech expects bytes, sample_rate in {8000,16000,32000,48000}
            if vad.is_speech(frame, sample_rate):
                speech_frames += 1

    # (Optional) flush leftover < 1 frame (not needed for VAD)
    speech_ratio = (speech_frames / total_frames) if total_frames else 0.0
    min_speech_frames = int(min_speech_ms / frame_ms)
    has_speech = (speech_frames >= min_speech_frames) and (speech_ratio >= min_speech_ratio)

    return {
        "has_speech": has_speech,
        "speech_frames": speech_frames,
        "total_frames": total_frames,
        "speech_ratio": speech_ratio
    }