In [1]:
# !pip install requests tqdm ffmpeg-python

In [2]:
import ffmpeg
from pathlib import Path

def extract_audio(video_path, output_audio):

    video_path = Path(video_path)
    output_audio = Path(output_audio)

    (
        ffmpeg
        .input(str(video_path))
        .output(
            str(output_audio),
            ac=1,
            ar=16000,
            acodec="flac"
        )
        .overwrite_output()
        .run()
    )


In [3]:
import requests

COLAB_API = "https://afc5-35-240-224-210.ngrok-free.app/transcribe_audio"


def transcribe_gpu(wav_file):

    with open(wav_file, "rb") as f:

        response = requests.post(
            COLAB_API,
            files={"file": f},
            timeout=3600
        )

    response.raise_for_status()

    return response.json()["segments"]


In [4]:
def transcribe_gpu(audio_file):

    with open(audio_file, "rb") as f:

        response = requests.post(
            COLAB_API,
            files={"file": f},
            timeout=3600
        )

    # ðŸ‘‡ SHOW server error details
    if response.status_code != 200:
        print("\n----- SERVER ERROR -----")
        print(response.text)
        print("------------------------\n")
        response.raise_for_status()

    return response.json()["segments"]


In [5]:
def transcribe_gpu(wav_file):
    with open(wav_file, "rb") as f:
        response = requests.post(
            COLAB_API,
            files={"file": f},
            timeout=3600
        )

    if response.status_code != 200:
        print("\n----- SERVER ERROR -----")
        print(response.text)
        print("------------------------\n")

    response.raise_for_status()
    return response.json()["segments"]


In [6]:
def format_time(seconds):

    hrs = int(seconds // 3600)
    mins = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds - int(seconds)) * 1000)

    return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"


def write_srt(segments, output_file):

    with open(output_file, "w", encoding="utf-8") as f:

        for i, seg in enumerate(segments, 1):

            f.write(f"{i}\n")
            f.write(
                f"{format_time(seg['start'])} --> "
                f"{format_time(seg['end'])}\n"
            )
            f.write(seg["text"] + "\n\n")


In [7]:
import os

def process_video(video_path):

    base = os.path.splitext(video_path)[0]

    wav_file = base + ".wav"
    srt_file = base + ".srt"

    print("Extracting audio...")
    extract_audio(video_path, wav_file)

    print("Sending audio to GPU...")
    segments = transcribe_gpu(wav_file)

    print("Writing subtitles...")
    write_srt(segments, srt_file)

    os.remove(wav_file)

    print("Done:", srt_file)



In [9]:
from pathlib import Path

video = Path(r"videos\[Anime Time] Attack On Titan - 12_with_subs.mkv")

process_video(video)

Extracting audio...


Error: ffmpeg error (see stderr output for detail)