<a href="https://colab.research.google.com/github/pneuly/whisper-asr-colab/blob/develop/whisper_asr_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import time
import logging
import threading
import locale
from joblib import Parallel, delayed
from google.colab import files

locale.getpreferredencoding = lambda: "UTF-8"
os.environ["UV_PRERELEASE"] = "if-necessary-or-explicit"
os.environ["UV_SYSTEM_PYTHON"] = "true"
os.environ["UV_NO_PROGRESS"] = "true"
for var in ["UV_BUILD_CONSTRAINT", "UV_CONSTRAINT"]:
    os.environ.pop(var, None)

# Avoid unnecessary downgrades
with open("override.txt", "w") as f:
    f.writelines("""nvidia-cudnn-cu12
nvidia-cublas-cu12
nvidia-cuda-cupti-cu12
nvidia-cuda-nvrtc-cu12
nvidia-cuda-runtime-cu12
nvidia-cufft-cu12
nvidia-curand-cu12
nvidia-cusolver-cu12
nvidia-cusparse-cu12
nvidia-nvjitlink-cu12
nvidia-nccl-cu12
""")
os.environ["UV_OVERRIDE"] = "override.txt"

# @title Faster-Whisper Implementation on Google Colab{ display-mode: "form" }
audio = 'https://www.youtube.com/live/UaDDdgpTch0'  # @param {type:"string"}
download_format = "mp3"  # @param ["", "mp3", "m4a", "aac", "vorbis", "opus", "wav"] {allow-input: true}
model_size = "large-v3-turbo" # @param ["large-v3-turbo", "large-v3", "large-v2", "large", "medium", "small", "base", "tiny"] {allow-input: true}
diarization = True  # @param {type:"boolean"}
HUGGING_FACE_TOKEN = "" # @param {type:"string"}
password = ""  # @param {type:"string"}
start_time = ""  # @param {type:"string"}
end_time = ""  # @param {type:"string"}
timestamp_offset = ""  # @param {type:"string"}
skip_silence = True  # @param {type:"boolean"}
hotwords = "次に、これです。"
batch_size = 1
realtime = False
initial_prompt = ""
prefix = None
vad_filter = False
files_to_download = []

# ----- main routine ------
logging.getLogger().setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
if not logger.hasHandlers():
    logger.addHandler(logging.StreamHandler())

if diarization and HUGGING_FACE_TOKEN == "":
    raise ValueError("Input your token to hf_token (https://huggingface.co/pyannote/speaker-diarization)")

if audio == "":
    audio = list(files.upload())[0]

!uv pip install git+https://github.com/pneuly/whisper-asr-colab.git@develop
from whisper_asr_colab.audio import Audio # noqa: E402
from faster_whisper.utils import download_model

audio = Audio().from_path_or_url(audio)
if start_time:
    audio.start_time = start_time
if end_time:
    audio.end_time = end_time
if download_format:
    audio.download_format = download_format
if password:
    audio.password = password
audio.verify_upload = False
audio._load_audio()

def _transcribe():
    from whisper_asr_colab.worker import Worker
    worker = Worker(
        audio=audio,
        model_size=model_size,
        language="ja",
        diarization=diarization,
        password=password,
        initial_prompt=initial_prompt,
        realtime=realtime,
        batch_size=batch_size,
        hugging_face_token=HUGGING_FACE_TOKEN,
        hotwords=hotwords,
        prefix=prefix,
        vad_filter=False,
        skip_silence=skip_silence,
    )
    if timestamp_offset:
        worker.timestamp_offset = timestamp_offset
    results = worker.run()
    # gc GPU RAM
    del worker
    return results

def _diarize():
    !uv pip install pyannote.audio
    from whisper_asr_colab.worker import Diarizer # noqa: E402
    from torch.cuda import empty_cache
    empty_cache()
    diarizer = Diarizer(
        audio=audio,
        hugging_face_token = HUGGING_FACE_TOKEN,
        )
    diarizer.diarize()
    return diarizer.integrate()

def _convert_audio_if_needed():
    input_path = audio.file_path
    if not input_path.endswith(download_format):
        !uv pip install ffmpeg-python
        import ffmpeg
        print(f"Converting {input_path} to mp3 format.")
        base, _ = os.path.splitext(input_path)
        output_path = base + '.' + download_format
        ffmpeg.run(
            ffmpeg.input(input_path).output(output_path),
            overwrite_output=True
        )
        return output_path
    return input_path

stop_event = threading.Event()
def _tail_log_file():
    file_path = "diarization_progress.txt"
    while not stop_event.is_set():
        if not os.path.exists(file_path):
            print("Waiting for transcribing to begin.")
            time.sleep(5)
            continue
        with open(file_path, "r") as f:
            while not stop_event.is_set():
                line = f.readline()
                if line:
                    print(line.strip(), flush=True)
                else:
                    time.sleep(0.5)

if __name__ == "__main__":
    # Initializing FasterWhisper and pyannote.audio in the same
    # namespace causes a crash or errors.
    # To isolate them, FasterWhisper is launched in a separate
    # process using joblib.
    # Since multiprocessing is unreliable in Jupyter environments,
    # joblib's 'loky' backend is required.
    # However, loky suppresses stdout/stderr from child processes,
    # so some workarounds are implemented.

    download_model(model_size)
    threading.Thread(target=_tail_log_file, daemon=True).start()
    result = Parallel(n_jobs=2, backend="loky", verbose=5)([delayed(_transcribe)()])
    stop_event.set()
    files_to_download.extend(result[0])

    if diarization:
        result_files = _diarize()
        files_to_download.extend(result_files)

    if audio.url:
        # Add audio file to files_to_download
        audio_file = _convert_audio_if_needed()
        files_to_download.append(audio_file)

    # Download files
    from whisper_asr_colab.utils import download_from_colab
    for file in files_to_download:
        print(f"Downloading {file}")
        download_from_colab(file)
