In [None]:
import os
import json
from tqdm import tqdm
import soundfile as sf
from moviepy import VideoFileClip, AudioFileClip
from transformers import pipeline

VIDEO_DIR = "/home/BOS_Videos"
OUTPUT_JSON = "all_transcription.json"

def to_wav(input_video, output_wav):
    try:
        clip = VideoFileClip(input_video)
        audio = clip.audio
        audio.write_audiofile(
            output_wav,
            fps=16000,           # 16kHz
            nbytes=2,            # 16-bit
            buffersize=2000,
            codec='pcm_s16le',   # WAV format
            ffmpeg_params=["-ac", "1"]  # Mono channel
        )
        clip.close()
    except Exception as e:
        raise RuntimeError(f"Failed to extract audio from {input_video}: {e}")

# Setup ASR pipeline
asr_pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-large-v3-turbo",
    chunk_length_s=30,
    batch_size=4,
    device=0,  # set to -1 if you want to use CPU
    generate_kwargs={"task": "transcribe"}
)

results = {}
for root, _, files in os.walk(VIDEO_DIR):
    for fn in tqdm(files, desc="Processing videos"):
        if not fn.lower().endswith((".mp4", ".mkv", ".mov", ".webm")):
            continue

        video_path = os.path.join(root, fn)
        wav_path = video_path.rsplit(".", 1)[0] + ".wav"

        try:
            # Step 1: Convert to WAV
            to_wav(video_path, wav_path)

            # Step 2: Read WAV into memory using soundfile
            audio_data, sample_rate = sf.read(wav_path)

            # Step 3: Transcribe using the pipeline
            out = asr_pipe(audio_data)
            results[fn] = out

        except Exception as e:
            print(f"⚠️ Error processing {fn}: {e}")

# Save translations to JSON
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"✅ Done. Translated {len(results)} videos into: {OUTPUT_JSON}")


Device set to use cuda:0
Processing videos:   0%|          | 0/49 [00:00<?, ?it/s]

MoviePy - Writing audio in /home/sarmistha/ACM-MM/BOS_Videos/M-QFGiwUCPA_2.wav



chunk:   0%|          | 0/2412 [00:00<?, ?it/s, now=None][A
chunk:  11%|█         | 254/2412 [00:00<00:00, 2535.52it/s, now=None][A
chunk:  21%|██        | 508/2412 [00:00<00:00, 2487.79it/s, now=None][A
chunk:  31%|███▏      | 757/2412 [00:00<00:00, 2445.93it/s, now=None][A
chunk:  42%|████▏     | 1002/2412 [00:00<00:00, 2429.25it/s, now=None][A
chunk:  52%|█████▏    | 1245/2412 [00:00<00:00, 2390.65it/s, now=None][A
chunk:  62%|██████▏   | 1485/2412 [00:00<00:00, 2379.31it/s, now=None][A
chunk:  71%|███████▏  | 1723/2412 [00:00<00:00, 2351.67it/s, now=None][A
chunk:  81%|████████  | 1959/2412 [00:00<00:00, 2335.58it/s, now=None][A
chunk:  91%|█████████ | 2193/2412 [00:00<00:00, 2286.30it/s, now=None][A
Processing videos:   0%|          | 0/49 [00:01<?, ?it/s]             [A

MoviePy - Done.


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Processing videos:   2%|▏         | 1/49 [00:17<13:55, 17.41s/it]

MoviePy - Writing audio in /home/sarmistha/ACM-MM/BOS_Videos/Izw-xaVkO0g_4.wav



chunk:   0%|          | 0/743 [00:00<?, ?it/s, now=None][A
chunk:  31%|███       | 232/743 [00:00<00:00, 2316.54it/s, now=None][A
chunk:  63%|██████▎   | 468/743 [00:00<00:00, 2339.74it/s, now=None][A
chunk:  94%|█████████▍| 702/743 [00:00<00:00, 2324.97it/s, now=None][A
Processing videos:   2%|▏         | 1/49 [00:17<13:55, 17.41s/it]   [A

MoviePy - Done.


Processing videos:   4%|▍         | 2/49 [00:23<08:16, 10.56s/it]

MoviePy - Writing audio in /home/sarmistha/ACM-MM/BOS_Videos/4VFGSeKelwA_4.wav



chunk:   0%|          | 0/1058 [00:00<?, ?it/s, now=None][A
chunk:  25%|██▌       | 266/1058 [00:00<00:00, 2620.68it/s, now=None][A
chunk:  50%|█████     | 529/1058 [00:00<00:00, 2487.60it/s, now=None][A
chunk:  74%|███████▎  | 779/1058 [00:00<00:00, 2322.48it/s, now=None][A
chunk:  96%|█████████▌| 1013/1058 [00:00<00:00, 2307.32it/s, now=None][A
Processing videos:   4%|▍         | 2/49 [00:23<08:16, 10.56s/it]     [A

MoviePy - Done.


Processing videos:   6%|▌         | 3/49 [00:30<07:04,  9.23s/it]

MoviePy - Writing audio in /home/sarmistha/ACM-MM/BOS_Videos/6IiEoSHw9gY_1.wav



chunk:   0%|          | 0/2894 [00:00<?, ?it/s, now=None][A
chunk:   9%|▉         | 265/2894 [00:00<00:00, 2646.81it/s, now=None][A
chunk:  18%|█▊        | 530/2894 [00:00<00:00, 2522.15it/s, now=None][A
chunk:  27%|██▋       | 783/2894 [00:00<00:00, 2424.38it/s, now=None][A
chunk:  35%|███▌      | 1026/2894 [00:00<00:00, 2379.08it/s, now=None][A
chunk:  44%|████▍     | 1274/2894 [00:00<00:00, 2413.78it/s, now=None][A
chunk:  53%|█████▎    | 1520/2894 [00:00<00:00, 2427.03it/s, now=None][A
chunk:  61%|██████    | 1767/2894 [00:00<00:00, 2423.88it/s, now=None][A
chunk:  70%|██████▉   | 2015/2894 [00:00<00:00, 2437.89it/s, now=None][A
chunk:  78%|███████▊  | 2259/2894 [00:00<00:00, 2415.87it/s, now=None][A
chunk:  86%|████████▋ | 2501/2894 [00:01<00:00, 2381.93it/s, now=None][A
chunk:  95%|█████████▍| 2740/2894 [00:01<00:00, 2350.51it/s, now=None][A
Processing videos:   6%|▌         | 3/49 [00:32<07:04,  9.23s/it]     [A

MoviePy - Done.


Processing videos:   8%|▊         | 4/49 [00:53<10:54, 14.55s/it]

MoviePy - Writing audio in /home/sarmistha/ACM-MM/BOS_Videos/lGHGzU3CtZg_5.wav



chunk:   0%|          | 0/31770 [00:00<?, ?it/s, now=None][A
chunk:   1%|          | 256/31770 [00:00<00:12, 2557.26it/s, now=None][A
chunk:   2%|▏         | 512/31770 [00:00<00:12, 2481.62it/s, now=None][A
chunk:   2%|▏         | 761/31770 [00:00<00:12, 2419.25it/s, now=None][A
chunk:   3%|▎         | 1004/31770 [00:00<00:12, 2380.08it/s, now=None][A
chunk:   4%|▍         | 1243/31770 [00:00<00:13, 2302.72it/s, now=None][A
chunk:   5%|▍         | 1474/31770 [00:00<00:13, 2282.92it/s, now=None][A
chunk:   5%|▌         | 1703/31770 [00:00<00:13, 2277.12it/s, now=None][A
chunk:   6%|▌         | 1931/31770 [00:00<00:13, 2257.55it/s, now=None][A
chunk:   7%|▋         | 2157/31770 [00:00<00:13, 2248.11it/s, now=None][A
chunk:   7%|▋         | 2382/31770 [00:01<00:13, 2231.59it/s, now=None][A
chunk:   8%|▊         | 2606/31770 [00:01<00:13, 2232.57it/s, now=None][A
chunk:   9%|▉         | 2831/31770 [00:01<00:12, 2226.55it/s, now=None][A
chunk:  10%|▉         | 3059/31770 [00:0

MoviePy - Done.
