In [7]:
from pathlib import Path
import subprocess, os, sys
from concurrent.futures import ThreadPoolExecutor, as_completed
import hashlib
import imageio_ffmpeg as iioff

VIDEO_EXTS = {".mp4", ".mov", ".mkv", ".avi", ".webm", ".m4v"}

In [8]:
FFMPEG = iioff.get_ffmpeg_exe()
def check_ffmpeg():
    try:
        r = subprocess.run([FFMPEG, "-version"], capture_output=True, text=True)
        if r.returncode != 0:
            raise RuntimeError(r.stderr)
    except Exception:
        raise RuntimeError("ffmpeg tidak ditemukan. Install ffmpeg dan pastikan `ffmpeg -version` bisa jalan di terminal.")

check_ffmpeg()
print("ffmpeg OK")


ffmpeg OK


In [14]:
def has_any_video(p: Path) -> bool:
    for ext in VIDEO_EXTS:
        if any(p.rglob(f"*{ext}")):
            return True
    return False

def find_video_dir(split_root: Path) -> Path | None:
    cand = split_root
    if cand.exists() and cand.is_dir() and has_any_video(cand):
        return cand

    # fallback: cari subfolder yang berisi video (skip yang jelas bukan video)
    skip_names = {"annotation", "transcription", "audio", "__macosx"}
    for sub in split_root.iterdir():
        if not sub.is_dir():
            continue
        if sub.name.lower() in skip_names:
            continue
        if has_any_video(sub):
            return sub

    # terakhir: kalau split_root sendiri ada video
    if has_any_video(split_root):
        return split_root

    return None

def safe_outpath(audio_dir: Path, video_path: Path, base_video_dir: Path) -> Path:
    stem = video_path.stem
    out = audio_dir / f"{stem}.wav"
    if not out.exists():
        return out

    rel = str(video_path.relative_to(base_video_dir)).replace("\\", "/")
    h = hashlib.md5(rel.encode("utf-8")).hexdigest()[:8]
    return audio_dir / f"{stem}_{h}.wav"

def extract_one(video_path: Path, out_wav: Path):
    out_wav.parent.mkdir(parents=True, exist_ok=True)

    cmd = [
        FFMPEG,
        "-hide_banner",
        "-loglevel", "error",
        "-y",
        "-i", str(video_path),
        "-vn",
        "-ac", "1",
        "-ar", "16000",
        "-c:a", "pcm_s16le",
        str(out_wav),
    ]
    p = subprocess.run(cmd, capture_output=True, text=True)
    if p.returncode == 0 and out_wav.exists() and out_wav.stat().st_size > 0:
        return True, "ok"
    return False, (p.stderr.strip() or "ffmpeg failed")


In [11]:
ROOT = Path.cwd()
print("Root project:", ROOT.parent)

Root project: e:\tugas-akhir-qiqi


In [15]:
# root repo: sesuaikan kalau notebook kamu ada di /code atau root
ROOT = Path.cwd()  # kalau notebook dijalankan dari root project
DATASET_DIR = ROOT.parent / "Dataset"

if not DATASET_DIR.exists():
    raise FileNotFoundError(f"Folder Dataset tidak ditemukan dari {ROOT}. Coba pindah working directory atau ubah ROOT.")

splits = {
    "Train": {"split_root": DATASET_DIR / "Train" / "train" },
    "Val":   {"split_root": DATASET_DIR / "Val" / "val"     },
    "Test":  {"split_root": DATASET_DIR / "Test" / "test"    },
}

jobs = []  # (video_path, out_path)
for split_name, cfg in splits.items():
    split_root = cfg["split_root"]
    if not split_root.exists():
        print(f"[SKIP] {split_name}: {split_root} tidak ada")
        continue

    video_dir = find_video_dir(split_root)
    if video_dir is None:
        print(f"[SKIP] {split_name}: tidak menemukan folder video di {split_root}")
        continue

    audio_dir = split_root / "audio"
    audio_dir.mkdir(parents=True, exist_ok=True)

    videos = []
    for ext in VIDEO_EXTS:
        videos.extend(video_dir.rglob(f"*{ext}"))
    videos = [v for v in videos if v.is_file()]

    print(f"[INFO] {split_name}: video_dir={video_dir} | total={len(videos)} | out={audio_dir}")

    for vp in videos:
        out = safe_outpath(audio_dir, vp, video_dir)
        if out.exists() and out.stat().st_size > 0:
            continue
        jobs.append((vp, out))

print(f"[RUN] total job baru = {len(jobs)}")

max_workers = min(12, (os.cpu_count() or 4) * 2)
ok = fail = 0

with ThreadPoolExecutor(max_workers=max_workers) as ex:
    futures = [ex.submit(extract_one, vp, out) for vp, out in jobs]
    for fut in as_completed(futures):
        success, msg = fut.result()
        if success:
            ok += 1
        else:
            fail += 1
            # kalau mau lebih detail, bisa print path juga (butuh ubah extract_one return video_path)
            print("[FAIL]", msg)

print(f"[DONE] Success={ok} Failed={fail}")
print("Output:")
print(" - Dataset/Train/audio")
print(" - Dataset/Val/audio")
print(" - Dataset/Test/audio")


[INFO] Train: video_dir=e:\tugas-akhir-qiqi\Dataset\Train\train | total=6000 | out=e:\tugas-akhir-qiqi\Dataset\Train\train\audio
[INFO] Val: video_dir=e:\tugas-akhir-qiqi\Dataset\Val\val | total=2000 | out=e:\tugas-akhir-qiqi\Dataset\Val\val\audio
[INFO] Test: video_dir=e:\tugas-akhir-qiqi\Dataset\Test\test | total=2000 | out=e:\tugas-akhir-qiqi\Dataset\Test\test\audio
[RUN] total job baru = 10000
[DONE] Success=10000 Failed=0
Output:
 - Dataset/Train/audio
 - Dataset/Val/audio
 - Dataset/Test/audio
