<a href="https://colab.research.google.com/github/sodonne6/Data_Augmentation_for_AVSR/blob/main/Data_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TCD-TIMIT → AV-HuBERT: Data Prep & Manifests

This notebook:
1) mounts Drive & sets paths  
2) (optional) resamples audio → 16 kHz  
3) extracts 88×88 lip crops from provided `.mat` ROI files  
4) counts audio samples & video frames  
5) builds **A+V 5-column manifests** (train/valid)  
6) ensures `.ltr` labels are aligned with manifests  
7) creates/updates `dict.ltr.txt`  
8) runs consistency checks

You can run it for one speaker as a smoke test, then reuse for larger batches.


In [None]:
# Mount + config
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path

# === YOUR ROOTS ===
ROOT = Path("/content/drive/MyDrive/tcdtimit")
SPEAKERS = [
    "volunteers/01M",      # add more later
]

# Inputs
VID25_SUB = "Clips/Video25/straightcam"     # 25 fps MP4s
MAT_SUB   = "Clips/Landmarks/straightcam"   # .mat files that contain 'mouth'

# Outputs
OUT_MP4_SUB = "Clips/video/straightcam"     # where mouth ROI MP4s go
DEBUG_DIR   = ROOT / "_debug_overlays"      # where overlays go
MANIFEST_SUB= "Clips/manifests"             # where {train|valid|test}.tsv will live (later)

# Cropping params
CROP_SIZE = 88
FPS      = 25

# (fine-tune if needed)
MOUTH_Y_BIAS = 14   # pixels to nudge crop downward
EXPAND       = 1.35 # enlarge bbox around polygon
MIN_WH       = 28   # minimum box size before resize

print("ROOT:", ROOT)
print("SPEAKERS:", SPEAKERS)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ROOT: /content/drive/MyDrive/tcdtimit
Speakers: ['volunteers/01M']
ROOT: /content/drive/MyDrive/tcdtimit
SPEAKERS: ['volunteers/01M']


In [None]:
# @title Utilities
import os, re, json, math, shutil, subprocess, random
from pathlib import Path
from typing import List, Tuple, Dict, Optional
import numpy as np
import cv2
from scipy.io import loadmat

def run(cmd:list, check=True):
    print("$", " ".join(map(str,cmd)))
    r = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    if r.returncode!=0 and check:
        print(r.stdout)
        raise RuntimeError(f"command failed: {' '.join(cmd)}")
    return r.stdout

def list_utts(dirpath:Path, ext:str) -> Dict[str, Path]:
    """Return dict {utt_id -> file} for files with given ext (case-insensitive)."""
    d = {}
    for p in sorted(dirpath.glob(f"*.{ext}")) + sorted(dirpath.glob(f"*.{ext.upper()}")):
        key = p.stem.lower()
        d[key] = p
    return d

def ensure_dir(p:Path):
    p.mkdir(parents=True, exist_ok=True)

def clean_dir(p:Path):
    if p.exists():
        for c in p.iterdir():
            if c.is_file(): c.unlink()
            elif c.is_dir(): shutil.rmtree(c)

def rm_stray_jpgs(base:Path):
    # delete *.jpg directly under base (not inside subfolders)
    if not base.exists(): return
    n=0
    for p in base.glob("*.jpg"):
        p.unlink(); n+=1
    for p in base.glob("*.JPG"):
        p.unlink(); n+=1
    print(f"[cleanup] removed {n} stray JPGs in {base}")


In [None]:
# @title Convert videos to 25fps MP4 (ffmpeg)
for spk in SPEAKERS:
    src = ROOT / spk / RAW_VIDEO_SUB
    dst = ROOT / spk / VID25_SUB
    ensure_dir(dst)
    if not src.exists():
        print(f"[WARN] Missing raw video folder: {src}")
        continue

    converted=0; skipped=0
    for mp4 in sorted(src.glob("*.mp4")):
        outp = dst/mp4.name
        if outp.exists() and outp.stat().st_size>0:
            skipped+=1
            continue
        # re-encode with constant frame rate 25, keep size, sane quality
        cmd = [
            "ffmpeg","-y","-hide_banner","-loglevel","error",
            "-i", str(mp4),
            "-r", str(FPS),
            "-an",        # we don't need the container audio track here
            "-c:v","libx264","-preset","medium","-crf","20",
            str(outp)
        ]
        run(cmd)
        converted+=1
    print(f"[video] {spk}: converted={converted} skipped={skipped} -> {dst}")


In [None]:
# @title Convert audio to 16k mono WAV (ffmpeg)
for spk in SPEAKERS:
    src = ROOT / spk / RAW_AUDIO_SUB
    dst = ROOT / spk / AUDIO16_SUB
    ensure_dir(dst)
    if not src.exists():
        print(f"[WARN] Missing raw audio folder: {src}")
        continue

    converted=0; skipped=0
    for wav in sorted(list(src.glob("*.wav")) + list(src.glob("*.WAV"))):
        outp = dst/wav.name.lower()  # normalize name
        if outp.exists() and outp.stat().st_size>0:
            skipped+=1; continue
        cmd = [
            "ffmpeg","-y","-hide_banner","-loglevel","error",
            "-i", str(wav),
            "-ac","1","-ar","16000",
            str(outp)
        ]
        run(cmd)
        converted+=1
    print(f"[audio] {spk}: converted={converted} skipped={skipped} -> {dst}")


In [None]:
# @title ROI loader for TCD-TIMIT .mat (mask-per-frame -> bounding boxes)
def load_roi_bboxes(mat_file:Path) -> List[Tuple[int,int,int,int]]:
    """
    Each frame entry in ROIs is a 2D array (H×W) with 1s in the mouth region.
    We compute a tight bbox per frame (x,y,w,h) in the video coordinate system.
    """
    mat = loadmat(str(mat_file))
    if 'ROIs' not in mat:
        raise ValueError(f"Missing 'ROIs' in {mat_file.name}; keys: {list(mat.keys())}")

    arr = mat['ROIs']
    # Some files are shape (1,N) object; some are (N,) — normalize:
    if arr.ndim == 2 and arr.shape[0] == 1:
        frames = [arr[0, i] for i in range(arr.shape[1])]
    else:
        frames = list(arr.ravel())

    bboxes = []
    last_bbox = None
    for idx, mask in enumerate(frames):
        mask = np.asarray(mask)
        # Binarize
        m = (mask > 0).astype(np.uint8)

        # If empty mask, reuse last bbox or put a small dummy (rare)
        if m.sum() == 0:
            if last_bbox is None:
                bboxes.append((0,0,1,1))
            else:
                bboxes.append(last_bbox)
            continue

        # Find tight bbox on mask
        ys, xs = np.where(m)
        x0, x1 = xs.min(), xs.max()
        y0, y1 = ys.min(), ys.max()
        w = max(1, x1 - x0 + 1)
        h = max(1, y1 - y0 + 1)
        last_bbox = (int(x0), int(y0), int(w), int(h))
        bboxes.append(last_bbox)
    return bboxes


In [None]:
# @title Extract 88x88 lip crops from 25fps video using .mat ROIs

def clamp_bbox(x,y,w,h, W,H):
    x = max(0, min(x, W-1))
    y = max(0, min(y, H-1))
    w = max(1, min(w, W - x))
    h = max(1, min(h, H - y))
    return x,y,w,h

def write_debug_overlay(vid_first_frame, bboxes, out_jpg):
    f = vid_first_frame.copy()
    if len(f.shape)==2: f = cv2.cvtColor(f, cv2.COLOR_GRAY2BGR)
    # draw a handful of boxes (start/mid/end)
    idxs = [0, len(bboxes)//2, len(bboxes)-1] if bboxes else []
    for i in idxs:
        x,y,w,h = bboxes[i]
        cv2.rectangle(f, (x,y), (x+w, y+h), (0,0,255), 3)
    cv2.imwrite(str(out_jpg), f)

for spk in SPEAKERS:
    mat_dir = ROOT / spk / MAT_SUB
    vid25_dir = ROOT / spk / VID25_SUB
    crops_root = ROOT / spk / CROPS_SUB
    ensure_dir(crops_root)
    if REMOVE_STRAY_JPGS: rm_stray_jpgs(crops_root)

    # Maps to find by id
    vids = list_utts(vid25_dir, "mp4")
    mats = list_utts(mat_dir,  "mat")

    ok=bad=0
    for utt, mat_path in mats.items():
        if utt not in vids:
            print(f"[skip] no 25fps video for {utt}")
            continue
        vid_path = vids[utt]

        # per-utter dir
        out_dir = crops_root / utt
        ensure_dir(out_dir)
        if CLEAN_UTT_DIRS:
            clean_dir(out_dir)
            out_dir.mkdir(parents=True, exist_ok=True)

        # load bboxes from .mat
        bboxes = load_roi_bboxes(mat_path)

        cap = cv2.VideoCapture(str(vid_path))
        if not cap.isOpened():
            print(f"[ERR] open video fail: {vid_path}")
            bad+=1; continue

        W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

        frame_idx = 0
        wrote = 0
        first_frame_img = None

        while True:
            ret, frame = cap.read()
            if not ret: break
            if frame_idx >= len(bboxes): break

            x,y,w,h = bboxes[frame_idx]
            # pad in video coords
            x -= PAD; y -= PAD; w += 2*PAD; h += 2*PAD
            x,y,w,h = clamp_bbox(x,y,w,h,W,H)

            # crop & to gray
            crop = frame[y:y+h, x:x+w]
            gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
            resized = cv2.resize(gray, (CROP_SIZE, CROP_SIZE), interpolation=cv2.INTER_AREA)

            cv2.imwrite(str(out_dir / f"{frame_idx:05d}.jpg"), resized)
            if first_frame_img is None:
                first_frame_img = frame.copy()
            frame_idx += 1
            wrote += 1

        cap.release()

        if WRITE_DEBUG_OVERLAYS and first_frame_img is not None:
            overlay_dir = crops_root.parent / "_debug_overlays"
            ensure_dir(overlay_dir)
            write_debug_overlay(first_frame_img, bboxes, overlay_dir / f"{utt}_overlay.jpg")

        print(f"[OK] {utt}: wrote {wrote} crops -> {out_dir}")
        ok += 1

    print(f"[crops] {spk}: ok={ok} bad={bad} -> {crops_root}")


[cleanup] removed 0 stray JPGs in /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/Video25fpsLip/straightcam
[OK] sa1: wrote 129 crops -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/Video25fpsLip/straightcam/sa1
[OK] sa2: wrote 118 crops -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/Video25fpsLip/straightcam/sa2
[OK] si1004: wrote 162 crops -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/Video25fpsLip/straightcam/si1004
[OK] si1092: wrote 92 crops -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/Video25fpsLip/straightcam/si1092
[OK] si1174: wrote 113 crops -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/Video25fpsLip/straightcam/si1174
[OK] si1175: wrote 155 crops -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/Video25fpsLip/straightcam/si1175
[OK] si1178: wrote 153 crops -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/Video25fpsLip/straightcam/si1178
[OK] si1266: wrote 108 crops -> /content/drive/MyDrive/tcdtimit/volunteers

KeyboardInterrupt: 

In [None]:
# @title Build manifests (tsv, ltr, dict) with audio+video+labels intersection (CORRECT TSV ORDER)
import string, random
from pathlib import Path
from typing import Dict

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def list_utts(folder: Path, ext: str) -> Dict[str, Path]:
    d = {}
    for p in sorted(folder.glob(f"*.{ext}")):
        d[p.stem.lower()] = p
    return d

def load_txt_labels(txt_dir: Path) -> Dict[str, str]:
    d = {}
    # case-insensitive: *.txt and *.TXT
    for p in sorted(list(txt_dir.glob("*.txt")) + list(txt_dir.glob("*.TXT"))):
        utt = p.stem.lower()
        d[utt] = p.read_text(encoding="utf-8").strip()
    return d

def letters_to_ltr(s: str) -> str:
    s = s.strip().lower()
    out = []
    for ch in s:
        if ch == " ":
            out.append("|")
        elif ch in string.ascii_lowercase + "'":
            out.append(ch)
        # drop everything else
    return " ".join(out)

SPLIT_SEED = 1337
VALID_FRAC = 0.20

for spk in SPEAKERS:
    base = ROOT / spk
    audio_dir = base / AUDIO16_SUB              # e.g. Clips/Audio16k/Audio/straightcam
    crops_dir = base / CROPS_SUB                # e.g. Clips/Video25fpsLip/straightcam
    txt_dir   = base / TXT_SUB                  # e.g. Clips/Labels/straightcam
    mani_dir  = base / MANIFEST_SUB             # e.g. manifests
    ensure_dir(mani_dir)

    aud = list_utts(audio_dir, "wav")
    lip = {d.name.lower(): d for d in crops_dir.iterdir() if d.is_dir()}
    txt = load_txt_labels(txt_dir)

    common = sorted(set(aud.keys()) & set(lip.keys()) & set(txt.keys()))
    print(f"[{spk}] audio: {len(aud)}  txt: {len(txt)}  lip: {len(lip)}")
    print(f"[{spk}] overlap(audio∩txt∩lip): {len(common)}  sample:", common[:10])

    if not common:
        print(f"[WARN] No overlapping utterances for {spk}. Skipping.")
        continue

    # deterministic split
    rng = random.Random(SPLIT_SEED)
    items = common[:]
    rng.shuffle(items)
    n_valid = max(1, int(round(len(items) * VALID_FRAC))) if items else 0
    valid = sorted(items[:n_valid])
    train = sorted(items[n_valid:])

    def write_split(name, utts):
        tsv = mani_dir / f"{name}.tsv"
        ltr = mani_dir / f"{name}.ltr"

        with tsv.open("w", encoding="utf-8") as ftsv, ltr.open("w", encoding="utf-8") as fltr:
            # TSV header line = absolute data root
            ftsv.write(str(base) + "\n")
            for u in utts:
                rel_video_dir = (crops_dir / u).relative_to(base)        # directory of JPG crops
                rel_audio_wav = aud[u].relative_to(base)                 # WAV @ 16 kHz
                nframes = sum(1 for _ in (crops_dir / u).glob("*.jpg"))  # count JPGs

                # *** REQUIRED ORDER for AV-HuBERT ***
                # <rel_video_dir> <num_frames> <rel_audio_wav>
                ftsv.write(f"{rel_video_dir}\t{nframes}\t{rel_audio_wav}\n")

                # LTR labels (1:1 with TSV rows, excluding header)
                fltr.write(letters_to_ltr(txt[u]) + "\n")

    write_split("train", train)
    write_split("valid", valid)

    # dict.ltr.txt (create once)
    dict_path = mani_dir / "dict.ltr.txt"
    if not dict_path.exists():
        symbols = list("'" + string.ascii_lowercase) + ["|"]
        with dict_path.open("w", encoding="utf-8") as f:
            f.write("<pad> 1 #fairseq:overwrite\n</s> 1 #fairseq:overwrite\n<unk> 1 #fairseq:overwrite\n")
            for s in symbols:
                f.write(f"{s} 1\n")

    # Convenience: per-utterance frame counts
    out_tsv = mani_dir / "video_frame_counts.tsv"
    with out_tsv.open("w", encoding="utf-8") as f:
        f.write("utt\tframes\n")
        for u in common:
            n = sum(1 for _ in (crops_dir / u).glob("*.jpg"))
            f.write(f"{u}\t{n}\n")

    # Quick sanity check of the new TSVs
    def check(tsv_name):
        lines = (mani_dir/tsv_name).read_text(encoding="utf-8").splitlines()
        root = Path(lines[0]) if lines else base
        ok = bad = 0
        for row in lines[1:]:
            parts = row.split("\t")
            if len(parts) != 3:
                bad += 1; continue
            vdir, nfrm, awav = parts
            if not nfrm.isdigit(): bad += 1; continue
            if not (root / vdir).is_dir(): bad += 1; continue
            if not (root / awav).is_file(): bad += 1; continue
            ok += 1
        print(f"{tsv_name}: ok={ok}, bad={bad}")

    check("train.tsv")
    check("valid.tsv")

    print(f"[OK] wrote manifests in {mani_dir}")
    print(f"  train: {len(train)}  |  valid: {len(valid)}")


[volunteers/01M] audio: 98  txt: 98  lip: 98
[volunteers/01M] overlap(audio∩txt∩lip): 98  sample: ['sa1', 'sa2', 'si1004', 'si1092', 'si1174', 'si1175', 'si1178', 'si1266', 'si1447', 'si1453']
train.tsv: ok=78, bad=0
valid.tsv: ok=20, bad=0
[OK] wrote manifests in /content/drive/MyDrive/tcdtimit/volunteers/01M/manifests
  train: 78  |  valid: 20


In [None]:
# Mouth crops from 25fps videos using fixed 601×601 window + clamp (no black bars)

import cv2, numpy as np
from pathlib import Path

# --- your roots ---
ROOT = Path("/content/drive/MyDrive/tcdtimit")
SPEAKERS = ["volunteers/01M"]  # add more speakers if you want
VID25_SUB = "Clips/Video25/straightcam"
MAT_SUB   = "Clips/Landmarks/straightcam"
OUT_SUB   = "Clips/Video25fpsLip/straightcam"  # per-utt folders with 88x88 jpg

# --- canonical window in 601×601 space (from your .mat) ---
X0, W_WIN = 168, 243
Y0, H_WIN = 340, 150

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def list_utts(root_dir: Path, ext: str):
    root_dir = Path(root_dir)
    out = {}
    for p in root_dir.rglob(f"*.{ext}"):
        rel = p.relative_to(root_dir).as_posix()
        out[rel[:-(len(ext)+1)]] = p  # drop extension
    return out

def to_601_canvas(bgr):
    """Letterbox each frame to a 601×601 canvas (fit width). Returns canvas, top_pad, new_h."""
    H, W = bgr.shape[:2]
    target = 601
    s = target / W
    new_w, new_h = target, int(round(H * s))
    resized = cv2.resize(bgr, (new_w, new_h), interpolation=cv2.INTER_AREA)
    canvas = np.zeros((target, target, 3), np.uint8)
    pad_y = (target - new_h) // 2
    canvas[pad_y:pad_y+new_h, 0:new_w] = resized
    return canvas, pad_y, new_h

total_ok = total_bad = 0

for spk in SPEAKERS:
    vid_dir = ROOT / spk / VID25_SUB
    out_root = ROOT / spk / OUT_SUB
    ensure_dir(out_root)

    vids = list_utts(vid_dir, "mp4")
    if not vids:
        print(f"[WARN] no MP4s found under {vid_dir}")
        continue

    for utt, mp4 in vids.items():
        out_dir = out_root / utt
        ensure_dir(out_dir)

        cap = cv2.VideoCapture(str(mp4))
        if not cap.isOpened():
            print(f"[ERR] open video fail: {mp4}")
            total_bad += 1
            continue

        idx = 0
        wrote = 0
        while True:
            ok, frame = cap.read()
            if not ok:
                break

            canvas, pad_y, new_h = to_601_canvas(frame)

            # clamp vertical window to avoid bottom black pad
            y_min = pad_y
            y_max = pad_y + new_h - H_WIN
            y_clamped = max(y_min, min(Y0, y_max))

            # crop, grayscale, resize → 88×88
            crop = canvas[y_clamped:y_clamped+H_WIN, X0:X0+W_WIN]
            gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
            lip88 = cv2.resize(gray, (88, 88), interpolation=cv2.INTER_AREA)

            cv2.imwrite(str(out_dir / f"{idx:05d}.jpg"), lip88)
            idx += 1
            wrote += 1

        cap.release()
        if wrote == 0:
            print(f"[WARN] {utt}: wrote 0 frames -> {out_dir}")
            total_bad += 1
        else:
            print(f"[OK]   {utt}: {wrote} frames -> {out_dir}")
            total_ok += 1

print(f"\nDone. ok={total_ok} bad={total_bad}")


[OK]   sa1: 129 frames -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/Video25fpsLip/straightcam/sa1
[OK]   sa2: 118 frames -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/Video25fpsLip/straightcam/sa2
[OK]   si1004: 162 frames -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/Video25fpsLip/straightcam/si1004


KeyboardInterrupt: 

In [None]:
# Create an 88×88 grayscale lip-ROI MP4 for sa1

import cv2, numpy as np
from pathlib import Path

ROOT = Path("/content/drive/MyDrive/tcdtimit")
SRC_MP4 = ROOT/"volunteers/01M/Clips/Video25/straightcam/sa1.mp4"
OUT_MP4 = ROOT/"volunteers/01M/Clips/video/straightcam/sa1.mp4"  # AV-HuBERT's canonical spot
OUT_MP4.parent.mkdir(parents=True, exist_ok=True)

# canonical window in 601×601
X0, W_WIN = 168, 243
Y0, H_WIN = 340, 150

def to_601_canvas(bgr):
    H, W = bgr.shape[:2]
    target = 601
    s = target / W               # fit width
    new_w, new_h = target, int(round(H*s))
    resized = cv2.resize(bgr, (new_w, new_h), interpolation=cv2.INTER_AREA)
    canvas = np.zeros((target, target, 3), np.uint8)
    pad_y = (target - new_h) // 2
    canvas[pad_y:pad_y+new_h, 0:new_w] = resized
    return canvas, pad_y, new_h

def open_writer(path, fps):
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    w = cv2.VideoWriter(str(path), fourcc, fps, (88,88), isColor=False)
    if w.isOpened():
        return w, False
    # fallback: some backends require 3-channel
    w = cv2.VideoWriter(str(path), fourcc, fps, (88,88), isColor=True)
    if not w.isOpened():
        raise RuntimeError(f"Failed to open writer: {path}")
    return w, True

cap = cv2.VideoCapture(str(SRC_MP4))
if not cap.isOpened():
    raise RuntimeError(f"Cannot open {SRC_MP4}")

src_fps = cap.get(cv2.CAP_PROP_FPS)
fps = float(src_fps) if src_fps and src_fps > 1 else 25.0
writer, needs_bgr = open_writer(OUT_MP4, fps)

count=0
while True:
    ok, frame = cap.read()
    if not ok: break
    canvas, pad_y, new_h = to_601_canvas(frame)

    # clamp vertical window so it never hits black padding
    y_min = pad_y
    y_max = pad_y + new_h - H_WIN
    y = max(y_min, min(Y0, y_max))

    crop = canvas[y:y+H_WIN, X0:X0+W_WIN]
    gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
    lip88 = cv2.resize(gray, (88,88), interpolation=cv2.INTER_AREA)

    if needs_bgr:
        lip88 = cv2.cvtColor(lip88, cv2.COLOR_GRAY2BGR)
    writer.write(lip88)
    count += 1

writer.release()
cap.release()
print(f"[OK] wrote {count} frames -> {OUT_MP4} @ {fps:.2f} fps")


[OK] wrote 129 frames -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/video/straightcam/sa1.mp4 @ 25.00 fps


In [None]:
# @title Batch crop all 25fps videos to 88x88 mouth-ROI MP4s (no padding, clamped)
import cv2, numpy as np
from pathlib import Path

ROOT = Path("/content/drive/MyDrive/tcdtimit")
SPEAKERS = ["volunteers/01M"]  # add more if needed
VID25_SUB = "Clips/Video25/straightcam"
OUT_SUB   = "Clips/video/straightcam"  # AV-HuBERT expects ROI mp4s here

# canonical window in 601×601 canvas (deduced from your .mat)
X0, W_WIN = 168, 243
Y0, H_WIN = 340, 150

def ensure_dir(p): p.mkdir(parents=True, exist_ok=True)

def list_utts(root_dir: Path, ext: str):
    out = {}
    for p in root_dir.rglob(f"*.{ext}"):
        rel = p.relative_to(root_dir).as_posix()
        out[rel[:-(len(ext)+1)]] = p
    return out

def to_601_canvas(bgr):
    H, W = bgr.shape[:2]
    target = 601
    s = target / W
    new_w, new_h = target, int(round(H*s))
    resized = cv2.resize(bgr, (new_w, new_h), interpolation=cv2.INTER_AREA)
    canvas = np.zeros((target, target, 3), np.uint8)
    pad_y = (target - new_h) // 2
    canvas[pad_y:pad_y+new_h, 0:new_w] = resized
    return canvas, pad_y, new_h

def open_writer(path, fps):
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    w = cv2.VideoWriter(str(path), fourcc, fps, (88,88), isColor=False)
    if w.isOpened(): return w, False
    w = cv2.VideoWriter(str(path), fourcc, fps, (88,88), isColor=True)
    if not w.isOpened(): raise RuntimeError(f"writer failed: {path}")
    return w, True

ok = bad = 0
for spk in SPEAKERS:
    vid_dir   = ROOT/spk/VID25_SUB
    out_dir   = ROOT/spk/OUT_SUB
    ensure_dir(out_dir)
    vids = list_utts(vid_dir, "mp4")
    for utt, src in vids.items():
        out_mp4 = out_dir/f"{utt}.mp4"
        ensure_dir(out_mp4.parent)
        cap = cv2.VideoCapture(str(src))
        if not cap.isOpened(): print("[ERR] open:", src); bad+=1; continue
        src_fps = cap.get(cv2.CAP_PROP_FPS)
        fps = float(src_fps) if src_fps and src_fps>1 else 25.0
        wtr, needs_bgr = open_writer(out_mp4, fps)
        wrote=0
        while True:
            ret, fr = cap.read()
            if not ret: break
            canvas, pad_y, new_h = to_601_canvas(fr)
            y_min, y_max = pad_y, pad_y + new_h - H_WIN
            y = max(y_min, min(Y0, y_max))
            crop = canvas[y:y+H_WIN, X0:X0+W_WIN]
            gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
            lip88 = cv2.resize(gray, (88,88), interpolation=cv2.INTER_AREA)
            if needs_bgr: lip88 = cv2.cvtColor(lip88, cv2.COLOR_GRAY2BGR)
            wtr.write(lip88); wrote+=1
        wtr.release(); cap.release()
        if wrote==0: print("[WARN] 0 frames:", src); bad+=1
        else: print("[OK]", utt, "->", out_mp4, f"({wrote} fr)"); ok+=1
print("done: ok=",ok," bad=",bad)


[OK] sa1 -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/video/straightcam/sa1.mp4 (129 fr)
[OK] sa2 -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/video/straightcam/sa2.mp4 (118 fr)
[OK] si1004 -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/video/straightcam/si1004.mp4 (162 fr)
[OK] si1092 -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/video/straightcam/si1092.mp4 (92 fr)
[OK] si1174 -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/video/straightcam/si1174.mp4 (113 fr)
[OK] si1175 -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/video/straightcam/si1175.mp4 (155 fr)
[OK] si1178 -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/video/straightcam/si1178.mp4 (153 fr)
[OK] si1266 -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/video/straightcam/si1266.mp4 (108 fr)
[OK] si1447 -> /content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/video/straightcam/si1447.mp4 (162 fr)
[OK] si1453 -> /content/drive/MyDrive/tcdtimit/volunteers/0