# AV-HuBERT — Train Only (CTC / Seq2Seq)

This notebook is **cleaned for training only**. All data prep happens offline; upload the prepared directory that contains `train.tsv`, `valid.tsv`, `dict.wrd.txt` (and optional SPM files if using Seq2Seq).

### What’s inside
1. Runtime & repo setup
2. User config (paths; choose CTC or Seq2Seq)
3. Sanity checks on your uploaded manifests
4. **CTC fine-tuning** (low-VRAM defaults; resume-safe)
5. **Resume** (just re-run the same cell)
6. Optional **Seq2Seq** launch stub (if you later want punctuation/casing)

> Tip: Resuming is as simple as running the same training cell again with the same `RUN_DIR` — Fairseq loads `checkpoint_last.pt` automatically.


In [ ]:
## 1) Runtime & repo setup
from google.colab import drive
drive.mount('/content/drive')
!nvidia-smi || true

# If you already have the repo somewhere, set REPO_DIR in the next cell.
# Otherwise, clone a fresh copy here (uncomment the next two lines):
# !git clone https://github.com/facebookresearch/av_hubert.git /content/av_hubert
# !echo "Cloned AV-HuBERT into /content/av_hubert"


In [ ]:
## 2) User config — edit these
REPO_DIR = '/content/av_hubert'  # repo root (contains 'avhubert/' and 'fairseq/' folders)
DATA_DIR = '/content/drive/MyDrive/tcdtimit/volunteers/01M/Clips/433h_data'  # your prepared manifests dir
CKPT     = '/content/drive/MyDrive/AVSR_Colab/models/base_vox_iter5.pt'     # pre-trained checkpoint
RUN_DIR  = '/content/drive/MyDrive/tcdtimit/volunteers/01M/runs/ctc_clean'   # experiment folder (resume-safe)

# Training flavor: 'ctc' (lighter/faster) or 's2s' (Seq2Seq; needs SentencePiece model)
FINETUNE_STYLE = 'ctc'  # 'ctc' or 's2s'

# Modalities: '["audio","video"]' or '["audio"]' if lips are poor
MODALITIES = '["audio","video"]'

# VRAM-friendly defaults
STACK_ORDER_AUDIO = 4     # (16kHz + 25 fps labels) => 26*4=104-dim features
MAX_TOKENS        = 100000 # reduce if you still OOM (e.g., 60000)
UPDATE_FREQ       = 2      # gradient accumulation
NUM_WORKERS       = 1

# Env
import os, pathlib
os.environ['HYDRA_FULL_ERROR'] = '1'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
os.environ['PYTHONPATH'] = f"{REPO_DIR}/avhubert:" + os.environ.get('PYTHONPATH','')
pathlib.Path(RUN_DIR).mkdir(parents=True, exist_ok=True)
print('Using:\n  REPO_DIR=', REPO_DIR, '\n  DATA_DIR=', DATA_DIR, '\n  CKPT=', CKPT, '\n  RUN_DIR=', RUN_DIR)


In [ ]:
## 3) Install Python deps & make sure fairseq is importable
%pip -q install tensorboard hydra-core==1.1.* omegaconf==2.1.* sentencepiece opencv-python-headless==4.8.* >/dev/null
%cd $REPO_DIR/fairseq
%pip -q install -e . >/dev/null
%cd $REPO_DIR
import fairseq; import avhubert
print('fairseq version OK; avhubert import OK')


In [ ]:
## 4) Sanity checks on manifests
import os
req = [f"{DATA_DIR}/train.tsv", f"{DATA_DIR}/valid.tsv", f"{DATA_DIR}/dict.wrd.txt"]
missing = [p for p in req if not os.path.isfile(p)]
assert not missing, f"Missing files: {missing}"
print('Found train/valid/dict ✅')
print('train.tsv head:')
with open(f"{DATA_DIR}/train.tsv") as f:
    for i, ln in zip(range(6), f):
        print(ln.rstrip())


In [ ]:
## 5) TRAIN — CTC (default). Re-run this cell to resume (same RUN_DIR)
assert FINETUNE_STYLE == 'ctc', 'Switch FINETUNE_STYLE to ctc or use the Seq2Seq cell below.'
%%bash -s "$REPO_DIR" "$DATA_DIR" "$CKPT" "$RUN_DIR" "$MODALITIES" "$STACK_ORDER_AUDIO" "$MAX_TOKENS" "$UPDATE_FREQ" "$NUM_WORKERS"
set -euo pipefail
REPO_DIR="$1"; DATA_DIR="$2"; CKPT="$3"; RUN_DIR="$4"; MODALITIES="$5"; SOA="$6"; MAXTOK="$7"; UPF="$8"; NW="$9"
export HYDRA_FULL_ERROR=1
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
export PYTHONPATH="${REPO_DIR}/avhubert:${PYTHONPATH:-}"

python -m fairseq_cli.hydra_train \
  --config-dir "${REPO_DIR}/avhubert/conf/finetune" \
  --config-name base_vox_30h \
  common.user_dir="${REPO_DIR}/avhubert" \
  hydra.run.dir="${RUN_DIR}" \
  task.data="${DATA_DIR}" \
  task.label_dir="${DATA_DIR}" \
  task.labels='["ltr"]' \
  task.modalities="${MODALITIES}" \
  task.is_s2s=false \
  task.normalize=true \
  task.image_aug=false \
  task.stack_order_audio="${SOA}" \
  task.max_sample_size=null \
  dataset.train_subset=train \
  dataset.valid_subset=valid \
  dataset.num_workers="${NW}" \
  dataset.max_tokens="${MAXTOK}" \
  model._name=av_hubert_ctc \
  model.w2v_path="${CKPT}" \
  model.freeze_finetune_updates=0 \
  model.decoder_layers=3 \
  +model.w2v_args.task._name=av_hubert_pretraining \
  +model.w2v_args.task.data="${DATA_DIR}" \
  +model.w2v_args.task.label_dir="${DATA_DIR}" \
  +model.w2v_args.task.labels='["ltr"]' \
  +model.w2v_args.task.modalities="${MODALITIES}" \
  +model.w2v_args.task.sample_rate=16000 \
  +model.w2v_args.task.label_rate=25 \
  +model.w2v_args.task.pad_audio=true \
  +model.w2v_args.task.random_crop=false \
  +model.w2v_args.task.single_target=true \
  +model.w2v_args.task.fine_tuning=false \
  +model.w2v_args.task.normalize=true \
  +model.w2v_args.task.image_aug=false \
  +model.w2v_args.task.stack_order_audio="${SOA}" \
  +model.w2v_args.label_rate=25 \
  +model.w2v_args.model._name=av_hubert \
  +model.w2v_args.model.input_modality=audio \
  +model.w2v_args.model.audio_feat_dim=$((26*SOA)) \
  +model.w2v_args.model.label_rate=25 \
  criterion._name=ctc \
  +criterion.zero_infinity=true \
  +criterion.post_process=letter \
  optimization.max_update=150 \
  optimization.lr='[1e-4]' \
  optimization.update_freq="[${UPF}]" \
  optimization.clip_norm=5.0 \
  lr_scheduler.warmup_steps=5 \
  lr_scheduler.hold_steps=5 \
  lr_scheduler.decay_steps=140 \
  checkpoint.save_dir="${RUN_DIR}/checkpoints" \
  checkpoint.save_interval_updates=50 \
  checkpoint.best_checkpoint_metric=loss \
  distributed_training.distributed_world_size=1 \
  common.log_interval=5


### 6) Resume training
Just re-run the **CTC TRAIN** cell with the same `RUN_DIR`. Fairseq will automatically load `checkpoint_last.pt` if present.


In [ ]:
## 7) OPTIONAL — Seq2Seq variant (needs SentencePiece model)
# Steps before enabling:
#   • Ensure your data dir contains SPM files, e.g. 'spm1000/spm_unigramXXX.model'.
#   • Set TOKENIZER_MODEL below.
# Then set FINETUNE_STYLE='s2s' in the config cell and run this cell instead of the CTC one.
assert FINETUNE_STYLE == 's2s', 'Switch FINETUNE_STYLE to "s2s" to run this cell.'
TOKENIZER_MODEL = f"{DATA_DIR}/spm1000/spm_unigram1000.model"  # edit if different
%%bash -s "$REPO_DIR" "$DATA_DIR" "$CKPT" "$RUN_DIR" "$MODALITIES" "$STACK_ORDER_AUDIO" "$MAX_TOKENS" "$UPDATE_FREQ" "$NUM_WORKERS" "$TOKENIZER_MODEL"
set -euo pipefail
REPO_DIR="$1"; DATA_DIR="$2"; CKPT="$3"; RUN_DIR="$4"; MODALITIES="$5"; SOA="$6"; MAXTOK="$7"; UPF="$8"; NW="$9"; TOK="$10"
export HYDRA_FULL_ERROR=1
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
export PYTHONPATH="${REPO_DIR}/avhubert:${PYTHONPATH:-}"

python -m fairseq_cli.hydra_train \
  --config-dir "${REPO_DIR}/avhubert/conf/finetune" \
  --config-name base_vox_30h \
  common.user_dir="${REPO_DIR}/avhubert" \
  hydra.run.dir="${RUN_DIR}" \
  task.data="${DATA_DIR}" \
  task.label_dir="${DATA_DIR}" \
  task.labels='["ltr"]' \
  task.modalities="${MODALITIES}" \
  task.is_s2s=true \
  task.normalize=true \
  task.image_aug=false \
  task.stack_order_audio="${SOA}" \
  task.max_sample_size=null \
  task.tokenizer_bpe_name=sentencepiece \
  task.tokenizer_bpe_model="${TOK}" \
  dataset.train_subset=train \
  dataset.valid_subset=valid \
  dataset.num_workers="${NW}" \
  dataset.max_tokens="${MAXTOK}" \
  model._name=av_hubert_seq2seq \
  model.w2v_path="${CKPT}" \
  model.freeze_finetune_updates=0 \
  model.decoder_layers=6 \
  +model.w2v_args.task._name=av_hubert_pretraining \
  +model.w2v_args.task.data="${DATA_DIR}" \
  +model.w2v_args.task.label_dir="${DATA_DIR}" \
  +model.w2v_args.task.labels='["ltr"]' \
  +model.w2v_args.task.modalities="${MODALITIES}" \
  +model.w2v_args.task.sample_rate=16000 \
  +model.w2v_args.task.label_rate=25 \
  +model.w2v_args.task.pad_audio=true \
  +model.w2v_args.task.random_crop=false \
  +model.w2v_args.task.single_target=true \
  +model.w2v_args.task.fine_tuning=false \
  +model.w2v_args.task.normalize=true \
  +model.w2v_args.task.image_aug=false \
  +model.w2v_args.task.stack_order_audio="${SOA}" \
  +model.w2v_args.label_rate=25 \
  +model.w2v_args.model._name=av_hubert \
  +model.w2v_args.model.input_modality=audio \
  +model.w2v_args.model.audio_feat_dim=$((26*SOA)) \
  +model.w2v_args.model.label_rate=25 \
  criterion._name=label_smoothed_cross_entropy \
  optimization.max_update=150 \
  optimization.lr='[5e-5]' \
  optimization.update_freq="[${UPF}]" \
  optimization.clip_norm=5.0 \
  lr_scheduler.warmup_steps=5 \
  lr_scheduler.hold_steps=5 \
  lr_scheduler.decay_steps=140 \
  checkpoint.save_dir="${RUN_DIR}/checkpoints" \
  checkpoint.save_interval_updates=50 \
  checkpoint.best_checkpoint_metric=loss \
  distributed_training.distributed_world_size=1 \
  common.log_interval=5


## Notes
- **Prepared data expectations:**
  Your `$DATA_DIR` should look like:
  ```
  train.tsv
  valid.tsv
  dict.wrd.txt
  audio/  (WAVs at 16kHz, filenames like <id>.wav)
  video/  (MP4s at 25fps, filenames like <id>.mp4)
  spm1000/ (optional for Seq2Seq)
    ├─ spm_unigramXXXX.model
    └─ spm_unigramXXXX.vocab
  ```
- **Cancel & resume:** Stopping the runtime is safe. Re-run the CTC train cell — it will pick up from `checkpoint_last.pt` under `RUN_DIR/checkpoints`.
- **Low VRAM tips:** Try `MAX_TOKENS=60000`, `UPDATE_FREQ=3`, and/or `task.image_aug=false`.
