# Biodenoising Adaptation Demo (ZIP input)

This notebook lets you:
- Upload a ZIP archive of audio files (e.g., WAV/FLAC)
- Extract into a timestamped folder
- Run domain adaptation using `adapt.py` on the extracted folder

Works in Jupyter/Colab. If using Colab, enable GPU for speed.


In [None]:
# Optional: install runtime dependencies on Colab
from __future__ import annotations
import sys
import subprocess

IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    try:
        import torch  # type: ignore
        import torchaudio  # type: ignore
        import soundfile  # type: ignore
        import yaml  # type: ignore
        import pandas  # type: ignore
    except Exception:
        # Basic deps; adjust if your Colab image is missing items
        subprocess.run([sys.executable, "-m", "pip", "install", "-q", "torch", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cu121"], check=True)
        subprocess.run([sys.executable, "-m", "pip", "install", "-q", "soundfile", "pyyaml", "pandas", "numpy", "scipy", "tqdm", "librosa"], check=True)
        import torch  # type: ignore
        import torchaudio  # type: ignore
        import soundfile  # type: ignore
        import yaml  # type: ignore
        import pandas  # type: ignore
    print("Colab environment ready.")
else:
    print("Running outside Colab; using local environment.")


In [None]:
# Preflight: ensure biodenoising is installed (GitHub branch install on Colab/Jupyter)
from __future__ import annotations
import sys
import subprocess
import importlib

BRANCH_URL = "git+https://github.com/earthspecies/biodenoising@marius/fixes"

try:
    from biodenoising import adapt as adapt  # type: ignore
    print("Found 'biodenoising.adapt'. Ensuring branch version...")
    # Force upgrade to requested branch to avoid stale installs
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "--upgrade", "--no-cache-dir", BRANCH_URL], check=True)
    adapt = importlib.reload(adapt)  # type: ignore
    print("Ensured 'biodenoising.adapt' comes from marius/fixes branch.")
except Exception:
    print("Installing biodenoising from GitHub branch marius/fixes...")
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "--no-cache-dir", BRANCH_URL], check=True)
    from biodenoising import adapt as adapt  # type: ignore
    print("Installed and imported 'biodenoising.adapt' from marius/fixes branch.")


In [None]:
# Setup: environment and paths (Colab/Jupyter-friendly)
from __future__ import annotations
import os
import sys
import time
from pathlib import Path
from typing import Optional, List

# Use current working directory so this works on Colab (/content) and Jupyter
WORKSPACE = Path.cwd().resolve()

print(f"CWD: {Path.cwd()}")
print("Python:", sys.version)

# Ensure local package/module resolution
if str(WORKSPACE) not in sys.path:
    sys.path.insert(0, str(WORKSPACE))

# Create base dirs for this demo
ADAPT_BASE = WORKSPACE / "scripts"
UPLOADS_DIR = ADAPT_BASE / "adapt_uploads"
OUTPUTS_DIR = ADAPT_BASE / "adapt_outputs"
for d in (UPLOADS_DIR, OUTPUTS_DIR):
    d.mkdir(parents=True, exist_ok=True)

print("Uploads dir:", UPLOADS_DIR)
print("Outputs dir:", OUTPUTS_DIR)


In [None]:
# Upload ZIP (Colab or Jupyter)
from __future__ import annotations
import io
import zipfile
from datetime import datetime

# Try Colab uploader first; fall back to ipywidgets
zip_path = None
try:
    from google.colab import files as colab_files  # type: ignore
    print("Detected Colab. Use the chooser to upload a ZIP.")
    uploaded = colab_files.upload()
    if uploaded:
        name = next(iter(uploaded.keys()))
        data = uploaded[name]
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        zip_path = UPLOADS_DIR / f"upload_{ts}.zip"
        with open(zip_path, "wb") as f:
            f.write(data)
        print(f"Saved upload to: {zip_path}")
except Exception as e:
    print("Colab uploader unavailable:", repr(e))

if zip_path is None:
    try:
        import ipywidgets as widgets  # type: ignore
        from IPython.display import display  # type: ignore

        file_uploader = widgets.FileUpload(accept=".zip", multiple=False)
        display(file_uploader)
        print("Use the widget above to upload a .zip, then re-run this cell once.")
        if file_uploader.value:
            item = list(file_uploader.value.values())[0]
            ts = datetime.now().strftime("%Y%m%d_%H%M%S")
            zip_path = UPLOADS_DIR / f"upload_{ts}.zip"
            with open(zip_path, "wb") as f:
                f.write(item["content"])  # type: ignore[index]
            print(f"Saved upload to: {zip_path}")
    except Exception as e:
        print("ipywidgets uploader unavailable:", repr(e))

if zip_path is None:
    print("As a fallback, set `zip_path` to an existing ZIP on disk and re-run.")
    # Example:
    # zip_path = WORKSPACE / "scripts" / "example_audio.zip"

zip_path


In [None]:
# Extract ZIP to a timestamped folder under scripts/adapt_uploads
from __future__ import annotations
import shutil
import zipfile
from datetime import datetime

assert zip_path is not None and Path(zip_path).exists(), "zip_path must be set to an existing file"

extract_ts = time.strftime("%Y%m%d_%H%M%S")
extract_dir = UPLOADS_DIR / f"unzipped_{extract_ts}"
extract_dir.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as zf:
    zf.extractall(extract_dir)

# Find audio files for a quick sanity check
AUDIO_EXTS = {".wav", ".flac", ".mp3", ".ogg", ".m4a", ".aac"}
audio_files = [p for p in extract_dir.rglob("*") if p.suffix.lower() in AUDIO_EXTS]
print(f"Extracted to: {extract_dir}")
print(f"Found {len(audio_files)} audio files (recursively)")

extract_dir


In [None]:
# Configure and run domain adaptation via adapt.py
from __future__ import annotations
import types
from types import SimpleNamespace
from pathlib import Path

import torch  # type: ignore
from biodenoising import adapt as adapt  # type: ignore

# Build args using ConfigParser if available; otherwise fall back to a SimpleNamespace
try:
    from biodenoising.adapt import ConfigParser  # type: ignore
    parser = ConfigParser()
    args = parser.parse_args([])
except Exception:
    args = SimpleNamespace()

# Required: point to the folder with noisy files (we assume your ZIP has noisy domain audio)
args.noisy_dir = str(extract_dir)

# Outputs
run_ts = time.strftime("%Y%m%d_%H%M%S")
run_out_dir = OUTPUTS_DIR / f"adapt_run_{run_ts}"
run_out_dir.mkdir(parents=True, exist_ok=True)
args.out_dir = str(run_out_dir)

# Sensible defaults; ensure required attributes exist
if not hasattr(args, "steps"):
    args.steps = 5
if not hasattr(args, "epochs"):
    args.epochs = 5
if not hasattr(args, "method"):
    args.method = "biodenoising16k_dns48"
if not hasattr(args, "segment"):
    args.segment = 4
if not hasattr(args, "device"):
    args.device = "cuda" if torch.cuda.is_available() else "cpu"

# Add missing attributes expected by adapt module
defaults = {
    # Core processing
    "sample_rate": 16000,
    "highpass": 20,
    "segment": 4,
    "peak_height": 0.008,
    "window_size": 0,
    # Flags and transforms
    "noise_reduce": False,
    "noisy_estimate": False,
    "transform": "none",
    "antialiasing": False,
    "time_scale_factor": 0,
    "revecho": 0.0,
    "amp_scale": False,
    "streaming": False,
    # Training / adaptation control
    "use_top": 1.0,
    "num_valid": 0,
    "epochs": 5,
    "steps": 5,
    "seed": 0,
    # IO and workers
    "processed_dir": None,
    "noisy_dir": None,
    "noise_dir": None,
    "test_dir": None,
    "num_workers": 4,
    "dry": 0.0,
    # Device and SR handling
    "keep_original_sr": False,
    "force_sample_rate": 0,
    # Selection tables / annotations
    "selection_table": False,
    "annotations": False,
    "annotations_begin_column": "Begin",
    "annotations_end_column": "End",
    "annotations_label_column": None,
    "annotations_label_value": None,
    "annotations_extension": ".csv",
    # Config and files
    "cfg": "biodenoising/conf/config_adapt.yaml",
    "checkpoint_file": "checkpoint.th",
    "history_file": "history.json",
    "model_path": "",
    # Model and optimizer
    "model": "biodenoising16k_dns48",
    "nsources": 1,
    "batch_size": 1,
    "epoch_size": 1000,
    "stride": 0.5,
    "pad": False,
    "lr": 3e-4,
    "optim": "adam",
    "beta1": 0.9,
    "beta2": 0.999,
    "show": False,
    "verbose": False,
    # Distributed setup
    "ddp": False,
    "rank": None,
    "world_size": 1,
    # Dataset/exclude controls (strings expected by dataset code)
    "exclude": "",
    "exclude_noise": "",
    "parallel_noise": False,
    "low_snr": 0.0,
    "high_snr": 0.0,
    "use_subset_noise": False,
    "use_subset_clean": False,
    "balance_clean": None,
    # Augment controls
    "repeat_prob": 0.0,
    "random_repeat": False,
    "random_pad": False,
    "silence_prob": 0.0,
    "noise_prob": 0.0,
    "normalize": False,
    "random_gain": 0.0,
    "low_gain": 1.0,
    "high_gain": 1.0,
    "remix": False,
    "flip": False,
    "shift": False,
    "trim": False, 
    # demucs sub-config
    "demucs": {"chout": 64, "resample": 0},
    # SWA / training extras
    "swa_scheduler": False,
    "swa_start": 0,
    # Evaluation / checkpointing / logging
    "eval_every": 1,
    "checkpoint": True,
    "best_file": "best_model.th",
    "restart": False,
    "samples_dir": None,
    "num_prints": 5,
    # Loss / STFT settings
    "loss": "l1",
    "stft_loss": False,
    "stft_sc_factor": 0.0,
    "stft_mag_factor": 0.0,
    "stft_mask": False,
    "stft_mask_threshold": 0.0,
    "rms_loss": 0.0,
    "clamp_loss": 30.0,
    "clip_grad_norm": 10.0,
    # Continue / teacher-student toggles
    "continue_best": False,
    "continue_pretrained": "",
    "save_again": False,
    "eval_window_size": 0,
    "other_noise": False,
    # Augment extras used by solver
    "bandmask": 0.0,
    "shift_same": False,
    "timescale": 0.0,
    "mixup": 0.0,
    # Model method flags
    "biodenoising16k_dns48": False,
}
for k, v in defaults.items():
    if not hasattr(args, k):
        setattr(args, k, v)

# Derive samples_dir if None
if args.samples_dir is None:
    args.samples_dir = str(Path(args.out_dir) / "samples")

# Ensure args supports membership checks used in package (e.g., "cometml" in args)
class ArgsNS(SimpleNamespace):
    def __contains__(self, key: str) -> bool:  # allow "in" on namespace, truthy only
        if not hasattr(self, key):
            return False
        try:
            return bool(getattr(self, key))
        except Exception:
            return False

args = ArgsNS(**vars(args))
if not hasattr(args, "cometml"):
    args.cometml = None

print("Running adaptation with args:")
print({
    "noisy_dir": args.noisy_dir,
    "out_dir": args.out_dir,
    "steps": args.steps,
    "epochs": args.epochs,
    "method": args.method,
    "segment": args.segment,
    "device": args.device,
})

# Mirror CLI behavior: set method flag if required by adapt module
if getattr(args, "method", None) == "biodenoising16k_dns48":
    setattr(args, "biodenoising16k_dns48", True)

# Call the adaptation entrypoint from the package
model = adapt.run_adaptation(args)
print("Adaptation completed.")
run_out_dir


In [None]:
# Inspect outputs (final step only, non-noise)
from __future__ import annotations
from pathlib import Path
import re

out_dir = run_out_dir
method = args.method
print("Output dir:", out_dir)

if out_dir.exists():
    pattern = re.compile(rf"{re.escape(method)}_step(\d+)_none_step\1$")
    candidates = []
    for d in Path(out_dir).iterdir():
        if d.is_dir():
            m = pattern.match(d.name)
            if m:
                candidates.append((int(m.group(1)), d))
    if not candidates:
        print("No final step directory found.")
    else:
        final_step, final_dir = max(candidates, key=lambda x: x[0])
        wavs = [p for p in final_dir.rglob("*.wav") if "_noise" not in str(p.parent)]
        print("Final dir:", final_dir)
        print("Final WAV files:", len(wavs))
        for p in wavs[:10]:
            print("-", p.relative_to(out_dir))
else:
    print("No outputs found.")