In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
import bioacoustics_model_zoo as bmz



In [None]:
# ====== CONFIG ======
SITE_CSV = "data/ankafobe_forest_A_KBF01_5s.csv"   # your site manifest
MODEL_JOBLIB = "data/shallow_lr_birdset_effnetB1.joblib"  # or resample base LR
LABELS_CSV = "data/train_labels_5s_mac_frommeta.csv"    # for class order
OUT_DIR = Path("/Volumes/Expansion/active_learning_candidates")
OUT_DIR.mkdir(parents=True, exist_ok=True)

BATCH_ROWS = 5000          # how many 5s clips per embed batch (tune)
EMBED_BATCH_SIZE = 32
NUM_WORKERS = 0

# choose which species to target (recommend: rare + mid classes)
TARGET_SPECIES = [
    "Philepitta_castanea",
    "Treron_australis",
    "Agapornis_canus",
    "Saxicola_torquatus",
    # add more...
]

TOPK = 10000  # keep this many best clips per class
# ====================

In [None]:
# load classifier + class list
clf = joblib.load(MODEL_JOBLIB)
classes = pd.read_csv(LABELS_CSV, index_col=[0,1,2]).columns.tolist()

# load site clip list
site = pd.read_csv(SITE_CSV)

# make sure we have file/start/end
needed = {"file","start_time","end_time"}
if not needed.issubset(set(site.columns)):
    raise ValueError(f"{SITE_CSV} must contain columns {needed}. Found: {site.columns.tolist()}")

site = site.set_index(["file","start_time","end_time"]).sort_index()

In [None]:
# dummy label df for embed()
infer_df = pd.DataFrame(0, index=site.index, columns=classes)

m = bmz.BirdSetEfficientNetB1()

# container for top-K candidates per class
top_tables = {sp: pd.DataFrame(columns=["file","start_time","end_time","score"]) for sp in TARGET_SPECIES}
def update_topk(df_old, df_new, k):
    df = pd.concat([df_old, df_new], ignore_index=True)
    df = df.sort_values("score", ascending=False).head(k)
    return df


In [None]:
# process in row-batches to keep memory bounded
idx = infer_df.index
n = len(idx)
for start in range(0, n, BATCH_ROWS):
    end = min(start + BATCH_ROWS, n)
    batch_df = infer_df.iloc[start:end]

    X = m.embed(batch_df, batch_size=EMBED_BATCH_SIZE, num_workers=NUM_WORKERS)
    p = clf.predict_proba(X.values.astype(np.float32))
    p_df = pd.DataFrame(p, index=X.index, columns=classes)

    # update candidate tables
    for sp in TARGET_SPECIES:
        if sp not in p_df.columns:
            continue
        s = p_df[sp]
        # keep rows above a small threshold to reduce churn (optional)
        s = s[s > 0.2]
        if len(s) == 0:
            continue
        new = s.reset_index()
        new.columns = ["file","start_time","end_time","score"]
        top_tables[sp] = update_topk(top_tables[sp], new, TOPK)

    print(f"processed rows {start}:{end} / {n}")

In [None]:
# write results
site_name = Path(SITE_CSV).stem
for sp, tbl in top_tables.items():
    out = OUT_DIR / f"{site_name}__{sp}__top{TOPK}.csv"
    tbl.to_csv(out, index=False)

print("done:", site_name)