<a href="https://colab.research.google.com/github/rileyhitthefan/bat-lab/blob/main/MLModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title 1) Mount Google Drive and set your data root
from google.colab import drive
drive.mount('/content/drive')

# Change this to your folder. For your original path:
#   /content/drive/My Drive/Bat Calls/ NYCTHE
# In Colab it's: /content/drive/MyDrive/Bat Calls/NYCTHE  (no space in MyDrive)
# If "NYCTHE" is a species folder, it's better to point ABOVE it:
root_dir = "/content/drive/MyDrive/Bat Calls"  # contains 'NYCTHE' inside
print("root_dir:", root_dir)

Mounted at /content/drive
root_dir: /content/drive/MyDrive/Bat Calls


In [None]:
#@title 2) Install & Imports
!pip -q install librosa==0.10.2.post1 soundfile==0.12.1 pyyaml==6.0.2 tqdm==4.66.5 scikit-learn==1.5.2

import os, io, math, json, yaml, random, time, gc
from dataclasses import dataclass
from typing import Dict
import numpy as np
import pandas as pd
from tqdm import tqdm

import librosa, soundfile as sf
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Torch:", torch.__version__, "| Device:", DEVICE)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.1/260.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m767.5/767.5 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
umap-learn 0.5.9.post2 requires scikit-learn>=1.6, but you have scikit-learn 1.5.2 which is incompa

In [None]:
#@title 3) Configuration
CONFIG = {
    "sr": 48000,
    "mono": True,
    "duration_s": 1.0,      # fixed window per example
    "hop_length": 512,
    "n_fft": 1024,
    "n_mels": 128,
    "fmin": 1000,
    "fmax": 24000,
    "batch_size": 16,
    "num_workers": 2,
    "lr": 1e-3,
    "epochs": 10,
    "model_dir": "checkpoints",
    "cache_dir": "cache_mels",
    "manifest_csv": "data_manifest.csv",
    "thresholds_yaml": "thresholds.yaml",
    "default_min_conf": 0.99,  # “crazy high” by default
}
os.makedirs(CONFIG["model_dir"], exist_ok=True)
os.makedirs(CONFIG["cache_dir"], exist_ok=True)

SEED = 1337
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if DEVICE == "cuda": torch.cuda.manual_seed_all(SEED)

print(json.dumps(CONFIG, indent=2))

{
  "sr": 48000,
  "mono": true,
  "duration_s": 1.0,
  "hop_length": 512,
  "n_fft": 1024,
  "n_mels": 128,
  "fmin": 1000,
  "fmax": 24000,
  "batch_size": 16,
  "num_workers": 2,
  "lr": 0.001,
  "epochs": 10,
  "model_dir": "checkpoints",
  "cache_dir": "cache_mels",
  "manifest_csv": "data_manifest.csv",
  "thresholds_yaml": "thresholds.yaml",
  "default_min_conf": 0.99
}


In [None]:
#@title 4) Manifest helpers (filepath,label,location)

def infer_species_and_location(path: str):
    parts = path.replace("\\", "/").split("/")
    # Prefer: .../<species>/<location>/<file.wav>
    if len(parts) >= 3:
        species = parts[-3]; location = parts[-2]
    elif len(parts) >= 2:
        species = parts[-2]; location = "unknown"
    else:
        species, location = "unknown", "unknown"
    return species, location

def build_manifest_from_dir(root_dir: str, out_csv: str):
    rows = []
    for dirpath, _, filenames in os.walk(root_dir):
        for fn in filenames:
            if fn.lower().endswith(".wav"):
                full = os.path.join(dirpath, fn)
                label, location = infer_species_and_location(full)
                rows.append({"filepath": full, "label": label, "location": location})
    df = pd.DataFrame(rows).sort_values("filepath")
    df.to_csv(out_csv, index=False)
    print(f"Wrote {out_csv} with {len(df)} rows")
    return df

def load_manifest(csv_path: str):
    df = pd.read_csv(csv_path)
    assert {"filepath","label","location"}.issubset(df.columns)
    return df

# Build manifest from your Drive folder:
df = build_manifest_from_dir(root_dir, CONFIG["manifest_csv"])
print(df.head(3))
print("Rows:", len(df), "| species:", df['label'].nunique(), "| locations:", df['location'].nunique())


Wrote data_manifest.csv with 18 rows
                                            filepath      label location
6  /content/drive/MyDrive/Bat Calls/NYCTHE/00047_...  Bat Calls   NYCTHE
2  /content/drive/MyDrive/Bat Calls/NYCTHE/NYCTHE...  Bat Calls   NYCTHE
0  /content/drive/MyDrive/Bat Calls/NYCTHE/NYCTHE...  Bat Calls   NYCTHE
Rows: 18 | species: 1 | locations: 2


In [None]:
#@title 5) Mel-spectrogram features (with cache) + preview
def load_wav_fixed(path, sr, mono, duration_s):
    y, s = librosa.load(path, sr=sr, mono=mono)
    target_len = int(sr*duration_s)
    if len(y) < target_len: y = np.pad(y, (0, target_len-len(y)))
    elif len(y) > target_len: y = y[:target_len]
    return y, sr

def wav_to_mel(y, sr, n_fft, hop_length, n_mels, fmin, fmax):
    S = librosa.feature.melspectrogram(
        y=y, sr=sr, n_fft=n_fft, hop_length=hop_length,
        n_mels=n_mels, fmin=fmin, fmax=fmax, power=2.0
    )
    S_db = librosa.power_to_db(S, ref=np.max)
    S_min, S_max = S_db.min(), S_db.max()
    S_norm = (S_db - S_min) / (S_max - S_min + 1e-8)
    return S_norm.astype(np.float32)

def cache_mel(path: str, cfg=CONFIG):
    base = os.path.basename(path)
    key = base.replace(".wav", f"_{cfg['sr']}sr_{cfg['n_mels']}mels.npy")
    out_path = os.path.join(cfg["cache_dir"], key)
    if os.path.exists(out_path): return out_path
    y, sr = load_wav_fixed(path, cfg["sr"], cfg["mono"], cfg["duration_s"])
    mel = wav_to_mel(y, sr, cfg["n_fft"], cfg["hop_length"], cfg["n_mels"], cfg["fmin"], cfg["fmax"])
    np.save(out_path, mel); return out_path

def preview_spectrogram(wav_path):
    y, sr = load_wav_fixed(wav_path, CONFIG["sr"], CONFIG["mono"], CONFIG["duration_s"])
    mel = wav_to_mel(y, sr, CONFIG["n_fft"], CONFIG["hop_length"], CONFIG["n_mels"], CONFIG["fmin"], CONFIG["fmax"])
    plt.figure(figsize=(6,3)); plt.imshow(mel, aspect="auto", origin="lower")
    plt.title(os.path.basename(wav_path)); plt.xlabel("Frames"); plt.ylabel("Mel bins"); plt.colorbar(); plt.show()


In [None]:
#@title 6) Dataset & stratified split dataloaders (location-aware)
class BatDataset(Dataset):
    def __init__(self, df: pd.DataFrame, label_to_idx: Dict[str,int], loc_to_idx: Dict[str,int]):
        self.df = df.reset_index(drop=True)
        self.label_to_idx = label_to_idx
        self.loc_to_idx = loc_to_idx
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        row = self.df.iloc[i]
        mel_path = cache_mel(row.filepath, CONFIG)
        mel = np.load(mel_path)                 # [n_mels, T]
        x = torch.tensor(mel).unsqueeze(0)      # [1, n_mels, T]
        y = torch.tensor(self.label_to_idx[row.label], dtype=torch.long)
        loc = torch.tensor(self.loc_to_idx.get(row.location, 0), dtype=torch.long)
        return x, y, loc, row.filepath

def build_loaders_stratified(df: pd.DataFrame, seed=SEED):
    # Stratify by label to avoid leakage
    df_train, df_temp = train_test_split(df, test_size=0.30, random_state=seed, stratify=df['label'])
    df_val, df_test   = train_test_split(df_temp, test_size=0.50, random_state=seed, stratify=df_temp['label'])

    species   = sorted(df['label'].unique())
    locations = sorted(df['location'].unique())
    label_to_idx = {s:i for i,s in enumerate(species)}
    loc_to_idx   = {l:i for i,l in enumerate(locations)}

    ds_train = BatDataset(df_train, label_to_idx, loc_to_idx)
    ds_val   = BatDataset(df_val,   label_to_idx, loc_to_idx)
    ds_test  = BatDataset(df_test,  label_to_idx, loc_to_idx)

    loaders = {
        "train": DataLoader(ds_train, batch_size=CONFIG["batch_size"], shuffle=True,  num_workers=CONFIG["num_workers"]),
        "val":   DataLoader(ds_val,   batch_size=CONFIG["batch_size"], shuffle=False, num_workers=CONFIG["num_workers"]),
        "test":  DataLoader(ds_test,  batch_size=CONFIG["batch_size"], shuffle=False, num_workers=CONFIG["num_workers"]),
    }
    meta = {"species": species, "locations": locations, "label_to_idx": label_to_idx, "loc_to_idx": loc_to_idx}
    return loaders, meta, (df_train, df_val, df_test)

loaders, meta, splits = build_loaders_stratified(df)
print("Species:", meta["species"])
print("Locations:", meta["locations"])


Species: ['Bat Calls']
Locations: ['NYCTHE', 'RHICAP']


In [None]:
#@title 7) Model: Small CNN + Location Embedding
class SmallAudioCNN(nn.Module):
    def __init__(self, n_classes:int, n_locations:int, loc_embed_dim:int=16):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1), nn.BatchNorm2d(16), nn.ReLU(),
            nn.MaxPool2d((2,2)),
            nn.Conv2d(16, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(),
            nn.MaxPool2d((2,2)),
            nn.Conv2d(32, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(),
            nn.AdaptiveAvgPool2d((4,4))
        )
        self.loc_embed = nn.Embedding(num_embeddings=max(1,n_locations), embedding_dim=loc_embed_dim)
        self.fc = nn.Sequential(
            nn.Linear(64*4*4 + loc_embed_dim, 256), nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(256, n_classes)
        )
        self.temperature = nn.Parameter(torch.ones(1))  # for calibration
    def forward(self, x, loc_ids):
        z = self.conv(x).flatten(1)             # [B, 1024]
        le = self.loc_embed(loc_ids)            # [B, loc_embed_dim]
        logits = self.fc(torch.cat([z, le], dim=1)) / self.temperature.clamp_min(0.5)
        return logits


In [None]:
#@title 8) Train / Eval
@dataclass
class TrainResult:
    best_path: str
    history: list

def accuracy(logits, y):
    preds = logits.argmax(1)
    return (preds == y).float().mean().item()

def run_epoch(model, loader, opt=None):
    is_train = opt is not None
    model.train() if is_train else model.eval()
    total_loss, total_acc, total_n = 0.0, 0.0, 0
    for x, y, loc, _ in loader:
        x, y, loc = x.to(DEVICE), y.to(DEVICE), loc.to(DEVICE)
        with torch.set_grad_enabled(is_train):
            logits = model(x, loc)
            loss = F.cross_entropy(logits, y)
            if is_train:
                opt.zero_grad(); loss.backward(); opt.step()
        bs = x.size(0)
        total_loss += loss.item()*bs
        total_acc  += accuracy(logits, y)*bs
        total_n    += bs
    return {"loss": total_loss/total_n, "acc": total_acc/total_n}

def train_model(loaders, meta):
    model = SmallAudioCNN(len(meta["species"]), len(meta["locations"])).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=CONFIG["lr"])
    best_val, history = 0.0, []
    best_path = os.path.join(CONFIG["model_dir"], "best.pt")
    for epoch in range(1, CONFIG["epochs"]+1):
        tr = run_epoch(model, loaders["train"], opt)
        va = run_epoch(model, loaders["val"], None)
        history.append({"epoch": epoch, "train": tr, "val": va})
        print(f"Epoch {epoch:02d} | train {tr['loss']:.4f}/{tr['acc']:.3f} | val {va['loss']:.4f}/{va['acc']:.3f}")
        if va["acc"] > best_val:
            best_val = va["acc"]
            torch.save({"model_state": model.state_dict(), "meta": meta}, best_path)
    return TrainResult(best_path=best_path, history=history)

def evaluate_model(model, loader):
    model.eval(); total_acc, total_n = 0.0, 0
    with torch.no_grad():
        for x, y, loc, _ in loader:
            x, y, loc = x.to(DEVICE), y.to(DEVICE), loc.to(DEVICE)
            total_acc += accuracy(model(x, loc), y)*x.size(0)
            total_n   += x.size(0)
    return total_acc/total_n if total_n>0 else 0.0


In [None]:
#@title 9) Temperature Scaling (Validation Calibration)
def calibrate_temperature(model, loader, max_iters=200, lr=5e-2):
    model.eval()
    temp = model.temperature
    opt = torch.optim.LBFGS([temp], lr=lr, max_iter=max_iters)
    def _nll():
        losses = []
        for x, y, loc, _ in loader:
            x, y, loc = x.to(DEVICE), y.to(DEVICE), loc.to(DEVICE)
            loss = F.cross_entropy(model(x, loc), y)
            losses.append(loss)
        return torch.stack(losses).mean()
    def closure():
        opt.zero_grad(); loss = _nll(); loss.backward(); return loss
    before = _nll().item(); opt.step(closure); after = _nll().item()
    print(f"Calibration NLL: {before:.4f} -> {after:.4f}; T={temp.item():.3f}")
    return temp.item()


In [None]:
#@title 10) Thresholds + Inference (Unknown if below min_conf)
def load_thresholds(path: str, default_min_conf: float):
    if os.path.exists(path):
        with open(path, "r") as f: th = yaml.safe_load(f) or {}
    else: th = {}
    th["_default_min_conf"] = th.get("_default_min_conf", default_min_conf)
    return th

def get_min_conf(thresholds: dict, species: str, location: str):
    per = thresholds.get("per", {})
    default = thresholds.get("_default_min_conf", CONFIG["default_min_conf"])
    if species in per and location in per[species]: return float(per[species][location])
    return float(default)

@torch.no_grad()
def predict_batch(model, batch_x, batch_loc, meta, thresholds):
    model.eval()
    logits = model(batch_x.to(DEVICE), batch_loc.to(DEVICE))
    probs = F.softmax(logits, dim=1).cpu().numpy()
    preds = probs.argmax(1)
    out = []
    idx_to_label = {v:k for k,v in meta["label_to_idx"].items()}
    idx_to_loc   = {v:k for k,v in meta["loc_to_idx"].items()}
    for i in range(len(preds)):
        top_idx = int(preds[i]); top_label = idx_to_label[top_idx]; p = float(probs[i, top_idx])
        loc_name = idx_to_loc[int(batch_loc[i].cpu().item())]
        min_conf = get_min_conf(thresholds, top_label, loc_name)
        is_unknown = (p < min_conf)
        reasons = [{"type":"low_conf","top1_conf":p,"min_conf":min_conf}] if is_unknown else []
        out.append({"label": top_label, "prob": p, "location": loc_name, "is_unknown": is_unknown, "reasons": reasons})
    return out

def load_best(model_path: str):
    ckpt = torch.load(model_path, map_location=DEVICE)
    meta = ckpt["meta"]
    model = SmallAudioCNN(len(meta["species"]), len(meta["locations"])).to(DEVICE)
    model.load_state_dict(ckpt["model_state"]); model.eval()
    return model, meta

def predict_file(wav_path: str, location: str, model_path: str, thresholds_yaml: str):
    model, meta = load_best(model_path)
    loc_to_idx = meta["loc_to_idx"]
    if location not in loc_to_idx: location = meta["locations"][0]
    loc_idx = torch.tensor([loc_to_idx[location]], dtype=torch.long)
    mel_path = cache_mel(wav_path, CONFIG)
    mel = np.load(mel_path)
    x = torch.tensor(mel).unsqueeze(0).unsqueeze(0)  # [1,1,n_mels,T]
    thresholds = load_thresholds(thresholds_yaml, CONFIG["default_min_conf"])
    return predict_batch(model, x, loc_idx, meta, thresholds)[0]

def topk_for_file(wav_path: str, location: str, model_path: str, k=3):
    model, meta = load_best(model_path)
    loc_to_idx = meta["loc_to_idx"]
    if location not in loc_to_idx: location = meta["locations"][0]
    loc_idx = torch.tensor([loc_to_idx[location]], dtype=torch.long)
    mel_path = cache_mel(wav_path, CONFIG)
    mel = np.load(mel_path)
    x = torch.tensor(mel).unsqueeze(0).unsqueeze(0).to(DEVICE)
    with torch.no_grad():
        logits = model(x, loc_idx.to(DEVICE))
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
    idx_to_label = {v:k for k,v in meta["label_to_idx"].items()}
    order = np.argsort(-probs)[:k]
    return [(idx_to_label[i], float(probs[i])) for i in order]


In [None]:
#@title 11) Train → Calibrate → Test
result = train_model(loaders, meta)
print("Best checkpoint:", result.best_path)

best_model, _meta = load_best(result.best_path)
_ = calibrate_temperature(best_model, loaders["val"])
torch.save({"model_state": best_model.state_dict(), "meta": _meta}, result.best_path)

test_acc = evaluate_model(best_model, loaders["test"])
print("Test accuracy:", round(test_acc, 3))


Epoch 01 | train 0.0000/1.000 | val 0.0000/1.000
Epoch 02 | train 0.0000/1.000 | val 0.0000/1.000
Epoch 03 | train 0.0000/1.000 | val 0.0000/1.000
Epoch 04 | train 0.0000/1.000 | val 0.0000/1.000
Epoch 05 | train 0.0000/1.000 | val 0.0000/1.000
Epoch 06 | train 0.0000/1.000 | val 0.0000/1.000
Epoch 07 | train 0.0000/1.000 | val 0.0000/1.000
Epoch 08 | train 0.0000/1.000 | val 0.0000/1.000
Epoch 09 | train 0.0000/1.000 | val 0.0000/1.000
Epoch 10 | train 0.0000/1.000 | val 0.0000/1.000
Best checkpoint: checkpoints/best.pt
Calibration NLL: 0.0000 -> 0.0000; T=1.000
Test accuracy: 1.0


In [None]:
#@title 12) (Optional) thresholds.yaml example (per-species, per-location)
example_thresholds = {
    "_default_min_conf": 0.99,
    "per": {
        # Fill these if you used county subfolders under each species, e.g.:
        # "NYCTHE": {"Tarrant": 0.995, "Denton": 0.992}
    }
}
with open(CONFIG["thresholds_yaml"], "w") as f:
    yaml.safe_dump(example_thresholds, f)
print("Wrote", CONFIG["thresholds_yaml"])


Wrote thresholds.yaml


In [None]:
#@title 13) Predict one file (top-1) + optional top-k
# file from the manifest
test_wav = df['filepath'].iloc[0]

# location: use "unknown" if you didn't create county folders
loc_name = "unknown" if "unknown" in meta["locations"] else meta["locations"][0]

pred = predict_file(test_wav, location=loc_name,
                    model_path=result.best_path,
                    thresholds_yaml=CONFIG["thresholds_yaml"])
print("Top-1:", pred)

print("Top-3:", topk_for_file(test_wav, location=loc_name, model_path=result.best_path, k=3))


Top-1: {'label': 'Bat Calls', 'prob': 1.0, 'location': 'NYCTHE', 'is_unknown': False, 'reasons': []}
Top-3: [('Bat Calls', 1.0)]


In [None]:
#@title 14) Batch predictions for the test set → CSV
model_best, meta2 = load_best(result.best_path)
idx_to_label = {v:k for k,v in meta2["label_to_idx"].items()}
idx_to_loc   = {v:k for k,v in meta2["loc_to_idx"].items()}
thresholds = load_thresholds(CONFIG["thresholds_yaml"], CONFIG["default_min_conf"])

rows = []
model_best.eval()
with torch.no_grad():
    for x, y, loc, paths in loaders["test"]:
        x = x.to(DEVICE); loc = loc.to(DEVICE)
        logits = model_best(x, loc)
        probs = torch.softmax(logits, dim=1).cpu().numpy()
        y = y.numpy(); loc = loc.cpu().numpy()
        for i in range(len(paths)):
            top_idx = int(np.argmax(probs[i]))
            top_label = idx_to_label[top_idx]
            p = float(probs[i, top_idx])
            loc_name = idx_to_loc[int(loc[i])]
            min_conf = get_min_conf(thresholds, top_label, loc_name)
            is_unknown = p < min_conf
            rows.append({
                "filepath": paths[i],
                "true_label": idx_to_label[int(y[i])],
                "pred_label": top_label,
                "pred_prob": round(p, 4),
                "location": loc_name,
                "is_unknown": is_unknown
            })

out_csv = "test_predictions.csv"
pd.DataFrame(rows).to_csv(out_csv, index=False)
print("Wrote:", out_csv)
pd.DataFrame(rows).head(10)


Wrote: test_predictions.csv


Unnamed: 0,filepath,true_label,pred_label,pred_prob,location,is_unknown
0,/content/drive/MyDrive/Bat Calls/RHICAP/00007_...,Bat Calls,Bat Calls,1.0,RHICAP,False
1,/content/drive/MyDrive/Bat Calls/NYCTHE/NYCTHE...,Bat Calls,Bat Calls,1.0,NYCTHE,False
2,/content/drive/MyDrive/Bat Calls/RHICAP/00009_...,Bat Calls,Bat Calls,1.0,RHICAP,False
