# Install & GPU Check

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# If Colab runtime is not already GPU, switch to: Runtime > Change runtime type > T4/V100/A100 GPU
!nvidia-smi -L || true

# Core stack (Colab usually has torch/torchvision preinstalled; timm/torchmetrics we add)
!pip -q install timm torchmetrics

# Albumentations is convenient for NumPy-based augmentation
!pip -q install albumentations==1.4.7

!pip install tqdm

GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-b4201d88-77a7-ae4e-41a5-c4c319571b2e)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.7/155.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m


# Config

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class Cfg:
    data_dir: str = "/content/wildfire_dataset"
    train_sub: str = "train"
    val_sub:   str = "val"
    test_sub:  str = "test"    # optional
    use_test:  bool = True     # set False if you don't have test yet

    pos_name: str = "wildfire" # label 1
    neg_name: str = "other"    # label 0

    model_name: str = "mobilenetv3_small_075"  # try: 'efficientnet_b0', 'tf_efficientnetv2_b0'
    img_size: int = 64

    batch_size: int = 64
    epochs_stage1: int = 3   # warmup: head-only
    epochs_stage2: int = 15  # unfreeze & finetune
    lr: float = 3e-4
    weight_decay: float = 1e-4
    num_workers: int = 2
    seed: int = 42

    pos_oversample: float = 1.5  # >1.0 oversamples positives; 1.0 disables

    out_dir: str = "/content/checkpoints"
    best_name: str = "best.pt"

cfg = Cfg()
Path(cfg.out_dir).mkdir(parents=True, exist_ok=True)
cfg


Cfg(data_dir='/content/wildfire_dataset', train_sub='train', val_sub='val', test_sub='test', use_test=True, pos_name='wildfire', neg_name='other', model_name='mobilenetv3_small_075', img_size=64, batch_size=64, epochs_stage1=3, epochs_stage2=15, lr=0.0003, weight_decay=0.0001, num_workers=2, seed=42, pos_oversample=1.5, out_dir='/content/checkpoints', best_name='best.pt')

## Datasets

### Get Wildfire Dataset

In [None]:
!cp TO BE FILLED IN .
!unzip -q wildfire_dataset.zip

### Get "Places" dataset from Kaggle

In [17]:
!pip install -q kaggle
!mkdir -p ~/.kaggle

!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [18]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mittalshubham/images256")

Using Colab cache for faster access to the 'images256' dataset.


In [19]:
# === Augment negatives from a Kaggle image dataset (drop-in) ===
import os, hashlib, random
from pathlib import Path
from glob import glob

import numpy as np
from PIL import Image, ImageFile, UnidentifiedImageError
from tqdm import tqdm

# ---- Inputs ----
KAGGLE_PATH = Path(path)  # 'path' returned by kagglehub.dataset_download("mittalshubham/images256")
DATA_ROOT   = Path(cfg.data_dir)           # where your train/val/test live (e.g., /content/data)
NEG_FOLDER  = "other"                      # put new negatives directly into 'other' folders
# NEG_FOLDER = "other_kaggle"              # <- alternatively, keep them separate; still mapped to 0 by your loader

SPLIT = (0.80, 0.10, 0.10)                 # train/val/test split
MAX_IMAGES = None                          # limit (e.g., 5000) or None for all

print("here", KAGGLE_PATH)

# ---- Find images in the Kaggle dataset ----
IMG_EXTS = (".jpg", ".jpeg", ".png", ".bmp", ".webp")
all_imgs = []
# for p in tqdm(glob(str(KAGGLE_PATH / "**/*"), recursive=True)):
    # if os.path.splitext(p)[1].lower() in IMG_EXTS and os.path.isfile(p):
        # all_imgs.append(p)
all_imgs = [p for p in glob(str(KAGGLE_PATH / "**/*"), recursive=True)
            if os.path.splitext(p)[1].lower() in IMG_EXTS and os.path.isfile(p)]

print("here2")

if not all_imgs:
    raise RuntimeError(f"No images found under {KAGGLE_PATH}. Check the dataset contents.")

random.seed(cfg.seed)
random.shuffle(all_imgs)
if MAX_IMAGES is not None:
    all_imgs = all_imgs[:MAX_IMAGES]

print(f"Found {len(all_imgs)} candidate negatives in Kaggle dataset.")

all_imgs = random.sample(all_imgs, k=min(10000, len(all_imgs)))
# all_imgs = all_imgs[:10000]

print(f"Chopped to {len(all_imgs)}.")

# ---- Helper: hash -> unique filename; convert PIL->npy (H,W,3, uint8) ----
def file_sha1(p):
    h = hashlib.sha1()
    with open(p, "rb") as f:
        for chunk in iter(lambda: f.read(1<<20), b""):
            h.update(chunk)
    return h.hexdigest()

def pil_to_npy_rgb_safe(path):
    try:
        with Image.open(path) as im:
            im = im.convert("RGB")  # normalize mode
            return np.array(im)     # your existing conversion
    except (UnidentifiedImageError, OSError) as e:
        return None  # skip this file

# ---- Create target dirs ----
(TR, VA, TE) = (DATA_ROOT / "train" / NEG_FOLDER,
                DATA_ROOT / "val"   / NEG_FOLDER,
                DATA_ROOT / "test"  / NEG_FOLDER)

for d in (TR, VA, TE):
    d.mkdir(parents=True, exist_ok=True)

# ---- Split & save ----
n = len(all_imgs)
n_tr = int(n * SPLIT[0])
n_va = int(n * SPLIT[1])
idx_tr = set(range(0, n_tr))
idx_va = set(range(n_tr, n_tr + n_va))
idx_te = set(range(n_tr + n_va, n))

counts = {"train": 0, "val": 0, "test": 0}
dups_skipped = 0

# Dedup across all splits by SHA1 (prevents leakage)
seen = set()

bad = []

for i, p in tqdm(enumerate(all_imgs), total=len(all_imgs)):
    h = file_sha1(p)
    if h in seen:
        dups_skipped += 1
        continue
    seen.add(h)

    arr = pil_to_npy_rgb_safe(p)
    if arr is None:
        bad.append(p)
        continue
    outname = f"{h}.npy"

    if i in idx_tr:
        np.save(TR / outname, arr, allow_pickle=False)
        counts["train"] += 1
    elif i in idx_va:
        np.save(VA / outname, arr, allow_pickle=False)
        counts["val"] += 1
    else:
        np.save(TE / outname, arr, allow_pickle=False)
        counts["test"] += 1

print(f"Saved negatives → train:{counts['train']}  val:{counts['val']}  test:{counts['test']}  (duplicates skipped: {dups_skipped})")
print(f"Train negatives dir: {TR}")
print(f"Val   negatives dir: {VA}")
print(f"Test  negatives dir: {TE}")

print(f"Skipped {len(bad)} corrupt/truncated files")

# --- Reminder about mapping ---
# If you used NEG_FOLDER="other": they're merged into your existing 'other' class.
# If you used NEG_FOLDER="other_kaggle": your current code maps unknown classes to 0 (negative) automatically.


here /kaggle/input/images256


100%|██████████| 247520/247520 [06:22<00:00, 647.22it/s]


here2
Found 247269 candidate negatives in Kaggle dataset.
Chopped to 10000.


100%|██████████| 10000/10000 [01:51<00:00, 89.79it/s]

Saved negatives → train:8000  val:1000  test:1000  (duplicates skipped: 0)
Train negatives dir: /content/wildfire_dataset/train/other
Val   negatives dir: /content/wildfire_dataset/val/other
Test  negatives dir: /content/wildfire_dataset/test/other
Skipped 0 corrupt/truncated files





In [None]:
!zip -r wildfire_dataset_with_places.zip wildfire_dataset
!cp wildfire_dataset_with_places.zip TO BE FILLED IN

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: wildfire_dataset/train/other/325780e9037dbcafb019e8140d3c15b1d7b311ba.npy (deflated 45%)
  adding: wildfire_dataset/train/other/6c51e7c068268bf146cd39dbc2d5068fa428442b.npy (deflated 18%)
  adding: wildfire_dataset/train/other/00a166abce2593ac6e4eaec914701d2655246db5.npy (deflated 28%)
  adding: wildfire_dataset/train/other/0fb0ccdb8c4f7c369bd0bb0af2045d52d9d8a303.npy (deflated 28%)
  adding: wildfire_dataset/train/other/3f8de830aae3eaa6c08d7442ac4de6c2d58d5089.npy (deflated 14%)
  adding: wildfire_dataset/train/other/6dcb7091a7e59d047699145668db857c656db68f.npy (deflated 16%)
  adding: wildfire_dataset/train/other/595d1481eb91631e1a7b5b814417b5e0a0266719.npy (deflated 11%)
  adding: wildfire_dataset/train/other/151c4dd7334f7f357b601a98ba59c6d93bcf50f2.npy (deflated 11%)
  adding: wildfire_dataset/train/other/988a634db0d8cb954e1333837dd2fc96403850b3.npy (deflated 26%)
  adding: wildfire_dataset/train/other/0884e