In [3]:
#!/usr/bin/env python3
from __future__ import annotations

import os


# =========================
# GPU + CPU LIMITS (set before torch/transformers)
# =========================
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"   # physical GPU 2,3 only
N_THREADS = 16
os.environ["OMP_NUM_THREADS"] = str(N_THREADS)
os.environ["MKL_NUM_THREADS"] = str(N_THREADS)
os.environ["OPENBLAS_NUM_THREADS"] = str(N_THREADS)
os.environ["VECLIB_MAXIMUM_THREADS"] = str(N_THREADS)
os.environ["NUMEXPR_NUM_THREADS"] = str(N_THREADS)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import re
import json
from pathlib import Path
from typing import Dict, List
import torch
import pandas as pd
from PIL import Image
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import tqdm

torch.set_num_threads(N_THREADS)

# =========================
# CONFIG
# =========================
YEARS = [2014, 2016, 2018, 2020, 2022, 2024]
YEARS_STR = ", ".join(str(y) for y in YEARS)

BASE_DIR = Path("/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/temporal-analysis/data")
FOLDER_TPL = "delhi_airshed_y_{y}_z_17_buf_25m"
BBOX_LABELS_DIR = Path(
    "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/temporal-analysis/data/"
    "delhi_airshed_y_2025_z_17_buf_25m_symlink/labels"
)
BBOX_YEAR = 2025
BBOX_CSV = Path(
    "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/temporal-analysis/"
    "cosmos_reason2_bbox_pixels_2025.csv"
)

MODEL_ID = "nvidia/Cosmos-Reason2-8B"
OUT_CSV = "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/temporal-analysis/cosmos_reason2_8b_kiln_change_results_delhi_all_loc_bbox_testing.csv"

MAX_NEW_TOKENS = 4096  # Cosmos needs more tokens for chain-of-thought
N_LOCATIONS = 924
LIMIT = 1 # set e.g. 10 for testing
SAVE_PROMPT_IN_CSV = True
TARGET_LOC = None # e.g. "28.604682_77.471200"


# =========================
# PROMPT
# =========================
PROMPT_TEMPLATE = f"""
You are analyzing multi year satellite image chips of the SAME location across years {YEARS_STR}.
Pixel bboxes from 2025 YOLO labels, format: [x_min, y_min, x_max, y_max].
{{BBOX_LINE}}
Use these exact boxes for all years. Do not invent boxes.
If the list is [[0, 0, 0, 0]], treat as negative (no bbox).
Detect brick kiln like structures and track changes.

Return STRICT JSON only. No markdown. No extra text.

Definitions:
presence: true if kiln like structure is present in the location in any year, false otherwise
kiln_present: whether kiln like structure exists in that year
kiln_shape: one of ["circular_oval","oval_round","rectangular_sharp_edge_corners","none"]
kiln_type: one of ["FCBK","CFCBK","Zigzag","none"]

Infer:
presence: true if any year has kiln_present true, otherwise false
appearance_year: first year kiln_present becomes true
type transition: (FCBK or CFCBK) -> Zigzag
shape transition: circular_oval or oval_round -> rectangular_sharp_edge_corners
demolished_year: first year after being present where kiln_present becomes false and stays absent thereafter (best effort)
negative_sample: true if kiln_present is false for all years

Output JSON schema:
{{
  "presence": <true/false>,
  "appearance_year": <int or 0>,
  "appearance_type": "<kiln_type at appearance or 'none'>",
  "type_transition_year_before": <int or 0>,
  "type_transition_year_after": <int or 0>,
  "type_transition_note": "<short>",
  "shape_transition_year_before": <int or 0>,
  "shape_transition_year_after": <int or 0>,
  "shape_transition_note": "<short>",
  "demolished": <true/false>,
  "demolished_year": <int or 0>,
  "negative_sample": <true/false>,
  "monitoring_note_one_line": "<one line summary of evolution over years>",
  "confidence": "<low|medium|high>"
}}

Be conservative. If unsure set type or shape to "unknown". If not present set to "none".
"""

# =========================
# Build year maps
# =========================
def build_year_maps() -> Dict[int, Dict[str, Path]]:
    year_maps: Dict[int, Dict[str, Path]] = {}
    for y in YEARS:
        folder = BASE_DIR / FOLDER_TPL.format(y=y)
        if not folder.exists():
            raise FileNotFoundError(f"Missing folder: {folder}")

        mapping: Dict[str, Path] = {}
        for f in folder.glob("*.png"):
            # expected: 28.208668_77.420208_2014.png
            parts = f.stem.split("_")
            if len(parts) >= 3:
                key = f"{parts[0]}_{parts[1]}"
                mapping[key] = f
        year_maps[y] = mapping
    return year_maps

def build_locations(n_locations: int) -> List[str]:
    year_maps = build_year_maps()
    common = set.intersection(*(set(year_maps[y].keys()) for y in YEARS))
    common = sorted(list(common))
    if len(common) < n_locations:
        raise ValueError(f"Need {n_locations} common locations, found {len(common)}")
    return common[:n_locations]

def load_images_for_loc(loc: str) -> List[Image.Image]:
    year_maps = build_year_maps()
    imgs: List[Image.Image] = []
    for y in YEARS:
        p = year_maps[y][loc]
        imgs.append(Image.open(p).convert("RGB"))
    return imgs

def load_bboxes_for_loc(loc: str) -> List[List[float]]:
    label_path = BBOX_LABELS_DIR / f"{loc}_{BBOX_YEAR}.txt"
    if not label_path.exists():
        return []
    raw = label_path.read_text().strip()
    if not raw:
        return []
    bboxes: List[List[float]] = []
    for line in raw.splitlines():
        parts = line.strip().split()
        if len(parts) < 4:
            continue
        # YOLO format: class x_center y_center width height (normalized)
        try:
            if len(parts) == 4:
                coords = parts[0:4]
            else:
                coords = parts[1:5]
            x_c, y_c, w, h = (float(coords[0]), float(coords[1]), float(coords[2]), float(coords[3]))
        except ValueError:
            continue
        bboxes.append([x_c, y_c, w, h])
    return bboxes

def yolo_to_pixel_bbox(
    bboxes: List[List[float]],
    img_w: int,
    img_h: int,
) -> List[List[float]]:
    pix: List[List[float]] = []
    for x_c, y_c, w, h in bboxes:
        x_min = (x_c - (w / 2.0)) * img_w
        x_max = (x_c + (w / 2.0)) * img_w
        y_min = (y_c - (h / 2.0)) * img_h
        y_max = (y_c + (h / 2.0)) * img_h
        x_min = max(0.0, min(float(img_w), x_min))
        x_max = max(0.0, min(float(img_w), x_max))
        y_min = max(0.0, min(float(img_h), y_min))
        y_max = max(0.0, min(float(img_h), y_max))
        pix.append([x_min, y_min, x_max, y_max])
    return pix

def build_bbox_csv(locs: List[str]) -> None:
    year_maps = build_year_maps()
    map_2025 = year_maps[BBOX_YEAR]
    rows = []
    for loc in locs:
        img_path = map_2025.get(loc)
        if img_path is None:
            continue
        with Image.open(img_path) as im:
            img_w, img_h = im.size
        norm = load_bboxes_for_loc(loc)
        pix = yolo_to_pixel_bbox(norm, img_w, img_h)
        if not pix:
            pix = [[0.0, 0.0, 0.0, 0.0]]
        lat, lon = loc.split("_")
        rows.append({
            "lat": float(lat),
            "lon": float(lon),
            "lat_lon": loc,
            "bbox_json": json.dumps(pix, ensure_ascii=False),
        })
    df = pd.DataFrame(rows)
    df.to_csv(BBOX_CSV, index=False)

def load_bbox_map(locs: List[str]) -> Dict[str, List[List[float]]]:
    if not BBOX_CSV.exists():
        build_bbox_csv(locs)
    df = pd.read_csv(BBOX_CSV)
    if "lat_lon" not in df.columns or "bbox_json" not in df.columns:
        build_bbox_csv(locs)
        df = pd.read_csv(BBOX_CSV)
    missing = set(locs) - set(df["lat_lon"].astype(str).tolist())
    if missing:
        build_bbox_csv(locs)
        df = pd.read_csv(BBOX_CSV)
    bbox_map: Dict[str, List[List[float]]] = {}
    for _, row in df.iterrows():
        loc = str(row["lat_lon"])
        raw = row.get("bbox_json", "[]")
        try:
            parsed = json.loads(raw) if isinstance(raw, str) else raw
        except Exception:
            parsed = [[0.0, 0.0, 0.0, 0.0]]
        if not parsed:
            parsed = [[0.0, 0.0, 0.0, 0.0]]
        bbox_map[loc] = parsed
    return bbox_map

def format_bboxes_for_prompt(loc: str, bbox_map: Dict[str, List[List[float]]]) -> str:
    bboxes = bbox_map.get(loc, [[0.0, 0.0, 0.0, 0.0]])
    bbox_strs = []
    for bb in bboxes:
        if len(bb) != 4:
            continue
        bbox_strs.append("[" + ", ".join(f"{v:.2f}" for v in bb) + "]")
    if not bbox_strs:
        bbox_strs = ["[0.00, 0.00, 0.00, 0.00]"]
    return "FOCUS ONLY on the region defined by bounding box: [" + ", ".join(bbox_strs) + "]"

# =========================
# JSON parse (robust)
# =========================
def extract_json(text: str):
    text = (text or "").strip()
    # Cosmos may output <think>...</think> reasoning, extract JSON after it
    think_match = re.search(r"</think>\s*(.+)", text, flags=re.DOTALL)
    if think_match:
        text = think_match.group(1).strip()

    try:
        return json.loads(text)
    except Exception:
        pass
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        return None
    cand = m.group(0)
    try:
        return json.loads(cand)
    except Exception:
        cand2 = re.sub(r"\s+", " ", cand).strip()
        try:
            return json.loads(cand2)
        except Exception:
            return None

# =========================
# Consistency flags (same logic)
# =========================
def _as_bool(v):
    if isinstance(v, bool):
        return v
    if v is None:
        return False
    if isinstance(v, (int, float)):
        return bool(v)
    s = str(v).strip().lower()
    return s in {"true", "1", "yes", "present"}

def presence_sequence_from_output(data):
    roi = data.get("roi_state_by_year", {}) or {}
    seq = []
    for y in YEARS:
        st = roi.get(str(y), {}) or {}
        seq.append(_as_bool(st.get("kiln_present", False)))
    return seq

def has_inconsistent_presence(seq):
    seen_present = False
    seen_absent_after_present = False
    for v in seq:
        if v and not seen_present:
            seen_present = True
        elif (not v) and seen_present:
            seen_absent_after_present = True
        elif v and seen_absent_after_present:
            return True
    return False

def confidence_to_score(conf):
    c = str(conf).strip().lower()
    if c == "high": return 0
    if c == "medium": return 1
    if c == "low": return 2
    return 1

def review_priority(confidence, inconsistent):
    return confidence_to_score(confidence) + (2 if inconsistent else 0)

def safe_int(x):
    x = pd.to_numeric(x, errors="coerce")
    if pd.isna(x):
        return 0
    return int(x)

# =========================
# Model load
# =========================
if torch.cuda.device_count() < 1:
    raise RuntimeError("No visible CUDA GPUs. Check CUDA_VISIBLE_DEVICES.")

model = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="sdpa",
)
processor = AutoProcessor.from_pretrained(MODEL_ID)

# =========================
# Run one location with images
# =========================
def run_one_location(loc: str, bbox_map: Dict[str, List[List[float]]]):
    imgs = load_images_for_loc(loc)
    bbox_text = format_bboxes_for_prompt(loc, bbox_map)
    prompt = PROMPT_TEMPLATE.replace("{BBOX_LINE}", bbox_text)

    content = []
    for img in imgs:
        content.append({"type": "image", "image": img})
    content.append({"type": "text", "text": prompt})

    messages = [{"role": "user", "content": content}]

    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt",
    )
    inputs = inputs.to(model.device)

    with torch.inference_mode():
        out_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)

    trimmed = [o[len(i):] for i, o in zip(inputs["input_ids"], out_ids)]
    text = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    data = extract_json(text)
    return text, data, prompt

# =========================
# Main
# =========================
if TARGET_LOC is not None:
    year_maps = build_year_maps()
    common = set.intersection(*(set(year_maps[y].keys()) for y in YEARS))
    if TARGET_LOC not in common:
        raise ValueError(f"TARGET_LOC not found in common locations: {TARGET_LOC}")
    locs = [TARGET_LOC]
else:
    locs = build_locations(N_LOCATIONS)
    if LIMIT is not None:
        locs = locs[:LIMIT]

bbox_map = load_bbox_map(locs)

rows = []
for loc in tqdm.tqdm(locs, desc="Cosmos-Reason2-8B locations"):
    lat, lon = loc.split("_")
    try:
        raw, data, prompt_text = run_one_location(loc, bbox_map)

        if data is None:
            rows.append({
                "lat": float(lat), "lon": float(lon), "lat_lon": loc,
                "status": "parse_fail", "raw_output": raw,
                **({"prompt_text": prompt_text} if SAVE_PROMPT_IN_CSV else {}),
            })
            continue

        seq = presence_sequence_from_output(data)
        inconsistent = has_inconsistent_presence(seq)

        row = {
            "lat": float(lat),
            "lon": float(lon),
            "lat_lon": loc,
            **({"prompt_text": prompt_text} if SAVE_PROMPT_IN_CSV else {}),
            "presence": bool(data.get("presence", False)),
            "appearance_year": safe_int(data.get("appearance_year", 0)),
            "appearance_type": data.get("appearance_type", "none"),

            "type_transition_year_before": safe_int(data.get("type_transition_year_before", 0)),
            "type_transition_year_after": safe_int(data.get("type_transition_year_after", 0)),
            "type_transition_note": data.get("type_transition_note", ""),

            "shape_transition_year_before": safe_int(data.get("shape_transition_year_before", 0)),
            "shape_transition_year_after": safe_int(data.get("shape_transition_year_after", 0)),
            "shape_transition_note": data.get("shape_transition_note", ""),

            "demolished": bool(data.get("demolished", False)),
            "demolished_year": safe_int(data.get("demolished_year", 0)),
            "negative_sample": bool(data.get("negative_sample", False)),

            "confidence": data.get("confidence", "unknown"),
            "inconsistent_presence": bool(inconsistent),
            "review_priority_score": int(review_priority(data.get("confidence", "unknown"), inconsistent)),

            # "presence_seq": json.dumps({str(YEARS[i]): bool(seq[i]) for i in range(len(YEARS))}, ensure_ascii=False),
            "monitoring_note_one_line": data.get("monitoring_note_one_line", ""),

            "raw_output": json.dumps(data, ensure_ascii=False),
            "status": "ok",
        }
        rows.append(row)

    except Exception as e:
        rows.append({
            "lat": float(lat), "lon": float(lon), "lat_lon": loc,
            "status": "exception", "error": repr(e), "raw_output": "",
            **({"prompt_text": ""} if SAVE_PROMPT_IN_CSV else {}),
        })

df = pd.DataFrame(rows).sort_values(["review_priority_score"], ascending=False)
Path(OUT_CSV).parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUT_CSV, index=False)
print("Saved:", OUT_CSV)
print(df["status"].value_counts(dropna=False))


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/750 [00:00<?, ?it/s]

Cosmos-Reason2-8B locations: 100%|██████████| 1/1 [00:07<00:00,  7.88s/it]

Saved: /home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/temporal-analysis/cosmos_reason2_8b_kiln_change_results_delhi_all_loc_bbox_testing.csv
status
ok    1
Name: count, dtype: int64





In [3]:
#!/usr/bin/env python3
from __future__ import annotations

import os


# =========================
# GPU + CPU LIMITS (set before torch/transformers)
# =========================
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"   # physical GPU 2,3 only
N_THREADS = 16
os.environ["OMP_NUM_THREADS"] = str(N_THREADS)
os.environ["MKL_NUM_THREADS"] = str(N_THREADS)
os.environ["OPENBLAS_NUM_THREADS"] = str(N_THREADS)
os.environ["VECLIB_MAXIMUM_THREADS"] = str(N_THREADS)
os.environ["NUMEXPR_NUM_THREADS"] = str(N_THREADS)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import re
import json
from pathlib import Path
from typing import Dict, List
import torch
import pandas as pd
from PIL import Image
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import tqdm
import cv2
import numpy as np
torch.set_num_threads(N_THREADS)

# =========================
# CONFIG
# =========================
YEARS = [2014, 2016, 2018, 2020, 2022, 2024]
YEARS_STR = ", ".join(str(y) for y in YEARS)

BASE_DIR = Path("/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/temporal-analysis/data")
FOLDER_TPL = "delhi_airshed_y_{y}_z_17_buf_25m"
BBOX_CSV = Path(
    "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/temporal-analysis/"
    "qwen3_30b_bbox_pixels_2025.csv"
)

# Video cache directory
VIDEO_CACHE_DIR = Path("/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/temporal-analysis/video_cache")
VIDEO_CACHE_DIR.mkdir(parents=True, exist_ok=True)

MODEL_ID = "nvidia/Cosmos-Reason2-8B"
OUT_CSV = "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/temporal-analysis/cosmos_reason2_8b_kiln_change_results_delhi_video_bbox.csv"

MAX_NEW_TOKENS = 4096  # Cosmos needs more tokens for chain-of-thought
N_LOCATIONS = 924
LIMIT = None # set e.g. 10 for testing
SAVE_PROMPT_IN_CSV = False
TARGET_LOC = None # e.g. "28.604682_77.471200"

# Video settings - 1 fps means model sees each year-frame with equal temporal spacing
VIDEO_FPS = 1.0  # 1 frame per second = 1 year per second in video
VIDEO_CODEC = "mp4v"  # or "avc1" for H.264


# =========================
# PROMPT
# =========================
PROMPT_TEMPLATE = f"""
You are analyzing multi year satellite image chips of the SAME location across years {YEARS_STR}.
The video shows temporal progression with 1 frame per year.
Pixel bboxes from labels, format: [x_min, y_min, x_max, y_max].
{{BBOX_LINE}}
Use these exact boxes for all years. Do not invent boxes.
If the list is [[0, 0, 0, 0]], treat as negative (no bbox).
Detect brick kiln like structures and track changes over time.

Return STRICT JSON only. No markdown. No extra text.

Definitions:
presence: true if kiln like structure is present in the location in any year, false otherwise
kiln_present: whether kiln like structure exists in that year
kiln_shape: one of ["circular_oval","oval_round","rectangular_sharp_edge_corners","none"]
kiln_type: one of ["FCBK","CFCBK","Zigzag","none"]

Infer:
presence: true if any year has kiln_present true, otherwise false
appearance_year: first year kiln_present becomes true
type transition: (FCBK or CFCBK) -> Zigzag
shape transition: circular_oval or oval_round -> rectangular_sharp_edge_corners
demolished_year: first year after being present where kiln_present becomes false and stays absent thereafter (best effort)
negative_sample: true if kiln_present is false for all years

Output JSON schema:
{{
  "presence": <true/false>,
  "appearance_year": <int or 0>,
  "appearance_type": "<kiln_type at appearance or 'none'>",
  "type_transition_year_before": <int or 0>,
  "type_transition_year_after": <int or 0>,
  "type_transition_note": "<short>",
  "shape_transition_year_before": <int or 0>,
  "shape_transition_year_after": <int or 0>,
  "shape_transition_note": "<short>",
  "demolished": <true/false>,
  "demolished_year": <int or 0>,
  "negative_sample": <true/false>,
  "monitoring_note_one_line": "<one line summary of evolution over years>",
  "confidence": "<low|medium|high>"
}}

Be conservative. If unsure set type or shape to "unknown". If not present set to "none".
"""

# =========================
# Build year maps
# =========================
def build_year_maps() -> Dict[int, Dict[str, Path]]:
    year_maps: Dict[int, Dict[str, Path]] = {}
    for y in YEARS:
        folder = BASE_DIR / FOLDER_TPL.format(y=y)
        if not folder.exists():
            raise FileNotFoundError(f"Missing folder: {folder}")

        mapping: Dict[str, Path] = {}
        for f in folder.glob("*.png"):
            # expected: 28.208668_77.420208_2014.png
            parts = f.stem.split("_")
            if len(parts) >= 3:
                key = f"{parts[0]}_{parts[1]}"
                mapping[key] = f
        year_maps[y] = mapping
    return year_maps

def build_locations(n_locations: int) -> List[str]:
    year_maps = build_year_maps()
    common = set.intersection(*(set(year_maps[y].keys()) for y in YEARS))
    common = sorted(list(common))
    if len(common) < n_locations:
        raise ValueError(f"Need {n_locations} common locations, found {len(common)}")
    return common[:n_locations]

def load_images_for_loc(loc: str) -> List[Image.Image]:
    year_maps = build_year_maps()
    imgs: List[Image.Image] = []
    for y in YEARS:
        p = year_maps[y][loc]
        imgs.append(Image.open(p).convert("RGB"))
    return imgs

def create_video_from_images(loc: str, imgs: List[Image.Image]) -> Path:
    """Create a video file from image sequence, with caching."""
    # Check cache first
    video_path = VIDEO_CACHE_DIR / f"{loc}.mp4"
    if video_path.exists():
        return video_path

    # Get image dimensions from first image
    img_array = np.array(imgs[0])
    height, width, _ = img_array.shape

    # Create video writer
    fourcc = cv2.VideoWriter_fourcc(*VIDEO_CODEC)
    video_writer = cv2.VideoWriter(
        str(video_path),
        fourcc,
        VIDEO_FPS,
        (width, height)
    )

    # Write frames
    for img in imgs:
        # Convert PIL to numpy array and BGR (OpenCV format)
        frame = np.array(img)
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        video_writer.write(frame_bgr)

    video_writer.release()
    return video_path

def load_bbox_map(locs: List[str]) -> Dict[str, List[List[float]]]:
    """Load bounding boxes directly from existing CSV (already in pixel coordinates)."""
    if not BBOX_CSV.exists():
        raise FileNotFoundError(f"Bbox CSV not found: {BBOX_CSV}")

    df = pd.read_csv(BBOX_CSV)
    if "lat_lon" not in df.columns or "bbox_json" not in df.columns:
        raise ValueError(f"Bbox CSV missing required columns: {BBOX_CSV}")

    bbox_map: Dict[str, List[List[float]]] = {}
    for _, row in df.iterrows():
        loc = str(row["lat_lon"])
        raw = row.get("bbox_json", "[]")
        try:
            parsed = json.loads(raw) if isinstance(raw, str) else raw
        except Exception:
            parsed = [[0.0, 0.0, 0.0, 0.0]]
        if not parsed:
            parsed = [[0.0, 0.0, 0.0, 0.0]]
        bbox_map[loc] = parsed

    # Check if all requested locations have bboxes
    missing = set(locs) - set(bbox_map.keys())
    if missing:
        print(f"Warning: {len(missing)} locations missing from bbox CSV")
        # Add empty bboxes for missing locations
        for loc in missing:
            bbox_map[loc] = [[0.0, 0.0, 0.0, 0.0]]

    return bbox_map

def format_bboxes_for_prompt(loc: str, bbox_map: Dict[str, List[List[float]]]) -> str:
    bboxes = bbox_map.get(loc, [[0.0, 0.0, 0.0, 0.0]])
    bbox_strs = []
    for bb in bboxes:
        if len(bb) != 4:
            continue
        bbox_strs.append("[" + ", ".join(f"{v:.2f}" for v in bb) + "]")
    if not bbox_strs:
        bbox_strs = ["[0.00, 0.00, 0.00, 0.00]"]
    return "FOCUS ONLY on the region defined by bounding box: [" + ", ".join(bbox_strs) + "]"

# =========================
# JSON parse (robust)
# =========================
def extract_json(text: str):
    text = (text or "").strip()
    # Cosmos may output <think>...</think> reasoning, extract JSON after it
    think_match = re.search(r"</think>\s*(.+)", text, flags=re.DOTALL)
    if think_match:
        text = think_match.group(1).strip()

    try:
        return json.loads(text)
    except Exception:
        pass
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        return None
    cand = m.group(0)
    try:
        return json.loads(cand)
    except Exception:
        cand2 = re.sub(r"\s+", " ", cand).strip()
        try:
            return json.loads(cand2)
        except Exception:
            return None

# =========================
# Consistency flags (same logic)
# =========================
def _as_bool(v):
    if isinstance(v, bool):
        return v
    if v is None:
        return False
    if isinstance(v, (int, float)):
        return bool(v)
    s = str(v).strip().lower()
    return s in {"true", "1", "yes", "present"}

def presence_sequence_from_output(data):
    roi = data.get("roi_state_by_year", {}) or {}
    seq = []
    for y in YEARS:
        st = roi.get(str(y), {}) or {}
        seq.append(_as_bool(st.get("kiln_present", False)))
    return seq

def has_inconsistent_presence(seq):
    seen_present = False
    seen_absent_after_present = False
    for v in seq:
        if v and not seen_present:
            seen_present = True
        elif (not v) and seen_present:
            seen_absent_after_present = True
        elif v and seen_absent_after_present:
            return True
    return False

def confidence_to_score(conf):
    c = str(conf).strip().lower()
    if c == "high": return 0
    if c == "medium": return 1
    if c == "low": return 2
    return 1

def review_priority(confidence, inconsistent):
    return confidence_to_score(confidence) + (2 if inconsistent else 0)

def safe_int(x):
    x = pd.to_numeric(x, errors="coerce")
    if pd.isna(x):
        return 0
    return int(x)

# =========================
# Model load
# =========================
if torch.cuda.device_count() < 1:
    raise RuntimeError("No visible CUDA GPUs. Check CUDA_VISIBLE_DEVICES.")

model = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="sdpa",
)
processor = AutoProcessor.from_pretrained(MODEL_ID)

# =========================
# Run one location with video
# =========================
def run_one_location(loc: str, bbox_map: Dict[str, List[List[float]]]):
    imgs = load_images_for_loc(loc)
    video_path = create_video_from_images(loc, imgs)

    bbox_text = format_bboxes_for_prompt(loc, bbox_map)
    prompt = PROMPT_TEMPLATE.replace("{BBOX_LINE}", bbox_text)

    # Use video format for Cosmos temporal reasoning
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": f"file://{video_path.resolve()}",
                    "fps": VIDEO_FPS,
                },
                {
                    "type": "text",
                    "text": prompt
                },
            ],
        }
    ]

    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt",
        fps=VIDEO_FPS,
    )
    inputs = inputs.to(model.device)

    with torch.inference_mode():
        out_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)

    trimmed = [o[len(i):] for i, o in zip(inputs["input_ids"], out_ids)]
    text = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    data = extract_json(text)
    return text, data, prompt

# =========================
# Main
# =========================
if TARGET_LOC is not None:
    year_maps = build_year_maps()
    common = set.intersection(*(set(year_maps[y].keys()) for y in YEARS))
    if TARGET_LOC not in common:
        raise ValueError(f"TARGET_LOC not found in common locations: {TARGET_LOC}")
    locs = [TARGET_LOC]
else:
    locs = build_locations(N_LOCATIONS)
    if LIMIT is not None:
        locs = locs[:LIMIT]

bbox_map = load_bbox_map(locs)

rows = []
for loc in tqdm.tqdm(locs, desc="Cosmos-Reason2-8B video locations"):
    lat, lon = loc.split("_")
    try:
        raw, data, prompt_text = run_one_location(loc, bbox_map)

        if data is None:
            rows.append({
                "lat": float(lat), "lon": float(lon), "lat_lon": loc,
                "status": "parse_fail", "raw_output": raw,
                **({"prompt_text": prompt_text} if SAVE_PROMPT_IN_CSV else {}),
            })
            continue

        seq = presence_sequence_from_output(data)
        inconsistent = has_inconsistent_presence(seq)

        row = {
            "lat": float(lat),
            "lon": float(lon),
            "lat_lon": loc,
            **({"prompt_text": prompt_text} if SAVE_PROMPT_IN_CSV else {}),
            "presence": bool(data.get("presence", False)),
            "appearance_year": safe_int(data.get("appearance_year", 0)),
            "appearance_type": data.get("appearance_type", "none"),

            "type_transition_year_before": safe_int(data.get("type_transition_year_before", 0)),
            "type_transition_year_after": safe_int(data.get("type_transition_year_after", 0)),
            "type_transition_note": data.get("type_transition_note", ""),

            "shape_transition_year_before": safe_int(data.get("shape_transition_year_before", 0)),
            "shape_transition_year_after": safe_int(data.get("shape_transition_year_after", 0)),
            "shape_transition_note": data.get("shape_transition_note", ""),

            "demolished": bool(data.get("demolished", False)),
            "demolished_year": safe_int(data.get("demolished_year", 0)),
            "negative_sample": bool(data.get("negative_sample", False)),

            "confidence": data.get("confidence", "unknown"),
            "inconsistent_presence": bool(inconsistent),
            "review_priority_score": int(review_priority(data.get("confidence", "unknown"), inconsistent)),

            # "presence_seq": json.dumps({str(YEARS[i]): bool(seq[i]) for i in range(len(YEARS))}, ensure_ascii=False),
            "monitoring_note_one_line": data.get("monitoring_note_one_line", ""),

            "raw_output": json.dumps(data, ensure_ascii=False),
            "status": "ok",
        }
        rows.append(row)

    except Exception as e:
        rows.append({
            "lat": float(lat), "lon": float(lon), "lat_lon": loc,
            "status": "exception", "error": repr(e), "raw_output": "",
            **({"prompt_text": ""} if SAVE_PROMPT_IN_CSV else {}),
        })

df = pd.DataFrame(rows).sort_values(["review_priority_score"], ascending=False)
Path(OUT_CSV).parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUT_CSV, index=False)
print("Saved:", OUT_CSV)
print(df["status"].value_counts(dropna=False))
print(f"\nVideo cache directory: {VIDEO_CACHE_DIR}")
print(f"Cached videos can be reused for future runs")


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/750 [00:00<?, ?it/s]

Cosmos-Reason2-8B video locations: 100%|██████████| 924/924 [00:50<00:00, 18.32it/s]


KeyError: 'review_priority_score'