In [None]:
ROOT_DIR     = r".\Nutrition5K\Nutrition5K\train"
LABELS_CSV   = r".\Nutrition5K\Nutrition5K\nutrition5k_train.csv"
OUT_CSV      = "./train_index.csv"

from pathlib import Path
import csv

IMG_EXTS = {".png"}

def norm_id(x: str) -> str:
    x = x.strip()
    if x.startswith("dish_"):
        return x
    return f"dish_{x.zfill(4) if x.isdigit() else x}"

def first_image(p: Path) -> str:
    if not p.is_dir():
        return ""
    for f in sorted(p.iterdir()):
        if f.is_file() and f.suffix.lower() in IMG_EXTS:
            return str(f)
    return ""

def pick_with_basename(dirpath: Path, basename: str) -> str:
    if not dirpath.is_dir():
        return ""
    for ext in IMG_EXTS:
        f = dirpath / f"{basename}{ext}"
        if f.is_file():
            return str(f)
    return first_image(dirpath)

def load_labels(csv_path: Path) -> dict:
    m = {}
    with csv_path.open("r", newline="", encoding="utf-8") as f:
        r = csv.DictReader(f)
        for row in r:
            m[norm_id(row["ID"])] = str(row["Value"]).strip()
    return m

def _is_zero_label(s: str) -> bool:
    try:
        return float(s.strip()) == 0.0
    except Exception:
        return False

# 新增：阈值过滤（>1500）
def _is_over_threshold(s: str, thr: float = 1500.0) -> bool:
    try:
        return float(s.strip()) > thr
    except Exception:
        # 非法/空值直接不过滤（由其它逻辑决定是否保留）
        return False

def main():
    root = Path(ROOT_DIR)
    color_root = root / "color"
    dcolor_root = root / "depth_color"
    draw_root   = root / "depth_raw"

    dish_ids = set()
    for p in (color_root, dcolor_root, draw_root):
        if p.is_dir():
            for d in p.iterdir():
                if d.is_dir():
                    dish_ids.add(d.name)

    labels = load_labels(Path(LABELS_CSV))

    all_ids = sorted(dish_ids | set(labels.keys()))
    rows = []
    skipped_zero = 0
    skipped_over = 0

    for did in all_ids:
        lbl = labels.get(did, "")

        rgb_dir = color_root / did
        dc_dir  = dcolor_root / did
        dr_dir  = draw_root / did

        rgb = pick_with_basename(rgb_dir, "rgb")
        dcp = pick_with_basename(dc_dir, "depth_color")
        drp = pick_with_basename(dr_dir, "depth_raw")

        rows.append({
            "dish_id": did,
            "label": lbl,
            "rgb_path": rgb,
            "depth_color_path": dcp,
            "depth_raw_path": drp
        })

    outp = Path(OUT_CSV)
    outp.parent.mkdir(parents=True, exist_ok=True)
    with outp.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["dish_id","label","rgb_path","depth_color_path","depth_raw_path"])
        w.writeheader()
        w.writerows(rows)

    print(f"[OK] wrote {len(rows)} rows -> {outp} "
          f"(skipped label==0: {skipped_zero}, skipped >1500: {skipped_over})")

if __name__ == "__main__":
    main()


[OK] wrote 3086 rows -> train_index.csv (skipped label==0: 214, skipped >1500: 1)


In [1]:
# -*- coding: utf-8 -*-
# 最小版：不去0、不做log、不去最大值，仅做必要校验以防崩溃

ROOT_DIR   = r".\Nutrition5K\Nutrition5K\train"
LABELS_CSV = r".\Nutrition5K\Nutrition5K\nutrition5k_train.csv"
OUT_CSV    = "./train_index_0.csv"

from pathlib import Path
import csv

IMG_EXTS = {".png"}

def norm_id(x: str) -> str:
    x = x.strip()
    if x.startswith("dish_"):
        return x
    return f"dish_{x.zfill(4) if x.isdigit() else x}"

def first_image(p: Path) -> str:
    if not p.is_dir():
        return ""
    for f in sorted(p.iterdir()):
        if f.is_file() and f.suffix.lower() in IMG_EXTS:
            return str(f)
    return ""

def pick_with_basename(dirpath: Path, basename: str) -> str:
    if not dirpath.is_dir():
        return ""
    for ext in IMG_EXTS:
        f = dirpath / f"{basename}{ext}"
        if f.is_file():
            return str(f)
    return first_image(dirpath)

def load_labels(csv_path: Path) -> dict:
    m = {}
    with csv_path.open("r", newline="", encoding="utf-8") as f:
        r = csv.DictReader(f)
        for row in r:
            m[norm_id(row["ID"])] = str(row["Value"]).strip()
    return m

def _to_float_or_none(s: str):
    try:
        return float(s.strip())
    except Exception:
        return None

def main():
    root = Path(ROOT_DIR)
    color_root = root / "color"
    dcolor_root = root / "depth_color"
    draw_root   = root / "depth_raw"

    # 收集所有 dish_id（存在于 color/depth_raw/depth_color 或 label 中）
    dish_ids = set()
    for p in (color_root, dcolor_root, draw_root):
        if p.is_dir():
            for d in p.iterdir():
                if d.is_dir():
                    dish_ids.add(d.name)

    labels = load_labels(Path(LABELS_CSV))
    all_ids = sorted(dish_ids | set(labels.keys()))

    rows = []
    skipped_missing_label = 0
    skipped_missing_rgb   = 0

    for did in all_ids:
        # 1) label 必须是可解析的数字（允许 0；不做 log；不做最大值过滤）
        y = _to_float_or_none(labels.get(did, ""))
        if y is None:
            skipped_missing_label += 1
            continue

        # 2) 路径收集；RGB 必须存在一张图；depth*可为空
        rgb_dir = color_root / did
        dc_dir  = dcolor_root / did
        dr_dir  = draw_root / did

        rgb = pick_with_basename(rgb_dir, "rgb")
        if not rgb:
            skipped_missing_rgb += 1
            continue

        dcp = pick_with_basename(dc_dir, "depth_color")
        drp = pick_with_basename(dr_dir, "depth_raw")

        rows.append({
            "dish_id": did,
            "label": f"{y}",          # 写成规范数值
            "rgb_path": rgb,
            "depth_color_path": dcp,  # 可能是 ""（允许）
            "depth_raw_path": drp     # 可能是 ""（允许）
        })

    outp = Path(OUT_CSV)
    outp.parent.mkdir(parents=True, exist_ok=True)
    with outp.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["dish_id","label","rgb_path","depth_color_path","depth_raw_path"])
        w.writeheader()
        w.writerows(rows)

    print(
        f"[OK] wrote {len(rows)} rows -> {outp} | "
        f"skipped missing_label: {skipped_missing_label}, "
        f"missing_rgb: {skipped_missing_rgb}"
    )

if __name__ == "__main__":
    main()


[OK] wrote 3301 rows -> train_index_0.csv | skipped missing_label: 0, missing_rgb: 0


In [2]:
ROOT_DIR     = r".\Nutrition5K\Nutrition5K\train"
LABELS_CSV   = r".\Nutrition5K\Nutrition5K\nutrition5k_train.csv"
OUT_CSV      = "./train_index_log.csv"  # 改名：log 版索引

from pathlib import Path
import csv, math

IMG_EXTS = {".png"}

def norm_id(x: str) -> str:
    x = x.strip()
    if x.startswith("dish_"):
        return x
    return f"dish_{x.zfill(4) if x.isdigit() else x}"

def first_image(p: Path) -> str:
    if not p.is_dir():
        return ""
    for f in sorted(p.iterdir()):
        if f.is_file() and f.suffix.lower() in IMG_EXTS:
            return str(f)
    return ""

def pick_with_basename(dirpath: Path, basename: str) -> str:
    if not dirpath.is_dir():
        return ""
    for ext in IMG_EXTS:
        f = dirpath / f"{basename}{ext}"
        if f.is_file():
            return str(f)
    return first_image(dirpath)

def load_labels(csv_path: Path) -> dict:
    m = {}
    with csv_path.open("r", newline="", encoding="utf-8") as f:
        r = csv.DictReader(f)
        for row in r:
            m[norm_id(row["ID"])] = str(row["Value"]).strip()
    return m

def _parse_float(s: str):
    try:
        return float(s.strip())
    except Exception:
        return None

def _is_zero_label(s: str) -> bool:
    try:
        return float(s.strip()) == 0.0
    except Exception:
        return False

# 线性域阈值过滤（>1500）
def _is_over_threshold(s: str, thr: float = 1500.0) -> bool:
    try:
        return float(s.strip()) > thr
    except Exception:
        # 非法/空值在其它逻辑里处理
        return False

def main():
    root = Path(ROOT_DIR)
    color_root = root / "color"
    dcolor_root = root / "depth_color"
    draw_root   = root / "depth_raw"

    dish_ids = set()
    for p in (color_root, dcolor_root, draw_root):
        if p.is_dir():
            for d in p.iterdir():
                if d.is_dir():
                    dish_ids.add(d.name)

    labels = load_labels(Path(LABELS_CSV))

    all_ids = sorted(dish_ids | set(labels.keys()))
    rows = []
    skipped_zero = 0
    skipped_over = 0
    skipped_invalid = 0   # 非法/空值
    skipped_nonpos = 0    # <=0（包括负数）

    for did in all_ids:
        lbl_s = labels.get(did, "")
        v = _parse_float(lbl_s)

        # 无法解析成浮点的，跳过
        if v is None:
            skipped_invalid += 1
            continue

        # 跳过 label == 0（历史逻辑保留）
        if v == 0.0:
            skipped_zero += 1
            continue

        # 跳过 label <= 0（log 不可取）
        if v <= 0.0:
            skipped_nonpos += 1
            continue

        # 跳过 label > 1500（线性域阈值）
        if v > 1500.0:
            skipped_over += 1
            continue

        # ---- 做 natural log ----
        lbl_log = math.log(v)

        rgb_dir = color_root / did
        dc_dir  = dcolor_root / did
        dr_dir  = draw_root / did

        rgb = pick_with_basename(rgb_dir, "rgb")
        dcp = pick_with_basename(dc_dir, "depth_color")
        drp = pick_with_basename(dr_dir, "depth_raw")

        rows.append({
            "dish_id": did,
            "label": f"{lbl_log:.6f}",     # 写入 ln(label)
            "rgb_path": rgb,
            "depth_color_path": dcp,
            "depth_raw_path": drp
        })

    outp = Path(OUT_CSV)
    outp.parent.mkdir(parents=True, exist_ok=True)
    with outp.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["dish_id","label","rgb_path","depth_color_path","depth_raw_path"])
        w.writeheader()
        w.writerows(rows)

    print(f"[OK] wrote {len(rows)} rows -> {outp} "
          f"(skipped invalid: {skipped_invalid}, "
          f"skipped <=0: {skipped_nonpos}, "
          f"skipped label==0: {skipped_zero}, "
          f"skipped >1500: {skipped_over})")

if __name__ == "__main__":
    main()


[OK] wrote 3086 rows -> train_index_log.csv (skipped invalid: 0, skipped <=0: 0, skipped label==0: 214, skipped >1500: 1)
