In [3]:
# ==============================================
# MediScan — Image Manifest -> PKL / H5 / JSON / YAML  (Final, robust)
# ==============================================

import json
import hashlib
from pathlib import Path
from typing import List, Dict, Any
import numpy as np
import pandas as pd
from PIL import Image

# ----- Optional YAML -----
try:
    import yaml
    HAVE_YAML = True
except Exception:
    HAVE_YAML = False
    print("[WARN] PyYAML not installed; YAML output will be skipped.")

# ----- Paths -----
IN_DIR  = Path(r"C:\Users\sagni\Downloads\MediScan\archive\data")
OUT_DIR = Path(r"C:\Users\sagni\Downloads\MediScan")
OUT_DIR.mkdir(parents=True, exist_ok=True)

BASE = "medi_scan_images"
PKL_PATH  = OUT_DIR / f"{BASE}.pkl"
H5_PATH   = OUT_DIR / f"{BASE}.h5"
JSON_PATH = OUT_DIR / f"{BASE}.json"
YAML_PATH = OUT_DIR / f"{BASE}.yaml"

# ----- Config -----
IMG_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}
HASH_BYTES = 1024 * 1024  # 1 MB for fast fingerprinting

def fast_sha256(path: Path, n_bytes: int = HASH_BYTES) -> str:
    h = hashlib.sha256()
    try:
        with open(path, "rb") as f:
            h.update(f.read(n_bytes))
        return h.hexdigest()
    except Exception:
        return ""

def safe_image_open(path: Path) -> Dict[str, Any]:
    try:
        with Image.open(path) as img:
            w, h = img.size
            dpi_x, dpi_y = None, None
            dpi = img.info.get("dpi")
            if isinstance(dpi, (tuple, list)) and len(dpi) >= 2:
                dpi_x, dpi_y = dpi[0], dpi[1]
            return {
                "width": w,
                "height": h,
                "mode": img.mode,
                "format": img.format,
                "dpi_x": dpi_x,
                "dpi_y": dpi_y,
            }
    except Exception:
        return {"width": None, "height": None, "mode": None, "format": None, "dpi_x": None, "dpi_y": None}

def list_images(root: Path) -> List[Path]:
    if not root.exists():
        print(f"[ERROR] Input directory does not exist: {root}")
        return []
    return [p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in IMG_EXTS]

def build_manifest(root: Path) -> pd.DataFrame:
    files = list_images(root)
    if not files:
        print(f"[WARN] No image files found under {root}")
        return pd.DataFrame(columns=[
            "filename","relpath","abspath","ext","size_bytes","sha256_head",
            "width","height","mode","format","dpi_x","dpi_y"
        ])

    rows: List[Dict[str, Any]] = []
    for i, p in enumerate(files, start=1):
        try:
            stat = p.stat()
            meta = safe_image_open(p)
            rows.append({
                "filename": p.name,
                "relpath": str(p.relative_to(root)),
                "abspath": str(p.resolve()),
                "ext": p.suffix.lower(),
                "size_bytes": int(stat.st_size),
                "sha256_head": fast_sha256(p, HASH_BYTES),
                "width": meta["width"],
                "height": meta["height"],
                "mode": meta["mode"],
                "format": meta["format"],
                "dpi_x": meta["dpi_x"],
                "dpi_y": meta["dpi_y"],
            })
        except Exception as e:
            rows.append({
                "filename": p.name,
                "relpath": str(p.relative_to(root)),
                "abspath": str(p.resolve()),
                "ext": p.suffix.lower(),
                "size_bytes": None,
                "sha256_head": "",
                "width": None, "height": None, "mode": None, "format": None,
                "dpi_x": None, "dpi_y": None,
                "error": str(e),
            })
        if i % 200 == 0:
            print(f"[INFO] Processed {i} images...")

    df = pd.DataFrame(rows).sort_values(["relpath", "filename"]).reset_index(drop=True)
    print(f"[INFO] Manifest built with {len(df)} rows.")
    return df

# ---------- Type coercion helpers ----------
NUMERIC_COLS = ["size_bytes", "width", "height", "dpi_x", "dpi_y"]

def coerce_for_hdf(df: pd.DataFrame) -> pd.DataFrame:
    """Convert to HDF5-friendly dtypes (no Pandas extension dtypes)."""
    g = df.copy()

    # 1) numeric columns -> float64 (NaNs allowed)
    for c in NUMERIC_COLS:
        if c in g.columns:
            g[c] = pd.to_numeric(g[c], errors="coerce").astype("float64")

    # 2) non-numeric/object columns -> plain Python strings (object dtype)
    for c in g.columns:
        if c not in NUMERIC_COLS:
            # Keep None for missing; convert non-missing to str
            g[c] = g[c].where(g[c].isna(), g[c].astype(str))

    return g

def coerce_for_json(df: pd.DataFrame) -> list[dict]:
    """Return a list of JSON-serializable dicts (NaN/NA -> None; no extension dtypes)."""
    g = df.copy()

    # numeric as floats
    for c in NUMERIC_COLS:
        if c in g.columns:
            g[c] = pd.to_numeric(g[c], errors="coerce").astype("float64")

    # Replace all pandas/np missing with None
    g = g.where(pd.notna(g), None)

    # Ensure everything else is basic Python types
    records = []
    for rec in g.to_dict(orient="records"):
        clean = {}
        for k, v in rec.items():
            if isinstance(v, (np.floating, np.integer)):
                v = float(v) if isinstance(v, np.floating) else int(v)
            elif isinstance(v, (np.bool_,)):
                v = bool(v)
            elif v is None:
                v = None
            else:
                v = str(v) if not isinstance(v, (str, int, float, bool, type(None))) else v
            clean[k] = v
        records.append(clean)
    return records

def save_all(df: pd.DataFrame):
    # ---- PKL ----
    print(f"[WRITE] Pickle  -> {PKL_PATH}")
    df.to_pickle(PKL_PATH)

    # ---- HDF5 ----
    try:
        print(f"[WRITE] HDF5    -> {H5_PATH}")
        df_h5 = coerce_for_hdf(df)
        df_h5.to_hdf(H5_PATH, key="images", mode="w", format="table")
    except Exception as e:
        print(f"[WARN] Could not write HDF5: {e}\n       Hint: pip install tables (64-bit Python)")

    # ---- JSON ----
    print(f"[WRITE] JSON    -> {JSON_PATH}")
    records = coerce_for_json(df)
    with open(JSON_PATH, "w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False, indent=2)

    # ---- YAML ----
    if HAVE_YAML:
        try:
            print(f"[WRITE] YAML    -> {YAML_PATH}")
            with open(YAML_PATH, "w", encoding="utf-8") as f:
                yaml.safe_dump(records, f, allow_unicode=True, sort_keys=False)
        except Exception as e:
            print(f"[WARN] Could not write YAML: {e}")
    else:
        print("[INFO] Skipping YAML (PyYAML not installed).")

# ----- Run -----
if __name__ == "__main__":
    print(f"[INFO] Scanning images under: {IN_DIR}")
    manifest_df = build_manifest(IN_DIR)
    print("[INFO] Columns:", list(manifest_df.columns))
    print("[INFO] Head:\n", manifest_df.head(3))
    save_all(manifest_df)

    print("\n[DONE] Files saved in:", OUT_DIR)
    print(" -", PKL_PATH)
    print(" -", H5_PATH)
    print(" -", JSON_PATH)
    if HAVE_YAML:
        print(" -", YAML_PATH)


[INFO] Scanning images under: C:\Users\sagni\Downloads\MediScan\archive\data
[INFO] Manifest built with 129 rows.
[INFO] Columns: ['filename', 'relpath', 'abspath', 'ext', 'size_bytes', 'sha256_head', 'width', 'height', 'mode', 'format', 'dpi_x', 'dpi_y']
[INFO] Head:
   filename  relpath                                            abspath   ext  \
0    1.jpg    1.jpg  C:\Users\sagni\Downloads\MediScan\archive\data...  .jpg   
1   10.jpg   10.jpg  C:\Users\sagni\Downloads\MediScan\archive\data...  .jpg   
2  100.jpg  100.jpg  C:\Users\sagni\Downloads\MediScan\archive\data...  .jpg   

   size_bytes                                        sha256_head  width  \
0        3045  33cebf97759ee401a51888d144bdb3772d414c1f65308b...    304   
1       52552  8e9da3ea572989d50af64c660f800d9282c88a575ad044...    960   
2      154430  83073b39b38f80528728bcb07db78670856b6899bee2dc...   1200   

   height mode format dpi_x dpi_y  
0     351    P    GIF  None  None  
1    1280  RGB   JPEG  None  None  
