In [1]:
# ==========================================================
# ArtifyAI — Archive Scanner → H5 / PKL / YAML / JSON
# Scans images, builds a manifest (one row per image), and
# saves robustly with large-text-safe HDF5 + streaming JSON.
# ==========================================================

from pathlib import Path
from typing import List, Dict, Any, Tuple
import hashlib
import json
import math
import os

import numpy as np
import pandas as pd
from PIL import Image, UnidentifiedImageError

# ---------- Optional YAML ----------
try:
    import yaml
    HAVE_YAML = True
except Exception:
    HAVE_YAML = False
    print("[WARN] PyYAML not installed; YAML output will be skipped.")

# ---------- Paths ----------
IN_ROOT = Path(r"C:\Users\sagni\Downloads\Artify AI\archive")
OUT_DIR = Path(r"C:\Users\sagni\Downloads\Artify AI")
OUT_DIR.mkdir(parents=True, exist_ok=True)

PKL_PATH  = OUT_DIR / "artify_manifest.pkl"
H5_PATH   = OUT_DIR / "artify_manifest.h5"
JSON_PATH = OUT_DIR / "artify_manifest.json"
YAML_PATH = OUT_DIR / "artify_manifest.yaml"   # subset by default; see YAML_MAX_ROWS

# ---------- Config ----------
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp", ".gif"}
SHA256_HEAD_BYTES = 65536   # 64KB for quick fingerprint
YAML_MAX_ROWS = 5000        # set to None for full YAML (can be very large!)

# ==========================================================
# Helpers
# ==========================================================
def list_images(root: Path) -> List[Path]:
    if not root.exists():
        print(f"[ERROR] Input folder not found: {root}")
        return []
    return [p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in IMG_EXTS]

def sha256_head(path: Path, nbytes: int = SHA256_HEAD_BYTES) -> str:
    try:
        h = hashlib.sha256()
        with open(path, "rb") as f:
            chunk = f.read(nbytes)
            h.update(chunk)
        return h.hexdigest()
    except Exception:
        return ""

def safe_image_info(path: Path) -> Dict[str, Any]:
    """
    Extract lightweight info; do NOT load the whole image into memory.
    """
    info = {
        "width": np.nan,
        "height": np.nan,
        "mode": None,
        "format": None,
        "dpi_x": None,
        "dpi_y": None,
    }
    try:
        with Image.open(path) as im:
            info["width"], info["height"] = im.size
            info["mode"] = im.mode
            info["format"] = im.format
            # DPI (if present)
            dpi = im.info.get("dpi", None)
            if isinstance(dpi, tuple) and len(dpi) >= 2:
                info["dpi_x"], info["dpi_y"] = dpi[0], dpi[1]
            elif isinstance(dpi, (int, float)):
                info["dpi_x"] = info["dpi_y"] = dpi
    except UnidentifiedImageError:
        info["format"] = "UNREADABLE"
    except Exception:
        pass
    return info

def scan_archive(root: Path) -> pd.DataFrame:
    files = list_images(root)
    rows = []
    for i, p in enumerate(files, 1):
        try:
            stat = p.stat()
            img = safe_image_info(p)
            # Use parent folder as "artist/style" label (common layout)
            artist = p.parent.name
            rows.append({
                "artist": artist,
                "filename": p.name,
                "relpath": str(p.relative_to(root)),
                "abspath": str(p.resolve()),
                "ext": p.suffix.lower(),
                "size_bytes": stat.st_size,
                "sha256_head": sha256_head(p),
                **img,
            })
        except Exception as e:
            rows.append({
                "artist": p.parent.name,
                "filename": p.name,
                "relpath": str(p.relative_to(root)),
                "abspath": str(p.resolve()),
                "ext": p.suffix.lower(),
                "size_bytes": None,
                "sha256_head": "",
                "width": np.nan, "height": np.nan,
                "mode": None, "format": None, "dpi_x": None, "dpi_y": None,
                "error": str(e),
            })
        if i % 500 == 0:
            print(f"[SCAN] {i}/{len(files)} files...")
    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.sort_values(["artist", "filename"]).reset_index(drop=True)
    print(f"[INFO] Scanned: {len(df)} images")
    return df

def coerce_for_hdf(df_in: pd.DataFrame) -> pd.DataFrame:
    g = df_in.copy()
    # object -> str
    for c in g.select_dtypes(include=["object"]).columns:
        g[c] = g[c].astype(str)
    # normalize numerics
    for c in g.columns:
        s = g[c]
        if pd.api.types.is_integer_dtype(s):
            g[c] = s.astype("int64", copy=False)
        elif pd.api.types.is_float_dtype(s):
            g[c] = s.astype("float64", copy=False)
        elif pd.api.types.is_bool_dtype(s):
            g[c] = s.astype(bool, copy=False)
    return g

def build_min_itemsize(df_in: pd.DataFrame, cap: int = 65500, headroom: int = 64) -> Dict[str, int]:
    sizes = {}
    for c in df_in.select_dtypes(include=["object"]).columns:
        try:
            max_len = int(df_in[c].astype(str).str.len().max() or 0)
        except Exception:
            max_len = 0
        size = min(cap, max_len + headroom)
        if size > 0:
            sizes[c] = size
    return sizes

def to_py_scalar(v):
    """Convert numpy/pandas scalars to plain Python for JSON/YAML."""
    if v is None:
        return None
    if isinstance(v, (np.integer,)):
        return int(v)
    if isinstance(v, (np.floating,)):
        return None if np.isnan(v) else float(v)
    if isinstance(v, (np.bool_,)):
        return bool(v)
    return v

# ==========================================================
# Run
# ==========================================================
df = scan_archive(IN_ROOT)
if df.empty:
    raise SystemExit("No images found. Check input path or extensions.")

print("[INFO] Columns:", list(df.columns))
print("[INFO] Head:\n", df.head(3))

# ---------- 1) Pickle ----------
print(f"[WRITE] Pickle -> {PKL_PATH}")
df.to_pickle(PKL_PATH)

# ---------- 2) HDF5 (with per-column min_itemsize) ----------
try:
    df_h5 = coerce_for_hdf(df)
    min_itemsize = build_min_itemsize(df_h5)
    print("[INFO] HDF5 min_itemsize:", min_itemsize)
    if H5_PATH.exists():
        try: H5_PATH.unlink()
        except Exception: pass
    print(f"[WRITE] HDF5   -> {H5_PATH}")
    df_h5.to_hdf(
        H5_PATH,
        key="images",
        mode="w",
        format="table",
        complib="blosc:zstd",
        complevel=5,
        min_itemsize=min_itemsize,
        data_columns=["artist", "ext", "format"]  # queryable columns
    )
except Exception as e:
    print(f"[WARN] Could not write HDF5: {e}\n       Hint: pip install tables (64-bit) and ensure enough disk space.")

# ---------- 3) JSON (streaming valid array) ----------
# Writes a proper JSON array without loading entire data to memory.
print(f"[WRITE] JSON   -> {JSON_PATH}")
with open(JSON_PATH, "w", encoding="utf-8") as f:
    f.write("[\n")
    for i, (_, row) in enumerate(df.iterrows()):
        rec = {k: to_py_scalar(v) for k, v in row.to_dict().items()}
        f.write(json.dumps(rec, ensure_ascii=False))
        if i != len(df) - 1:
            f.write(",\n")
    f.write("\n]\n")

# ---------- 4) YAML (cap rows to keep file reasonable) ----------
if HAVE_YAML:
    try:
        if YAML_MAX_ROWS is None or YAML_MAX_ROWS >= len(df):
            subset = df
            print(f"[INFO] Writing full YAML with {len(df)} rows (can be large).")
        else:
            subset = df.iloc[:YAML_MAX_ROWS]
            print(f"[INFO] Writing YAML subset of {len(subset)} rows (YAML_MAX_ROWS={YAML_MAX_ROWS}).")

        subset_records = []
        for _, row in subset.iterrows():
            rec = {k: to_py_scalar(v) for k, v in row.to_dict().items()}
            # ensure only JSON/YAML-safe scalar types
            for k, v in list(rec.items()):
                if not isinstance(v, (str, int, float, bool)) and v is not None:
                    rec[k] = str(v)
            subset_records.append(rec)

        print(f"[WRITE] YAML   -> {YAML_PATH}")
        with open(YAML_PATH, "w", encoding="utf-8") as f:
            yaml.safe_dump(subset_records, f, allow_unicode=True, sort_keys=False)
    except Exception as e:
        print(f"[WARN] Could not write YAML: {e}\n       Tip: set YAML_MAX_ROWS lower or write to a larger drive.")
else:
    print("[INFO] Skipping YAML (PyYAML not installed).")

print("\n[DONE] Files saved in:", OUT_DIR)
print(" -", PKL_PATH)
print(" -", H5_PATH, "(if created)")
print(" -", JSON_PATH)
if HAVE_YAML:
    print(" -", YAML_PATH)


[INFO] Scanned: 18 images
[INFO] Columns: ['artist', 'filename', 'relpath', 'abspath', 'ext', 'size_bytes', 'sha256_head', 'width', 'height', 'mode', 'format', 'dpi_x', 'dpi_y']
[INFO] Head:
     artist                                     filename  \
0  archive  Camille Pissarro - Boulevard Montmartre.jpg   
1  archive                   Edvard Munch - Anxiety.jpg   
2  archive                Edvard Munch - The Scream.jpg   

                                       relpath  \
0  Camille Pissarro - Boulevard Montmartre.jpg   
1                   Edvard Munch - Anxiety.jpg   
2                Edvard Munch - The Scream.jpg   

                                             abspath   ext  size_bytes  \
0  C:\Users\sagni\Downloads\Artify AI\archive\Cam...  .jpg     3436910   
1  C:\Users\sagni\Downloads\Artify AI\archive\Edv...  .jpg    11238116   
2  C:\Users\sagni\Downloads\Artify AI\archive\Edv...  .jpg    37765818   

                                         sha256_head  width  height mode 