In [1]:
# ==============================================================
# DeepFakeShield — Image Manifest -> PKL / H5 / JSON / YAML
# ==============================================================
# Scans these folders (recursive), labels them, and saves a manifest:
#   Train/Real:
#     C:\Users\sagni\Downloads\DeepFakeShield\archive\train-20250112T065955Z-001\train\real
#   Train/Fake:
#     C:\Users\sagni\Downloads\DeepFakeShield\archive\train-20250112T065955Z-001\train\fake
#   Sample/Fake:
#     C:\Users\sagni\Downloads\DeepFakeShield\archive\Sample_fake_images\Sample_fake_images\fake
#   Test/Fake:
#     C:\Users\sagni\Downloads\DeepFakeShield\archive\test-20250112T065939Z-001\test\fake
#   Test/Real:
#     C:\Users\sagni\Downloads\DeepFakeShield\archive\test-20250112T065939Z-001\test\real
#
# Outputs (saved to C:\Users\sagni\Downloads\DeepFakeShield):
#   - deepfakeshield_images.pkl
#   - deepfakeshield_images.h5
#   - deepfakeshield_images.json
#   - deepfakeshield_images.yaml
#
# Notes:
# - We store paths + metadata (no image bytes).
# - HDF5 requires `tables` (PyTables, 64-bit Python recommended).
# - JSON/YAML have NaNs converted to None, and pure Python scalars.
# ==============================================================

from pathlib import Path
from typing import List, Dict, Any, Tuple
import hashlib
import json
import numpy as np
import pandas as pd
from PIL import Image

# ---------- Optional YAML ----------
try:
    import yaml
    HAVE_YAML = True
except Exception:
    HAVE_YAML = False
    print("[WARN] PyYAML not installed; YAML output will be skipped.")

# ---------- Paths ----------
OUT_DIR = Path(r"C:\Users\sagni\Downloads\DeepFakeShield")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Folders to scan (path, split, label_name) — label: real=0, fake=1
DATA_DIRS: List[Tuple[Path, str, str]] = [
    (Path(r"C:\Users\sagni\Downloads\DeepFakeShield\archive\train-20250112T065955Z-001\train\real"), "train", "real"),
    (Path(r"C:\Users\sagni\Downloads\DeepFakeShield\archive\train-20250112T065955Z-001\train\fake"), "train", "fake"),
    (Path(r"C:\Users\sagni\Downloads\DeepFakeShield\archive\Sample_fake_images\Sample_fake_images\fake"), "sample", "fake"),
    (Path(r"C:\Users\sagni\Downloads\DeepFakeShield\archive\test-20250112T065939Z-001\test\fake"), "test", "fake"),
    (Path(r"C:\Users\sagni\Downloads\DeepFakeShield\archive\test-20250112T065939Z-001\test\real"), "test", "real"),
]

BASE = "deepfakeshield_images"
PKL_PATH  = OUT_DIR / f"{BASE}.pkl"
H5_PATH   = OUT_DIR / f"{BASE}.h5"
JSON_PATH = OUT_DIR / f"{BASE}.json"
YAML_PATH = OUT_DIR / f"{BASE}.yaml"

# ---------- Config ----------
IMG_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp", ".gif"}
HASH_BYTES = 1024 * 1024  # 1 MB fingerprint for quick dedupe

# ---------- Helpers ----------
def fast_sha256(path: Path, n_bytes: int = HASH_BYTES) -> str:
    h = hashlib.sha256()
    try:
        with open(path, "rb") as f:
            h.update(f.read(n_bytes))
        return h.hexdigest()
    except Exception:
        return ""

def safe_image_meta(path: Path) -> Dict[str, Any]:
    """Open image and extract safe metadata without loading full image."""
    try:
        with Image.open(path) as im:
            im.load()  # ensure header is read
            w, h = im.size
            dpi_x, dpi_y = None, None
            dpi = im.info.get("dpi")
            if isinstance(dpi, (tuple, list)) and len(dpi) >= 2:
                dpi_x, dpi_y = dpi[0], dpi[1]
            return {
                "width": w,
                "height": h,
                "mode": im.mode,
                "format": im.format,
                "dpi_x": dpi_x,
                "dpi_y": dpi_y,
            }
    except Exception:
        return {
            "width": None, "height": None, "mode": None, "format": None,
            "dpi_x": None, "dpi_y": None
        }

def list_images(root: Path) -> List[Path]:
    if not root.exists():
        print(f"[WARN] Missing directory (skipped): {root}")
        return []
    return [p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in IMG_EXTS]

def scan_dir(root: Path, split: str, label_name: str) -> List[Dict[str, Any]]:
    label = 1 if label_name.lower() == "fake" else 0
    files = list_images(root)
    rows: List[Dict[str, Any]] = []
    for i, p in enumerate(files, 1):
        try:
            st = p.stat()
            meta = safe_image_meta(p)
            rows.append({
                "split": split,                      # train / test / sample
                "label_name": label_name,            # "real" / "fake"
                "label": int(label),                 # 0 / 1
                "filename": p.name,
                "relpath": str(p.relative_to(root)),
                "abspath": str(p.resolve()),
                "ext": p.suffix.lower(),
                "size_bytes": int(st.st_size),
                "sha256_head": fast_sha256(p),
                "width": meta["width"],
                "height": meta["height"],
                "mode": meta["mode"],
                "format": meta["format"],
                "dpi_x": meta["dpi_x"],
                "dpi_y": meta["dpi_y"],
                "source_dir": str(root),
            })
        except Exception as e:
            rows.append({
                "split": split,
                "label_name": label_name,
                "label": int(label),
                "filename": p.name,
                "relpath": str(p.relative_to(root)),
                "abspath": str(p.resolve()),
                "ext": p.suffix.lower(),
                "size_bytes": None,
                "sha256_head": "",
                "width": None, "height": None, "mode": None, "format": None,
                "dpi_x": None, "dpi_y": None,
                "source_dir": str(root),
                "error": str(e),
            })
        if i % 200 == 0:
            print(f"[INFO] {split}/{label_name}: processed {i} images...")
    print(f"[INFO] {split}/{label_name}: total {len(files)} images")
    return rows

def build_manifest(dirs: List[Tuple[Path, str, str]]) -> pd.DataFrame:
    all_rows: List[Dict[str, Any]] = []
    for root, split, label_name in dirs:
        all_rows.extend(scan_dir(root, split, label_name))
    if not all_rows:
        print("[WARN] No images found in any provided directory.")
        return pd.DataFrame(columns=[
            "split", "label_name", "label", "filename", "relpath", "abspath", "ext",
            "size_bytes", "sha256_head", "width", "height", "mode", "format",
            "dpi_x", "dpi_y", "source_dir", "error"
        ])
    df = pd.DataFrame(all_rows)
    # Stable ordering for repeatable outputs
    order_cols = [
        "split", "label_name", "label", "filename", "relpath", "abspath", "ext",
        "size_bytes", "sha256_head", "width", "height", "mode", "format",
        "dpi_x", "dpi_y", "source_dir", "error"
    ]
    df = df[[c for c in order_cols if c in df.columns]].sort_values(
        ["split", "label_name", "source_dir", "relpath", "filename"]
    ).reset_index(drop=True)
    print(f"[INFO] Manifest rows: {len(df)}")
    return df

# ---------- Coercion for storage ----------
NUMERIC_COLS = ["label", "size_bytes", "width", "height", "dpi_x", "dpi_y"]

def coerce_for_hdf(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure PyTables-friendly dtypes: numeric -> float64/int64, strings for others."""
    g = df.copy()
    for c in NUMERIC_COLS:
        if c in g.columns:
            g[c] = pd.to_numeric(g[c], errors="coerce").astype("float64")
    # Ensure object/string for non-numeric
    for c in g.columns:
        if c not in NUMERIC_COLS:
            g[c] = g[c].where(g[c].notna(), None).astype(object)
    return g

def records_for_json_yaml(df: pd.DataFrame) -> list[dict]:
    """Return list of Python-native dicts (NaN->None) suitable for json/yaml."""
    g = df.copy()
    # Convert numeric columns to float64 for uniformity
    for c in NUMERIC_COLS:
        if c in g.columns:
            g[c] = pd.to_numeric(g[c], errors="coerce").astype("float64")
    # Replace NaN/NA with None
    g = g.where(pd.notna(g), None)
    recs = g.to_dict(orient="records")

    # Convert numpy scalars to Python types
    out = []
    for rec in recs:
        clean = {}
        for k, v in rec.items():
            if isinstance(v, (np.floating,)):
                clean[k] = float(v)
            elif isinstance(v, (np.integer,)):
                clean[k] = int(v)
            elif isinstance(v, (np.bool_,)):
                clean[k] = bool(v)
            else:
                clean[k] = v  # str, None, etc.
        out.append(clean)
    return out

def save_all_formats(df: pd.DataFrame):
    # 1) PKL
    print(f"[WRITE] Pickle  -> {PKL_PATH}")
    df.to_pickle(PKL_PATH)

    # 2) HDF5
    try:
        df_h5 = coerce_for_hdf(df)
        print(f"[WRITE] HDF5    -> {H5_PATH}")
        df_h5.to_hdf(H5_PATH, key="images", mode="w", format="table")
    except Exception as e:
        print(f"[WARN] Could not write HDF5: {e}\n       Hint: ensure `pip install tables` and 64-bit Python.")

    # 3) JSON
    print(f"[WRITE] JSON    -> {JSON_PATH}")
    records = records_for_json_yaml(df)
    with open(JSON_PATH, "w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False, indent=2)

    # 4) YAML
    if HAVE_YAML:
        try:
            print(f"[WRITE] YAML    -> {YAML_PATH}")
            with open(YAML_PATH, "w", encoding="utf-8") as f:
                yaml.safe_dump(records, f, allow_unicode=True, sort_keys=False)
        except Exception as e:
            print(f"[WARN] Could not write YAML: {e}")
    else:
        print("[INFO] Skipping YAML (PyYAML not installed).")

# ---------- Run ----------
if __name__ == "__main__":
    print("[INFO] Building DeepFakeShield image manifest...")
    manifest_df = build_manifest(DATA_DIRS)
    print("[INFO] Columns:", list(manifest_df.columns))
    print("[INFO] Head:\n", manifest_df.head(5))
    save_all_formats(manifest_df)

    print("\n[DONE] Files saved in:", OUT_DIR)
    print(" -", PKL_PATH)
    print(" -", H5_PATH)
    print(" -", JSON_PATH)
    if HAVE_YAML:
        print(" -", YAML_PATH)


[INFO] Building DeepFakeShield image manifest...
[INFO] train/real: processed 200 images...
[INFO] train/real: total 326 images
[INFO] train/fake: total 153 images
[INFO] sample/fake: total 5 images
[INFO] test/fake: processed 200 images...
[INFO] test/fake: total 389 images
[INFO] test/real: total 110 images
[INFO] Manifest rows: 983
[INFO] Columns: ['split', 'label_name', 'label', 'filename', 'relpath', 'abspath', 'ext', 'size_bytes', 'sha256_head', 'width', 'height', 'mode', 'format', 'dpi_x', 'dpi_y', 'source_dir']
[INFO] Head:
     split label_name  label                 filename                  relpath  \
0  sample       fake      1  IMG-20250106-WA0009.jpg  IMG-20250106-WA0009.jpg   
1  sample       fake      1  IMG-20250106-WA0010.jpg  IMG-20250106-WA0010.jpg   
2  sample       fake      1  IMG-20250106-WA0011.jpg  IMG-20250106-WA0011.jpg   
3  sample       fake      1  IMG-20250106-WA0012.jpg  IMG-20250106-WA0012.jpg   
4  sample       fake      1  IMG-20250106-WA0013.jpg  IM