In [1]:
# Jupyter cell: audit local images vs Cloudflare R2 (multi-prefix support)

import os
from pathlib import Path
import csv
from collections import defaultdict, Counter

import boto3
from botocore.config import Config

# ======== CONFIGURE ME ========
LOCAL_IMAGES_ROOT = Path("out/media_files").resolve()   # e.g., Path("UNKNOWN - Copy").resolve()

# Prefer Python list here; falls back to env var R2_PREFIXES (comma-separated), then R2_PREFIX.
R2_PREFIXES = [
    "Images",
    "RedGiphys",
    "Gifs",
    "Videos"
]
if not R2_PREFIXES:
    if os.getenv("R2_PREFIXES"):
        R2_PREFIXES = [p.strip().strip("/") for p in os.getenv("R2_PREFIXES").split(",") if p.strip()]
    else:
        single = (os.getenv("R2_PREFIX") or "").strip().strip("/")
        R2_PREFIXES = [single] if single else [""]  # empty string = no prefix (bucket root)

# R2 credentials (from environment)
R2_ENDPOINT = os.getenv("R2_ENDPOINT")                # e.g., "https://<accountid>.r2.cloudflarestorage.com"
R2_ACCESS_KEY_ID = os.getenv("R2_ACCESS_KEY_ID")
R2_SECRET_ACCESS_KEY = os.getenv("R2_SECRET_ACCESS_KEY")
R2_REGION = os.getenv("R2_REGION", "auto")
R2_BUCKET = os.getenv("R2_BUCKET")                    # required

# Output CSV (written next to the local root)
OUT_CSV = LOCAL_IMAGES_ROOT.parent / "r2_audit.csv"

# ======== VALIDATE ========
missing = [name for name, val in [
    ("R2_ENDPOINT", R2_ENDPOINT),
    ("R2_ACCESS_KEY_ID", R2_ACCESS_KEY_ID),
    ("R2_SECRET_ACCESS_KEY", R2_SECRET_ACCESS_KEY),
    ("R2_BUCKET", R2_BUCKET),
] if not val]
if missing:
    raise ValueError(f"Missing required env vars: {', '.join(missing)}")

if not LOCAL_IMAGES_ROOT.exists() or not LOCAL_IMAGES_ROOT.is_dir():
    raise FileNotFoundError(f"Local images root not found or not a directory: {LOCAL_IMAGES_ROOT}")

# ======== CONNECT TO R2 (S3-compatible) ========
s3 = boto3.client(
    "s3",
    endpoint_url=R2_ENDPOINT,
    aws_access_key_id=R2_ACCESS_KEY_ID,
    aws_secret_access_key=R2_SECRET_ACCESS_KEY,
    region_name=R2_REGION,
    config=Config(signature_version="s3v4"),
)

# ======== HELPERS ========
def posix_rel(local_path: Path, root: Path) -> str:
    """Relative path with forward slashes (S3-style)."""
    return local_path.relative_to(root).as_posix()

def make_key(prefix: str, rel: str) -> str:
    """Join normalized prefix with relative path."""
    prefix = (prefix or "").strip().strip("/")
    return f"{prefix}/{rel}" if prefix else rel

def list_remote_objects(bucket: str, prefix: str | None = None):
    paginator = s3.get_paginator("list_objects_v2")
    kwargs = {"Bucket": bucket}
    if prefix:
        kwargs["Prefix"] = prefix
    by_key = {}
    for page in paginator.paginate(**kwargs):
        for obj in page.get("Contents", []):
            by_key[obj["Key"]] = obj
    return by_key

def split_dir_stem_ext(key: str):
    """Return (dir_path_with_trailing_slash_or_empty, stem, ext_with_dot_or_empty, filename)."""
    if "/" in key:
        d, fname = key.rsplit("/", 1)
        d += "/"
    else:
        d, fname = "", key
    if "." in fname:
        st, ex = fname.rsplit(".", 1)
        return d, st, "." + ex.lower(), fname
    else:
        return d, fname, "", fname

# ======== PULL REMOTE INDEX FOR ALL PREFIXES ========
remote_by_key = {}               # key -> obj
by_dir_stem = defaultdict(list)  # (dir, stem) -> list[(key, ext)]
prefix_key_counts = Counter()

for pref in R2_PREFIXES:
    pref_norm = pref.strip().strip("/")
    # List only under this prefix (empty -> full bucket)
    submap = list_remote_objects(R2_BUCKET, pref_norm if pref_norm else None)
    remote_by_key.update(submap)
    prefix_key_counts[pref_norm or "(root)"] += len(submap)

# Build same-stem map once across all keys
for key in remote_by_key:
    d, st, ex, _ = split_dir_stem_ext(key)
    by_dir_stem[(d, st)].append((key, ex))

# ======== SCAN LOCAL & MATCH ACROSS PREFIXES ========
rows = []
totals = Counter()
per_prefix_exact = Counter()
per_prefix_same_stem = Counter()

for local in LOCAL_IMAGES_ROOT.rglob("*"):
    if not local.is_file():
        continue

    local_rel = posix_rel(local, LOCAL_IMAGES_ROOT)       # e.g., "abc123/01.jpg"
    local_ext = (("." + local.suffix.lower().lstrip(".")) if local.suffix else "").lower()

    all_expected_keys = [ make_key(pref, local_rel) for pref in R2_PREFIXES ]

    matched = False
    match_type = "missing"
    matched_prefix = ""
    matched_key = ""
    remote_ext = ""
    same_ext = False
    note = ""

    # 1) Try exact match in the order of prefixes provided
    for pref, key in zip(R2_PREFIXES, all_expected_keys):
        if key in remote_by_key:
            matched = True
            match_type = "exact"
            matched_prefix = pref
            matched_key = key
            _, _, remote_ext, _ = split_dir_stem_ext(key)
            same_ext = (remote_ext == local_ext)
            note = "exact_match"
            totals["exact"] += 1
            per_prefix_exact[pref or "(root)"] += 1
            break

    # 2) If no exact match, try same folder + same stem (any extension) per prefix
    if not matched:
        for pref, key in zip(R2_PREFIXES, all_expected_keys):
            parent_dir, stem, _, _ = split_dir_stem_ext(key)
            candidates = by_dir_stem.get((parent_dir, stem), [])
            if candidates:
                # pick the first candidate
                alt_key, remote_ext = candidates[0]
                matched = True
                match_type = "same_stem"
                matched_prefix = pref
                matched_key = alt_key
                same_ext = (remote_ext == local_ext)
                note = "found_same_stem_same_ext" if same_ext else "found_same_stem_diff_ext"
                totals["same_stem"] += 1
                per_prefix_same_stem[pref or "(root)"] += 1
                break

    if not matched:
        totals["missing"] += 1
        note = "missing"

    rows.append({
        "local_rel": local_rel,
        "local_ext": local_ext or "",
        "all_expected_keys": " | ".join(all_expected_keys),
        "matched": matched,
        "match_type": match_type,          # exact | same_stem | missing
        "matched_prefix": matched_prefix or "",
        "matched_key": matched_key,
        "remote_ext": remote_ext or "",
        "same_ext": same_ext,
        "note": note,
    })

# ======== WRITE CSV ========
header = [
    "local_rel", "local_ext", "all_expected_keys",
    "matched", "match_type", "matched_prefix", "matched_key",
    "remote_ext", "same_ext", "note",
]
with OUT_CSV.open("w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=header)
    w.writeheader()
    w.writerows(rows)

# ======== SUMMARY ========
print(f"Local root: {LOCAL_IMAGES_ROOT}")
print(f"Bucket: {R2_BUCKET}")
print(f"Endpoint: {R2_ENDPOINT}")
print("Prefixes:", [p or "(root)" for p in R2_PREFIXES])
print("\nRemote objects counted per prefix:")
for p in R2_PREFIXES:
    print(f"  {p or '(root)'}: {prefix_key_counts[p or '(root)']}")

total_files = len(rows)
print(f"\nLocal files scanned: {total_files}")
print(f"  Exact matches:      {totals['exact']}")
print(f"  Same-stem matches:  {totals['same_stem']}")
print(f"  Missing:            {totals['missing']}")

print("\nPer-prefix matches:")
for p in R2_PREFIXES:
    key = p or "(root)"
    print(f"  {key}: exact={per_prefix_exact[key]}, same_stem={per_prefix_same_stem[key]}")

print(f"\nCSV written: {OUT_CSV}")


Local root: S:\minds\Desktop\Downloader and Reddit System\DOWNLOADERS\SCRIPTS\out\media_files
Bucket: media-archive
Endpoint: https://e42006076fb25fd6e73a881a7bbdebd5.r2.cloudflarestorage.com
Prefixes: ['Images', 'RedGiphys', 'Gifs', 'Videos']

Remote objects counted per prefix:
  Images: 614
  RedGiphys: 265
  Gifs: 63
  Videos: 5

Local files scanned: 17
  Exact matches:      17
  Same-stem matches:  0
  Missing:            0

Per-prefix matches:
  Images: exact=17, same_stem=0
  RedGiphys: exact=0, same_stem=0
  Gifs: exact=0, same_stem=0
  Videos: exact=0, same_stem=0

CSV written: S:\minds\Desktop\Downloader and Reddit System\DOWNLOADERS\SCRIPTS\out\r2_audit.csv
