In [1]:
import os
from pathlib import Path

try:
    from dotenv import load_dotenv
    _ = load_dotenv(override=False)
except Exception:
    pass  # it's fine if python-dotenv isn't installed; fall back to system env

REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID", "").strip()
REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET", "").strip()
REDDIT_USERNAME = os.getenv("REDDIT_USERNAME", "").strip()
REDDIT_PASSWORD = os.getenv("REDDIT_PASSWORD", "").strip()
REDDIT_USER_AGENT = os.getenv("REDDIT_USER_AGENT", "SavedRedditJSON/1.0").strip()

for k, v in {
    "REDDIT_CLIENT_ID": REDDIT_CLIENT_ID,
    "REDDIT_CLIENT_SECRET": "[set]" if REDDIT_CLIENT_SECRET else "",
    "REDDIT_USERNAME": REDDIT_USERNAME,
    "REDDIT_PASSWORD": "[set]" if REDDIT_PASSWORD else "",
    "REDDIT_USER_AGENT": REDDIT_USER_AGENT,
}.items():
    if not v:
        print(f"⚠️ Missing {k}. Set it via environment or a .env file.")


In [2]:
import time
import base64
import requests
from collections import Counter

OAUTH_TOKEN_URL = "https://www.reddit.com/api/v1/access_token"

_token_cache = {"access_token": None, "expires_at": 0}

def _basic_auth_header(client_id, client_secret):
    pair = f"{client_id}:{client_secret}".encode("utf-8")
    return "Basic " + base64.b64encode(pair).decode("ascii")

def get_oauth_token(force=False, scope="read history"):
    """Fetch and cache an OAuth token using password grant."""
    if not force and _token_cache["access_token"] and time.time() < _token_cache["expires_at"] - 30:
        return _token_cache["access_token"]

    if not (REDDIT_CLIENT_ID and REDDIT_CLIENT_SECRET and REDDIT_USERNAME and REDDIT_PASSWORD):
        raise RuntimeError("Reddit OAuth env vars are not fully set.")

    headers = {
        "Authorization": _basic_auth_header(REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET),
        "User-Agent": REDDIT_USER_AGENT or "SavedRedditJSON/1.0",
    }
    data = {
        "grant_type": "password",
        "username": REDDIT_USERNAME,
        "password": REDDIT_PASSWORD,
        "scope": scope,  # minimal scopes we need for reading
    }
    r = requests.post(OAUTH_TOKEN_URL, headers=headers, data=data, timeout=30)
    r.raise_for_status()
    js = r.json()
    tok = js.get("access_token")
    if not tok:
        raise RuntimeError(f"OAuth failed: {js}")
    _token_cache["access_token"] = tok
    _token_cache["expires_at"] = time.time() + int(js.get("expires_in", 3600))
    return tok


In [3]:
from urllib.parse import urlparse
from tqdm.auto import tqdm
import random, json, csv, os, re, time
from datetime import datetime, timezone

SESSION = requests.Session()
SESSION.headers.update({"User-Agent": REDDIT_USER_AGENT or "SavedRedditJSON/1.0"})

def ensure_bearer():
    tok = get_oauth_token(force=False)
    SESSION.headers["Authorization"] = f"Bearer {tok}"

def maybe_oauth_url(url: str) -> str:
    """
    Normalize any reddit URL to the OAuth domain with raw_json=1.
    Accepts:
      - https://www.reddit.com/r/.../comments/ID/... 
      - https://old.reddit.com/...
      - permalink paths like /r/.../comments/ID/...
    Returns:
      https://oauth.reddit.com/r/.../comments/ID/.../.json?raw_json=1
    """
    u = url.strip()
    if not u:
        raise ValueError("Empty URL")
    if not u.startswith("http"):
        # treat it as a permalink path
        u = "https://oauth.reddit.com" + (u if u.startswith("/") else "/" + u)
    parsed = urlparse(u)
    host = "oauth.reddit.com"
    path = parsed.path
    if not path.endswith("/"):
        path += "/"
    if not path.endswith(".json"):
        path += ".json"
    return f"https://{host}{path}?raw_json=1"

def request_with_backoff(method, url, *, max_retries=5, timeout=30):
    ensure_bearer()
    attempt = 0
    while True:
        try:
            resp = SESSION.request(method, url, timeout=timeout)
        except requests.RequestException as e:
            if attempt >= max_retries:
                raise
            time.sleep(min(60, 2 ** attempt) + random.uniform(0, 0.5))
            attempt += 1
            continue

        # Handle unauthorized → refresh token once
        if resp.status_code == 401 and attempt < max_retries:
            ensure_bearer()  # refresh
            attempt += 1
            time.sleep(1.0)
            continue

        # Handle rate limit or server errors
        if resp.status_code in (429,) or 500 <= resp.status_code < 600:
            if attempt >= max_retries:
                resp.raise_for_status()
            retry_after = resp.headers.get("Retry-After")
            if retry_after:
                try:
                    sleep = float(retry_after)
                except ValueError:
                    sleep = 10.0
            else:
                sleep = min(60, 2 ** attempt) + random.uniform(0, 0.5)
            time.sleep(sleep)
            attempt += 1
            continue

        if 400 <= resp.status_code < 500:
            resp.raise_for_status()

        return resp


In [4]:
def now_iso():
    return datetime.now(timezone.utc).isoformat()

def fetch_post_and_comments_oauth(url: str):
    """Use OAuth domain + raw_json=1 to fetch the familiar 2-element JSON list."""
    api_url = maybe_oauth_url(url)
    r = request_with_backoff("GET", api_url, max_retries=5, timeout=30)
    data = r.json()
    if not (isinstance(data, list) and len(data) >= 2):
        raise RuntimeError("Unexpected Reddit JSON format")
    post_listing = data[0]["data"]["children"]
    if not post_listing:
        raise RuntimeError("Post listing empty")
    post = post_listing[0]["data"]
    comments_listing = data[1]
    return post, comments_listing

def extract_comments(listing_node, *, max_depth=6, max_count=5000):
    """
    Convert Reddit's raw 'Listing' tree into a UI-friendly array:
    [{id, author, body, score, created_utc, permalink, is_submitter, parent_id, replies:[...]}, ...]
    """
    out = []
    remaining = [max_count]

    def from_node(node, depth):
        if remaining[0] <= 0 or depth > max_depth or not isinstance(node, dict):
            return None
        kind = node.get("kind")
        data = node.get("data", {})

        if kind == "t1":
            remaining[0] -= 1
            item = {
                "id": data.get("id"),
                "author": data.get("author") or "[deleted]",
                "author_fullname": data.get("author_fullname"),
                "body": data.get("body") or "",          # CommentThread uses plain text body
                "score": data.get("score"),
                "created_utc": data.get("created_utc"),
                "permalink": "https://www.reddit.com" + (data.get("permalink") or ""),
                "is_submitter": data.get("is_submitter"),
                "parent_id": data.get("parent_id"),
                "replies": []
            }
            # replies can be "" (empty string), None, or a Listing
            replies = data.get("replies")
            if isinstance(replies, dict):
                for ch in replies.get("data", {}).get("children", []):
                    child = from_node(ch, depth + 1)
                    if child is not None:
                        item["replies"].append(child)
            return item

        if kind == "Listing":
            for ch in data.get("children", []):
                c = from_node(ch, depth)
                if c is not None:
                    out.append(c)

        return None

    from_node(listing_node, 1)
    return out

def classify_media_kind(post: dict) -> str:
    url = (post.get("url_overridden_by_dest") or post.get("url") or "").lower()
    domain = (post.get("domain") or "").lower()
    post_hint = (post.get("post_hint") or "").lower()
    if post.get("is_gallery", False): return "gallery"
    elif "v.redd.it" in url or \
       (post.get("secure_media") and post["secure_media"].get("reddit_video")) or \
       (post.get("media") and post["media"].get("reddit_video")) or \
       bool(post.get("crosspost_parent_list")):
        return "video"
    elif post_hint == "image" or domain in ("i.redd.it", "i.reddituploads.com"): return "image"
    elif post.get("is_self", False): return "self"
    return "external"

def make_archive_object(post, comments_listing, *, include_comments=True, comments_depth=6, comments_limit=5000):
    obj = {
        "archived_at": now_iso(),
        "reddit_fullname": post.get("name"),
        "reddit_id": post.get("id"),
        "permalink": "https://www.reddit.com" + post.get("permalink", ""),
        "title": post.get("title", ""),
        "selftext": post.get("selftext", ""),
        "author": post.get("author"),
        "author_fullname": post.get("author_fullname"),
        "subreddit": post.get("subreddit"),
        "subreddit_id": post.get("subreddit_id"),
        "created_utc": post.get("created_utc"),
        "is_self": post.get("is_self", False),
        "url": post.get("url_overridden_by_dest") or post.get("url"),
        "domain": post.get("domain"),
        "post_hint": post.get("post_hint"),
        "is_gallery": post.get("is_gallery", False),
        "over_18": post.get("over_18", False),
        "spoiler": post.get("spoiler", False),
        "link_flair_text": post.get("link_flair_text"),
        "is_original_content": post.get("is_original_content", False),
        "stickied": post.get("stickied", False),
        "locked": post.get("locked", False),
        "edited": post.get("edited"),
        "num_comments": post.get("num_comments"),
        "score": post.get("score"),
        "upvote_ratio": post.get("upvote_ratio"),
        "media": None,          # schema parity; no downloads
        "external_link": None,  # set if media_kind == "external"
        "raw_post": post,
        "raw_comments": None,   # keep the full Reddit structure for fidelity
        # new meta about truncation
        "comments_truncated_by_depth": False,
        "comments_truncated_by_limit": False,
    }

    if include_comments:
        # always keep raw
        obj["raw_comments"] = comments_listing
        # build the UI-friendly list
        comments = extract_comments(
            comments_listing, max_depth=comments_depth, max_count=comments_limit
        )
        obj["comments"] = comments
        # heuristics to mark truncation (best-effort)
        obj["comments_truncated_by_depth"] = bool(comments_depth)  # informative only
        obj["comments_truncated_by_limit"] = bool(comments_limit and len(comments) >= comments_limit)

    return obj

def select_bucket(media_kind: str) -> str:
    if media_kind in ("image", "gallery", "video"):
        return "media"
    elif media_kind == "external":
        return "external"
    return "text"

def write_json_file(archive_obj, out_root, jsonl_path=None, skip_existing=True):
    import os, json
    rid = archive_obj.get("reddit_id") or "post"
    fname = f"{rid}.json"
    bucket = select_bucket(archive_obj.get("media_kind"))
    outdir_bucket = os.path.join(out_root, bucket)
    os.makedirs(outdir_bucket, exist_ok=True)
    path = os.path.join(outdir_bucket, fname)

    # If skipping existing, only truly skip when the file already has a 'comments' array.
    if skip_existing and os.path.exists(path):
        try:
            with open(path, "r", encoding="utf-8") as f:
                existing = json.load(f)
            if isinstance(existing.get("comments"), list) and len(existing["comments"]) > 0:
                return path, bucket, "skipped"
            # else: upgrade this file by writing comments now
        except Exception:
            # if corrupt/unreadable, fall through to rewrite
            pass

    with open(path, "w", encoding="utf-8") as f:
        json.dump(archive_obj, f, ensure_ascii=False, indent=2)
    if jsonl_path:
        with open(jsonl_path, "a", encoding="utf-8") as jf:
            jf.write(json.dumps(archive_obj, ensure_ascii=False) + "\n")
    return path, bucket, ("updated" if os.path.exists(path) else "written")


In [5]:
# ---- User knobs ----
INPUT_CSV = 'links.csv'            # e.g., "urls.csv" (first column = reddit URLs/permalinks)
SINGLE_URL = None           # e.g., "https://www.reddit.com/r/whatever/comments/abc123/..."
OUTDIR = "out"     # root folder for outputs
JSONL_PATH = None           # e.g., "archive.jsonl" to append JSON lines too
COMMENTS_DEPTH = 1000
COMMENTS_LIMIT = 100000
DELAY_BETWEEN = 0.01         # seconds between posts
BATCH_SIZE = 1000
BATCH_PAUSE = 0            # seconds between batches

def process_one(link: str):
    post, comments_listing = fetch_post_and_comments_oauth(link)
    archive = make_archive_object(
        post, comments_listing,
        include_comments=True,
        comments_depth=COMMENTS_DEPTH,
        comments_limit=COMMENTS_LIMIT
    )
    media_kind = classify_media_kind(post)
    archive["media_kind"] = media_kind
    if media_kind == "external":
        archive["external_link"] = post.get("url_overridden_by_dest") or post.get("url")

    out_path, bucket, status = write_json_file(archive, OUTDIR, JSONL_PATH, skip_existing=True)
    return {
        "id": archive.get("reddit_id"),
        "bucket": bucket,
        "link": archive.get("permalink"),
        "path": out_path,
        "status": status,   # "written" | "updated" | "skipped"
    }

# Build list of links
links = []
if SINGLE_URL:
    links = [SINGLE_URL.strip()]
elif INPUT_CSV and os.path.exists(INPUT_CSV):
    with open(INPUT_CSV, newline="", encoding="utf-8") as f:
        links = [row[0].strip() for row in csv.reader(f) if row and row[0].strip() and not row[0].strip().startswith("#")]

os.makedirs(OUTDIR, exist_ok=True)
totals = Counter()  # counts written, skipped, failed

def _set_postfix(pbar, info):
    pbar.set_postfix({
        "id": info.get("id") or "n/a",
        "bucket": info.get("bucket") or "-",
        "status": info.get("status") or "-",
    })

if links:
    i = 0
    while i < len(links):
        batch = links[i:i+BATCH_SIZE]
        pbar = tqdm(batch, desc=f"Batch {i//BATCH_SIZE + 1}", leave=False)
        for link in pbar:
            try:
                info = process_one(link)
                totals[info["status"]] += 1
                _set_postfix(pbar, info)
            except Exception as e:
                totals["failed"] += 1
                _set_postfix(pbar, {"id": "-", "bucket": "-", "status": "failed"})
                # also print a one-line error without breaking the bar
                tqdm.write(f"[ERROR] {link} :: {type(e).__name__}: {str(e)[:200]}")
            time.sleep(DELAY_BETWEEN)
        i += BATCH_SIZE
        if i < len(links):
            for _ in tqdm(range(BATCH_PAUSE), desc="Cooldown", leave=False):
                time.sleep(1)
else:
    link = input("Paste a Reddit post URL (or permalink path): ").strip()
    pbar = tqdm([link], desc="Processing", leave=False)
    for l in pbar:
        try:
            info = process_one(l)
            totals[info["status"]] += 1
            _set_postfix(pbar, info)
        except Exception as e:
            totals["failed"] += 1
            _set_postfix(pbar, {"id": "-", "bucket": "-", "status": "failed"})
            tqdm.write(f"[ERROR] {l} :: {type(e).__name__}: {str(e)[:200]}")

# Final one-line summary
print(f"\nDone. Written: {totals['written']}, Skipped: {totals['skipped']}, Failed: {totals['failed']}.")


Batch 1:   0%|          | 0/1000 [00:00<?, ?it/s]

[ERROR] https://www.reddit.com/r/CNC_Connect/comments/189a9ms/18_f4m_losangeles_success_story :: HTTPError: 404 Client Error: Not Found for url: https://oauth.reddit.com/r/CNC_Connect/comments/189a9ms/18_f4m_losangeles_success_story/.json?raw_json=1
[ERROR] https://www.reddit.com/r/CNC_Connect/comments/1frx6uh/24f4m_co_success_hotel_bar_flirting_turns_dark :: HTTPError: 404 Client Error: Not Found for url: https://oauth.reddit.com/r/CNC_Connect/comments/1frx6uh/24f4m_co_success_hotel_bar_flirting_turns_dark/.json?raw_json=1
[ERROR] https://www.reddit.com/r/u_Nikaniikaa/comments/unn3cn/verification_video :: HTTPError: 403 Client Error: Forbidden for url: https://oauth.reddit.com/r/u_Nikaniikaa/comments/unn3cn/verification_video/.json?raw_json=1


Cooldown: 0it [00:00, ?it/s]

Batch 2:   0%|          | 0/21 [00:00<?, ?it/s]


Done. Written: 0, Skipped: 10, Failed: 3.


# EXTERNAL LINK EXTRACTION

In [None]:
# === Extract external links and download Redgifs as <post_id>.mp4 ===
import re
import csv
import json
import time
from pathlib import Path
from urllib.parse import urlparse

import requests

try:
    from tqdm.auto import tqdm
except Exception:
    def tqdm(x, **kwargs): return x

BASE_OUT = Path(OUT_DIR)
EXTERNAL_DIR = BASE_OUT / "external"
REDDITS_OK = {"reddit.com", "www.reddit.com", "old.reddit.com", "np.reddit.com", "oauth.reddit.com", "redd.it"}
REDDIT_NATIVE_MEDIA = {"i.redd.it", "v.redd.it"}

# ---- 1) Helpers to read archives and extract the outbound link ----
def _domain(url: str) -> str:
    try:
        return urlparse(url).netloc.lower()
    except Exception:
        return ""

def extract_external_url(archive_obj: dict) -> str | None:
    """
    From your saved archive object:
      { "raw_post": {...}, "raw_comments": {...}, "comments": [...] }
    Pull the outbound link for external posts.
    """
    post = archive_obj.get("raw_post") or {}
    # Prefer the 'url_overridden_by_dest' field; fallback to 'url'
    url = post.get("url_overridden_by_dest") or post.get("url")
    if not url:
        return None

    d = _domain(url)
    # Treat non-Reddit, non-native-media as external
    if d and d not in REDDITS_OK and d not in REDDIT_NATIVE_MEDIA:
        return url
    return None

# ---- 2) Redgifs normalization & API download ----
# Accept common Redgifs URL shapes:
RE_REDGIFS_ID = re.compile(
    r"""(?ix)
    (?:^|/)(?:watch|ifr)/([a-z0-9]+)     # redgifs.com/watch/<id> or /ifr/<id>
    |                                   # OR
    (?:^|/)(?:i)/([a-z0-9]+)            # i.redgifs.com/i/<id>
    """.strip()
)

def redgifs_id_from_url(url: str) -> str | None:
    """
    Extract the media ID from redgifs-style URLs:
      - https://redgifs.com/watch/<id>
      - https://www.redgifs.com/watch/<id>
      - https://v3.redgifs.com/watch/<id>
      - https://redgifs.com/ifr/<id>
      - https://i.redgifs.com/i/<id>
    """
    m = RE_REDGIFS_ID.search(url)
    if not m:
        return None
    # One of the two groups will be set
    gid = m.group(1) or m.group(2)
    return gid.lower() if gid else None

# Redgifs API: get a temporary token, then resolve mp4 URLs
REDGIFS_AUTH_URL = "https://api.redgifs.com/v2/auth/temporary"
REDGIFS_GIF_URL  = "https://api.redgifs.com/v2/gifs/{id}"

_SESSION = requests.Session()
_RG_TOKEN = None
_RG_TOKEN_TS = 0

def redgifs_token(force: bool = False) -> str:
    global _RG_TOKEN, _RG_TOKEN_TS
    now = time.time()
    # Reuse token for ~20 minutes unless forced
    if not force and _RG_TOKEN and (now - _RG_TOKEN_TS) < 1200:
        return _RG_TOKEN
    r = _SESSION.get(REDGIFS_AUTH_URL, timeout=30)
    r.raise_for_status()
    _RG_TOKEN = r.json().get("token")
    _RG_TOKEN_TS = now
    if not _RG_TOKEN:
        raise RuntimeError("Failed to obtain Redgifs token.")
    return _RG_TOKEN

def redgifs_mp4_url(gid: str) -> str:
    tok = redgifs_token()
    headers = {"Authorization": f"Bearer {tok}"}
    r = _SESSION.get(REDGIFS_GIF_URL.format(id=gid), headers=headers, timeout=30)
    # If token expired, refresh once
    if r.status_code in (401, 403):
        tok = redgifs_token(force=True)
        headers = {"Authorization": f"Bearer {tok}"}
        r = _SESSION.get(REDGIFS_GIF_URL.format(id=gid), headers=headers, timeout=30)
    r.raise_for_status()
    info = r.json().get("gif") or {}
    # Prefer HD if present, else SD, else fallback to urls.origin
    urls = info.get("urls") or {}
    return urls.get("hd") or urls.get("sd") or urls.get("origin")

def download_stream(url: str, dest: Path, *, max_retries: int = 4):
    dest.parent.mkdir(parents=True, exist_ok=True)
    for attempt in range(max_retries):
        try:
            with _SESSION.get(url, stream=True, timeout=60) as r:
                r.raise_for_status()
                with open(dest, "wb") as f:
                    for chunk in r.iter_content(chunk_size=1024 * 256):
                        if chunk:
                            f.write(chunk)
            return
        except Exception as e:
            if attempt + 1 >= max_retries:
                raise
            time.sleep(min(2 ** attempt, 15))

# ---- 3) Walk external posts, export external links CSV, download Redgifs ----
external_json_files = sorted(EXTERNAL_DIR.glob("*.json"))
print(f"Found {len(external_json_files)} external post JSONs in {EXTERNAL_DIR}")

external_rows = []
redgifs_failed = []

REDGIFS_OUT = BASE_OUT / "redgifs"
REDGIFS_OUT.mkdir(parents=True, exist_ok=True)

for fp in tqdm(external_json_files, desc="Scanning external posts", unit="post"):
    try:
        data = json.loads(fp.read_text(encoding="utf-8"))
        post = (data or {}).get("raw_post") or {}
        pid  = post.get("id") or fp.stem  # fallback to filename if needed

        ext_url = extract_external_url(data)
        if not ext_url:
            # Still record that this external-typed file has no resolvable URL
            external_rows.append({"id": pid, "link": "", "domain": ""})
            continue

        dom = _domain(ext_url)
        external_rows.append({"id": pid, "link": ext_url, "domain": dom})

        # Redgifs download
        if "redgifs.com" in dom or dom.endswith(".redgifs.com"):
            gid = redgifs_id_from_url(ext_url)
            if not gid:
                # Sometimes the external URL is a redirect page; skip but log
                redgifs_failed.append({"id": pid, "link": ext_url, "reason": "no_id_from_url"})
                continue

            out_path = REDGIFS_OUT / f"{pid}.mp4"
            if out_path.exists():
                # already downloaded
                continue

            try:
                mp4_url = redgifs_mp4_url(gid)
                if not mp4_url:
                    redgifs_failed.append({"id": pid, "link": ext_url, "reason": "no_mp4_url"})
                    continue
                download_stream(mp4_url, out_path)
                # Show success line
                print(f"[REDGIFS] id={pid} -> {out_path.name}")
            except Exception as e:
                redgifs_failed.append({"id": pid, "link": ext_url, "reason": str(e)})
    except Exception as e:
        # If we cannot read this JSON at all, log as a redgifs failure only if it looked like redgifs
        redgifs_failed.append({"id": fp.stem, "link": "", "reason": f"read_error: {e}"})

# ---- 4) Write summary CSVs ----
ext_csv = BASE_OUT / "external_links.csv"
with ext_csv.open("w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["id", "link", "domain"])
    w.writeheader()
    w.writerows(external_rows)

if redgifs_failed:
    fail_csv = BASE_OUT / "redgifs_failed.csv"
    with fail_csv.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["id", "link", "reason"])
        w.writeheader()
        w.writerows(redgifs_failed)
    print(f"\nSaved Redgifs download failures to: {fail_csv.resolve()}")

print(f"\nSaved external links to: {ext_csv.resolve()}")
print(f"Redgifs saved (if any) to: {REDGIFS_OUT.resolve()}")


# MEDIA DOWNLOADER

In [None]:
# === Download embedded Reddit-hosted media for posts in out/media/*.json ===
import os
import re
import csv
import json
import time
import html
import mimetypes
from pathlib import Path
from urllib.parse import urlparse

import requests

try:
    from tqdm.auto import tqdm
except Exception:
    def tqdm(x, **kwargs): return x

BASE_OUT = Path(OUT_DIR)
MEDIA_JSON_DIR = BASE_OUT / "media"
MEDIA_OUT_DIR = BASE_OUT / "media_files"
MEDIA_OUT_DIR.mkdir(parents=True, exist_ok=True)

SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "reddit-media-downloader/1.0"})

# ---------- helpers ----------
def _clean_url(u: str | None) -> str | None:
    if not u:
        return None
    # Reddit often returns HTML-escaped URLs inside JSON
    return html.unescape(u)

def _domain(u: str | None) -> str:
    if not u:
        return ""
    try:
        return urlparse(u).netloc.lower()
    except Exception:
        return ""

def _ext_from_url_or_type(url: str | None, content_type: str | None) -> str:
    # Prefer extension from URL, else derive from content-type
    if url:
        path = urlparse(url).path
        ext = os.path.splitext(path)[1].lower()
        if ext in {".jpg", ".jpeg", ".png", ".gif", ".mp4", ".webm"}:
            return ext
    if content_type:
        ext = mimetypes.guess_extension(content_type.split(";")[0].strip())
        if ext:
            # normalize jpeg
            return ".jpg" if ext == ".jpe" else ext
    # sensible default fallback
    return ".mp4" if (url and ".mp4" in url) else ".jpg"

def _stream_download(url: str, dest: Path, *, max_retries: int = 4, chunk=1024 * 256):
    dest.parent.mkdir(parents=True, exist_ok=True)
    for attempt in range(max_retries):
        try:
            with SESSION.get(url, stream=True, timeout=60) as r:
                r.raise_for_status()
                ctype = r.headers.get("Content-Type")
                # if dest has no extension yet, refine using content-type
                if dest.suffix == "" and ctype:
                    dest = dest.with_suffix(_ext_from_url_or_type(url, ctype))
                with open(dest, "wb") as f:
                    for part in r.iter_content(chunk_size=chunk):
                        if part:
                            f.write(part)
            return dest  # final path (may include refined suffix)
        except Exception:
            if attempt + 1 >= max_retries:
                raise
            time.sleep(min(2 ** attempt, 15))

def _pick_best_preview(post: dict) -> str | None:
    """
    For image/GIF-like posts where 'preview' exists.
    Prefer MP4 variant (smaller, plays everywhere), else best image 'source'.
    """
    prev = post.get("preview") or {}
    variants = prev.get("variants") or {}
    # mp4 variant for gifs, etc.
    mp4v = variants.get("mp4") or variants.get("reddit_video_preview")
    if mp4v and mp4v.get("source", {}).get("url"):
        return _clean_url(mp4v["source"]["url"])
    # fallback to the image source
    src = (prev.get("images") or [{}])[0].get("source", {})
    if src.get("url"):
        return _clean_url(src["url"])
    return None

def _pick_vreddit_urls(post: dict) -> tuple[str | None, str | None]:
    """
    v.redd.it posts: return (preferred_mp4_url, fallback_mp4_url)
    Try in order: 'hls_url' (m3u8) -> 'fallback_url' (progressive) -> preview mp4.
    We only directly download MP4 (no ffmpeg merge here), so we prefer fallback_url,
    and otherwise try preview mp4.
    """
    media = post.get("media") or {}
    rv = media.get("reddit_video") or {}
    fallback = rv.get("fallback_url")  # often progressive mp4 (may be muted on long vids)
    hls = rv.get("hls_url")            # m3u8 playlist (would require ffmpeg)
    # If no fallback, sometimes preview.mp4 exists:
    prev_mp4 = None
    prev = post.get("preview") or {}
    pv = prev.get("reddit_video_preview") or {}
    if isinstance(pv, dict) and pv.get("fallback_url"):
        prev_mp4 = pv["fallback_url"]
    return (_clean_url(fallback), _clean_url(prev_mp4 or hls))

def _gallery_items(post: dict) -> list[tuple[str, str]]:
    """
    For gallery posts: return list of (url, suggested_ext).
    Uses media_metadata to select best 's' rendition.
    """
    items = []
    meta = post.get("media_metadata") or {}
    gdata = post.get("gallery_data") or {}
    order = [e.get("media_id") for e in gdata.get("items", []) if e.get("media_id")]
    for mid in order:
        m = meta.get(mid) or {}
        s = m.get("s") or {}
        url = _clean_url(s.get("mp4") or s.get("gif") or s.get("u") or s.get("url"))
        if not url:
            continue
        # guess extension: mp4 preferred over gif over image
        if "mp4" in s:
            ext = ".mp4"
        elif "gif" in s:
            ext = ".mp4"  # we'll still download the gif URL, but use .mp4 if it's actually mp4
        else:
            # look at mime if present
            m_type = m.get("m")
            ext = _ext_from_url_or_type(url, m_type)
        items.append((url, ext))
    return items

# ---------- main walk ----------
media_jsons = sorted(MEDIA_JSON_DIR.glob("*.json"))
print(f"Found {len(media_jsons)} media post JSONs in {MEDIA_JSON_DIR}")

fail_rows = []
downloaded = 0

for fp in tqdm(media_jsons, desc="Downloading embedded media", unit="post"):
    try:
        data = json.loads(fp.read_text(encoding="utf-8"))
        post = (data or {}).get("raw_post") or {}
        pid = post.get("id") or fp.stem

        # Prefer Reddit-hosted URL if present
        url = _clean_url(post.get("url_overridden_by_dest") or post.get("url"))
        dom = _domain(url)

        # Case A: gallery
        if post.get("is_gallery") or (post.get("gallery_data") and post.get("media_metadata")):
            items = _gallery_items(post)
            if not items:
                fail_rows.append({"id": pid, "reason": "gallery_no_items"})
                continue
            for idx, (item_url, ext) in enumerate(items, start=1):
                outfile = MEDIA_OUT_DIR / f"{pid}_g{idx:02d}{ext if ext.startswith('.') else ('.' + ext)}"
                if outfile.exists():
                    continue
                try:
                    _stream_download(item_url, outfile)
                    downloaded += 1
                    print(f"[GAL] {pid} -> {outfile.name}")
                except Exception as e:
                    fail_rows.append({"id": pid, "reason": f"gallery_item_fail:{e}"})

            continue  # next post

        # Case B: native video (v.redd.it)
        if (post.get("is_video") or (post.get("media") or {}).get("reddit_video")) and dom.endswith("v.redd.it"):
            main_mp4, alt_mp4 = _pick_vreddit_urls(post)
            target = MEDIA_OUT_DIR / f"{pid}.mp4"
            if target.exists():
                continue
            src = main_mp4 or alt_mp4
            if not src:
                # last chance: look into preview variants
                src = _pick_best_preview(post)
            if not src:
                fail_rows.append({"id": pid, "reason": "vreddit_no_source"})
                continue
            try:
                _stream_download(src, target)
                downloaded += 1
                print(f"[VID] {pid} -> {target.name}")
            except Exception as e:
                fail_rows.append({"id": pid, "reason": f"vreddit_dl_fail:{e}"})
            continue

        # Case C: image / gif via i.redd.it or preview
        if dom.endswith("i.redd.it"):
            # Direct i.redd.it link
            ext = _ext_from_url_or_type(url, None)
            target = MEDIA_OUT_DIR / f"{pid}{ext}"
            if not target.exists():
                try:
                    _stream_download(url, target)
                    downloaded += 1
                    print(f"[IMG] {pid} -> {target.name}")
                except Exception as e:
                    fail_rows.append({"id": pid, "reason": f"ireddit_dl_fail:{e}"})
            continue

        # Fallback: try preview (covers some GIF-to-MP4 conversions)
        prev_url = _pick_best_preview(post)
        if prev_url and _domain(prev_url) in {"i.redd.it", "v.redd.it", "preview.redd.it"}:
            ext = _ext_from_url_or_type(prev_url, None)
            target = MEDIA_OUT_DIR / f"{pid}{ext}"
            if not target.exists():
                try:
                    _stream_download(prev_url, target)
                    downloaded += 1
                    print(f"[PREV] {pid} -> {target.name}")
                except Exception as e:
                    fail_rows.append({"id": pid, "reason": f"preview_dl_fail:{e}"})
            continue

        # If we reach here, it looks like a Reddit-hosted "media" without a reliable direct URL
        fail_rows.append({"id": pid, "reason": "no_reddit_media_url"})
    except Exception as e:
        fail_rows.append({"id": fp.stem, "reason": f"read_error:{e}"})

# ---------- write failures ----------
if fail_rows:
    fail_csv = BASE_OUT / "media_failed.csv"
    with fail_csv.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["id", "reason"])
        w.writeheader()
        w.writerows(fail_rows)
    print(f"\nSaved media failures to: {fail_csv.resolve()}")

print(f"\nDone. Downloaded: {downloaded}. Files saved under: {MEDIA_OUT_DIR.resolve()}")
