In [16]:
# --- Reddit script app credentials (temporary hardcode ok for local use) ---
REDDIT_CLIENT_ID = "Ik1IhrLMkUe2Y7_jLqj-Ew"
REDDIT_CLIENT_SECRET = "1j81ffxuNl-e8EzPV4D3OzCVCH-1lw"
REDDIT_USERNAME = "Grand_Admiral_Tyken"
REDDIT_PASSWORD = "X5bugNC9j3Bc^Uf"


# --- IO config ---
CSV_PATH = "links.csv"   # one Reddit URL per line, no header
OUT_DIR  = "out"         # base folder for outputs

# --- polite request pacing ---
REQUEST_DELAY_SEC = 0.5  # delay between requests to Reddit

In [17]:
import re
import csv
import json
import time
from pathlib import Path
from typing import Optional, Tuple, List, Dict
from urllib.parse import urlparse

import requests

# Session + headers
SESSION = requests.Session()
UA = f"reddit-json-downloader-jupyter/1.0 (by u/{REDDIT_USERNAME})"

# OAuth endpoints and params
OAUTH_TOKEN_URL = "https://www.reddit.com/api/v1/access_token"
OAUTH_API_BASE  = "https://oauth.reddit.com"
COMMENTS_QUERY  = "raw_json=1&limit=500&depth=10&showmore=true"  # fuller comment payload

# URL parsing helpers
COMMENTS_ID_RE = re.compile(r"/comments/([a-z0-9]{5,8})", re.IGNORECASE)
SHORTLINK_RE   = re.compile(r"redd\.it/([a-z0-9]{5,8})", re.IGNORECASE)
SUB_RE         = re.compile(r"/r/([^/]+)/comments/", re.IGNORECASE)

def request_with_backoff(method: str, url: str, *, headers=None, data=None, timeout=60, max_retries=5):
    for attempt in range(max_retries):
        r = SESSION.request(method, url, headers=headers, data=data, timeout=timeout)
        # Success
        if r.status_code < 400:
            return r
        # Quarantine, rate limit, or transient failures
        if r.status_code in (429,) or 500 <= r.status_code < 600:
            time.sleep(min(2 ** attempt, 30))
            continue
        return r  # hard error
    return r

def get_token() -> str:
    auth = requests.auth.HTTPBasicAuth(REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET)
    data = {"grant_type": "password", "username": REDDIT_USERNAME, "password": REDDIT_PASSWORD}
    headers = {"User-Agent": UA}
    r = requests.post(OAUTH_TOKEN_URL, auth=auth, data=data, headers=headers, timeout=30)
    r.raise_for_status()
    tok = r.json().get("access_token")
    if not tok:
        raise RuntimeError(f"OAuth token missing; resp={r.text}")
    return tok

def oauth_headers() -> dict:
    tok = getattr(SESSION, "_oauth_token", None)
    if not tok:
        tok = get_token()
        SESSION._oauth_token = tok
    return {"Authorization": f"bearer {tok}", "User-Agent": UA}

def accept_quarantine(subreddit: str) -> bool:
    if not subreddit:
        return False
    url = f"{OAUTH_API_BASE}/api/accept_quarantine"
    r = request_with_backoff("POST", url, headers=oauth_headers(), max_retries=3)
    return r.status_code in (200, 204, 409)  # 409 ~ already accepted

def parse_link(link: str) -> Tuple[Optional[str], Optional[str]]:
    m = COMMENTS_ID_RE.search(link)
    if m:
        post_id = m.group(1)
    else:
        m2 = SHORTLINK_RE.search(link)
        post_id = m2.group(1) if m2 else None
    m_sr = SUB_RE.search(link)
    subreddit = m_sr.group(1) if m_sr else None
    return post_id, subreddit

def normalize_comments_url(link: str, fallback_post_id: Optional[str]) -> str:
    p = urlparse(link)
    path = p.path or ""
    host = (p.netloc or "").lower()
    if "redd.it" in host or "/comments/" not in path:
        if fallback_post_id:
            path = f"/comments/{fallback_post_id}/"
    if not path.endswith("/"):
        path += "/"
    return f"{OAUTH_API_BASE}{path}.json?{COMMENTS_QUERY}"


In [18]:
def collect_more_ids(listing_node) -> List[str]:
    ids = []
    def walk(node):
        if not isinstance(node, dict): return
        kind = node.get("kind"); data = node.get("data", {})
        if kind == "more":
            children = data.get("children") or []
            ids.extend([c for c in children if c])
        elif kind in ("Listing", "t1"):
            for ch in data.get("children", []):
                walk(ch)
            if kind == "t1" and isinstance(data.get("replies"), dict):
                walk(data["replies"])
    walk(listing_node)
    return ids

def index_comments_by_id(listing_node) -> Dict[str, dict]:
    idx = {}
    def walk(node):
        if not isinstance(node, dict): return
        kind = node.get("kind"); data = node.get("data", {})
        if kind == "t1":
            cid = (data.get("id") or "").lower()
            if cid: idx[cid] = node
            if isinstance(data.get("replies"), dict):
                walk(data["replies"])
        elif kind == "Listing":
            for ch in data.get("children", []):
                walk(ch)
    walk(listing_node)
    return idx

def replace_more_with_children(root_listing: dict, parent_lookup: Dict[str, dict], chunk_result: dict):
    listing = chunk_result.get("json", {}).get("data", {}).get("things", [])
    for thing in listing:
        if thing.get("kind") != "t1":
            continue
        data = thing.get("data", {})
        pid = data.get("parent_id", "")
        if pid.startswith("t1_"):
            parent_id = pid[3:].lower()
            parent = parent_lookup.get(parent_id)
            if parent:
                if not isinstance(parent["data"].get("replies"), dict):
                    parent["data"]["replies"] = {"kind": "Listing", "data": {"children": []}}
                parent["data"]["replies"]["data"]["children"].append(thing)
        elif pid.startswith("t3_"):
            root_listing["data"]["children"].append(thing)

def strip_more_nodes(node):
    if not isinstance(node, dict): return
    kind = node.get("kind"); data = node.get("data", {})
    if kind == "Listing":
        new_children = []
        for ch in data.get("children", []):
            if ch.get("kind") == "more":
                continue
            new_children.append(ch)
        data["children"] = new_children
        for ch in new_children:
            strip_more_nodes(ch)
    if kind == "t1" and isinstance(data.get("replies"), dict):
        strip_more_nodes(data["replies"])

def fetch_full_post_and_comments(link: str):
    """Return (post_obj, full_comments_listing, raw_array) with all 'more' expanded."""
    if not link.startswith(("http://", "https://")):
        raise ValueError(f"Not a URL: {link}")

    post_id, sr_hint = parse_link(link)
    comments_url = normalize_comments_url(link, post_id)

    r = request_with_backoff("GET", comments_url, headers=oauth_headers(), timeout=60)
    if r.status_code == 403:
        # try quarantine once
        sr = sr_hint
        if not sr:
            m = re.search(r"/r/([^/]+)/comments/", comments_url)
            sr = m.group(1) if m else ""
        if sr and accept_quarantine(sr):
            r = request_with_backoff("GET", comments_url, headers=oauth_headers(), timeout=60)
    r.raise_for_status()
    data = r.json()
    if not (isinstance(data, list) and len(data) >= 2):
        raise RuntimeError("Unexpected Reddit JSON format")

    post_listing = data[0]["data"]["children"]
    if not post_listing:
        raise RuntimeError("Post listing empty")
    post = post_listing[0]["data"]
    subreddit = post.get("subreddit") or sr_hint or ""
    link_id = post.get("id")  # base36
    comments_listing = data[1]

    # resolve all "more"
    if link_id:
        while True:
            more_ids = collect_more_ids(comments_listing)
            if not more_ids:
                break
            for i in range(0, len(more_ids), 100):
                chunk = more_ids[i:i+100]
                form = {
                    "link_id": f"t3_{link_id}",
                    "api_type": "json",
                    "children": ",".join(chunk),
                    "sort": "confidence",
                    "limit_children": False,
                    "raw_json": 1,
                }
                url = f"{OAUTH_API_BASE}/api/morechildren"
                r2 = request_with_backoff("POST", url, headers=oauth_headers(), data=form, timeout=60)
                if r2.status_code == 403 and subreddit and accept_quarantine(subreddit):
                    r2 = request_with_backoff("POST", url, headers=oauth_headers(), data=form, timeout=60)
                r2.raise_for_status()
                payload = r2.json()
                parent_idx = index_comments_by_id(comments_listing)
                replace_more_with_children(comments_listing, parent_idx, payload)
            # clean 'more' nodes consumed this pass
            strip_more_nodes(comments_listing)

    return post, comments_listing, data


In [19]:
# ---- Classification: external takes precedence over "media preview" ----
INTERNAL_REDDIT_HOSTS = {
    # Reddit-owned domains (internal)
    "reddit.com", "www.reddit.com", "old.reddit.com", "np.reddit.com", "oauth.reddit.com",
    "redd.it",
    # NOTE: we intentionally do NOT put i.redd.it or v.redd.it here, because those
    # are "native media" and should be treated as media, not "internal text".
}

# Native, Reddit-hosted media (treat as media, not external)
NATIVE_MEDIA_HOSTS = {"i.redd.it", "v.redd.it"}

def domain_of(url: Optional[str]) -> str:
    if not url:
        return ""
    try:
        return urlparse(url).netloc.lower()
    except Exception:
        return ""

def classify_post(post_data: dict) -> str:
    """
    Decide among: 'external', 'media', 'text'

    Priority (revised):
      1) external  -> if it links off-Reddit (e.g., redgifs.com), even if a preview exists
      2) media     -> native Reddit media (i.redd.it images, v.redd.it videos), galleries, etc.
      3) text      -> self-posts without media
    """
    is_self = bool(post_data.get("is_self"))

    # Prefer url_overridden_by_dest (if present), else url
    url = post_data.get("url_overridden_by_dest") or post_data.get("url")
    d = domain_of(url) or (post_data.get("domain") or "").lower()

    # 1) External: outbound, non-Reddit, non-native-media hosts
    if not is_self and d and (d not in INTERNAL_REDDIT_HOSTS) and (d not in NATIVE_MEDIA_HOSTS):
        return "external"

    # 2) Media: Reddit-native media, previews, galleries, video flags
    post_hint   = (post_data.get("post_hint") or "").lower()
    has_gallery = bool(post_data.get("gallery_data"))
    has_preview = bool(post_data.get("preview"))
    has_media   = bool(post_data.get("media")) or bool(post_data.get("is_video"))

    is_native_media_host = d in NATIVE_MEDIA_HOSTS
    is_media_hint = post_hint in {"image", "hosted:video", "rich:video"}
    if is_native_media_host or has_media or has_preview or has_gallery or is_media_hint:
        return "media"

    # 3) Text
    return "text"

def extract_comments_full(listing_node):
    """
    Produce a nested list of comments with replies (no depth/limit).
    """
    def convert(node):
        if not isinstance(node, dict): return None
        kind = node.get("kind"); data = node.get("data", {})
        if kind == "t1":
            item = {
                "id": data.get("id"),
                "author": data.get("author"),
                "author_fullname": data.get("author_fullname"),
                "body": data.get("body"),
                "body_html": data.get("body_html"),
                "score": data.get("score"),
                "created_utc": data.get("created_utc"),
                "permalink": "https://www.reddit.com" + data.get("permalink", ""),
                "is_submitter": data.get("is_submitter"),
                "parent_id": data.get("parent_id"),
                "replies": []
            }
            replies = data.get("replies")
            if isinstance(replies, dict):
                children = replies.get("data", {}).get("children", [])
                for ch in children:
                    child = convert(ch)
                    if child:
                        item["replies"].append(child)
            return item
        elif kind == "Listing":
            out = []
            for ch in data.get("children", []):
                c = convert(ch)
                if c:
                    out.append(c)
            return out
        return None

    res = convert(listing_node)
    return res if isinstance(res, list) else (res or [])

def make_archive_object(post: dict, comments_listing: dict):
    """
    Stable archive object:
      - 'raw_post'     : submission object (full)
      - 'raw_comments' : full comment Listing (expanded, no 'more')
      - 'comments'     : nested list of comments with replies
    """
    return {
        "raw_post": post,
        "raw_comments": comments_listing,
        "comments": extract_comments_full(comments_listing),
    }

def save_archive(doc: dict, base_out: Path, post_id: str, category: str):
    target = base_out / category
    target.mkdir(parents=True, exist_ok=True)
    (target / f"{post_id}.json").write_text(
        json.dumps(doc, ensure_ascii=False, indent=2), encoding="utf-8"
    )


In [20]:
# Progress bar + failed-links export + show (id, link, folder) for each success
try:
    from tqdm.auto import tqdm
except Exception:
    def tqdm(x, **kwargs): return x

base_out = Path(OUT_DIR)
base_out.mkdir(parents=True, exist_ok=True)

# Load links first so we can show a progress bar
links: list[str] = []
with open(CSV_PATH, newline="", encoding="utf-8") as f:
    reader = csv.reader(f)
    for row in reader:
        if row and row[0].strip():
            links.append(row[0].strip())

total = len(links)
ok = failed = skipped = 0

failed_rows = []  # collect failures to export later

def already_archived(pid: str) -> bool:
    return any((base_out / sub / f"{pid}.json").exists() for sub in ("media", "external", "text"))

for link in tqdm(links, desc="Archiving posts", unit="post"):
    pre_id, _ = parse_link(link)
    if pre_id and already_archived(pre_id):
        skipped += 1
        continue

    try:
        post, comments_listing, raw_array = fetch_full_post_and_comments(link)
        pid = post.get("id")
        if not pid:
            raise RuntimeError("Missing post id")

        category = classify_post(post)
        archive = make_archive_object(post, comments_listing)
        save_archive(archive, base_out, pid, category)
        ok += 1
        # 👇 show id, link, folder
        print(f"[OK] id={pid} | folder={category} | link={link}")
    except Exception as e:
        failed += 1
        status = getattr(getattr(e, "response", None), "status_code", "")
        failed_rows.append({
            "link": link,
            "guessed_id": pre_id or "",
            "error": str(e),
            "status": status,
        })

    time.sleep(REQUEST_DELAY_SEC)

# Export failed links to CSV in OUT_DIR
if failed_rows:
    fail_path = base_out / "failed.csv"
    with fail_path.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["link", "guessed_id", "status", "error"])
        w.writeheader()
        w.writerows(failed_rows)
    print(f"\nSaved failed links to: {fail_path.resolve()}")

print(f"\nDone. Success: {ok}, Skipped: {skipped}, Failed: {failed}, Total: {total}. Output root: {base_out.resolve()}")


Archiving posts:   0%|          | 0/791 [00:00<?, ?post/s]

[OK] id=y696cb | folder=text | link=https://www.reddit.com/r/SluttyConfessions/comments/y696cb/i_love_letting_guys_who_lack_confidence_pound_my
[OK] id=yf5fyr | folder=text | link=https://www.reddit.com/r/sexstories/comments/yf5fyr/fun_at_a_swingers_house_party
[OK] id=ydqkgu | folder=text | link=https://www.reddit.com/r/sexstories/comments/ydqkgu/my_roommates_and_i_had_sex_with_the_boys_from
[OK] id=y9sqye | folder=text | link=https://www.reddit.com/r/SluttyConfessions/comments/y9sqye/f19_how_i_found_out_my_bf_was_big
[OK] id=y8qvtg | folder=text | link=https://www.reddit.com/r/SluttyConfessions/comments/y8qvtg/my_f_friends_girlfriend_cheated_on_him_with_his
[OK] id=1mnxp43 | folder=media | link=https://www.reddit.com/r/IWantToBeHerHentai2/comments/1mnxp43/i_dont_know_why_you_such_a_big_deal_about_it_its
[OK] id=ya5sk7 | folder=external | link=https://www.reddit.com/r/RealGirls/comments/ya5sk7/your_face_should_be_between_my_thighs_right_now
[OK] id=ya2yvv | folder=external | link=http

# EXTERNAL LINK EXTRACTION

In [None]:
# === Extract external links and download Redgifs as <post_id>.mp4 ===
import re
import csv
import json
import time
from pathlib import Path
from urllib.parse import urlparse

import requests

try:
    from tqdm.auto import tqdm
except Exception:
    def tqdm(x, **kwargs): return x

BASE_OUT = Path(OUT_DIR)
EXTERNAL_DIR = BASE_OUT / "external"
REDDITS_OK = {"reddit.com", "www.reddit.com", "old.reddit.com", "np.reddit.com", "oauth.reddit.com", "redd.it"}
REDDIT_NATIVE_MEDIA = {"i.redd.it", "v.redd.it"}

# ---- 1) Helpers to read archives and extract the outbound link ----
def _domain(url: str) -> str:
    try:
        return urlparse(url).netloc.lower()
    except Exception:
        return ""

def extract_external_url(archive_obj: dict) -> str | None:
    """
    From your saved archive object:
      { "raw_post": {...}, "raw_comments": {...}, "comments": [...] }
    Pull the outbound link for external posts.
    """
    post = archive_obj.get("raw_post") or {}
    # Prefer the 'url_overridden_by_dest' field; fallback to 'url'
    url = post.get("url_overridden_by_dest") or post.get("url")
    if not url:
        return None

    d = _domain(url)
    # Treat non-Reddit, non-native-media as external
    if d and d not in REDDITS_OK and d not in REDDIT_NATIVE_MEDIA:
        return url
    return None

# ---- 2) Redgifs normalization & API download ----
# Accept common Redgifs URL shapes:
RE_REDGIFS_ID = re.compile(
    r"""(?ix)
    (?:^|/)(?:watch|ifr)/([a-z0-9]+)     # redgifs.com/watch/<id> or /ifr/<id>
    |                                   # OR
    (?:^|/)(?:i)/([a-z0-9]+)            # i.redgifs.com/i/<id>
    """.strip()
)

def redgifs_id_from_url(url: str) -> str | None:
    """
    Extract the media ID from redgifs-style URLs:
      - https://redgifs.com/watch/<id>
      - https://www.redgifs.com/watch/<id>
      - https://v3.redgifs.com/watch/<id>
      - https://redgifs.com/ifr/<id>
      - https://i.redgifs.com/i/<id>
    """
    m = RE_REDGIFS_ID.search(url)
    if not m:
        return None
    # One of the two groups will be set
    gid = m.group(1) or m.group(2)
    return gid.lower() if gid else None

# Redgifs API: get a temporary token, then resolve mp4 URLs
REDGIFS_AUTH_URL = "https://api.redgifs.com/v2/auth/temporary"
REDGIFS_GIF_URL  = "https://api.redgifs.com/v2/gifs/{id}"

_SESSION = requests.Session()
_RG_TOKEN = None
_RG_TOKEN_TS = 0

def redgifs_token(force: bool = False) -> str:
    global _RG_TOKEN, _RG_TOKEN_TS
    now = time.time()
    # Reuse token for ~20 minutes unless forced
    if not force and _RG_TOKEN and (now - _RG_TOKEN_TS) < 1200:
        return _RG_TOKEN
    r = _SESSION.get(REDGIFS_AUTH_URL, timeout=30)
    r.raise_for_status()
    _RG_TOKEN = r.json().get("token")
    _RG_TOKEN_TS = now
    if not _RG_TOKEN:
        raise RuntimeError("Failed to obtain Redgifs token.")
    return _RG_TOKEN

def redgifs_mp4_url(gid: str) -> str:
    tok = redgifs_token()
    headers = {"Authorization": f"Bearer {tok}"}
    r = _SESSION.get(REDGIFS_GIF_URL.format(id=gid), headers=headers, timeout=30)
    # If token expired, refresh once
    if r.status_code in (401, 403):
        tok = redgifs_token(force=True)
        headers = {"Authorization": f"Bearer {tok}"}
        r = _SESSION.get(REDGIFS_GIF_URL.format(id=gid), headers=headers, timeout=30)
    r.raise_for_status()
    info = r.json().get("gif") or {}
    # Prefer HD if present, else SD, else fallback to urls.origin
    urls = info.get("urls") or {}
    return urls.get("hd") or urls.get("sd") or urls.get("origin")

def download_stream(url: str, dest: Path, *, max_retries: int = 4):
    dest.parent.mkdir(parents=True, exist_ok=True)
    for attempt in range(max_retries):
        try:
            with _SESSION.get(url, stream=True, timeout=60) as r:
                r.raise_for_status()
                with open(dest, "wb") as f:
                    for chunk in r.iter_content(chunk_size=1024 * 256):
                        if chunk:
                            f.write(chunk)
            return
        except Exception as e:
            if attempt + 1 >= max_retries:
                raise
            time.sleep(min(2 ** attempt, 15))

# ---- 3) Walk external posts, export external links CSV, download Redgifs ----
external_json_files = sorted(EXTERNAL_DIR.glob("*.json"))
print(f"Found {len(external_json_files)} external post JSONs in {EXTERNAL_DIR}")

external_rows = []
redgifs_failed = []

REDGIFS_OUT = BASE_OUT / "redgifs"
REDGIFS_OUT.mkdir(parents=True, exist_ok=True)

for fp in tqdm(external_json_files, desc="Scanning external posts", unit="post"):
    try:
        data = json.loads(fp.read_text(encoding="utf-8"))
        post = (data or {}).get("raw_post") or {}
        pid  = post.get("id") or fp.stem  # fallback to filename if needed

        ext_url = extract_external_url(data)
        if not ext_url:
            # Still record that this external-typed file has no resolvable URL
            external_rows.append({"id": pid, "link": "", "domain": ""})
            continue

        dom = _domain(ext_url)
        external_rows.append({"id": pid, "link": ext_url, "domain": dom})

        # Redgifs download
        if "redgifs.com" in dom or dom.endswith(".redgifs.com"):
            gid = redgifs_id_from_url(ext_url)
            if not gid:
                # Sometimes the external URL is a redirect page; skip but log
                redgifs_failed.append({"id": pid, "link": ext_url, "reason": "no_id_from_url"})
                continue

            out_path = REDGIFS_OUT / f"{pid}.mp4"
            if out_path.exists():
                # already downloaded
                continue

            try:
                mp4_url = redgifs_mp4_url(gid)
                if not mp4_url:
                    redgifs_failed.append({"id": pid, "link": ext_url, "reason": "no_mp4_url"})
                    continue
                download_stream(mp4_url, out_path)
                # Show success line
                print(f"[REDGIFS] id={pid} -> {out_path.name}")
            except Exception as e:
                redgifs_failed.append({"id": pid, "link": ext_url, "reason": str(e)})
    except Exception as e:
        # If we cannot read this JSON at all, log as a redgifs failure only if it looked like redgifs
        redgifs_failed.append({"id": fp.stem, "link": "", "reason": f"read_error: {e}"})

# ---- 4) Write summary CSVs ----
ext_csv = BASE_OUT / "external_links.csv"
with ext_csv.open("w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["id", "link", "domain"])
    w.writeheader()
    w.writerows(external_rows)

if redgifs_failed:
    fail_csv = BASE_OUT / "redgifs_failed.csv"
    with fail_csv.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["id", "link", "reason"])
        w.writeheader()
        w.writerows(redgifs_failed)
    print(f"\nSaved Redgifs download failures to: {fail_csv.resolve()}")

print(f"\nSaved external links to: {ext_csv.resolve()}")
print(f"Redgifs saved (if any) to: {REDGIFS_OUT.resolve()}")


# MEDIA DOWNLOADER

In [None]:
# === Download embedded Reddit-hosted media for posts in out/media/*.json ===
import os
import re
import csv
import json
import time
import html
import mimetypes
from pathlib import Path
from urllib.parse import urlparse

import requests

try:
    from tqdm.auto import tqdm
except Exception:
    def tqdm(x, **kwargs): return x

BASE_OUT = Path(OUT_DIR)
MEDIA_JSON_DIR = BASE_OUT / "media"
MEDIA_OUT_DIR = BASE_OUT / "media_files"
MEDIA_OUT_DIR.mkdir(parents=True, exist_ok=True)

SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "reddit-media-downloader/1.0"})

# ---------- helpers ----------
def _clean_url(u: str | None) -> str | None:
    if not u:
        return None
    # Reddit often returns HTML-escaped URLs inside JSON
    return html.unescape(u)

def _domain(u: str | None) -> str:
    if not u:
        return ""
    try:
        return urlparse(u).netloc.lower()
    except Exception:
        return ""

def _ext_from_url_or_type(url: str | None, content_type: str | None) -> str:
    # Prefer extension from URL, else derive from content-type
    if url:
        path = urlparse(url).path
        ext = os.path.splitext(path)[1].lower()
        if ext in {".jpg", ".jpeg", ".png", ".gif", ".mp4", ".webm"}:
            return ext
    if content_type:
        ext = mimetypes.guess_extension(content_type.split(";")[0].strip())
        if ext:
            # normalize jpeg
            return ".jpg" if ext == ".jpe" else ext
    # sensible default fallback
    return ".mp4" if (url and ".mp4" in url) else ".jpg"

def _stream_download(url: str, dest: Path, *, max_retries: int = 4, chunk=1024 * 256):
    dest.parent.mkdir(parents=True, exist_ok=True)
    for attempt in range(max_retries):
        try:
            with SESSION.get(url, stream=True, timeout=60) as r:
                r.raise_for_status()
                ctype = r.headers.get("Content-Type")
                # if dest has no extension yet, refine using content-type
                if dest.suffix == "" and ctype:
                    dest = dest.with_suffix(_ext_from_url_or_type(url, ctype))
                with open(dest, "wb") as f:
                    for part in r.iter_content(chunk_size=chunk):
                        if part:
                            f.write(part)
            return dest  # final path (may include refined suffix)
        except Exception:
            if attempt + 1 >= max_retries:
                raise
            time.sleep(min(2 ** attempt, 15))

def _pick_best_preview(post: dict) -> str | None:
    """
    For image/GIF-like posts where 'preview' exists.
    Prefer MP4 variant (smaller, plays everywhere), else best image 'source'.
    """
    prev = post.get("preview") or {}
    variants = prev.get("variants") or {}
    # mp4 variant for gifs, etc.
    mp4v = variants.get("mp4") or variants.get("reddit_video_preview")
    if mp4v and mp4v.get("source", {}).get("url"):
        return _clean_url(mp4v["source"]["url"])
    # fallback to the image source
    src = (prev.get("images") or [{}])[0].get("source", {})
    if src.get("url"):
        return _clean_url(src["url"])
    return None

def _pick_vreddit_urls(post: dict) -> tuple[str | None, str | None]:
    """
    v.redd.it posts: return (preferred_mp4_url, fallback_mp4_url)
    Try in order: 'hls_url' (m3u8) -> 'fallback_url' (progressive) -> preview mp4.
    We only directly download MP4 (no ffmpeg merge here), so we prefer fallback_url,
    and otherwise try preview mp4.
    """
    media = post.get("media") or {}
    rv = media.get("reddit_video") or {}
    fallback = rv.get("fallback_url")  # often progressive mp4 (may be muted on long vids)
    hls = rv.get("hls_url")            # m3u8 playlist (would require ffmpeg)
    # If no fallback, sometimes preview.mp4 exists:
    prev_mp4 = None
    prev = post.get("preview") or {}
    pv = prev.get("reddit_video_preview") or {}
    if isinstance(pv, dict) and pv.get("fallback_url"):
        prev_mp4 = pv["fallback_url"]
    return (_clean_url(fallback), _clean_url(prev_mp4 or hls))

def _gallery_items(post: dict) -> list[tuple[str, str]]:
    """
    For gallery posts: return list of (url, suggested_ext).
    Uses media_metadata to select best 's' rendition.
    """
    items = []
    meta = post.get("media_metadata") or {}
    gdata = post.get("gallery_data") or {}
    order = [e.get("media_id") for e in gdata.get("items", []) if e.get("media_id")]
    for mid in order:
        m = meta.get(mid) or {}
        s = m.get("s") or {}
        url = _clean_url(s.get("mp4") or s.get("gif") or s.get("u") or s.get("url"))
        if not url:
            continue
        # guess extension: mp4 preferred over gif over image
        if "mp4" in s:
            ext = ".mp4"
        elif "gif" in s:
            ext = ".mp4"  # we'll still download the gif URL, but use .mp4 if it's actually mp4
        else:
            # look at mime if present
            m_type = m.get("m")
            ext = _ext_from_url_or_type(url, m_type)
        items.append((url, ext))
    return items

# ---------- main walk ----------
media_jsons = sorted(MEDIA_JSON_DIR.glob("*.json"))
print(f"Found {len(media_jsons)} media post JSONs in {MEDIA_JSON_DIR}")

fail_rows = []
downloaded = 0

for fp in tqdm(media_jsons, desc="Downloading embedded media", unit="post"):
    try:
        data = json.loads(fp.read_text(encoding="utf-8"))
        post = (data or {}).get("raw_post") or {}
        pid = post.get("id") or fp.stem

        # Prefer Reddit-hosted URL if present
        url = _clean_url(post.get("url_overridden_by_dest") or post.get("url"))
        dom = _domain(url)

        # Case A: gallery
        if post.get("is_gallery") or (post.get("gallery_data") and post.get("media_metadata")):
            items = _gallery_items(post)
            if not items:
                fail_rows.append({"id": pid, "reason": "gallery_no_items"})
                continue
            for idx, (item_url, ext) in enumerate(items, start=1):
                outfile = MEDIA_OUT_DIR / f"{pid}_g{idx:02d}{ext if ext.startswith('.') else ('.' + ext)}"
                if outfile.exists():
                    continue
                try:
                    _stream_download(item_url, outfile)
                    downloaded += 1
                    print(f"[GAL] {pid} -> {outfile.name}")
                except Exception as e:
                    fail_rows.append({"id": pid, "reason": f"gallery_item_fail:{e}"})

            continue  # next post

        # Case B: native video (v.redd.it)
        if (post.get("is_video") or (post.get("media") or {}).get("reddit_video")) and dom.endswith("v.redd.it"):
            main_mp4, alt_mp4 = _pick_vreddit_urls(post)
            target = MEDIA_OUT_DIR / f"{pid}.mp4"
            if target.exists():
                continue
            src = main_mp4 or alt_mp4
            if not src:
                # last chance: look into preview variants
                src = _pick_best_preview(post)
            if not src:
                fail_rows.append({"id": pid, "reason": "vreddit_no_source"})
                continue
            try:
                _stream_download(src, target)
                downloaded += 1
                print(f"[VID] {pid} -> {target.name}")
            except Exception as e:
                fail_rows.append({"id": pid, "reason": f"vreddit_dl_fail:{e}"})
            continue

        # Case C: image / gif via i.redd.it or preview
        if dom.endswith("i.redd.it"):
            # Direct i.redd.it link
            ext = _ext_from_url_or_type(url, None)
            target = MEDIA_OUT_DIR / f"{pid}{ext}"
            if not target.exists():
                try:
                    _stream_download(url, target)
                    downloaded += 1
                    print(f"[IMG] {pid} -> {target.name}")
                except Exception as e:
                    fail_rows.append({"id": pid, "reason": f"ireddit_dl_fail:{e}"})
            continue

        # Fallback: try preview (covers some GIF-to-MP4 conversions)
        prev_url = _pick_best_preview(post)
        if prev_url and _domain(prev_url) in {"i.redd.it", "v.redd.it", "preview.redd.it"}:
            ext = _ext_from_url_or_type(prev_url, None)
            target = MEDIA_OUT_DIR / f"{pid}{ext}"
            if not target.exists():
                try:
                    _stream_download(prev_url, target)
                    downloaded += 1
                    print(f"[PREV] {pid} -> {target.name}")
                except Exception as e:
                    fail_rows.append({"id": pid, "reason": f"preview_dl_fail:{e}"})
            continue

        # If we reach here, it looks like a Reddit-hosted "media" without a reliable direct URL
        fail_rows.append({"id": pid, "reason": "no_reddit_media_url"})
    except Exception as e:
        fail_rows.append({"id": fp.stem, "reason": f"read_error:{e}"})

# ---------- write failures ----------
if fail_rows:
    fail_csv = BASE_OUT / "media_failed.csv"
    with fail_csv.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["id", "reason"])
        w.writeheader()
        w.writerows(fail_rows)
    print(f"\nSaved media failures to: {fail_csv.resolve()}")

print(f"\nDone. Downloaded: {downloaded}. Files saved under: {MEDIA_OUT_DIR.resolve()}")
