In [1]:
# --- Reddit script app credentials (temporary hardcode ok for local use) ---
REDDIT_CLIENT_ID = "Ik1IhrLMkUe2Y7_jLqj-Ew"
REDDIT_CLIENT_SECRET = "1j81ffxuNl-e8EzPV4D3OzCVCH-1lw"
REDDIT_USERNAME = "Grand_Admiral_Tyken"
REDDIT_PASSWORD = "X5bugNC9j3Bc^Uf"
REDDIT_USER_AGENT = "SavedRedditJSON/1.0 by u/" + REDDIT_USERNAME

for k, v in {
    "REDDIT_CLIENT_ID": REDDIT_CLIENT_ID,
    "REDDIT_CLIENT_SECRET": "[set]" if REDDIT_CLIENT_SECRET else "",
    "REDDIT_USERNAME": REDDIT_USERNAME,
    "REDDIT_PASSWORD": "[set]" if REDDIT_PASSWORD else "",
}.items():
    if not v:
        print(f"WARNING: missing {k}.")

In [2]:
# --- Notebook Config ---
DATA_ROOT = "out"          # output root used by your site
CSV_PATH  = "../links.csv"           # one URL per line (optional)
SKIP_EXISTING   = True            # skip if a JSON for that id already exists in any bucket
COMMENTS_DEPTH  = 1000               # nested reply depth cap
COMMENTS_LIMIT  = 100000             # max comments per thread (total)
REQ_MAX_RETRIES = 10               # HTTP backoff retries
DELAY_BETWEEN   = 0.05             # polite delay (seconds) per URL
BATCH_PAUSE     = 100               # extra pause between batches (0 = none)

In [3]:
import os, re, csv, json, time, random, base64, requests
from datetime import datetime, timezone
from urllib.parse import urlparse, urlunparse, urlencode, parse_qsl

try:
    from dotenv import load_dotenv
    _ = load_dotenv(override=False)
except Exception:
    pass

# progress bar
try:
    from tqdm.notebook import tqdm
except Exception:
    from tqdm import tqdm

RUN_TS = datetime.now().strftime("%Y%m%d-%H%M%S")
REPORTS_DIR = "reports"
os.makedirs(REPORTS_DIR, exist_ok=True)
REPORTS_CSV = os.path.join(REPORTS_DIR, f"run-{RUN_TS}.csv")

# session with UA
UA = REDDIT_USER_AGENT or "Reddit-Archiver/JSON"
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": UA})

# make sure buckets exist
for sub in ("media", "external", "text"):
    os.makedirs(os.path.join(DATA_ROOT, sub), exist_ok=True)


In [4]:
class RedditOAuth:
    TOKEN_URL = "https://www.reddit.com/api/v1/access_token"
    def __init__(self, client_id, client_secret, username, password, user_agent):
        self.client_id = client_id
        self.client_secret = client_secret
        self.username = username
        self.password = password
        self.user_agent = user_agent
        self._token = None
        self._exp = 0

    def fetch(self):
        auth = requests.auth.HTTPBasicAuth(self.client_id, self.client_secret)
        data = {"grant_type":"password", "username": self.username, "password": self.password}
        headers = {"User-Agent": self.user_agent}
        r = requests.post(self.TOKEN_URL, auth=auth, data=data, headers=headers, timeout=30)
        r.raise_for_status()
        js = r.json()
        self._token = js["access_token"]
        self._exp = time.time() + int(js.get("expires_in", 3600)) * 0.9  # refresh early

    def token(self):
        if not self._token or time.time() >= self._exp:
            self.fetch()
        return self._token

    def headers(self):
        return {"Authorization": f"bearer {self.token()}", "User-Agent": self.user_agent}

# uses REDDIT_* from your .env / first cell
oauth = RedditOAuth(REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USERNAME, REDDIT_PASSWORD, REDDIT_USER_AGENT)

def _to_oauth_url(url: str) -> str:
    # convert public reddit URLs to oauth host; drop .json
    u = url.rstrip()
    if u.endswith(".json"):
        u = u[:-5]
    p = urlparse(u)
    if p.netloc.lower() in {"www.reddit.com","reddit.com","old.reddit.com","np.reddit.com"}:
        p = p._replace(netloc="oauth.reddit.com")
        u = urlunparse(p)
    return u

def oauth_request(method: str, url: str, **kw):
    headers = kw.pop("headers", {}) or {}
    headers.update(oauth.headers())
    url2 = _to_oauth_url(url)
    while True:
        r = SESSION.request(method, url2, headers=headers, **kw)
        if r.status_code == 401:  # expired token
            oauth._token = None
            headers.update(oauth.headers())
            r = SESSION.request(method, url2, headers=headers, **kw)
        if r.status_code == 429:  # rate limited
            delay = r.headers.get("retry-after")
            try:
                delay = float(delay) if delay is not None else 2.0
            except Exception:
                delay = 2.0
            time.sleep(max(2.0, delay))
            continue
        return r

In [5]:
me = oauth_request("GET", "https://oauth.reddit.com/api/v1/me").json()
print("Auth user:", me.get("name"))

Auth user: Grand_Admiral_Tyken


In [6]:
def now_iso():
    return datetime.now(timezone.utc).isoformat()

def request_with_backoff(method: str, url: str, *, max_retries=5, timeout=30, stream=False, headers=None, params=None):
    attempt = 0
    while True:
        try:
            resp = oauth_request(method, url, timeout=timeout, stream=stream, headers=headers, params=params)
        except requests.RequestException as e:
            if attempt >= max_retries:
                raise
            sleep = min(60, 2 ** attempt) + random.uniform(0, 0.5)
            print(f"Network error {e}; retrying in {sleep:.1f}s …")
            time.sleep(sleep); attempt += 1
            continue

        # Respect rate limits / transient errors
        if resp.status_code == 429 or 500 <= resp.status_code < 600:
            if attempt >= max_retries:
                resp.raise_for_status()
                return resp
            retry_after = resp.headers.get("retry-after")
            try:
                sleep = float(retry_after) if retry_after is not None else min(60, 2 ** attempt)
            except Exception:
                sleep = min(60, 2 ** attempt)
            sleep += random.uniform(0, 0.5)
            print(f"HTTP {resp.status_code}; retrying in {sleep:.1f}s …")
            time.sleep(sleep); attempt += 1
            continue

        return resp
    
def find_existing_path_and_bucket(root, rid):
    for b in ("media","external","text"):
        p = os.path.join(root, b, f"{rid}.json")
        if os.path.exists(p):
            return p, b
    return None, None

def out_path_any_bucket(root, rid):
    """Return the first existing path for rid across buckets, else None."""
    for b in ("media", "external", "text"):
        p = os.path.join(root, b, f"{rid}.json")
        if os.path.exists(p): return p
    return None

def out_path_for(root, bucket, rid):
    return os.path.join(root, bucket, f"{rid}.json")

def init_report():
    if not os.path.exists(REPORTS_CSV):
        with open(REPORTS_CSV, "w", newline="", encoding="utf-8") as f:
            csv.writer(f).writerow(["ts","id","status","url","bucket","out_path","reason","http_status"])

def log_report(*, ts, rid, status, url, bucket=None, out_path=None, reason=None, http_status=None):
    with open(REPORTS_CSV, "a", newline="", encoding="utf-8") as f:
        csv.writer(f).writerow([ts, rid, status, url, bucket or "", out_path or "", reason or "", http_status or ""])


In [7]:
def fetch_post_and_comments(url: str, *, max_retries=REQ_MAX_RETRIES):
    if not url.startswith(("http://", "https://")):
        raise ValueError(f"Not a URL: {url}")
    u = url
    if not u.endswith("/"):
        u += "/"
    params = {"raw_json": 1, "limit": COMMENTS_LIMIT, "depth": COMMENTS_DEPTH}
    r = request_with_backoff("GET", u, max_retries=max_retries, timeout=30, params=params)
    data = r.json()
    if not (isinstance(data, list) and len(data) >= 2):
        raise RuntimeError("Unexpected Reddit JSON format")
    post_listing = data[0]["data"]["children"]
    if not post_listing:
        raise RuntimeError("Post listing empty")
    post = post_listing[0]["data"]
    comments_listing = data[1]
    return post, comments_listing, data


In [8]:
def extract_comments(listing_node, *, max_depth=COMMENTS_DEPTH, max_count=COMMENTS_LIMIT):
    """
    Normalize Reddit 'Listing' trees into a clean array of nested comment dicts:
    { id, author, body, body_html, score, created_utc, permalink, is_submitter, parent_id, replies: [] }
    - Replies are ALWAYS an array ('' -> [])
    - Only kind 't1' (comments) are collected
    - Depth and total count are bounded for safety
    """
    collected = []
    def walk(node, depth, remaining):
        if remaining[0] <= 0 or depth > max_depth: return
        if not isinstance(node, dict): return
        kind = node.get("kind"); data = node.get("data", {})

        if kind == "t1":
            remaining[0] -= 1
            item = {
                "id": data.get("id"),
                "author": data.get("author"),
                "author_fullname": data.get("author_fullname"),
                "body": data.get("body"),
                "body_html": data.get("body_html"),
                "score": data.get("score"),
                "created_utc": data.get("created_utc"),
                "permalink": "https://www.reddit.com" + (data.get("permalink") or ""),
                "is_submitter": data.get("is_submitter"),
                "parent_id": data.get("parent_id"),
                "replies": []
            }
            replies = data.get("replies")
            if replies and isinstance(replies, dict):  # Listing case
                children = replies.get("data", {}).get("children", [])
                for ch in children:
                    if remaining[0] <= 0: break
                    child_obj = walk(ch, depth + 1, remaining)
                    if child_obj: item["replies"].append(child_obj)
            # If replies == "" → keep replies as []
            return item

        if kind == "Listing":
            for ch in node.get("data", {}).get("children", []):
                if remaining[0] <= 0: break
                obj = walk(ch, depth, remaining)
                if obj: collected.append(obj)
        return None

    remaining = [max_count]
    walk(listing_node, 1, remaining)
    return collected


In [9]:
def classify_media_kind(post: dict) -> str:
    url = (post.get("url_overridden_by_dest") or post.get("url") or "").lower()
    domain = (post.get("domain") or "").lower()
    post_hint = (post.get("post_hint") or "").lower()
    if post.get("is_gallery", False): return "gallery"
    elif "v.redd.it" in url or \
       (post.get("secure_media") and post["secure_media"].get("reddit_video")) or \
       (post.get("media") and post["media"].get("reddit_video")) or \
       bool(post.get("crosspost_parent_list")):
        return "video"
    elif post_hint == "image" or domain in ("i.redd.it", "i.reddituploads.com"): return "image"
    elif post.get("is_self", False): return "self"
    return "external"

def select_bucket(media_kind: str) -> str:
    if media_kind in ("image", "gallery", "video"):
        return "media"
    elif media_kind == "external":
        return "external"
    return "text"


In [10]:
def make_archive_object(post: dict, comments_listing: dict, *,
                        include_comments=True, comments_depth=COMMENTS_DEPTH, comments_limit=COMMENTS_LIMIT):
    obj = {
        "archived_at": now_iso(),
        "reddit_fullname": post.get("name"),
        "reddit_id": post.get("id"),
        "permalink": "https://www.reddit.com" + (post.get("permalink") or ""),
        "title": post.get("title", ""),
        "selftext": post.get("selftext", ""),
        "author": post.get("author"),
        "author_fullname": post.get("author_fullname"),
        "subreddit": post.get("subreddit"),
        "subreddit_id": post.get("subreddit_id"),
        "created_utc": post.get("created_utc"),
        "is_self": post.get("is_self", False),
        "url": post.get("url_overridden_by_dest") or post.get("url"),
        "domain": post.get("domain"),
        "post_hint": post.get("post_hint"),
        "is_gallery": post.get("is_gallery", False),
        "over_18": post.get("over_18", False),
        "spoiler": post.get("spoiler", False),
        "link_flair_text": post.get("link_flair_text"),
        "is_original_content": post.get("is_original_content", False),
        "stickied": post.get("stickied", False),
        "locked": post.get("locked", False),
        "edited": post.get("edited"),
        "num_comments": post.get("num_comments"),
        "score": post.get("score"),
        "upvote_ratio": post.get("upvote_ratio"),
        "media_kind": classify_media_kind(post),
        "media": None,              # (extend later if you also download assets)
        "external_link": None,      # (fill if media_kind == external)
        "raw_post": post,
        "raw_comments": None
    }
    if include_comments:
        obj["comments"] = extract_comments(comments_listing,
                                           max_depth=comments_depth,
                                           max_count=comments_limit)
        obj["raw_comments"] = comments_listing

    if obj["media_kind"] == "external":
        obj["external_link"] = obj["url"]

    return obj

def write_archive_json(archive_obj, root=DATA_ROOT):
    rid = archive_obj.get("reddit_id") or "post"
    bucket = select_bucket(archive_obj.get("media_kind"))
    out_dir = os.path.join(root, bucket)
    os.makedirs(out_dir, exist_ok=True)
    path = os.path.join(out_dir, f"{rid}.json")
    with open(path, "w", encoding="utf-8") as f:
        json.dump(archive_obj, f, ensure_ascii=False, indent=2)
    return path


In [11]:
def process_one(url: str, *, skip_existing=SKIP_EXISTING):
    ts = now_iso()
    try:
        # Fetch first to learn the reddit id (cheap single request)
        post, comments_listing, _ = fetch_post_and_comments(url, max_retries=REQ_MAX_RETRIES)
        rid = post.get("id") or "post"

        # Skip if JSON already exists in any bucket
        if skip_existing:
            existing_path, existing_bucket = find_existing_path_and_bucket(DATA_ROOT, rid)
            if existing_path:
                return {
                    "ts": ts,
                    "status": "skipped",
                    "id": rid,
                    "url": url,
                    "bucket": existing_bucket,
                    "path": existing_path,
                    "reason": "exists",
                    "http_status": None,
                }

        # Build archive, classify bucket, write file
        obj = make_archive_object(post, comments_listing)
        bucket = select_bucket(obj.get("media_kind"))
        path = write_archive_json(obj)  # writes to DATA_ROOT/<bucket>/<id>.json

        return {
            "ts": ts,
            "status": "success",
            "id": rid,
            "url": url,
            "bucket": bucket,
            "path": path,
            "reason": None,
            "http_status": None,
        }

    except requests.HTTPError as he:
        code = getattr(he.response, "status_code", None)
        # try to recover id from URL on failure
        rid = locals().get("rid", None)
        return {
            "ts": ts,
            "status": "failed",
            "id": rid or "",
            "url": url,
            "bucket": None,
            "path": None,
            "reason": str(he),
            "http_status": code,
        }
    except Exception as e:
        rid = locals().get("rid", None)
        return {
            "ts": ts,
            "status": "failed",
            "id": rid or "",
            "url": url,
            "bucket": None,
            "path": None,
            "reason": str(e),
            "http_status": None,
        }


In [12]:
def read_links(csv_path):
    out = []
    with open(csv_path, newline="", encoding="utf-8") as f:
        for row in csv.reader(f):
            if not row: continue
            u = (row[0] or "").strip()
            if not u or u.startswith("#"): continue
            out.append(u)
    return out

links = read_links(CSV_PATH)

init_report()  # <-- add this

results = {"success": 0, "skipped": 0, "failed": 0}
with tqdm(total=len(links), desc="Archiving posts", unit="post") as pbar:
    for link in links:
        info = process_one(link)
        results[info["status"]] += 1
        log_report(
            ts=info["ts"], rid=info["id"], status=info["status"], url=info["url"],
            bucket=info.get("bucket"), out_path=info.get("path"),
            reason=info.get("reason"), http_status=info.get("http_status")
        )
        pbar.set_postfix(results=results)
        pbar.update(1)
        if DELAY_BETWEEN: time.sleep(DELAY_BETWEEN)

print("Done.", results, "Report:", REPORTS_CSV)


Archiving posts:   0%|          | 0/1 [00:00<?, ?post/s]

Done. {'success': 1, 'skipped': 0, 'failed': 0} Report: reports\run-20251016-145538.csv


# EXTERNAL LINK EXTRACTION

In [13]:
# === Extract external links and download Redgifs as <post_id>.mp4 ===
import re
import csv
import json
import time
from pathlib import Path
from urllib.parse import urlparse

import requests

try:
    from tqdm.auto import tqdm
except Exception:
    def tqdm(x, **kwargs): return x

BASE_OUT = Path(DATA_ROOT)
EXTERNAL_DIR = BASE_OUT / "external"
REDDITS_OK = {"reddit.com", "www.reddit.com", "old.reddit.com", "np.reddit.com", "oauth.reddit.com", "redd.it"}
REDDIT_NATIVE_MEDIA = {"i.redd.it", "v.redd.it"}

# ---- 1) Helpers to read archives and extract the outbound link ----
def _domain(url: str) -> str:
    try:
        return urlparse(url).netloc.lower()
    except Exception:
        return ""

def extract_external_url(archive_obj: dict) -> str | None:
    """
    From your saved archive object:
      { "raw_post": {...}, "raw_comments": {...}, "comments": [...] }
    Pull the outbound link for external posts.
    """
    post = archive_obj.get("raw_post") or {}
    # Prefer the 'url_overridden_by_dest' field; fallback to 'url'
    url = post.get("url_overridden_by_dest") or post.get("url")
    if not url:
        return None

    d = _domain(url)
    # Treat non-Reddit, non-native-media as external
    if d and d not in REDDITS_OK and d not in REDDIT_NATIVE_MEDIA:
        return url
    return None

# ---- 2) Redgifs normalization & API download ----
# Accept common Redgifs URL shapes:
RE_REDGIFS_ID = re.compile(
    r"""(?ix)
    (?:^|/)(?:watch|ifr)/([a-z0-9]+)     # redgifs.com/watch/<id> or /ifr/<id>
    |                                   # OR
    (?:^|/)(?:i)/([a-z0-9]+)            # i.redgifs.com/i/<id>
    """.strip()
)

def redgifs_id_from_url(url: str) -> str | None:
    """
    Extract the media ID from redgifs-style URLs:
      - https://redgifs.com/watch/<id>
      - https://www.redgifs.com/watch/<id>
      - https://v3.redgifs.com/watch/<id>
      - https://redgifs.com/ifr/<id>
      - https://i.redgifs.com/i/<id>
    """
    m = RE_REDGIFS_ID.search(url)
    if not m:
        return None
    # One of the two groups will be set
    gid = m.group(1) or m.group(2)
    return gid.lower() if gid else None

# Redgifs API: get a temporary token, then resolve mp4 URLs
REDGIFS_AUTH_URL = "https://api.redgifs.com/v2/auth/temporary"
REDGIFS_GIF_URL  = "https://api.redgifs.com/v2/gifs/{id}"

_SESSION = requests.Session()
_RG_TOKEN = None
_RG_TOKEN_TS = 0

def redgifs_token(force: bool = False) -> str:
    global _RG_TOKEN, _RG_TOKEN_TS
    now = time.time()
    # Reuse token for ~20 minutes unless forced
    if not force and _RG_TOKEN and (now - _RG_TOKEN_TS) < 1200:
        return _RG_TOKEN
    r = _SESSION.get(REDGIFS_AUTH_URL, timeout=30)
    r.raise_for_status()
    _RG_TOKEN = r.json().get("token")
    _RG_TOKEN_TS = now
    if not _RG_TOKEN:
        raise RuntimeError("Failed to obtain Redgifs token.")
    return _RG_TOKEN

def redgifs_mp4_url(gid: str) -> str:
    tok = redgifs_token()
    headers = {"Authorization": f"Bearer {tok}"}
    r = _SESSION.get(REDGIFS_GIF_URL.format(id=gid), headers=headers, timeout=30)
    # If token expired, refresh once
    if r.status_code in (401, 403):
        tok = redgifs_token(force=True)
        headers = {"Authorization": f"Bearer {tok}"}
        r = _SESSION.get(REDGIFS_GIF_URL.format(id=gid), headers=headers, timeout=30)
    r.raise_for_status()
    info = r.json().get("gif") or {}
    # Prefer HD if present, else SD, else fallback to urls.origin
    urls = info.get("urls") or {}
    return urls.get("hd") or urls.get("sd") or urls.get("origin")

def download_stream(url: str, dest: Path, *, max_retries: int = 4):
    dest.parent.mkdir(parents=True, exist_ok=True)
    for attempt in range(max_retries):
        try:
            with _SESSION.get(url, stream=True, timeout=60) as r:
                r.raise_for_status()
                with open(dest, "wb") as f:
                    for chunk in r.iter_content(chunk_size=1024 * 256):
                        if chunk:
                            f.write(chunk)
            return
        except Exception as e:
            if attempt + 1 >= max_retries:
                raise
            time.sleep(min(2 ** attempt, 15))

# ---- 3) Walk external posts, export external links CSV, download Redgifs ----
external_json_files = sorted(EXTERNAL_DIR.glob("*.json"))
print(f"Found {len(external_json_files)} external post JSONs in {EXTERNAL_DIR}")

external_rows = []
redgifs_failed = []

REDGIFS_OUT = BASE_OUT / "redgifs"
REDGIFS_OUT.mkdir(parents=True, exist_ok=True)

for fp in tqdm(external_json_files, desc="Scanning external posts", unit="post"):
    try:
        data = json.loads(fp.read_text(encoding="utf-8"))
        post = (data or {}).get("raw_post") or {}
        pid  = post.get("id") or fp.stem  # fallback to filename if needed

        ext_url = extract_external_url(data)
        if not ext_url:
            # Still record that this external-typed file has no resolvable URL
            external_rows.append({"id": pid, "link": "", "domain": ""})
            continue

        dom = _domain(ext_url)
        external_rows.append({"id": pid, "link": ext_url, "domain": dom})

        # Redgifs download
        if "redgifs.com" in dom or dom.endswith(".redgifs.com"):
            gid = redgifs_id_from_url(ext_url)
            if not gid:
                # Sometimes the external URL is a redirect page; skip but log
                redgifs_failed.append({"id": pid, "link": ext_url, "reason": "no_id_from_url"})
                continue

            out_path = REDGIFS_OUT / f"{pid}.mp4"
            if out_path.exists():
                # already downloaded
                continue

            try:
                mp4_url = redgifs_mp4_url(gid)
                if not mp4_url:
                    redgifs_failed.append({"id": pid, "link": ext_url, "reason": "no_mp4_url"})
                    continue
                download_stream(mp4_url, out_path)
                # Show success line
                print(f"[REDGIFS] id={pid} -> {out_path.name}")
            except Exception as e:
                redgifs_failed.append({"id": pid, "link": ext_url, "reason": str(e)})
    except Exception as e:
        # If we cannot read this JSON at all, log as a redgifs failure only if it looked like redgifs
        redgifs_failed.append({"id": fp.stem, "link": "", "reason": f"read_error: {e}"})

# ---- 4) Write summary CSVs ----
ext_csv = BASE_OUT / "external_links.csv"
with ext_csv.open("w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["id", "link", "domain"])
    w.writeheader()
    w.writerows(external_rows)

if redgifs_failed:
    fail_csv = BASE_OUT / "redgifs_failed.csv"
    with fail_csv.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["id", "link", "reason"])
        w.writeheader()
        w.writerows(redgifs_failed)
    print(f"\nSaved Redgifs download failures to: {fail_csv.resolve()}")

print(f"\nSaved external links to: {ext_csv.resolve()}")
print(f"Redgifs saved (if any) to: {REDGIFS_OUT.resolve()}")


Found 0 external post JSONs in out\external


Scanning external posts: 0post [00:00, ?post/s]


Saved external links to: S:\minds\Desktop\Downloader and Reddit System\Saved-Reddit\SCRIPTS\out\external_links.csv
Redgifs saved (if any) to: S:\minds\Desktop\Downloader and Reddit System\Saved-Reddit\SCRIPTS\out\redgifs


# MEDIA DOWNLOADER

In [14]:
# === Download embedded Reddit-hosted media for posts in out/media/*.json ===
import os
import re
import csv
import json
import time
import html
import mimetypes
from pathlib import Path
from urllib.parse import urlparse

import requests

try:
    from tqdm.auto import tqdm
except Exception:
    def tqdm(x, **kwargs): return x

# ---- configurable root (defaults to ./out) ----
DATA_ROOT = os.environ.get("DATA_ROOT", "out")

BASE_OUT = Path(DATA_ROOT)
MEDIA_JSON_DIR = BASE_OUT / "media"
MEDIA_OUT_DIR = BASE_OUT / "media_files"
MEDIA_OUT_DIR.mkdir(parents=True, exist_ok=True)

SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "reddit-media-downloader/1.1 (preserve-originals)"})

# ---------- helpers ----------
WIN_ILLEGAL = set('<>:"/\\|?*')

def _clean_url(u: str | None) -> str | None:
    if not u:
        return None
    # Reddit often returns HTML-escaped URLs inside JSON
    return html.unescape(u)

def _domain(u: str | None) -> str:
    if not u:
        return ""
    try:
        return urlparse(u).netloc.lower()
    except Exception:
        return ""

def _ext_from_url_or_type(url: str | None, content_type: str | None) -> str:
    # Prefer extension from URL, else derive from content-type
    if url:
        path = urlparse(url).path
        ext = os.path.splitext(path)[1].lower()
        if ext in {
            ".jpg", ".jpeg", ".png", ".webp", ".avif",
            ".gif", ".mp4", ".webm", ".mov"
        }:
            return ext
    if content_type:
        # normalize content-type → extension (mimetypes includes many)
        ct = content_type.split(";")[0].strip().lower()
        ext = mimetypes.guess_extension(ct) or ""
        if ext:
            # common normalizations
            if ext == ".jpe":
                return ".jpg"
            if ext == ".apng":
                return ".png"
            return ext.lower()
        # a few manual fallbacks
        if ct == "image/jpg":
            return ".jpg"
        if ct == "image/jpeg":
            return ".jpg"
        if ct == "image/webp":
            return ".webp"
        if ct == "image/avif":
            return ".avif"
        if ct == "image/gif":
            return ".gif"
        if ct == "video/mp4":
            return ".mp4"
        if ct == "video/webm":
            return ".webm"
    # last-resort: try to infer from URL substrings
    if url:
        low = url.lower()
        for marker, e in [
            (".jpg", ".jpg"), (".jpeg", ".jpg"), (".png", ".png"),
            (".webp", ".webp"), (".avif", ".avif"),
            (".gif", ".gif"), (".mp4", ".mp4"), (".webm", ".webm")
        ]:
            if marker in low:
                return e
    # default image if unknown
    return ".jpg"

def _stream_download(url: str, dest: Path, *, max_retries: int = 4, chunk=1024 * 256) -> Path:
    """
    Stream download to `dest`. If `dest` has no suffix, we'll refine it from Content-Type.
    Returns the final path (might differ if we refined suffix).
    """
    dest.parent.mkdir(parents=True, exist_ok=True)
    final_dest = dest
    for attempt in range(max_retries):
        try:
            with SESSION.get(url, stream=True, timeout=60) as r:
                r.raise_for_status()
                ctype = r.headers.get("Content-Type")
                # If dest has no extension yet, refine using content-type
                if final_dest.suffix == "" and ctype:
                    final_dest = final_dest.with_suffix(_ext_from_url_or_type(url, ctype))
                with open(final_dest, "wb") as f:
                    for part in r.iter_content(chunk_size=chunk):
                        if part:
                            f.write(part)
            return final_dest
        except Exception as e:
            if attempt + 1 >= max_retries:
                raise
            time.sleep(min(2 ** attempt, 15))
    return final_dest

def _pick_best_preview_original_first(post: dict) -> str | None:
    """
    For image/GIF-like posts where 'preview' exists.
    Prefer GIF (original) over MP4 transcodes; else best image 'source'.
    """
    prev = post.get("preview") or {}
    variants = prev.get("variants") or {}
    # prefer gif over mp4 to keep original
    gifv = variants.get("gif")
    if gifv and gifv.get("source", {}).get("url"):
        return _clean_url(gifv["source"]["url"])
    # image source
    src = (prev.get("images") or [{}])[0].get("source", {})
    if src.get("url"):
        return _clean_url(src["url"])
    # finally, allow mp4 if nothing else available
    mp4v = variants.get("mp4") or variants.get("reddit_video_preview")
    if mp4v and mp4v.get("source", {}).get("url"):
        return _clean_url(mp4v["source"]["url"])
    return None

def _pick_vreddit_urls(post: dict) -> tuple[str | None, str | None]:
    """
    v.redd.it posts: return (preferred_mp4_url, fallback_mp4_url)
    We only download MP4 for native reddit video; there's no "original" other than mp4.
    """
    media = post.get("media") or {}
    rv = media.get("reddit_video") or {}
    fallback = rv.get("fallback_url")  # progressive mp4
    hls = rv.get("hls_url")            # m3u8 (requires ffmpeg; we don't use it here)
    # Sometimes preview mp4 exists:
    prev = post.get("preview") or {}
    pv = prev.get("reddit_video_preview") or {}
    prev_mp4 = pv.get("fallback_url") if isinstance(pv, dict) else None
    return (_clean_url(fallback), _clean_url(prev_mp4 or hls))

def _safe_dirname_from_title(title: str | None, pid: str) -> str:
    """
    Build a Windows-safe folder name from the post title.
    - Strip illegal characters <>:"/\|?*
    - Collapse whitespace
    - Trim trailing dots/spaces
    - Limit length
    Fallback to post id if empty.
    """
    t = html.unescape((title or "").strip())
    # collapse whitespace
    t = re.sub(r"\s+", " ", t)
    # remove illegal chars
    t = "".join(ch for ch in t if ch not in WIN_ILLEGAL and ord(ch) >= 32)
    # trim length generously (Windows path limits are tighter, but this is fine)
    t = t[:120].strip(" .")
    if not t:
        t = pid
    return t

def _gallery_items_with_originals(post: dict) -> list[tuple[str, str]]:
    """
    For gallery posts: return list of (url, suggested_ext) preserving original format.
    Uses media_metadata to select the best 's' rendition. Prefers GIF over MP4 for animated items.
    """
    items = []
    meta = post.get("media_metadata") or {}
    gdata = post.get("gallery_data") or {}
    order = [e.get("media_id") for e in gdata.get("items", []) if e.get("media_id")]
    for mid in order:
        m = meta.get(mid) or {}
        s = m.get("s") or {}
        # Prefer original GIF when present, else still image, else mp4 fallback
        url = _clean_url(s.get("gif") or s.get("u") or s.get("url") or s.get("mp4"))
        if not url:
            continue
        m_type = m.get("m")  # e.g., "image/jpeg", "image/png", "image/gif"
        ext = _ext_from_url_or_type(url, m_type)
        items.append((url, ext))
    return items

def _num_pad_width(n: int) -> int:
    """Choose padding width for numbering (01, 02 …)."""
    return max(2, len(str(n)))

# ---------- main walk ----------
media_jsons = sorted(MEDIA_JSON_DIR.glob("*.json"))
print(f"Found {len(media_jsons)} media post JSONs in {MEDIA_JSON_DIR}")

fail_rows = []
downloaded = 0

for fp in tqdm(media_jsons, desc="Downloading embedded media", unit="post"):
    try:
        data = json.loads(fp.read_text(encoding="utf-8"))
        post = (data or {}).get("raw_post") or {}
        pid = post.get("id") or fp.stem

        # Prefer Reddit-hosted URL if present
        url = _clean_url(post.get("url_overridden_by_dest") or post.get("url"))
        dom = _domain(url)

        # Case A: gallery  (→ save into folder named after the post title; files 01.ext, 02.ext…)
        if post.get("is_gallery") or (post.get("gallery_data") and post.get("media_metadata")):
            items = _gallery_items_with_originals(post)
            if not items:
                fail_rows.append({"id": pid, "reason": "gallery_no_items"})
                continue

            # Build Windows-safe folder name from title; fallback to pid if needed
            folder = str(pid).strip()
            gal_dir = MEDIA_OUT_DIR / folder
            gal_dir.mkdir(parents=True, exist_ok=True)

            pad = _num_pad_width(len(items))
            for idx, (item_url, ext) in enumerate(items, start=1):
                # Always ensure ext begins with dot
                if not ext.startswith("."):
                    ext = "." + ext
                # 01.ext, 02.ext …
                outfile = gal_dir / f"{str(idx).zfill(pad)}{ext}"
                if outfile.exists():
                    continue
                try:
                    _stream_download(item_url, outfile)
                    downloaded += 1
                    print(f"[GAL] {pid} -> {folder}/{outfile.name}")
                except Exception as e:
                    fail_rows.append({"id": pid, "reason": f"gallery_item_fail:{e}"})
            continue  # next post

        # Case B: native video (v.redd.it) → original is MP4
        if (post.get("is_video") or (post.get("media") or {}).get("reddit_video")) and dom.endswith("v.redd.it"):
            main_mp4, alt_mp4 = _pick_vreddit_urls(post)
            target = MEDIA_OUT_DIR / f"{pid}.mp4"
            if target.exists():
                continue
            src = main_mp4 or alt_mp4
            if not src:
                # last chance: look into preview variants (might be mp4)
                src = _pick_best_preview_original_first(post)
            if not src:
                fail_rows.append({"id": pid, "reason": "vreddit_no_source"})
                continue
            try:
                _stream_download(src, target)
                downloaded += 1
                print(f"[VID] {pid} -> {target.name}")
            except Exception as e:
                fail_rows.append({"id": pid, "reason": f"vreddit_dl_fail:{e}"})
            continue

        # Case C: direct image/gif via i.redd.it
        if dom.endswith("i.redd.it"):
            ext = _ext_from_url_or_type(url, None)
            target = MEDIA_OUT_DIR / f"{pid}{ext}"
            if not target.exists():
                try:
                    _stream_download(url, target)
                    downloaded += 1
                    print(f"[IMG] {pid} -> {target.name}")
                except Exception as e:
                    fail_rows.append({"id": pid, "reason": f"ireddit_dl_fail:{e}"})
            continue

        # Fallback: try preview, preferring original formats (gif/image) before mp4
        prev_url = _pick_best_preview_original_first(post)
        if prev_url and _domain(prev_url) in {"i.redd.it", "v.redd.it", "preview.redd.it"}:
            ext = _ext_from_url_or_type(prev_url, None)
            target = MEDIA_OUT_DIR / f"{pid}{ext}"
            if not target.exists():
                try:
                    _stream_download(prev_url, target)
                    downloaded += 1
                    print(f"[PREV] {pid} -> {target.name}")
                except Exception as e:
                    fail_rows.append({"id": pid, "reason": f"preview_dl_fail:{e}"})
            continue

        # If we reach here, it looks like a Reddit-hosted "media" without a reliable direct URL
        fail_rows.append({"id": pid, "reason": "no_reddit_media_url"})
    except Exception as e:
        fail_rows.append({"id": fp.stem, "reason": f"read_error:{e}"})

# ---------- write failures ----------
if fail_rows:
    fail_csv = BASE_OUT / "media_failed.csv"
    with fail_csv.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["id", "reason"])
        w.writeheader()
        w.writerows(fail_rows)
    print(f"\nSaved media failures to: {fail_csv.resolve()}")

print(f"\nDone. Downloaded: {downloaded}. Files saved under: {MEDIA_OUT_DIR.resolve()}")


Found 1 media post JSONs in out\media


  """


Downloading embedded media:   0%|          | 0/1 [00:00<?, ?post/s]

[GAL] 1o7yp0r -> 1o7yp0r/01.jpg
[GAL] 1o7yp0r -> 1o7yp0r/02.jpg
[GAL] 1o7yp0r -> 1o7yp0r/03.jpg
[GAL] 1o7yp0r -> 1o7yp0r/04.jpg
[GAL] 1o7yp0r -> 1o7yp0r/05.jpg
[GAL] 1o7yp0r -> 1o7yp0r/06.jpg
[GAL] 1o7yp0r -> 1o7yp0r/07.jpg
[GAL] 1o7yp0r -> 1o7yp0r/08.jpg
[GAL] 1o7yp0r -> 1o7yp0r/09.jpg
[GAL] 1o7yp0r -> 1o7yp0r/10.jpg

Done. Downloaded: 10. Files saved under: S:\minds\Desktop\Downloader and Reddit System\Saved-Reddit\SCRIPTS\out\media_files
