In [1]:
import os, re, csv, sys, subprocess, time, random
import requests
import argparse
from urllib.parse import urlparse, urljoin
import xml.etree.ElementTree as ET

UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Reddit-Downloader/OptionB"
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": UA})

In [2]:
def safe_name(s: str) -> str:
    s = s.strip()
    s = re.sub(r"[^\w\-. ]+", "_", s)
    return re.sub(r"\s+", " ", s)[:200] or "reddit_download"

def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

# ---- Backoff-aware HTTP helpers ----

def request_with_backoff(method: str, url: str, *, max_retries=5, timeout=30, stream=False, headers=None):
    """
    Generic HTTP request with exponential backoff.
    Respects Retry-After if present. Jitter added to avoid thundering herds.
    Retries on 429 and 5xx. Raises on other 4xx.
    """
    attempt = 0
    while True:
        try:
            resp = SESSION.request(method, url, timeout=timeout, stream=stream, headers=headers)
        except requests.RequestException as e:
            if attempt >= max_retries:
                raise
            sleep = min(60, (2 ** attempt)) + random.uniform(0, 0.5)
            print(f"Network error {e}; retrying in {sleep:.1f}s …")
            time.sleep(sleep)
            attempt += 1
            continue

        if resp.status_code == 429 or 500 <= resp.status_code < 600:
            if attempt >= max_retries:
                resp.raise_for_status()
            retry_after = resp.headers.get("Retry-After")
            if retry_after is not None:
                try:
                    sleep = float(retry_after)
                except ValueError:
                    sleep = 10.0
            else:
                sleep = min(60, (2 ** attempt)) + random.uniform(0, 0.5)
            print(f"{resp.status_code} on {url}\nRetrying in {sleep:.1f}s …")
            time.sleep(sleep)
            attempt += 1
            continue

        # Other 4xx -> raise immediately
        if 400 <= resp.status_code < 500:
            resp.raise_for_status()

        return resp

def download_file(url: str, outpath: str, *, max_retries=5):
    with request_with_backoff("GET", url, max_retries=max_retries, timeout=60, stream=True) as r:
        total = int(r.headers.get("Content-Length", 0))
        done = 0
        chunk = 1 << 15
        with open(outpath, "wb") as f:
            for part in r.iter_content(chunk_size=chunk):
                if not part:
                    continue
                f.write(part)
                done += len(part)
                if total:
                    pct = done * 100 // total
                    print(f"\r  {os.path.basename(outpath)}  {pct}% ({done}/{total} bytes)", end="")
        if total:
            print()

# ---- Reddit parsing + media handling ----

def get_post_json(url: str, *, max_retries=5) -> dict:
    if not url.startswith(("http://", "https://")):
        raise ValueError(f"Not a URL: {url}")
    u = url
    if not u.endswith("/"):
        u += "/"
    if not u.endswith(".json"):
        u += ".json"
    r = request_with_backoff("GET", u, max_retries=max_retries, timeout=30)
    data = r.json()
    if isinstance(data, list) and data and data[0]["data"]["children"]:
        return data[0]["data"]["children"][0]["data"]
    return data

def pick_best_from_mpd(mpd_xml: str):
    root = ET.fromstring(mpd_xml)
    ns = {"mpd": root.tag.split('}')[0].strip('{')} if '}' in root.tag else {}
    def fa(elem, path):
        return elem.findall(path, ns) if ns else elem.findall(path)
    base_urls = fa(root, ".//mpd:BaseURL") if ns else root.findall(".//BaseURL")
    base_url = base_urls[0].text.strip() if base_urls else ""
    best_video = (0, None)
    best_audio = (0, None)
    for aset in fa(root, ".//mpd:AdaptationSet") if ns else root.findall(".//AdaptationSet"):
        mime = aset.get("mimeType", "")
        for rep in fa(aset, "mpd:Representation") if ns else aset.findall("Representation"):
            bw = int(rep.get("bandwidth", "0"))
            rep_base = fa(rep, "mpd:BaseURL") if ns else rep.findall("BaseURL")
            if not rep_base:
                continue
            url = rep_base[0].text.strip()
            if base_url and not url.lower().startswith(("http://", "https://")):
                url = urljoin(base_url, url)
            if mime.startswith("video/") and bw > best_video[0]:
                best_video = (bw, url)
            elif mime.startswith("audio/") and bw > best_audio[0]:
                best_audio = (bw, url)
    return best_video[1], best_audio[1]

def has_ffmpeg() -> bool:
    try:
        subprocess.run(["ffmpeg", "-version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False)
        return True
    except FileNotFoundError:
        return False

def merge_av(video_path: str, audio_path: str, out_path: str):
    cmd = ["ffmpeg", "-y", "-i", video_path, "-i", audio_path, "-c", "copy", out_path]
    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if proc.returncode != 0:
        raise RuntimeError(f"FFmpeg failed:\n{proc.stderr}")




In [3]:
def handle_image(post: dict, outdir: str, *, max_retries=5) -> str:
    url = post.get("url_overridden_by_dest") or post.get("url")
    if not url:
        raise RuntimeError("No image URL found.")
    title = safe_name(post.get("title", "reddit_image"))
    ext = os.path.splitext(urlparse(url).path)[1] or ".jpg"
    out = os.path.join(outdir, f"{title}{ext}")
    print(f"Downloading image → {out}")
    download_file(url, out, max_retries=max_retries)
    return out

def handle_gallery(post: dict, outdir: str, *, max_retries=5) -> list:
    media_meta = post.get("media_metadata", {})
    gallery_data = post.get("gallery_data", {}).get("items", [])
    if not media_meta or not gallery_data:
        raise RuntimeError("No gallery metadata found.")
    title = safe_name(post.get("title", "reddit_gallery"))
    gallery_dir = os.path.join(outdir, title)
    ensure_dir(gallery_dir)
    outputs = []
    for i, item in enumerate(gallery_data, 1):
        media_id = item["media_id"]
        meta = media_meta[media_id]
        if "p" in meta and meta["p"]:
            candidate = meta["p"][-1]["u"]
        else:
            candidate = meta["s"]["u"]
        candidate = candidate.replace("&amp;", "&")
        ext = ".jpg"
        if "m" in meta.get("s", {}):
            mt = meta["s"]["m"]
            if "png" in mt: ext = ".png"
            elif "gif" in mt: ext = ".gif"
        out = os.path.join(gallery_dir, f"{i:02d}{ext}")
        print(f"Downloading gallery item {i} → {out}")
        download_file(candidate, out, max_retries=max_retries)
        outputs.append(out)
    return outputs

def handle_video(post: dict, outdir: str, *, max_retries=5) -> str:
    title = safe_name(post.get("title", "reddit_video"))
    ensure_dir(outdir)

    reddit_video = None
    if post.get("secure_media") and post["secure_media"].get("reddit_video"):
        reddit_video = post["secure_media"]["reddit_video"]
    elif post.get("media") and post["media"].get("reddit_video"):
        reddit_video = post["media"]["reddit_video"]
    elif post.get("crosspost_parent_list"):
        for parent in post["crosspost_parent_list"]:
            if parent.get("secure_media") and parent["secure_media"].get("reddit_video"):
                reddit_video = parent["secure_media"]["reddit_video"]; break
            if parent.get("media") and parent["media"].get("reddit_video"):
                reddit_video = parent["media"]["reddit_video"]; break

    if not reddit_video:
        url = post.get("url_overridden_by_dest") or post.get("url", "")
        if "v.redd.it" in url:
            dash_url = url.rstrip("/") + "/DASHPlaylist.mpd"
            reddit_video = {"dash_url": dash_url}
        else:
            raise RuntimeError("No reddit_video found on this post.")

    dash_url = reddit_video.get("dash_url")
    fallback = reddit_video.get("fallback_url")

    if dash_url:
        print(f"Fetching DASH manifest:\n  {dash_url}")
        r = request_with_backoff("GET", dash_url, max_retries=max_retries, timeout=30)
        if r.status_code == 403:
            alt = dash_url.replace("https://", "http://")
            r = request_with_backoff("GET", alt, max_retries=max_retries, timeout=30)
        v_url, a_url = pick_best_from_mpd(r.text)
        if not v_url and not a_url and fallback:
            print("No representations in MPD; falling back to single MP4.")
            out = os.path.join(outdir, f"{title}.mp4")
            download_file(fallback, out, max_retries=max_retries)
            return out

        v_path = os.path.join(outdir, f"{title}.video.mp4")
        a_path = os.path.join(outdir, f"{title}.audio.mp4")
        if v_url:
            print(f"Downloading best video:\n  {v_url}\n→ {v_path}")
            download_file(v_url, v_path, max_retries=max_retries)
        if a_url:
            print(f"Downloading best audio:\n  {a_url}\n→ {a_path}")
            download_file(a_url, a_path, max_retries=max_retries)

        if a_url:
            if not has_ffmpeg():
                raise RuntimeError("FFmpeg not found to merge audio+video. Install FFmpeg or add it to PATH.")
            out_path = os.path.join(outdir, f"{title}.mp4")
            print("Merging A+V with FFmpeg…")
            merge_av(v_path, a_path, out_path)
            for p in (v_path, a_path):
                try: os.remove(p)
                except Exception: pass
            return out_path
        else:
            final = os.path.join(outdir, f"{title}.mp4")
            os.replace(v_path, final)
            return final

    out = os.path.join(outdir, f"{title}.mp4")
    print(f"Downloading fallback MP4 (may be video-only):\n  {fallback}\n→ {out}")
    download_file(fallback, out, max_retries=max_retries)
    return out

def log_external_link(post: dict, post_url: str, links_csv: str):
    external_url = post.get("url_overridden_by_dest") or post.get("url") or ""
    title = post.get("title", "")
    domain = (post.get("domain") or "").lower()
    row = [post_url, external_url, title, domain]
    exists = os.path.exists(links_csv)
    with open(links_csv, "a", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        if not exists:
            w.writerow(["post_url", "external_url", "title", "domain"])
        w.writerow(row)
    print(f"Logged external link → {links_csv}\n  {external_url}")

def classify_and_handle(url: str, outdir: str, links_csv: str, *, max_retries=5) -> list:
    ensure_dir(outdir)
    post = get_post_json(url, max_retries=max_retries)

    is_gallery = post.get("is_gallery", False)
    post_hint = (post.get("post_hint") or "").lower()
    domain = (post.get("domain") or "").lower()
    p_url = (post.get("url_overridden_by_dest") or post.get("url") or "").lower()

    is_reddit_image = post_hint == "image" or domain in ("i.redd.it", "i.reddituploads.com")
    is_reddit_video = "v.redd.it" in p_url or \
                      (post.get("secure_media") and post["secure_media"].get("reddit_video")) or \
                      (post.get("media") and post["media"].get("reddit_video")) or \
                      bool(post.get("crosspost_parent_list"))

    results = []
    if is_gallery:
        print("Detected gallery.")
        results.extend(handle_gallery(post, outdir, max_retries=max_retries))
    elif is_reddit_video:
        print("Detected hosted video (v.redd.it or reddit_video).")
        results.append(handle_video(post, outdir, max_retries=max_retries))
    elif is_reddit_image:
        print("Detected single image.")
        results.append(handle_image(post, outdir, max_retries=max_retries))
    else:
        print("Detected external link; logging instead of downloading.")
        log_external_link(post, url, links_csv)
    return results

In [4]:
# ---- CLI / batching ----

def main():
    parser = argparse.ArgumentParser(
        description="Reddit image/gallery/video downloader (no yt-dlp). External link posts are logged to CSV."
    )
    parser.add_argument("-u", "--url", help="Single Reddit post URL")
    parser.add_argument("-c", "--csv", dest="in_csv", default="posts.csv",
                        help="CSV file with one Reddit URL per line")
    parser.add_argument("-o", "--outdir", default="downloads", help="Output folder for saved media")
    parser.add_argument("--links-csv", default="external_links.csv",
                        help="Where to log external (non-Reddit-hosted) links")
    parser.add_argument("--batch-size", type=int, default=25, help="Posts per batch before pausing")
    parser.add_argument("--batch-pause", type=int, default=90, help="Seconds to sleep between batches")
    parser.add_argument("--delay", type=float, default=2.0, help="Seconds to sleep between posts")
    parser.add_argument("--max-retries", type=int, default=5, help="Max HTTP retries on 429/5xx")
    args, _ = parser.parse_known_args()

    ensure_dir(args.outdir)

    def process_one(link: str, idx: int):
        print(f"\n[{idx}] >>> {link}")
        try:
            files = classify_and_handle(link, args.outdir, args.links_csv, max_retries=args.max_retries)
            for p in files:
                print("Saved:", p)
        except Exception as e:
            print(f"Failed {link}: {e}")

    if args.url:
        process_one(args.url, 1)
        return

    if os.path.exists(args.in_csv):
        with open(args.in_csv, newline="", encoding="utf-8") as f:
            rows = [row[0].strip() for row in csv.reader(f) if row and row[0].strip() and not row[0].strip().startswith("#")]
        total = len(rows)
        i = 0
        while i < total:
            batch = rows[i:i + args.batch_size]
            print(f"\nProcessing batch {i//args.batch_size + 1} ({len(batch)} items)…")
            for j, link in enumerate(batch, start=1):
                process_one(link, i + j)
                time.sleep(args.delay)  # polite delay between posts
            i += args.batch_size
            if i < total:
                print(f"\nSleeping {args.batch_pause}s between batches to avoid rate limits…")
                time.sleep(args.batch_pause)
        return

    # Fallback interactive
    url = input("Paste a Reddit post URL: ").strip()
    process_one(url, 1)

if __name__ == "__main__":
    main()


Processing batch 1 (25 items)…

[1] >>> https://www.reddit.com/r/u_Environmental_Sail68/comments/11nfo8t/jackandjill_peachiikitten_lilykawaii_dirtydallas1
Detected external link; logging instead of downloading.
Logged external link → external_links.csv
  https://www.reddit.com/r/u_Environmental_Sail68/comments/11nfo8t/jackandjill_peachiikitten_lilykawaii_dirtydallas1/

[2] >>> https://www.reddit.com/r/SluttyConfessions/comments/tfucs4/how_i_f18_went_from_innocent_college_virgin_to
Detected external link; logging instead of downloading.
Logged external link → external_links.csv
  https://www.reddit.com/r/SluttyConfessions/comments/tfucs4/how_i_f18_went_from_innocent_college_virgin_to/

[3] >>> https://www.reddit.com/r/SluttyConfessions/comments/tex5xg/how_i_f18_went_from_innocent_college_virgin_to
Detected external link; logging instead of downloading.
Logged external link → external_links.csv
  https://www.reddit.com/r/SluttyConfessions/comments/tex5xg/how_i_f18_went_from_innocent_col