In [1]:
import re
import pandas as pd
from pathlib import Path
from datetime import datetime

import SCRIPTS.jsonDownloader as jd
from SCRIPTS.redditLinkRetriever import fetch_saved_post_links, save_links_txt
from SCRIPTS.mediaDownloader import download_embedded_media
from SCRIPTS.mediaOrganizer import organize_downloads
from SCRIPTS.redgifDownloader import process_external
from SCRIPTS.cloudflareUploader import upload_media
from SCRIPTS.r2_audit import audit_local_vs_r2

In [2]:
# Set to True to avoid making any changes
DRY_RUN_MEDIA = False
DRY_RUN_ORGANIZE = False
DRY_RUN_CLOUDFLARE = False
DRY_RUN_FINAL = False

In [3]:
# Retrieve saved post links for the specified user
links = fetch_saved_post_links()

In [4]:
links[:5]

['https://www.reddit.com/r/slutsofsnapchat/comments/1ob2xjm/telegram_creamykikii_cumslut_what_i_offer_gfe',
 'https://www.reddit.com/r/ABGHeavens/comments/1ob1yuu/best_abg',
 'https://www.reddit.com/r/HentaiBullying/comments/1o9n30u/they_use_me_to_make_money_and_their_own',
 'https://www.reddit.com/r/collegesluts/comments/1oaqcv4/freshman_with_biggest_tits_in_class',
 'https://www.reddit.com/r/cumsluts/comments/1oaw7d3/daddys_good_girl_had_made_him_wait']

# NEW POST VALIDATION

This section validates new posts from reddits saved folder

In [5]:
csv_path = Path("ordered_posts.csv")
raw_df = pd.read_csv(csv_path)

POST_ID_RE = re.compile(r"/comments/([a-z0-9]+)(?:[/?#]|$)", re.IGNORECASE)
SHORT_RE   = re.compile(r"redd\.it/([a-z0-9]+)(?:[/?#]|$)", re.IGNORECASE)
max_order_num = raw_df.order_num.max()

def strip_trailing_slash(url: str) -> str:
    # remove trailing slashes only at the very end (doesn't touch scheme)
    return url.rstrip("/")

def extract_post_id(url: str) -> str | None:
    """
    Try to extract a post id from:
      - standard permalink: .../comments/<postid>/...
      - shortlink: https://redd.it/<postid>
    """
    m = POST_ID_RE.search(url)
    if m:
        return m.group(1)
    m = SHORT_RE.search(url)
    if m:
        return m.group(1)
    return None

existing_ids = set(str(x).lower() for x in raw_df.get("post_id", pd.Series([])).dropna())

new_rows = []
next_order = max_order_num + 1
seen_in_batch = set()  # avoid duplicates within this run

for raw_link in reversed(links):
    link = strip_trailing_slash(raw_link)
    post_id = extract_post_id(link)
    if not post_id:
        continue
    pid = post_id.lower()

    # Only add if NOT already in CSV and not already queued this batch
    if pid in existing_ids or pid in seen_in_batch:
        continue

    new_rows.append({
        "order_num": next_order,
        "link": link,
        "post_id": post_id,
        "date_added": datetime.utcnow().isoformat(timespec="seconds"),
    })
    seen_in_batch.add(pid)
    next_order += 1

# Preview as a DataFrame
new_df = pd.DataFrame(new_rows)
new_df


  "date_added": datetime.utcnow().isoformat(timespec="seconds"),


Unnamed: 0,order_num,link,post_id,date_added
0,1281,https://www.reddit.com/r/IWantToBeHerHentai2/c...,1o6f8je,2025-10-20T02:24:50
1,1282,https://www.reddit.com/r/abelladanger/comments...,1jhxtty,2025-10-20T02:24:50
2,1283,https://www.reddit.com/r/HentaiAndRoleplayy/co...,1981et9,2025-10-20T02:24:50
3,1284,https://www.reddit.com/r/ABGHeavens/comments/1...,1o7yp0r,2025-10-20T02:24:50
4,1285,https://www.reddit.com/r/bangmybully/comments/...,1o8ba01,2025-10-20T02:24:50
5,1286,https://www.reddit.com/r/ABGHeavens/comments/1...,1o97c82,2025-10-20T02:24:50
6,1287,https://www.reddit.com/r/bangmybully/comments/...,1o8bta5,2025-10-20T02:24:50
7,1288,https://www.reddit.com/r/ABGHeavens/comments/1...,1oa6x2t,2025-10-20T02:24:50
8,1289,https://www.reddit.com/r/cumsluts/comments/1o9...,1o9yw0m,2025-10-20T02:24:50
9,1290,https://www.reddit.com/r/bangmybully/comments/...,1o9tk5e,2025-10-20T02:24:50


In [6]:
final_df = pd.concat([raw_df, new_df], ignore_index=True)
final_df = final_df.sort_values(by="order_num", ascending=False).reset_index(drop=True)

In [7]:
import importlib
importlib.reload(jd)

jd.configure(
    DATA_ROOT="Out",
    SKIP_EXISTING=False,
    REPORTS_DIR="__reports"
    )

summary = jd.process_all(new_df["link"].tolist(), show_progress=True)
summary

  0%|          | 0/29 [00:00<?, ?post/s]

Done. Success: 29, Skipped: 0, Failed: 0


{'success': 29, 'skipped': 0, 'failed': 0}

# MEDIA DOWNLOADER
Reviews the external and media json folders in **Out/**, downloading:
- Images
- Gifs
- Videos

In [8]:
folders = ["external", "media"]
download_stats = []

# point to your inputs/outputs explicitly
for mediaType in folders:
    download_stats.append(download_embedded_media(
        media_json_dir=Path("Out/" + mediaType),   # where your *.json live
        media_out_dir=Path("Media/media_files"),  # where downloads should go
        write_fail_csv_to=Path("__reports/media_report" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
        show_progress=True,
    ))

download_stats

Downloading embedded media:   0%|          | 0/15 [00:00<?, ?post/s]

Downloading embedded media:   0%|          | 0/11 [00:00<?, ?post/s]

[{'downloaded': 0,
  'failed': 15,
  'skipped': 0,
  'fail_rows': [{'id': '1jhxtty', 'reason': 'no_reddit_media_url'},
   {'id': '1o85il3', 'reason': 'no_reddit_media_url'},
   {'id': '1o8ba01', 'reason': 'no_reddit_media_url'},
   {'id': '1o8bta5', 'reason': 'no_reddit_media_url'},
   {'id': '1o8r5ic', 'reason': 'no_reddit_media_url'},
   {'id': '1o9t13k', 'reason': 'no_reddit_media_url'},
   {'id': '1o9tk5e', 'reason': 'no_reddit_media_url'},
   {'id': '1o9yw0m', 'reason': 'no_reddit_media_url'},
   {'id': '1oa631j', 'reason': 'no_reddit_media_url'},
   {'id': '1oa6chl', 'reason': 'no_reddit_media_url'},
   {'id': '1oa90lu', 'reason': 'no_reddit_media_url'},
   {'id': '1oabnlx', 'reason': 'no_reddit_media_url'},
   {'id': '1oaqcv4', 'reason': 'no_reddit_media_url'},
   {'id': '1oaw7d3', 'reason': 'no_reddit_media_url'},
   {'id': '1ob2xjm', 'reason': 'no_reddit_media_url'}],
  'out_dir': WindowsPath('Media/media_files'),
  'json_dir': WindowsPath('Out/external')},
 {'downloaded': 76,

In [9]:
move_stats = organize_downloads(
    input_dir="Media/media_files",  # where your downloader wrote files
    output_dir="Media",             # where Images/, Videos/, Gifs/ live
    strategy="move",
    conflict="rename",
    show_progress=True,
    dry_run=DRY_RUN_ORGANIZE,                  # set True to preview
    prune_empty_galleries=True,     # remove empty src folders after moving
)

move_stats

Organizing media:   0%|          | 0/76 [00:00<?, ?file/s]

{'moved': 76,
 'copied': 0,
 'linked': 0,
 'skipped': 0,
 'unknown': 0,
 'dry_run': False,
 'strategy': 'move',
 'conflict': 'rename',
 'input_dir': 'S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media\\media_files',
 'output_dir': 'S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media',
 'errors': [],
 'created_dirs': {'S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media\\Images',
  'S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media\\Images\\1981et9',
  'S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media\\Images\\1o7yp0r',
  'S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media\\Images\\1o97c82',
  'S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media\\Images\\1o9n30u',
  'S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media\\Images\\1oa6s16',
  'S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media\\Images\\1oa6x2t',
  'S:\\mind

# REDGIF DOWNLOADER

Downloads redgifs from external json folder in **Out/**

In [10]:
stats = process_external(
    media_json_dir=Path("Out/external"),
    media_out_dir=Path("Media/RedGiphys"),
    write_fail_csv_to=Path("__reports/redgif_report_" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
    write_links_csv_to=Path("__reports/external_links" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
    show_progress=True,
    dry_run=DRY_RUN_MEDIA,
    overwrite_downloads=False,
)

stats

Found 15 external post JSONs in Out\external


Scanning external posts:   0%|          | 0/15 [00:00<?, ?post/s]

Downloading Redgifs:   0%|          | 0/15 [00:00<?, ?file/s]

[REDGIFS] id=1jhxtty -> 1jhxtty.mp4
[REDGIFS] id=1o85il3 -> 1o85il3.mp4
[REDGIFS] id=1o8ba01 -> 1o8ba01.mp4
[REDGIFS] id=1o8bta5 -> 1o8bta5.mp4
[REDGIFS] id=1o8r5ic -> 1o8r5ic.mp4
[REDGIFS] id=1o9t13k -> 1o9t13k.mp4
[REDGIFS] id=1o9tk5e -> 1o9tk5e.mp4
[REDGIFS] id=1o9yw0m -> 1o9yw0m.mp4
[REDGIFS] id=1oa631j -> 1oa631j.mp4
[REDGIFS] id=1oa6chl -> 1oa6chl.mp4
[REDGIFS] id=1oa90lu -> 1oa90lu.mp4
[REDGIFS] id=1oabnlx -> 1oabnlx.mp4
[REDGIFS] id=1oaqcv4 -> 1oaqcv4.mp4
[REDGIFS] id=1oaw7d3 -> 1oaw7d3.mp4
[REDGIFS] id=1ob2xjm -> 1ob2xjm.mp4
Saved external links to: S:\minds\Desktop\Downloader and Reddit System\Saved-Reddit\__reports\external_links20251019-192514.csv


{'external_rows': [{'id': '1jhxtty',
   'link': 'https://www.redgifs.com/watch/indigocraftyhousefly',
   'domain': 'www.redgifs.com'},
  {'id': '1o85il3',
   'link': 'https://www.redgifs.com/watch/sandybrownthosesealion',
   'domain': 'www.redgifs.com'},
  {'id': '1o8ba01',
   'link': 'https://www.redgifs.com/watch/barespitefuljavalina',
   'domain': 'www.redgifs.com'},
  {'id': '1o8bta5',
   'link': 'https://www.redgifs.com/watch/antiquewhitenuttyanemone',
   'domain': 'www.redgifs.com'},
  {'id': '1o8r5ic',
   'link': 'https://redgifs.com/watch/grossambitiouscreature',
   'domain': 'redgifs.com'},
  {'id': '1o9t13k',
   'link': 'https://www.redgifs.com/watch/impressivelinenbasil',
   'domain': 'www.redgifs.com'},
  {'id': '1o9tk5e',
   'link': 'https://www.redgifs.com/watch/greenmedicalqueenbee',
   'domain': 'www.redgifs.com'},
  {'id': '1o9yw0m',
   'link': 'https://v3.redgifs.com/watch/hardtofindlightskyblueelkhound',
   'domain': 'v3.redgifs.com'},
  {'id': '1oa631j',
   'link': 

# CLOUDFLARE VERIFICATION & UPLOAD

In [11]:
raw_output = []
uploadsData = []

for mediaType in ["Images", "Videos", "Gifs", "RedGiphys"]:
    try:
        result = upload_media(
            input_path=Path("Media/" + mediaType + "/"),  # folder with your media or galleries
            r2_prefix=mediaType,                # one of: Gifs, Images, RedGiphys, Videos
            dry_run=DRY_RUN_CLOUDFLARE,                      # True = preview only, False = actually upload
            overwrite=False                    # only overwrite existing files if True
        )

        raw_output.append(result)
        uploadsData.extend(result["planned"])
        
    except Exception as e:
        continue

In [12]:
results_df = pd.DataFrame(uploadsData)
pd.set_option('display.max_rows', None)
results_df

Unnamed: 0,local,r2_key,bytes,content_type
0,S:\minds\Desktop\Downloader and Reddit System\...,Images/1o6f8je.jpeg,173914,image/jpeg
1,S:\minds\Desktop\Downloader and Reddit System\...,Images/1o9eszy.png,7138882,image/png
2,S:\minds\Desktop\Downloader and Reddit System\...,Images/1981et9/01.jpg,90388,image/jpeg
3,S:\minds\Desktop\Downloader and Reddit System\...,Images/1981et9/02.jpg,619484,image/jpeg
4,S:\minds\Desktop\Downloader and Reddit System\...,Images/1981et9/03.jpg,733750,image/jpeg
5,S:\minds\Desktop\Downloader and Reddit System\...,Images/1981et9/04.jpg,675840,image/jpeg
6,S:\minds\Desktop\Downloader and Reddit System\...,Images/1981et9/05.jpg,166419,image/jpeg
7,S:\minds\Desktop\Downloader and Reddit System\...,Images/1981et9/06.jpg,135279,image/jpeg
8,S:\minds\Desktop\Downloader and Reddit System\...,Images/1981et9/07.jpg,109472,image/jpeg
9,S:\minds\Desktop\Downloader and Reddit System\...,Images/1981et9/08.jpg,96953,image/jpeg


# VERIFY UPLOAD

In [13]:
audit_results = audit_local_vs_r2(
    local_root=Path("Media/Images"),          # where your images/galleries are
    r2_prefixes=["Images", "RedGiphys", "Gifs", "Videos"],
    write_csv_to=Path("__reports/r2_audit" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
    show_progress=True
)

audit_results["totals"]

Auditing: 100%|██████████| 76/76 [00:00<?, ?file/s]


Counter({'exact': 76})

In [14]:
if DRY_RUN_FINAL or ("missing" in audit_results["totals"].keys()):
    print(final_df.head(20))
    print("Dry run enabled; ordered_posts not changed")
else:
    print("Updating ordered_posts.csv with new posts...")
    final_df.to_csv("ordered_posts.csv", index=False)

Updating ordered_posts.csv with new posts...
