In [2]:
import re
import pandas as pd
from pathlib import Path
from datetime import datetime

import SCRIPTS.jsonDownloader as jd
from SCRIPTS.redditLinkRetriever import fetch_saved_post_links, save_links_txt
from SCRIPTS.mediaDownloader import download_embedded_media
from SCRIPTS.mediaOrganizer import organize_downloads
from SCRIPTS.redgifDownloader import process_external
from SCRIPTS.cloudflareUploader import upload_media

In [3]:
# Set to True to avoid making any changes
DRY_RUN_MEDIA = True 
DRY_RUN_ORGANIZE = True 
DRY_RUN_CLOUDFLARE = True 
DRY_RUN_FINAL = True 

In [4]:
# Retrieve saved post links for the specified user
links = fetch_saved_post_links()

In [5]:
links[:5]

['https://www.reddit.com/r/slutsofsnapchat/comments/1ob41f5/cum_play_with_a_slutty_19yr_old_sexting_sextapes',
 'https://www.reddit.com/r/slutsofsnapchat/comments/1ob2xjm/telegram_creamykikii_cumslut_what_i_offer_gfe',
 'https://www.reddit.com/r/ABGHeavens/comments/1ob1yuu/best_abg',
 'https://www.reddit.com/r/HentaiBullying/comments/1o9n30u/they_use_me_to_make_money_and_their_own',
 'https://www.reddit.com/r/collegesluts/comments/1oaqcv4/freshman_with_biggest_tits_in_class']

# NEW POST VALIDATION

This section validates new posts from reddits saved folder

In [6]:
csv_path = Path("ordered_posts.csv")
raw_df = pd.read_csv(csv_path)

POST_ID_RE = re.compile(r"/comments/([a-z0-9]+)(?:[/?#]|$)", re.IGNORECASE)
SHORT_RE   = re.compile(r"redd\.it/([a-z0-9]+)(?:[/?#]|$)", re.IGNORECASE)
max_order_num = raw_df.order_num.max()

def strip_trailing_slash(url: str) -> str:
    # remove trailing slashes only at the very end (doesn't touch scheme)
    return url.rstrip("/")

def extract_post_id(url: str) -> str | None:
    """
    Try to extract a post id from:
      - standard permalink: .../comments/<postid>/...
      - shortlink: https://redd.it/<postid>
    """
    m = POST_ID_RE.search(url)
    if m:
        return m.group(1)
    m = SHORT_RE.search(url)
    if m:
        return m.group(1)
    return None

existing_ids = set(str(x).lower() for x in raw_df.get("post_id", pd.Series([])).dropna())

new_rows = []
next_order = max_order_num + 1
seen_in_batch = set()  # avoid duplicates within this run

for raw_link in reversed(links):
    link = strip_trailing_slash(raw_link)
    post_id = extract_post_id(link)
    if not post_id:
        continue
    pid = post_id.lower()

    # Only add if NOT already in CSV and not already queued this batch
    if pid in existing_ids or pid in seen_in_batch:
        continue

    new_rows.append({
        "order_num": next_order,
        "link": link,
        "post_id": post_id,
        "date_added": datetime.utcnow().isoformat(timespec="seconds"),
    })
    seen_in_batch.add(pid)
    next_order += 1

# Preview as a DataFrame
new_df = pd.DataFrame(new_rows)
new_df


  "date_added": datetime.utcnow().isoformat(timespec="seconds"),


Unnamed: 0,order_num,link,post_id,date_added
0,1286,https://www.reddit.com/r/bangmybully/comments/...,1o8ba01,2025-10-20T01:28:13
1,1287,https://www.reddit.com/r/ABGHeavens/comments/1...,1o97c82,2025-10-20T01:28:13
2,1288,https://www.reddit.com/r/bangmybully/comments/...,1o8bta5,2025-10-20T01:28:13
3,1289,https://www.reddit.com/r/ABGHeavens/comments/1...,1oa6x2t,2025-10-20T01:28:13
4,1290,https://www.reddit.com/r/cumsluts/comments/1o9...,1o9yw0m,2025-10-20T01:28:13
5,1291,https://www.reddit.com/r/bangmybully/comments/...,1o9tk5e,2025-10-20T01:28:13
6,1292,https://www.reddit.com/r/biosuits/comments/1o9...,1o9eszy,2025-10-20T01:28:13
7,1293,https://www.reddit.com/r/SluttyConfessions/com...,1o9oow5,2025-10-20T01:28:13
8,1294,https://www.reddit.com/r/slutsofsnapchat/comme...,1o9t13k,2025-10-20T01:28:13
9,1295,https://www.reddit.com/r/Rapekink/comments/1o9...,1o9fjdk,2025-10-20T01:28:13


In [15]:
final_df = pd.concat([raw_df, new_df], ignore_index=True)
final_df = final_df.sort_values(by="order_num", ascending=False).reset_index(drop=True)

In [8]:
import importlib
importlib.reload(jd)

jd.configure(
    DATA_ROOT="Out",
    SKIP_EXISTING=False,
    REPORTS_DIR="Out/__reports"
    )

summary = jd.process_all(new_df["link"].tolist(), show_progress=True)
summary

  0%|          | 0/26 [00:00<?, ?post/s]

Done. Success: 0, Skipped: 26, Failed: 0


{'success': 0, 'skipped': 26, 'failed': 0}

# MEDIA DOWNLOADER
Reviews the external and media json folders in **Out/**, downloading:
- Images
- Gifs
- Videos

In [9]:
folders = ["external", "media"]
download_stats = []

# point to your inputs/outputs explicitly
for mediaType in folders:
    download_stats.append(download_embedded_media(
        media_json_dir=Path("Out/" + mediaType),   # where your *.json live
        media_out_dir=Path("Media/media_files"),  # where downloads should go
        write_fail_csv_to=Path("Media/__reports/media_report" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
        show_progress=True,
    ))

download_stats

Downloading embedded media:   0%|          | 0/14 [00:00<?, ?post/s]

Downloading embedded media:   0%|          | 0/9 [00:00<?, ?post/s]

[{'downloaded': 0,
  'failed': 14,
  'skipped': 0,
  'fail_rows': [{'id': '1o85il3', 'reason': 'no_reddit_media_url'},
   {'id': '1o8ba01', 'reason': 'no_reddit_media_url'},
   {'id': '1o8bta5', 'reason': 'no_reddit_media_url'},
   {'id': '1o8r5ic', 'reason': 'no_reddit_media_url'},
   {'id': '1o9t13k', 'reason': 'no_reddit_media_url'},
   {'id': '1o9tk5e', 'reason': 'no_reddit_media_url'},
   {'id': '1o9yw0m', 'reason': 'no_reddit_media_url'},
   {'id': '1oa631j', 'reason': 'no_reddit_media_url'},
   {'id': '1oa6chl', 'reason': 'no_reddit_media_url'},
   {'id': '1oa90lu', 'reason': 'no_reddit_media_url'},
   {'id': '1oabnlx', 'reason': 'no_reddit_media_url'},
   {'id': '1oaqcv4', 'reason': 'no_reddit_media_url'},
   {'id': '1oaw7d3', 'reason': 'no_reddit_media_url'},
   {'id': '1ob2xjm', 'reason': 'no_reddit_media_url'}],
  'out_dir': WindowsPath('Media/media_files'),
  'json_dir': WindowsPath('Out/external')},
 {'downloaded': 57,
  'failed': 0,
  'skipped': 0,
  'fail_rows': [],
  'o

In [10]:
move_stats = organize_downloads(
    input_dir="Media/media_files",  # where your downloader wrote files
    output_dir="Media",             # where Images/, Videos/, Gifs/ live
    strategy="move",
    conflict="rename",
    show_progress=True,
    dry_run=DRY_RUN_ORGANIZE,                  # set True to preview
    prune_empty_galleries=True,     # remove empty src folders after moving
)

move_stats

Organizing media:   0%|          | 0/57 [00:00<?, ?file/s]

{'moved': 0,
 'copied': 0,
 'linked': 0,
 'skipped': 0,
 'unknown': 0,
 'dry_run': True,
 'strategy': 'move',
 'conflict': 'rename',
 'input_dir': 'S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media\\media_files',
 'output_dir': 'S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media',
 'errors': [],
 'created_dirs': set(),
 'pruned_dirs': []}

# REDGIF DOWNLOADER

Downloads redgifs from external json folder in **Out/**

In [11]:
stats = process_external(
    media_json_dir=Path("Out/external"),
    media_out_dir=Path("Media/RedGiphys"),
    write_fail_csv_to=Path("Media/__reports/redgif_report_" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
    write_links_csv_to=Path("Media/__reports/external_links.csv"),
    show_progress=True,
    dry_run=DRY_RUN_MEDIA,
    overwrite_downloads=False,
)

stats

Found 14 external post JSONs in Out\external


Scanning external posts:   0%|          | 0/14 [00:00<?, ?post/s]

Downloading Redgifs:   0%|          | 0/14 [00:00<?, ?file/s]

[DRY-RUN] would download id=1o85il3 -> 1o85il3.mp4
[DRY-RUN] would download id=1o8ba01 -> 1o8ba01.mp4
[DRY-RUN] would download id=1o8bta5 -> 1o8bta5.mp4
[DRY-RUN] would download id=1o8r5ic -> 1o8r5ic.mp4
[DRY-RUN] would download id=1o9t13k -> 1o9t13k.mp4
[DRY-RUN] would download id=1o9tk5e -> 1o9tk5e.mp4
[DRY-RUN] would download id=1o9yw0m -> 1o9yw0m.mp4
[DRY-RUN] would download id=1oa631j -> 1oa631j.mp4
[DRY-RUN] would download id=1oa6chl -> 1oa6chl.mp4
[DRY-RUN] would download id=1oa90lu -> 1oa90lu.mp4
[DRY-RUN] would download id=1oabnlx -> 1oabnlx.mp4
[DRY-RUN] would download id=1oaqcv4 -> 1oaqcv4.mp4
[DRY-RUN] would download id=1oaw7d3 -> 1oaw7d3.mp4
[DRY-RUN] would download id=1ob2xjm -> 1ob2xjm.mp4


ValueError: dict contains fields not in fieldnames: 'planned_path'

# CLOUDFLARE VERIFICATION & UPLOAD

In [None]:
raw_output = []
uploadsData = []

for mediaType in ["Images", "Videos", "Gifs", "RedGiphys"]:
    try:
        result = upload_media(
            input_path=Path("Media/" + mediaType + "/"),  # folder with your media or galleries
            r2_prefix=mediaType,                # one of: Gifs, Images, RedGiphys, Videos
            dry_run=DRY_RUN_CLOUDFLARE,                      # True = preview only, False = actually upload
            overwrite=False                    # only overwrite existing files if True
        )

        raw_output.append(result)
        uploadsData.extend(result["planned"])
        
    except Exception as e:
        continue

In [None]:
results_df = pd.DataFrame(uploadsData)
pd.set_option('display.max_rows', None)
results_df

Unnamed: 0,local,r2_key,bytes,content_type
0,S:\minds\Desktop\Downloader and Reddit System\...,Images/1o9eszy.png,7138882,image/png
1,S:\minds\Desktop\Downloader and Reddit System\...,Images/1o97c82/01.jpg,121746,image/jpeg
2,S:\minds\Desktop\Downloader and Reddit System\...,Images/1o97c82/02.jpg,102415,image/jpeg
3,S:\minds\Desktop\Downloader and Reddit System\...,Images/1o97c82/03.jpg,69554,image/jpeg
4,S:\minds\Desktop\Downloader and Reddit System\...,Images/1o97c82/04.jpg,190367,image/jpeg
5,S:\minds\Desktop\Downloader and Reddit System\...,Images/1o97c82/05.jpg,124637,image/jpeg
6,S:\minds\Desktop\Downloader and Reddit System\...,Images/1o9n30u/01.jpg,59141,image/jpeg
7,S:\minds\Desktop\Downloader and Reddit System\...,Images/1o9n30u/02.jpg,137872,image/jpeg
8,S:\minds\Desktop\Downloader and Reddit System\...,Images/1o9n30u/03.jpg,73782,image/jpeg
9,S:\minds\Desktop\Downloader and Reddit System\...,Images/1o9n30u/04.jpg,82437,image/jpeg


In [None]:
if DRY_RUN_FINAL:
    print(final_df.head(20))
    print("Dry run enabled; ordered_posts not changed")
else:
    print("Updating ordered_posts.csv with new posts...")
    final_df.to_csv("ordered_posts.csv", index=False)

    order_num                                               link  post_id  \
0        1311  https://www.reddit.com/r/slutsofsnapchat/comme...  1ob41f5   
1        1310  https://www.reddit.com/r/slutsofsnapchat/comme...  1ob2xjm   
2        1309  https://www.reddit.com/r/ABGHeavens/comments/1...  1ob1yuu   
3        1308  https://www.reddit.com/r/HentaiBullying/commen...  1o9n30u   
4        1307  https://www.reddit.com/r/collegesluts/comments...  1oaqcv4   
5        1306  https://www.reddit.com/r/cumsluts/comments/1oa...  1oaw7d3   
6        1305  https://www.reddit.com/r/SluttyConfessions/com...  1oao2hs   
7        1304  https://www.reddit.com/r/ABGHeavens/comments/1...  1oawsjr   
8        1303  https://www.reddit.com/r/rape_hentai/comments/...  1oa90lu   
9        1302  https://www.reddit.com/r/bangmybully/comments/...  1oabnlx   
10       1301  https://www.reddit.com/r/cumsluts/comments/1oa...  1oa6chl   
11       1300  https://www.reddit.com/r/ABGHeavens/comments/1...  1oaee68   