In [1]:
import re
import pandas as pd
from pathlib import Path
from datetime import datetime

import SCRIPTS.jsonDownloader as jd
from SCRIPTS.redditLinkRetriever import fetch_saved_post_links, save_links_txt
from SCRIPTS.mediaDownloader import download_embedded_media
from SCRIPTS.mediaOrganizer import organize_downloads
from SCRIPTS.redgifDownloader import process_external
from SCRIPTS.cloudflareUploader import upload_media
from SCRIPTS.r2_audit import audit_local_vs_r2

In [2]:
# Set to True to avoid making any changes
DRY_RUN_MEDIA = False
DRY_RUN_ORGANIZE = False
DRY_RUN_CLOUDFLARE = False
DRY_RUN_FINAL = False

In [3]:
# Retrieve saved post links for the specified user
links = fetch_saved_post_links()

In [4]:
len(links)

130

In [5]:
links[:5]

['https://www.reddit.com/r/IWantToBeHerHentai2/comments/1oew9bz/i_want_strangers_to_find_ways_to_use_me_even_if',
 'https://www.reddit.com/r/cumsluts/comments/1oewrsc/semen_demon',
 'https://www.reddit.com/r/rapefantasies/comments/1of6xb8/my_dad_friend_always_comes_over_whenever_im_home',
 'https://www.reddit.com/r/slutsofsnapchat/comments/1of5cy7/say_hey_if_u_are_straight_21_and_ur_boner_is_3',
 'https://www.reddit.com/r/IWantToBeHerHentai2/comments/1oev893/ugghh_everything_about_this_image_just']

# NEW POST VALIDATION

This section validates new posts from reddits saved folder

In [6]:
csv_path = Path("ordered_posts.csv")
raw_df = pd.read_csv(csv_path)

POST_ID_RE = re.compile(r"/comments/([a-z0-9]+)(?:[/?#]|$)", re.IGNORECASE)
SHORT_RE   = re.compile(r"redd\.it/([a-z0-9]+)(?:[/?#]|$)", re.IGNORECASE)
max_order_num = raw_df.order_num.max()

def strip_trailing_slash(url: str) -> str:
    # remove trailing slashes only at the very end (doesn't touch scheme)
    return url.rstrip("/")

def extract_post_id(url: str) -> str | None:
    """
    Try to extract a post id from:
      - standard permalink: .../comments/<postid>/...
      - shortlink: https://redd.it/<postid>
    """
    m = POST_ID_RE.search(url)
    if m:
        return m.group(1)
    m = SHORT_RE.search(url)
    if m:
        return m.group(1)
    return None

existing_ids = set(str(x).lower() for x in raw_df.get("post_id", pd.Series([])).dropna())

new_rows = []
next_order = max_order_num + 1
seen_in_batch = set()  # avoid duplicates within this run

for raw_link in reversed(links):
    link = strip_trailing_slash(raw_link)
    post_id = extract_post_id(link)
    if not post_id:
        continue
    pid = post_id.lower()

    # Only add if NOT already in CSV and not already queued this batch
    if pid in existing_ids or pid in seen_in_batch:
        continue

    new_rows.append({
        "order_num": next_order,
        "link": link,
        "post_id": post_id,
        "date_added": datetime.utcnow().isoformat(timespec="seconds"),
    })
    seen_in_batch.add(pid)
    next_order += 1

# Preview as a DataFrame
new_df = pd.DataFrame(new_rows)
new_df


  "date_added": datetime.utcnow().isoformat(timespec="seconds"),


Unnamed: 0,order_num,link,post_id,date_added
0,1330,https://www.reddit.com/r/cumsluts/comments/1oe...,1oetkrb,2025-10-24T21:33:02
1,1331,https://www.reddit.com/r/IWantToBeHerHentai2/c...,1oev893,2025-10-24T21:33:02
2,1332,https://www.reddit.com/r/slutsofsnapchat/comme...,1of5cy7,2025-10-24T21:33:02
3,1333,https://www.reddit.com/r/rapefantasies/comment...,1of6xb8,2025-10-24T21:33:02
4,1334,https://www.reddit.com/r/cumsluts/comments/1oe...,1oewrsc,2025-10-24T21:33:02
5,1335,https://www.reddit.com/r/IWantToBeHerHentai2/c...,1oew9bz,2025-10-24T21:33:02


In [7]:
final_df = pd.concat([raw_df, new_df], ignore_index=True)
final_df = final_df.sort_values(by="order_num", ascending=False).reset_index(drop=True)

In [8]:
import importlib
importlib.reload(jd)

jd.configure(
    DATA_ROOT="Out",
    SKIP_EXISTING=False,
    REPORTS_DIR="__reports"
    )

summary = jd.process_all(new_df["link"].tolist(), show_progress=True)
summary

  0%|          | 0/6 [00:00<?, ?post/s]

Done. Success: 6, Skipped: 0, Failed: 0


{'success': 6, 'skipped': 0, 'failed': 0}

# MEDIA DOWNLOADER
Reviews the external and media json folders in **Out/**, downloading:
- Images
- Gifs
- Videos

In [9]:
folders = ["external", "media"]
download_stats = []

# point to your inputs/outputs explicitly
for mediaType in folders:
    download_stats.append(download_embedded_media(
        media_json_dir=Path("Out/" + mediaType),   # where your *.json live
        media_out_dir=Path("Media/media_files"),  # where downloads should go
        write_fail_csv_to=Path("__reports/media_report" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
        show_progress=True,
    ))

download_stats

Downloading embedded media:   0%|          | 0/12 [00:00<?, ?post/s]

Downloading embedded media:   0%|          | 0/4 [00:00<?, ?post/s]

[{'downloaded': 0,
  'failed': 12,
  'skipped': 0,
  'fail_rows': [{'id': '16q5pwe', 'reason': 'no_reddit_media_url'},
   {'id': '17mku0s', 'reason': 'no_reddit_media_url'},
   {'id': '1odgtl4', 'reason': 'no_reddit_media_url'},
   {'id': '1oe3txz', 'reason': 'no_reddit_media_url'},
   {'id': '1oefmkz', 'reason': 'no_reddit_media_url'},
   {'id': '1oehbw8', 'reason': 'no_reddit_media_url'},
   {'id': '1oeilc5', 'reason': 'no_reddit_media_url'},
   {'id': '1oetkrb', 'reason': 'no_reddit_media_url'},
   {'id': '1oew9bz', 'reason': 'no_reddit_media_url'},
   {'id': '1oewrsc', 'reason': 'no_reddit_media_url'},
   {'id': '1of5cy7', 'reason': 'no_reddit_media_url'},
   {'id': '1of6xb8', 'reason': 'no_reddit_media_url'}],
  'out_dir': WindowsPath('Media/media_files'),
  'json_dir': WindowsPath('Out/external')},
 {'downloaded': 19,
  'failed': 0,
  'skipped': 0,
  'fail_rows': [],
  'out_dir': WindowsPath('Media/media_files'),
  'json_dir': WindowsPath('Out/media')}]

In [10]:
move_stats = organize_downloads(
    input_dir="Media/media_files",  # where your downloader wrote files
    output_dir="Media",             # where Images/, Videos/, Gifs/ live
    strategy="move",
    conflict="rename",
    show_progress=True,
    dry_run=DRY_RUN_ORGANIZE,                  # set True to preview
    prune_empty_galleries=True,     # remove empty src folders after moving
)

move_stats

Organizing media:   0%|          | 0/19 [00:00<?, ?file/s]

{'moved': 19,
 'copied': 0,
 'linked': 0,
 'skipped': 0,
 'unknown': 0,
 'dry_run': False,
 'strategy': 'move',
 'conflict': 'rename',
 'input_dir': 'S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media\\media_files',
 'output_dir': 'S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media',
 'errors': [],
 'created_dirs': set(),
 'pruned_dirs': ['S:\\minds\\Desktop\\Downloader and Reddit System\\Saved-Reddit\\Media\\media_files\\1oebbrm']}

# REDGIF DOWNLOADER

Downloads redgifs from external json folder in **Out/**

In [11]:
stats = process_external(
    media_json_dir=Path("Out/external"),
    media_out_dir=Path("Media/RedGiphys"),
    write_fail_csv_to=Path("__reports/redgif_report_" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
    write_links_csv_to=Path("__reports/external_links" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
    show_progress=True,
    dry_run=DRY_RUN_MEDIA,
    overwrite_downloads=False,
)

stats

Found 12 external post JSONs in Out\external


Scanning external posts:   0%|          | 0/12 [00:00<?, ?post/s]

Downloading Redgifs:   0%|          | 0/12 [00:00<?, ?file/s]

[REDGIFS] id=1oetkrb -> 1oetkrb.mp4
[REDGIFS] id=1oew9bz -> 1oew9bz.mp4
[REDGIFS] id=1oewrsc -> 1oewrsc.mp4
[REDGIFS] id=1of5cy7 -> 1of5cy7.mp4
[REDGIFS] id=1of6xb8 -> 1of6xb8.mp4
Saved external links to: S:\minds\Desktop\Downloader and Reddit System\Saved-Reddit\__reports\external_links20251024-143311.csv
Saved Redgifs failures to: S:\minds\Desktop\Downloader and Reddit System\Saved-Reddit\__reports\redgif_report_20251024-143311.csv


{'external_rows': [{'id': '16q5pwe',
   'link': 'https://www.redgifs.com/watch/midnightblueverifiablehoneybadger',
   'domain': 'www.redgifs.com'},
  {'id': '17mku0s',
   'link': 'https://v3.redgifs.com/watch/clearradiantimago',
   'domain': 'v3.redgifs.com'},
  {'id': '1odgtl4',
   'link': 'https://www.redgifs.com/watch/goldpoorverdin',
   'domain': 'www.redgifs.com'},
  {'id': '1oe3txz',
   'link': 'https://redgifs.com/watch/awkwardmediumseagreenduckling',
   'domain': 'redgifs.com'},
  {'id': '1oefmkz',
   'link': 'https://v3.redgifs.com/watch/769309872419882097',
   'domain': 'v3.redgifs.com'},
  {'id': '1oehbw8',
   'link': 'https://www.redgifs.com/watch/jealouswelcomegilamonster',
   'domain': 'www.redgifs.com'},
  {'id': '1oeilc5',
   'link': 'https://www.redgifs.com/watch/blueseparateswallow',
   'domain': 'www.redgifs.com'},
  {'id': '1oetkrb',
   'link': 'https://www.redgifs.com/watch/deafeninguniformcockatiel',
   'domain': 'www.redgifs.com'},
  {'id': '1oew9bz',
   'link': 

# CLOUDFLARE VERIFICATION & UPLOAD

In [12]:
raw_output = []
uploadsData = []

for mediaType in ["Images", "Videos", "Gifs", "RedGiphys"]:
    try:
        result = upload_media(
            input_path=Path("Media/" + mediaType + "/"),  # folder with your media or galleries
            r2_prefix=mediaType,                # one of: Gifs, Images, RedGiphys, Videos
            dry_run=DRY_RUN_CLOUDFLARE,                      # True = preview only, False = actually upload
            overwrite=False                    # only overwrite existing files if True
        )

        raw_output.append(result)
        uploadsData.extend(result["planned"])
        
    except Exception as e:
        continue

In [13]:
results_df = pd.DataFrame(uploadsData)
pd.set_option('display.max_rows', None)
results_df

Unnamed: 0,local,r2_key,bytes,content_type
0,S:\minds\Desktop\Downloader and Reddit System\...,Images/1oedqt7.jpeg,2234365,image/jpeg
1,S:\minds\Desktop\Downloader and Reddit System\...,Images/1oedqt7_1.jpeg,2234365,image/jpeg
2,S:\minds\Desktop\Downloader and Reddit System\...,Images/1oefa6g.jpeg,143237,image/jpeg
3,S:\minds\Desktop\Downloader and Reddit System\...,Images/1oefa6g_1.jpeg,143237,image/jpeg
4,S:\minds\Desktop\Downloader and Reddit System\...,Images/1oev893.png,341785,image/png
5,S:\minds\Desktop\Downloader and Reddit System\...,Images/1oebbrm/01.png,754180,image/png
6,S:\minds\Desktop\Downloader and Reddit System\...,Images/1oebbrm/01_1.png,754180,image/png
7,S:\minds\Desktop\Downloader and Reddit System\...,Images/1oebbrm/02.jpg,261552,image/jpeg
8,S:\minds\Desktop\Downloader and Reddit System\...,Images/1oebbrm/02_1.jpg,261552,image/jpeg
9,S:\minds\Desktop\Downloader and Reddit System\...,Images/1oebbrm/03.jpg,140004,image/jpeg


# VERIFY UPLOAD

In [14]:
audit_results = audit_local_vs_r2(
    local_root=Path("Media/Images"),          # where your images/galleries are
    r2_prefixes=["Images", "RedGiphys", "Gifs", "Videos"],
    write_csv_to=Path("__reports/r2_audit" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
    show_progress=True
)

audit_results["totals"]

Auditing: 100%|██████████| 35/35 [00:00<00:00, 35002.54file/s]


Counter({'exact': 35})

In [15]:
if DRY_RUN_FINAL or ("missing" in audit_results["totals"].keys()):
    print(final_df.head(20))
    print("Dry run enabled; ordered_posts not changed")
else:
    print("Updating ordered_posts.csv with new posts...")
    final_df.to_csv("ordered_posts.csv", index=False)

Updating ordered_posts.csv with new posts...
