In [1]:
import re
import pandas as pd
from pathlib import Path
from datetime import datetime

from SCRIPTS.redditLinkRetriever import fetch_saved_post_links, save_links_txt

In [2]:
DRY_RUN = False # Set to True to avoid making any changes

In [3]:
# Retrieve saved post links for the specified user
links = fetch_saved_post_links()

In [4]:
links[:5]

['https://www.reddit.com/r/rape_hentai/comments/1oa90lu/going_to_the_beach_was_memorable_but_not_for_the',
 'https://www.reddit.com/r/bangmybully/comments/1oabnlx/your_mother_is_closer_to_your_bully_than_to_your',
 'https://www.reddit.com/r/IWantToBeHerHentai2/comments/1oaaxu4/ive_been_a_toy_and_nothing_else_for_those_men_and',
 'https://www.reddit.com/r/cumsluts/comments/1oa6chl/goals',
 'https://www.reddit.com/r/ABGHeavens/comments/1oaee68/amazing_abg_and_her_friend']

# NEW POST VALIDATION

This section validates new posts from reddits saved folder

In [5]:
csv_path = Path("ordered_posts.csv")
raw_df = pd.read_csv(csv_path)

POST_ID_RE = re.compile(r"/comments/([a-z0-9]+)(?:[/?#]|$)", re.IGNORECASE)
SHORT_RE   = re.compile(r"redd\.it/([a-z0-9]+)(?:[/?#]|$)", re.IGNORECASE)
max_order_num = raw_df.order_num.max()

def strip_trailing_slash(url: str) -> str:
    # remove trailing slashes only at the very end (doesn't touch scheme)
    return url.rstrip("/")

def extract_post_id(url: str) -> str | None:
    """
    Try to extract a post id from:
      - standard permalink: .../comments/<postid>/...
      - shortlink: https://redd.it/<postid>
    """
    m = POST_ID_RE.search(url)
    if m:
        return m.group(1)
    m = SHORT_RE.search(url)
    if m:
        return m.group(1)
    return None

existing_ids = set(str(x).lower() for x in raw_df.get("post_id", pd.Series([])).dropna())

new_rows = []
next_order = max_order_num + 1
seen_in_batch = set()  # avoid duplicates within this run

for raw_link in reversed(links):
    link = strip_trailing_slash(raw_link)
    post_id = extract_post_id(link)
    if not post_id:
        continue
    pid = post_id.lower()

    # Only add if NOT already in CSV and not already queued this batch
    if pid in existing_ids or pid in seen_in_batch:
        continue

    new_rows.append({
        "order_num": next_order,
        "link": link,
        "post_id": post_id,
        "date_added": datetime.utcnow().isoformat(timespec="seconds"),
    })
    seen_in_batch.add(pid)
    next_order += 1

# Preview as a DataFrame
new_df = pd.DataFrame(new_rows)
new_df


  "date_added": datetime.utcnow().isoformat(timespec="seconds"),


Unnamed: 0,order_num,link,post_id,date_added
0,1285,https://www.reddit.com/r/ABGHeavens/comments/1...,1o7yp0r,2025-10-19T08:54:17
1,1286,https://www.reddit.com/r/bangmybully/comments/...,1o8ba01,2025-10-19T08:54:17
2,1287,https://www.reddit.com/r/ABGHeavens/comments/1...,1o97c82,2025-10-19T08:54:17
3,1288,https://www.reddit.com/r/bangmybully/comments/...,1o8bta5,2025-10-19T08:54:17
4,1289,https://www.reddit.com/r/ABGHeavens/comments/1...,1oa6x2t,2025-10-19T08:54:17
5,1290,https://www.reddit.com/r/cumsluts/comments/1o9...,1o9yw0m,2025-10-19T08:54:17
6,1291,https://www.reddit.com/r/bangmybully/comments/...,1o9tk5e,2025-10-19T08:54:17
7,1292,https://www.reddit.com/r/biosuits/comments/1o9...,1o9eszy,2025-10-19T08:54:17
8,1293,https://www.reddit.com/r/SluttyConfessions/com...,1o9oow5,2025-10-19T08:54:17
9,1294,https://www.reddit.com/r/slutsofsnapchat/comme...,1o9t13k,2025-10-19T08:54:17


In [26]:
import SCRIPTS.jsonDownloader as jd
import importlib
importlib.reload(jd)

jd.configure(
    DATA_ROOT="Out",
    SKIP_EXISTING=False,
    REPORTS_DIR="Out/__reports"
    )

summary = jd.process_all(new_df["link"].tolist(), show_progress=True)
summary

  0%|          | 0/21 [00:00<?, ?post/s]

Done. Success: 0, Skipped: 21, Failed: 0


{'success': 0, 'skipped': 21, 'failed': 0}

# MEDIA DOWNLOADER