In [None]:
import re
import pandas as pd
from pathlib import Path
from datetime import datetime

import SCRIPTS.jsonDownloader as jd
from SCRIPTS.redditLinkRetriever import fetch_saved_post_links, save_links_txt
from SCRIPTS.mediaDownloader import download_embedded_media
from SCRIPTS.mediaOrganizer import organize_downloads
from SCRIPTS.redgifDownloader import process_external
from SCRIPTS.cloudflareUploader import upload_media
from SCRIPTS.r2_audit import audit_local_vs_r2

In [None]:
# Set to True to avoid making any changes
DRY_RUN_MEDIA = False
DRY_RUN_ORGANIZE = False
DRY_RUN_CLOUDFLARE = False
DRY_RUN_FINAL = False

In [None]:
# Retrieve saved post links for the specified user
links = fetch_saved_post_links()

In [None]:
links[:5]

# NEW POST VALIDATION

This section validates new posts from reddits saved folder

In [None]:
csv_path = Path("ordered_posts.csv")
raw_df = pd.read_csv(csv_path)

POST_ID_RE = re.compile(r"/comments/([a-z0-9]+)(?:[/?#]|$)", re.IGNORECASE)
SHORT_RE   = re.compile(r"redd\.it/([a-z0-9]+)(?:[/?#]|$)", re.IGNORECASE)
max_order_num = raw_df.order_num.max()

def strip_trailing_slash(url: str) -> str:
    # remove trailing slashes only at the very end (doesn't touch scheme)
    return url.rstrip("/")

def extract_post_id(url: str) -> str | None:
    """
    Try to extract a post id from:
      - standard permalink: .../comments/<postid>/...
      - shortlink: https://redd.it/<postid>
    """
    m = POST_ID_RE.search(url)
    if m:
        return m.group(1)
    m = SHORT_RE.search(url)
    if m:
        return m.group(1)
    return None

existing_ids = set(str(x).lower() for x in raw_df.get("post_id", pd.Series([])).dropna())

new_rows = []
next_order = max_order_num + 1
seen_in_batch = set()  # avoid duplicates within this run

for raw_link in reversed(links):
    link = strip_trailing_slash(raw_link)
    post_id = extract_post_id(link)
    if not post_id:
        continue
    pid = post_id.lower()

    # Only add if NOT already in CSV and not already queued this batch
    if pid in existing_ids or pid in seen_in_batch:
        continue

    new_rows.append({
        "order_num": next_order,
        "link": link,
        "post_id": post_id,
        "date_added": datetime.utcnow().isoformat(timespec="seconds"),
    })
    seen_in_batch.add(pid)
    next_order += 1

# Preview as a DataFrame
new_df = pd.DataFrame(new_rows)
new_df


In [None]:
final_df = pd.concat([raw_df, new_df], ignore_index=True)
final_df = final_df.sort_values(by="order_num", ascending=False).reset_index(drop=True)

In [None]:
import importlib
importlib.reload(jd)

jd.configure(
    DATA_ROOT="Out",
    SKIP_EXISTING=False,
    REPORTS_DIR="__reports"
    )

summary = jd.process_all(new_df["link"].tolist(), show_progress=True)
summary

# MEDIA DOWNLOADER
Reviews the external and media json folders in **Out/**, downloading:
- Images
- Gifs
- Videos

In [None]:
folders = ["external", "media"]
download_stats = []

# point to your inputs/outputs explicitly
for mediaType in folders:
    download_stats.append(download_embedded_media(
        media_json_dir=Path("Out/" + mediaType),   # where your *.json live
        media_out_dir=Path("Media/media_files"),  # where downloads should go
        write_fail_csv_to=Path("__reports/media_report" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
        show_progress=True,
    ))

download_stats

In [None]:
move_stats = organize_downloads(
    input_dir="Media/media_files",  # where your downloader wrote files
    output_dir="Media",             # where Images/, Videos/, Gifs/ live
    strategy="move",
    conflict="rename",
    show_progress=True,
    dry_run=DRY_RUN_ORGANIZE,                  # set True to preview
    prune_empty_galleries=True,     # remove empty src folders after moving
)

move_stats

# REDGIF DOWNLOADER

Downloads redgifs from external json folder in **Out/**

In [None]:
stats = process_external(
    media_json_dir=Path("Out/external"),
    media_out_dir=Path("Media/RedGiphys"),
    write_fail_csv_to=Path("__reports/redgif_report_" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
    write_links_csv_to=Path("__reports/external_links" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
    show_progress=True,
    dry_run=DRY_RUN_MEDIA,
    overwrite_downloads=False,
)

stats

# CLOUDFLARE VERIFICATION & UPLOAD

In [None]:
raw_output = []
uploadsData = []

for mediaType in ["Images", "Videos", "Gifs", "RedGiphys"]:
    try:
        result = upload_media(
            input_path=Path("Media/" + mediaType + "/"),  # folder with your media or galleries
            r2_prefix=mediaType,                # one of: Gifs, Images, RedGiphys, Videos
            dry_run=DRY_RUN_CLOUDFLARE,                      # True = preview only, False = actually upload
            overwrite=False                    # only overwrite existing files if True
        )

        raw_output.append(result)
        uploadsData.extend(result["planned"])
        
    except Exception as e:
        continue

In [None]:
results_df = pd.DataFrame(uploadsData)
pd.set_option('display.max_rows', None)
results_df

# VERIFY UPLOAD

In [None]:
audit_results = audit_local_vs_r2(
    local_root=Path("Media/Images"),          # where your images/galleries are
    r2_prefixes=["Images", "RedGiphys", "Gifs", "Videos"],
    write_csv_to=Path("__reports/r2_audit" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv"),
    show_progress=True
)

audit_results["totals"]

In [None]:
if DRY_RUN_FINAL or ("missing" in audit_results["totals"].keys()):
    print(final_df.head(20))
    print("Dry run enabled; ordered_posts not changed")
else:
    print("Updating ordered_posts.csv with new posts...")
    final_df.to_csv("ordered_posts.csv", index=False)