In [5]:
import openai, pandas as pd, requests, bs4, readability, lxml, praw, os
print("OK")

OK


In [6]:
from dotenv import load_dotenv; load_dotenv()
import os
print("OPENAI:", bool(os.getenv("OPENAI_API_KEY")))
print("SERPAPI:", bool(os.getenv("SERPAPI_API_KEY")))
print("REDDIT:", all([os.getenv("REDDIT_CLIENT_ID"), os.getenv("REDDIT_CLIENT_SECRET"), os.getenv("REDDIT_USER_AGENT")]))


OPENAI: True
SERPAPI: True
REDDIT: True


In [10]:
# Configuation

# --- config ---
TOPIC = "Private-party used-car buying in SF Bay Area"
SUBREDDITS = ["UsedCars"] #, "MechanicAdvice", "WhatCarShouldIBuy", "cars", "Scams", "AskCarsales"]
QUERY_STRINGS = [
    'Buying a used car']
    #'private party payment cashier check',
    #'private party title lien transfer',
    #'private party escrow payment',
    #'inspection OBD checklist private sale',
    #'wire vs cash private sale',
#]


SINCE_DAYS = 730        # ~24 months * 365
THREADS_PER_QUERY = 4   # keep small for first run
TOP_COMMENTS = 4

import os, json, time, math, datetime as dt, re, pathlib # re = pyton regex. pathlib = an easy way to manage env vars.
import praw as praw # Reddit API


# loads the environment vars from .env
from dotenv import load_dotenv; load_dotenv()

# Creates an env var for the outputs, and creates a data/exports dir that corresponds with the env var
EXPORTS = pathlib.Path("../data/exports"); EXPORTS.mkdir(parents=True, exist_ok=True)


RUN_ID = dt.datetime.utcnow().strftime("%Y%m%d_%H%M")
print("Run:", RUN_ID)


Run: 20251022_1918


In [12]:
# --- Stage 1: Researcher (Reddit via PRAW search) ---

# connect to Reddit (uses your .env values)
reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    user_agent=os.getenv("REDDIT_USER_AGENT"),
)

reddit.read_only = True

# simple call: fetch one hot post from r/UsedCars
#sub = reddit.subreddit("UsedCars")
#post = next(sub.hot(limit=1))
#print("OK:", post.title[:80])


def search_threads(query, subreddits, limit_per_sub, since_days):
    """ Search Reddit for `query` across `subreddits`, keeping posts newer than `since_days`.

    Args:
        query: Search string (e.g., "private party escrow").
        subreddits: List like ["UsedCars", "MechanicAdvice"].
        limit_per_sub: Max results per subreddit (before filtering).
        since_days: Recency window; older posts are dropped.

    Returns:
        A de-duplicated list of thread dicts with id, title, permalink, timestamps, etc.
    """
    
    after_ts = time.time() - since_days * 24 * 3600
    found = []
    
    for sub in subreddits:
        sr = reddit.subreddit(sub)
        # Reddit search (relevance) with a per-subreddit cap
        for submission in sr.search(query, sort="relevance", limit=limit_per_sub):
            if submission.created_utc < after_ts:
                continue
            found.append({
                "id": submission.id,
                "subreddit": str(sub),
                "title": submission.title,
                "permalink": "https://www.reddit.com" + submission.permalink,
                "created_utc": submission.created_utc,
                "created_iso": dt.datetime.utcfromtimestamp(submission.created_utc).isoformat() + "Z",
                "is_self": submission.is_self,
                "url": submission.url,
                "query": query,
            })
    # de-dupe by submission id
    seen = set()
    dedup = []
    for r in found:
        if r["id"] in seen:
            continue
        seen.add(r["id"])
        dedup.append(r)
    return dedup


# run searches over your configured queries/subreddits
all_threads = []
for q in QUERY_STRINGS:
    all_threads += search_threads(
        query=q, 
        subreddits=SUBREDDITS, 
        limit_per_sub=THREADS_PER_QUERY, 
        since_days=SINCE_DAYS
    )

# global de-dup (across queries)
seen_ids, threads = set(), []
for r in all_threads:
    if r["id"] in seen_ids: 
        continue
    seen_ids.add(r["id"]); threads.append(r)

print(f"Threads found (after recency + dedupe): {len(threads)}")
# quick peek
for t in threads[:5]:
    print(f"- r/{t['subreddit']} | {t['created_iso']} | {t['title'][:200]}…") 



Threads found (after recency + dedupe): 4
- r/UsedCars | 2025-08-01T20:49:40Z | Is buying a used car still the way to go?…
- r/UsedCars | 2025-10-11T17:56:53Z | how do you buy a used car?…
- r/UsedCars | 2025-08-07T16:51:57Z | How do people even buy used cars.…
- r/UsedCars | 2025-07-25T15:23:52Z | I sold a classic car to this guy (private sale) and now he's saying I misrepresented the car and he's giving me the opportunity to buy the car back.…


In [None]:
# --- Stage 2: Crawler — expand each thread with post + top comments ---

def fetch_thread(submission_id: str, top_n: int = TOP_COMMENTS) -> dict:
    sub = reddit.submission(id=submission_id)
    sub.comment_sort = "top"
    # flatten "More comments…" so we get real comments
    sub.comments.replace_more(limit=0)

    comments = []
    for c in sub.comments[:top_n]:
        comments.append({
            "author": str(c.author) if c.author else "deleted",
            "body": c.body,
            "permalink": f"https://www.reddit.com{c.permalink}",
            "created_utc": c.created_utc,
        })

    return {
        "thread_id": sub.id,
        "subreddit": str(sub.subreddit),
        "thread_title": sub.title,
        "selftext": sub.selftext or "",
        "permalink": f"https://www.reddit.com{sub.permalink}",
        "created_utc": sub.created_utc,
        "comments": comments,
    }

# run crawler over the threads we just found
bundles = []
for t in threads:
    try:
        bundles.append(fetch_thread(t["id"], top_n=TOP_COMMENTS))
    except Exception as e:
        print("crawl error:", t["id"], e)

print(f"Bundles: {len(bundles)} (each has post + up to {TOP_COMMENTS} top comments)")
# quick peek
if bundles:
    b = bundles[0]
    print("EXAMPLE:", b["thread_title"][:90], "| comments:", len(b["comments"]))
