In [1]:
import openai, pandas as pd, requests, bs4, readability, lxml, praw, os
print("OK")

OK


In [2]:
from dotenv import load_dotenv; load_dotenv()
import os
print("OPENAI:", bool(os.getenv("OPENAI_API_KEY")))
print("SERPAPI:", bool(os.getenv("SERPAPI_API_KEY")))
print("REDDIT:", all([os.getenv("REDDIT_CLIENT_ID"), os.getenv("REDDIT_CLIENT_SECRET"), os.getenv("REDDIT_USER_AGENT")]))


OPENAI: True
SERPAPI: True
REDDIT: True


In [3]:
# Configuation

# --- config ---
TOPIC = "Private-party used-car buying in SF Bay Area"
SUBREDDITS = ["UsedCars"] #, "MechanicAdvice", "WhatCarShouldIBuy", "cars", "Scams", "AskCarsales"]
QUERY_STRINGS = [
    'Buying a used car']
    #'private party payment cashier check',
    #'private party title lien transfer',
    #'private party escrow payment',
    #'inspection OBD checklist private sale',
    #'wire vs cash private sale',
#]


RECENCY_MONTHS = 24
SINCE_DAYS = RECENCY_MONTHS * 30  # Stage 1 uses days
THREADS_PER_QUERY = 4   # keep small for first run
TOP_COMMENTS = 4

import os, json, time, math, datetime as dt, re, pathlib # re = pyton regex. pathlib = an easy way to manage env vars.
import praw as praw # Reddit API
from openai import OpenAI # OpenAI API



# loads the environment vars from .env
from dotenv import load_dotenv; load_dotenv()

# Creates an env var for the outputs, and creates a data/exports dir that corresponds with the env var
EXPORTS = pathlib.Path("../data/exports"); EXPORTS.mkdir(parents=True, exist_ok=True)


RUN_ID = dt.datetime.utcnow().strftime("%Y%m%d_%H%M")
print("Run:", RUN_ID)


Run: 20251023_2127


In [4]:
# --- Stage 1: Researcher (Reddit via PRAW search) ---

# connect to Reddit (uses your .env values)
reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    user_agent=os.getenv("REDDIT_USER_AGENT"),
)

reddit.read_only = True

# simple call: fetch one hot post from r/UsedCars
#sub = reddit.subreddit("UsedCars")
#post = next(sub.hot(limit=1))
#print("OK:", post.title[:80])


def search_threads(query, subreddits, limit_per_sub, since_days):
    """ Search Reddit for `query` across `subreddits`, keeping posts newer than `since_days`.

    Args:
        query: Search string (e.g., "private party escrow").
        subreddits: List like ["UsedCars", "MechanicAdvice"].
        limit_per_sub: Max results per subreddit (before filtering).
        since_days: Recency window; older posts are dropped.

    Returns:
        A de-duplicated list of thread dicts with id, title, permalink, timestamps, etc.
    """
    
    after_ts = time.time() - since_days * 24 * 3600
    found = []
    
    for sub in subreddits:
        sr = reddit.subreddit(sub)
        # Reddit search (relevance) with a per-subreddit cap
        for submission in sr.search(query, sort="relevance", limit=limit_per_sub):
            if submission.created_utc < after_ts:
                continue
            found.append({
                "id": submission.id,
                "subreddit": str(sub),
                "title": submission.title,
                "permalink": "https://www.reddit.com" + submission.permalink,
                "created_utc": submission.created_utc,
                "created_iso": dt.datetime.utcfromtimestamp(submission.created_utc).isoformat() + "Z",
                "is_self": submission.is_self,
                "url": submission.url,
                "query": query,
            })
    # de-dupe by submission id
    seen = set()
    dedup = []
    for r in found:
        if r["id"] in seen:
            continue
        seen.add(r["id"])
        dedup.append(r)
    return dedup


# run searches over your configured queries/subreddits
all_threads = []
for q in QUERY_STRINGS:
    all_threads += search_threads(
        query=q, 
        subreddits=SUBREDDITS, 
        limit_per_sub=THREADS_PER_QUERY, 
        since_days=SINCE_DAYS
    )

# global de-dup (across queries)
seen_ids, threads = set(), []
for r in all_threads:
    if r["id"] in seen_ids: 
        continue
    seen_ids.add(r["id"]); threads.append(r)

print(f"Threads found (after recency + dedupe): {len(threads)}")
# quick peek
for t in threads[:5]:
    print(f"- r/{t['subreddit']} | {t['created_iso']} | {t['title'][:200]}…") 



Threads found (after recency + dedupe): 4
- r/UsedCars | 2025-10-11T17:56:53Z | how do you buy a used car?…
- r/UsedCars | 2025-08-01T20:49:40Z | Is buying a used car still the way to go?…
- r/UsedCars | 2025-08-07T16:51:57Z | How do people even buy used cars.…
- r/UsedCars | 2025-07-25T15:23:52Z | I sold a classic car to this guy (private sale) and now he's saying I misrepresented the car and he's giving me the opportunity to buy the car back.…


In [5]:
# --- Stage 2: Crawler — expand each thread with post + top comments ---

def fetch_thread(submission_id: str, top_n: int = TOP_COMMENTS) -> dict:
    sub = reddit.submission(id=submission_id)
    sub.comment_sort = "top"
    # flatten "More comments…" so we get real comments
    sub.comments.replace_more(limit=0)

    comments = []
    for c in sub.comments[:top_n]:
        comments.append({
            "author": str(c.author) if c.author else "deleted",
            "body": c.body,
            "permalink": f"https://www.reddit.com{c.permalink}",
            "created_utc": c.created_utc,
        })

    return {
        "thread_id": sub.id,
        "subreddit": str(sub.subreddit),
        "thread_title": sub.title,
        "selftext": sub.selftext or "",
        "permalink": f"https://www.reddit.com{sub.permalink}",
        "created_utc": sub.created_utc,
        "comments": comments,
    }

# run crawler over the threads we just found
bundles = []
for t in threads:
    try:
        bundles.append(fetch_thread(t["id"], top_n=TOP_COMMENTS))
    except Exception as e:
        print("crawl error:", t["id"], e)


def preview_bundles(bundles, max_title=100, max_comment=140):
    for idx, b in enumerate(bundles, 1):
        print(f"\n[{idx}] r/{b['subreddit']} | {b['thread_title'][:max_title]}")
        print(f"    Thread: {b['permalink']}")
        for i, c in enumerate(b["comments"], 1):
            body = " ".join(c["body"].split())  # collapse whitespace/newlines
            excerpt = (body[:max_comment] + "…") if len(body) > max_comment else body
            print(f"    {i:02d}. {excerpt}")
        print("-"*80)

preview_bundles(bundles)

# Print into a json file
out_path = EXPORTS / f"bundles_{RUN_ID}.json"
with open(out_path, "w") as f:
    json.dump(bundles, f, ensure_ascii=False, indent=2)
str(out_path)




[1] r/UsedCars | how do you buy a used car?
    Thread: https://www.reddit.com/r/UsedCars/comments/1o426it/how_do_you_buy_a_used_car/
    01. Well, if you’ve never bought one or been to a dealership, do not ever go in person. They’d see you coming a mile away and get you to buy som…
    02. You need to find what you want/need. What will you be using it for? How many passengers? Will it be a commuter, a short ride, or what? What'…
    03. find the wealthiest zip code near you. restrict your searches to that. wealthy folks take great care of their cars, and when they sell them …
    04. Always get your own financing. Get pre-approved letter from your bank. Dont use dealer financing. Dealer will lie and give you higher intres…
--------------------------------------------------------------------------------

[2] r/UsedCars | Is buying a used car still the way to go?
    Thread: https://www.reddit.com/r/UsedCars/comments/1mf7v5k/is_buying_a_used_car_still_the_way_to_go/
    01. There are s

'../data/exports/bundles_20251023_2127.json'

In [6]:
# --- Stage 3: Extractor — LLM → structured rows ---
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

SCHEMA = {
 "persona":"buyer|seller|specialist|unknown",
 "stage":"discover|schedule|prepare|meetup_inspect|decide|pay_paperwork|aftercare",
 "action":"what they did",
 "feeling":"how they felt",
 "pain":"risk/annoyance",
 "workaround":"what they tried",
 "opportunity":"what we could do",
 "verbatim_quote":"exact sentence from source",
 "source_url":"https://...",
 "permalink":"https://...#comment-id (or thread permalink)",
 "source_type":"reddit",
 "thread_title":"...",
 "posted_at":"ISO8601 or null",
 "location_city":"city name or null",
 "location_state":"US state/region or null",
 "location_hint":"free text if mentioned (dealer/DMV names, banks, etc.)",
 "stage_confidence":0.0,
 "persona_confidence":0.0
}

EXTRACT_GUIDE = """
You are extracting structured observations for JOURNEY MAPPING of PRIVATE-PARTY used-car transactions
(person buys a used car directly from a private seller, not a dealership). Return ONLY a JSON array.

Rules (must follow all):
- Include at least ONE verbatim sentence from the SOURCE TEXT as "verbatim_quote". No paraphrasing in that field.
- Always include:
  • "source_url" = the thread permalink
  • "permalink" = the comment's permalink if quoting a comment; otherwise the thread permalink
  • "thread_title" and "source_type" = "reddit"
- Classify "persona" and "stage" using the provided enums. If unclear, use "unknown" for persona.
- Keep "action", "feeling", "pain", "workaround", "opportunity" concise and practical for a journey map.
- Extract location if present:
  • "location_city" and "location_state" when clearly mentioned (e.g., "San Jose, CA"), else null
  • "location_hint" for any free-text place clues (DMV names, banks, neighborhoods)
- Assign "stage_confidence" and "persona_confidence" in [0.0, 1.0].
- If the text does NOT contain concrete, quotable content about a private-party used-car transaction,
  return an empty array [].

Output: JSON only. No prose, no markdown.
"""

def extract_from_bundle(b):
    # compact source: thread title + selftext + top comments
    parts = [f"THREAD: {b['thread_title']}", b.get("selftext","")]
    for c in b.get("comments", []):
        parts.append(f"COMMENT ({c['permalink']}): {c['body']}")
    text = "\n\n".join(parts)[:20000]  # cap to avoid token bloat

    resp = client.responses.create(
        model="gpt-4.1-mini",
        temperature=0.2,
        input=[
            {"role":"system","content":"You extract structured observations for private-party used-car journeys."},
            {"role":"user","content": json.dumps({"schema":SCHEMA, "thread_permalink":b["permalink"], "text":text})},
            {"role":"user","content": EXTRACT_GUIDE}
        ]
    )

    try:
        arr = json.loads(resp.output_text)
        # normalize a few fields and stamp defaults
        for r in arr:
            r.setdefault("source_url", b["permalink"])
            r.setdefault("thread_title", b["thread_title"])
            r.setdefault("source_type", "reddit")
            if not r.get("permalink"):
                r["permalink"] = b["permalink"]
            # we don't have precise per-comment timestamps here; leave posted_at null
        return arr
    except Exception as e:
        print("parse error:", e)
        return []

# Run extractor over all bundles
rows = []
for b in bundles:
    rows += extract_from_bundle(b)
  

print("Raw rows extracted:", len(rows))
# quick peek
for r in rows: #[:5]:
    print(r["persona"], r["stage"], "→", r["verbatim_quote"][:80], "…", r["permalink"])


Raw rows extracted: 17
buyer discover → Take your answers from that, and get online on sites like cargurus,cars,auto tra … https://www.reddit.com/r/UsedCars/comments/1o426it/how_do_you_buy_a_used_car/nizb6pt/
buyer prepare → Visually inspect the vehicle. Is it clean and well groomed? Is it ratted out fil … https://www.reddit.com/r/UsedCars/comments/1o426it/how_do_you_buy_a_used_car/nizb6pt/
buyer meetup_inspect → Drive it to a reputable shop across town. Pay for an inspection and tell them yo … https://www.reddit.com/r/UsedCars/comments/1o426it/how_do_you_buy_a_used_car/nizb6pt/
buyer decide → Look at oilchange sticker on window. Is it overdue for oil change? If its overdu … https://www.reddit.com/r/UsedCars/comments/1o426it/how_do_you_buy_a_used_car/nizbjeb/
buyer discover → find the wealthiest zip code near you. restrict your searches to that. … https://www.reddit.com/r/UsedCars/comments/1o426it/how_do_you_buy_a_used_car/nj0hkds/
buyer discover → I think right now the best deal you c

In [7]:
# --- Stage 4: Normalize / Dedupe / Filter ---

# It drops query strings (?utm=…) and fragments (#comment-123) so two URLs that are the same page compare equal.
# Examples:
# https://site.com/page?utm_source=news#section → https://site.com/page
# https://reddit.com/r/UsedCars/comments/abc123/?sort=top → https://reddit.com/r/UsedCars/comments/abc123/
def canon_url(u: str) -> str:
    if not u: return ""
        
    # substitute ? or # and everything that comes after it with  "". Strip removes any leading or trailing spaces from a string
    return re.sub(r"[?#].*$", "", u.strip()) 

# Input
# r: dict — one extracted row from the LLM (a Python dict). It should have numeric fields:
#     stage_confidence
#     persona_confidence
#
# min_conf: float = 0.5 — the threshold both confidences must meet or exceed (defaults to 0.5).
#
# Output
# bool — True if both confidences are ≥ min_conf; otherwise False.
#
# Algorithm (one-liner, expanded)
# Look up stage_confidence and persona_confidence in r with .get(...).
# If a key is missing, use 0 as a default.
# Cast each value to float (handles strings like "0.72" or None safely—non-numeric strings would raise).
# Compare both to min_conf.
# Return True only if both comparisons pass (logical AND).
def ok_conf(r: dict, min_conf: float = 0.5) -> bool:
    return (
        float(r.get("stage_confidence", 0)) >= min_conf and
        float(r.get("persona_confidence", 0)) >= min_conf
    )

# Purpose
# Gate rows by recency: keep items whose timestamp is within the last months (default RECENCY_MONTHS). It’s a soft filter—if there’s no usable date, 
# it keeps the row rather than throwing it away.
#
# Signature (inputs → output)
# Input
# iso: str — a timestamp string in (roughly) ISO-8601 (e.g., "2024-07-15T12:34:56Z" or "2024-07-15T12:34:56").
# months: int = 24 — how many months back to allow (approximate, using 30 days per month).
#
# Output
# bool — True if the row should be kept, False if it’s too old.
def within_months(iso: str, months: int = RECENCY_MONTHS) -> bool:
    if not iso:
        return True
    try:
        d = dt.datetime.fromisoformat(iso.replace("Z",""))
        cutoff = dt.datetime.utcnow() - dt.timedelta(days=months*30)
        return d >= cutoff
    except Exception:
        return True

# Python list comprehensions
# [new_item   for item in iterable   if condition]
# Read it left→right: “Put new_item into a list for each item in iterable if condition.”

# Breakdown:
# r for r in rows → iterate all extracted row dicts.
# r.get("verbatim_quote") → fetches the value (or None if missing). In an if, this is truthy only if it exists and isn’t empty (e.g., not "").
# and r.get("source_url") → also require a non-empty source_url.
# The comprehension builds a new list with only rows that satisfy both conditions.
#
# Why:
# We require a real, verbatim quote (evidence) and a URL to verify it. Rows missing either are dropped before dedupe/QA.

# 1) must-haves
rows_clean = [r for r in rows if r.get("verbatim_quote") and r.get("source_url")]

# What it does
# Goal: keep only the first occurrence of a row with the same quote + URL.
# seen (set): fast membership check of keys we’ve already kept.
# key: a tuple of
# the quote text (trimmed), and
# a canonicalized URL (permalink if present, else source_url, with query/fragment stripped by canon_url).
# If the key was seen before → skip; otherwise record it and append the row to deduped.

# 2) de-dupe by (verbatim_quote, permalink-or-source_url)
seen = set(); deduped = []
for r in rows_clean:
    key = (r["verbatim_quote"].strip(), canon_url(r.get("permalink") or r.get("source_url")))
    if key in seen: 
        continue
    seen.add(key); deduped.append(r)

# 3) confidence gate
filtered = [r for r in deduped if ok_conf(r, min_conf=0.5)]

# 4) optional recency filter if posted_at present
filtered = [r for r in filtered if within_months(r.get("posted_at",""), months=24)]

print(f"Rows → raw: {len(rows)} | clean: {len(rows_clean)} | deduped: {len(deduped)} | final: {len(filtered)}")


Rows → raw: 17 | clean: 17 | deduped: 17 | final: 17


In [10]:
# --- Stage 5: Review (summary + preview) ---

import pandas as pd
from IPython.display import display

pd.set_option("display.max_colwidth", 160)

df = pd.DataFrame(filtered)

# 1) High-level summary: counts by persona / stage / source
summary = (
    df.groupby(["persona", "stage", "source_type"], dropna=False)
      .size()
      .reset_index(name="count")
      .sort_values(["persona","stage","count"], ascending=[True, True, False])
)
display(summary.head(30))

# 2) Distribution by stage (quick sanity)
stage_counts = df["stage"].value_counts().rename_axis("stage").reset_index(name="count")
display(stage_counts)

# 3) Human-friendly preview of the first N rows
preview_cols = [
    "persona","stage","verbatim_quote","opportunity",
    "permalink","thread_title","location_city","location_state"
]
preview = df[preview_cols].head(12)
display(preview)


# 4) (Optional) mark-and-drop any rows by index before saving
#    After you scan `preview` or `df`, put indices to drop here:
drop_idx = []   # e.g., [3, 7]
if drop_idx:
    df = df.drop(index=drop_idx).reset_index(drop=True)
    print(f"Dropped {len(drop_idx)} rows; remaining: {len(df)}")
else:
    print(f"No manual drops; rows: {len(df)}")

# Keep `df` as the reviewed set for Stage 6 save.
reviewed_rows = df.to_dict(orient="records")


Unnamed: 0,persona,stage,source_type,count
0,buyer,aftercare,reddit,1
1,buyer,decide,reddit,2
2,buyer,discover,reddit,5
3,buyer,meetup_inspect,reddit,1
4,buyer,prepare,reddit,1
5,seller,pay_paperwork,reddit,2
6,specialist,aftercare,reddit,3
7,specialist,discover,reddit,2


Unnamed: 0,stage,count
0,discover,7
1,aftercare,4
2,decide,2
3,pay_paperwork,2
4,prepare,1
5,meetup_inspect,1


Unnamed: 0,persona,stage,verbatim_quote,opportunity,permalink,thread_title,location_city,location_state
0,buyer,discover,"Take your answers from that, and get online on sites like cargurus,cars,auto trader, heck research your local dealers and filter with your answers from befo...",provide integrated vehicle history and pricing tools with clear explanations for buyers,https://www.reddit.com/r/UsedCars/comments/1o426it/how_do_you_buy_a_used_car/nizb6pt/,how do you buy a used car?,,
1,buyer,prepare,Visually inspect the vehicle. Is it clean and well groomed? Is it ratted out filthy dirty full of garbage? If it's old and you're in the northeastern United...,offer guided checklists or virtual inspection assistance for buyers,https://www.reddit.com/r/UsedCars/comments/1o426it/how_do_you_buy_a_used_car/nizb6pt/,how do you buy a used car?,,northeastern United States
2,buyer,meetup_inspect,Drive it to a reputable shop across town. Pay for an inspection and tell them you want to know all details of the cars condition and operation as best as th...,connect buyers with trusted inspection services or offer mobile inspections,https://www.reddit.com/r/UsedCars/comments/1o426it/how_do_you_buy_a_used_car/nizb6pt/,how do you buy a used car?,,
3,buyer,decide,Look at oilchange sticker on window. Is it overdue for oil change? If its overdue than a major red flag.,simplify access to verified vehicle history and maintenance records,https://www.reddit.com/r/UsedCars/comments/1o426it/how_do_you_buy_a_used_car/nizbjeb/,how do you buy a used car?,,
4,buyer,discover,find the wealthiest zip code near you. restrict your searches to that.,offer location-based recommendations for quality private sellers,https://www.reddit.com/r/UsedCars/comments/1o426it/how_do_you_buy_a_used_car/nj0hkds/,how do you buy a used car?,,
5,buyer,discover,I think right now the best deal you can get on a used car is a 10 year old vehicle with less than 150k miles private sale. A ten year old model has tons of ...,provide inspection services or trusted inspectors for private sales,https://www.reddit.com/r/UsedCars/comments/1mf7v5k/is_buying_a_used_car_still_the_way_to_go/n6fkfk7/,Is buying a used car still the way to go?,,
6,buyer,discover,"QC is worse than it ever has been in my lifetime with vehicles and if these new, complex engines arent properly maintained they just don't last. Maybe I'm j...",offer verified maintenance histories or warranties for used cars,https://www.reddit.com/r/UsedCars/comments/1mf7v5k/is_buying_a_used_car_still_the_way_to_go/n6f23mu/,Is buying a used car still the way to go?,,
7,buyer,discover,"Honestly it's frying my head, I've spend the last week constantly looking, messaging, arranging and nothing.\n\nThey either don't reply, or they do but ghos...",provide guidance on effective messaging and communication strategies,https://www.reddit.com/r/UsedCars/comments/1mk5cyg/how_do_people_even_buy_used_cars/,How do people even buy used cars.,,
8,specialist,discover,"Be direct, don't ask too many questions online, wait until you see it in person.\n\nHey, I'm interested in the car. Does it have XYZ option/feature I'm look...",offer templates or coaching on effective buyer communication,https://www.reddit.com/r/UsedCars/comments/1mk5cyg/how_do_people_even_buy_used_cars/n7g4v7p/,How do people even buy used cars.,,
9,specialist,discover,What I've seen is if you're trying to buy a normal 2008 Honda Accord type of car it's a nightmare because everyone is trying to buy the same $5000 car that'...,help buyers understand market conditions and prepare financially,https://www.reddit.com/r/UsedCars/comments/1mk5cyg/how_do_people_even_buy_used_cars/n7g5wmi/,How do people even buy used cars.,,


No manual drops; rows: 17


In [11]:
# --- Stage 6: Save snapshot(s) ---

# JSON (primary artifact)
out_json = EXPORTS / f"rows_{RUN_ID}.json"
with open(out_json, "w") as f:
    json.dump(reviewed_rows, f, ensure_ascii=False, indent=2)
print("Saved JSON →", out_json)

# CSV (handy for quick scanning)
import pandas as pd
out_csv = EXPORTS / f"rows_{RUN_ID}.csv"
pd.DataFrame(reviewed_rows).to_csv(out_csv, index=False)
print("Saved CSV  →", out_csv)


Saved JSON → ../data/exports/rows_20251023_2127.json
Saved CSV  → ../data/exports/rows_20251023_2127.csv


In [None]:
# --- OPTIONAL: Append to Google Sheets (new worksheet per run) ---
import os, datetime as dt, pandas as pd, gspread
from google.oauth2.service_account import Credentials

SPREADSHEET_ID = os.getenv("GOOGLE_SHEETS_SPREADSHEET_ID")
creds = Credentials.from_service_account_file(
    "google-credentials.json",
    scopes=["https://www.googleapis.com/auth/spreadsheets"]
)
gc = gspread.authorize(creds)
ss = gc.open_by_key(SPREADSHEET_ID)

title = dt.datetime.utcnow().strftime(f"run_{RUN_ID}")
ws = ss.add_worksheet(title=title, rows=2000, cols=20)
df = pd.DataFrame(reviewed_rows)
ws.update([df.columns.tolist()] + df.values.tolist())
print("Appended to Google Sheet tab:", title)
