From d6c7ea16a00cc01525fda7b74e1f13c5f6a57746 Mon Sep 17 00:00:00 2001 From: drewid Date: Thu, 16 Apr 2026 07:05:39 -0400 Subject: [PATCH] feat: add --detect-paywall flag for subscriber-only content classification Queries Substack's public API to add is_paid and audience fields to YAML frontmatter. This lets users with paid subscriptions build guardrails to avoid accidentally sharing or redistributing content that creators intended for paying subscribers only. - New fetch_paywall_status() function hits /api/v1/posts/{slug} - Opt-in via --detect-paywall CLI flag (no breaking changes) - Frontmatter gains is_paid (bool) and audience (str) fields - Graceful fallback to null on API errors - Updated README with feature docs and usage examples --- README.md | 18 ++++++++++++++++++ substack2md.py | 45 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 127163e..05ea533 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Substack doesn't let you bulk-export your reading list or subscriptions in a use - **Obsidian wikilinks** - Auto-converts internal links to existing notes - **Configurable naming** - Map publication slugs to custom directory names - **Transcript cleaning** - Strips timestamps and speaker labels from podcast transcripts +- **Paywall detection** - Optionally tags posts as free or subscriber-only via Substack's public API, so you can avoid accidentally sharing paid content ## Installation @@ -126,6 +127,9 @@ python substack2md.py URL --overwrite # Process from existing markdown export (cleanup only) python substack2md.py --from-md export.md --url https://pub.substack.com/p/slug + +# Tag posts with paywall status (respects creators' rights) +python substack2md.py --urls-file urls.txt --detect-paywall ``` ## URL File Format @@ -170,6 +174,8 @@ canonical: "https://daveshap.substack.com/p/post-slug" slug: "post-slug" tags: [substack, ai, automation] image: "https://substackcdn.com/image.jpg" +is_paid: false +audience: "everyone" links_internal: 3 links_external: 12 source: "substack2md v1.1.0" @@ -178,6 +184,17 @@ source: "substack2md v1.1.0" Content starts here... ``` +## Paywall Detection + +When `--detect-paywall` is passed, substack2md queries Substack's public API to determine whether each post is free or subscriber-only. This adds two fields to the YAML frontmatter: + +- **`is_paid`** (`true`/`false`/`null`) — whether the post requires a paid subscription +- **`audience`** (`"everyone"` or `"only_paid"` or `null`) — the audience scope set by the author + +This is opt-in and requires no additional authentication — the metadata endpoint is public. + +**Why this matters:** If you have a paid subscription, CDP will fetch the full content of subscriber-only posts. The paywall metadata lets you build guardrails in your own workflows to avoid accidentally sharing or redistributing content that creators intended for paying subscribers only. Respect the creators whose work you value enough to pay for. + ## Troubleshooting ### "No CDP connection" @@ -219,6 +236,7 @@ options: --timeout SECONDS Page load timeout (default: 45) --retries N Retry failed URLs N times (default: 2) --sleep-ms MS Delay between requests (default: 150) + --detect-paywall Add is_paid/audience to frontmatter via Substack API ``` ## Contributing diff --git a/substack2md.py b/substack2md.py index 97d550f..ca7998f 100644 --- a/substack2md.py +++ b/substack2md.py @@ -168,6 +168,34 @@ def normalize_tags(tags: List[str]) -> List[str]: out.insert(0, "substack") return out +def fetch_paywall_status(publication: str, slug: str) -> Dict: + """Query Substack's public API for paywall/audience metadata. + + Substack exposes ``/api/v1/posts/{slug}`` on every publication subdomain. + The response includes *is_paid* (bool) and *audience* (str) which indicate + whether the post is behind a paywall. No authentication is required for + this metadata endpoint. + + Returns a dict with ``is_paid`` and ``audience`` keys. On any failure the + values default to ``None`` so that the caller can distinguish "not checked" + from "checked and free". + """ + result: Dict = {"is_paid": None, "audience": None} + api_url = f"https://{publication}.substack.com/api/v1/posts/{slug}" + try: + resp = requests.get(api_url, headers={"Accept": "application/json", + "User-Agent": "substack2md"}, timeout=10) + if resp.status_code == 200: + data = resp.json() + result["is_paid"] = data.get("audience") == "only_paid" + result["audience"] = data.get("audience", "everyone") + else: + print(f"[paywall] API returned {resp.status_code} for {api_url}", file=sys.stderr) + except Exception as exc: + print(f"[paywall] Could not query {api_url}: {exc}", file=sys.stderr) + return result + + def cleanup_url(url: str) -> str: if not url: return url @@ -488,6 +516,8 @@ def with_frontmatter(fields: Dict, body_md: str) -> str: "tags": fields["tags"], "image": fields["image"] or "", "video_url": fields.get("video_url","") or "", + "is_paid": fields.get("is_paid"), + "audience": fields.get("audience"), "links_internal": fields.get("links_internal",0), "links_external": fields.get("links_external",0), "source": fields.get("source","substack2md v1.1.0"), @@ -502,13 +532,20 @@ def with_frontmatter(fields: Dict, body_md: str) -> str: def process_url(url: str, base_dir: Path, pub_mappings: Dict[str, str], also_save_html: bool, overwrite: bool, - cdp_host: str, cdp_port: int, timeout: int, retries: int) -> Optional[Path]: + cdp_host: str, cdp_port: int, timeout: int, retries: int, + detect_paywall: bool = False) -> Optional[Path]: client = CDPClient(cdp_host, cdp_port, timeout=timeout) last_err = None for attempt in range(1, retries+1): try: html = client.fetch_html(url) fields, body_md = extract_article_fields(url, html) + + # Paywall detection via Substack public API + if detect_paywall: + pw = fetch_paywall_status(fields["publication"], fields["slug"]) + fields["is_paid"] = pw["is_paid"] + fields["audience"] = pw["audience"] # Use configurable publication name mapping pub_pretty = get_publication_name(fields["publication"], pub_mappings) @@ -610,6 +647,9 @@ def main(): ap.add_argument("--timeout", type=int, default=45, help="Per-page CDP timeout seconds") ap.add_argument("--retries", type=int, default=2, help="Retries per URL on transient failures") ap.add_argument("--sleep-ms", type=int, default=150, help="Sleep between URLs to be polite") + ap.add_argument("--detect-paywall", action="store_true", + help="Query Substack API to add is_paid/audience to frontmatter. " + "Helps avoid accidentally sharing subscriber-only content.") args = ap.parse_args() # Load configuration @@ -646,7 +686,8 @@ def main(): if "substack.com" not in url: print(f"[warn] Not a substack URL: {url}") process_url(url, base_dir, pub_mappings, args.also_save_html, args.overwrite, - args.cdp_host, args.cdp_port, args.timeout, args.retries) + args.cdp_host, args.cdp_port, args.timeout, args.retries, + detect_paywall=args.detect_paywall) if i < len(url_list) and args.sleep_ms > 0: time.sleep(args.sleep_ms / 1000.0)