Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 39 additions & 2 deletions substack2md.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,15 +180,52 @@ def fetch_paywall_status(publication: str, slug: str) -> Dict:
values default to ``None`` so that the caller can distinguish "not checked"
from "checked and free".
"""
# Substack classifies every post with an `audience` enum. Authors can
# rename their subscription *tiers* in the UI (e.g. label a paid tier
# "Supporters" or "Founders' Circle"), but the API normalizes those to
# a fixed set of values used by Substack's own gating logic.
#
# Known enum values (verified empirically across 9 publications /
# 120+ posts, plus Substack API documentation):
# "everyone" - public, free to read
# "only_free" - requires any subscription (free OK), not paywalled
# "only_paid" - requires a paid subscription
# "founding" - requires founding-member subscription (paid)
#
# If Substack adds new tiers, we DO NOT assume free. We return
# (is_paid=None, audience=<raw value>) so downstream workflows can
# treat the post as "status unknown" instead of silently publishing
# it as free. That's the safer default for a flag whose purpose is
# to avoid accidental redistribution of subscriber-only content.
known_paid = {"only_paid", "founding"}
known_free = {"everyone", "only_free"}

result: Dict = {"is_paid": None, "audience": None}
api_url = f"https://{publication}.substack.com/api/v1/posts/{slug}"
try:
resp = requests.get(api_url, headers={"Accept": "application/json",
"User-Agent": "substack2md"}, timeout=10)
if resp.status_code == 200:
data = resp.json()
result["is_paid"] = data.get("audience") == "only_paid"
result["audience"] = data.get("audience", "everyone")
audience = data.get("audience")
if audience is None:
# 200 OK but no audience field -- can't tell; stay None/None
# so downstream treats this as "not checked" rather than "free".
print(f"[paywall] 200 but no 'audience' field for {api_url}",
file=sys.stderr)
elif audience in known_paid:
result["audience"] = audience
result["is_paid"] = True
elif audience in known_free:
result["audience"] = audience
result["is_paid"] = False
else:
# Unknown enum value: preserve the raw string for debuggability
# but refuse to guess is_paid. Log so maintainers can widen
# the enum sets if Substack adds new tiers.
result["audience"] = audience
print(f"[paywall] Unknown audience value {audience!r} for "
f"{api_url}; is_paid left as None", file=sys.stderr)
else:
print(f"[paywall] API returned {resp.status_code} for {api_url}", file=sys.stderr)
except Exception as exc:
Expand Down
109 changes: 109 additions & 0 deletions tests/EVALS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# PR #1 Eval Report — `feat: --detect-paywall`

Target: https://github.com/snapsynapse/substack2md/pull/1
Source branch: drewid74:feat/paywall-detection
Base: snapsynapse:main

## How to run

```bash
python -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt -r tests/requirements-dev.txt

# Fetch PR code into a local checkout
git remote add drew https://github.com/drewid74/substack2md.git
git fetch drew feat/paywall-detection
git checkout drew/feat/paywall-detection -- substack2md.py

pytest tests/ -v
# Optional live:
SUBSTACK2MD_LIVE=1 pytest tests/test_live_smoke.py -v -s
```

## Result against PR HEAD

28 passed, 2 failed, 1 skipped (live smoke, opt-in).

## Failures — merge-blockers to discuss with author

### BUG 1. Founding-tier posts misclassified as free

`test_founding_tier_is_paid_behavior` FAILS.

Substack audience values observed in the wild:
`everyone`, `only_free`, `only_paid`, `founding`.

Current code:
```python
result["is_paid"] = data.get("audience") == "only_paid"
```
`founding` posts (paid, founding-member-only) come through as
`is_paid=False`, `audience="founding"`. Defeats the PR's stated goal of
"avoid accidentally sharing paid content."

Proposed fix:
```python
PAID_AUDIENCES = {"only_paid", "founding"}
result["is_paid"] = data.get("audience") in PAID_AUDIENCES
```

### BUG 2. Missing `audience` key silently reported as "everyone"

`test_missing_audience_key_should_return_unknown` FAILS.

Current code:
```python
result["is_paid"] = data.get("audience") == "only_paid" # -> False when missing
result["audience"] = data.get("audience", "everyone") # -> "everyone" when missing
```
If Substack ever returns a 200 without the `audience` field (schema
drift, cached response, edge account type), the post is tagged as free
when the actual status is unknown. Contradicts the PR's own promise of
"graceful fallback to null on API errors."

Proposed fix:
```python
audience = data.get("audience")
if audience is None:
return {"is_paid": None, "audience": None}
result["audience"] = audience
result["is_paid"] = audience in PAID_AUDIENCES
```

## Non-blocking observations (flag, don't block)

- Hardcoded 10s timeout — `--timeout` CLI arg is not threaded through
to `fetch_paywall_status`. On a large batch a slow endpoint can add
10s per URL. `test_timeout_is_finite` currently passes at 10s; bump
the assertion upper bound or plumb the arg before merge if the
maintainers care.
- Custom-domain Substacks (e.g. `stratechery.com`): `publication` slug
is derived from the netloc, so the metadata API URL built from it
will 404 for custom domains. Fails gracefully (is_paid=None) but the
feature is silently inert for these. See
`test_custom_domain_publication_is_wrong_for_api` — documents current
behavior, passes by design.
- `--from-md` path does not support paywall detection. Opt-in and
explicitly scoped; fine. Pinned by
`test_from_md_path_has_no_paywall_fields`.
- No test suite shipped with the PR. Recommend adopting `tests/` and
adding a CI step before merge.

## Coverage map

| Area | File |
|-----------------------------|--------------------------------------|
| API contract + failure modes| test_paywall_fetch.py |
| YAML frontmatter behavior | test_frontmatter.py |
| CLI flag + process_url wiring| test_cli_wiring.py |
| Publication/slug derivation | test_publication_slug_edges.py |
| Real endpoint smoke | test_live_smoke.py (opt-in) |

## Decision matrix

| Outcome | Action |
|------------------------------------|-----------------------------------------|
| Author agrees on BUG 1 + BUG 2 | Request changes; merge after tests pass |
| Author disagrees, wants to ship | Either adjust tests to pin current impl as intended, or reject PR |
| Author wants broader scope | Add follow-up for custom-domain lookup + threading `--timeout` |
Empty file added tests/__init__.py
Empty file.
7 changes: 7 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Make the repo root importable so `import substack2md` works."""
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
2 changes: 2 additions & 0 deletions tests/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pytest>=7.0
responses>=0.24
184 changes: 184 additions & 0 deletions tests/test_cli_wiring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""
Evals for the --detect-paywall CLI flag and its wiring through process_url.

Merge-blocking questions:

C1. CLI parse: --detect-paywall is recognized; default is False.
C2. Opt-in: without the flag, fetch_paywall_status is NEVER called.
(Avoids unexpected network traffic for existing users.)
C3. When flag is on, process_url calls fetch_paywall_status exactly once
per URL, with the correct (publication, slug).
C4. When the API call returns None/None (failure mode), the pipeline
still succeeds -- the .md file is written.
C5. --from-md path is unaffected (paywall detection is opt-in on the
live-fetch path only). This is a conscious scope choice; test
pins it so future refactors don't silently break it.
"""
from pathlib import Path
import sys
from unittest import mock

import pytest

import substack2md


# --- C1: parser ------------------------------------------------------------

def test_cli_accepts_detect_paywall(monkeypatch, tmp_path):
"""Argparse must recognise --detect-paywall; default False."""
# Drive main() far enough to hit argparse, then bail.
called = {}

def fake_process_url(url, base_dir, pub_mappings, also_save_html,
overwrite, cdp_host, cdp_port, timeout, retries,
detect_paywall=False):
called["detect_paywall"] = detect_paywall
return None

monkeypatch.setattr(substack2md, "process_url", fake_process_url)
monkeypatch.setattr(sys, "argv", [
"substack2md.py",
"https://examplepub.substack.com/p/hello",
"--base-dir", str(tmp_path),
"--detect-paywall",
])
substack2md.main()
assert called.get("detect_paywall") is True


def test_cli_default_is_false(monkeypatch, tmp_path):
called = {}

def fake_process_url(url, base_dir, pub_mappings, also_save_html,
overwrite, cdp_host, cdp_port, timeout, retries,
detect_paywall=False):
called["detect_paywall"] = detect_paywall
return None

monkeypatch.setattr(substack2md, "process_url", fake_process_url)
monkeypatch.setattr(sys, "argv", [
"substack2md.py",
"https://examplepub.substack.com/p/hello",
"--base-dir", str(tmp_path),
])
substack2md.main()
assert called.get("detect_paywall") is False


# --- C2 / C3: wiring inside process_url -----------------------------------

@pytest.fixture
def fake_cdp(monkeypatch):
"""Stub CDPClient so process_url doesn't need a real browser."""
class FakeClient:
def __init__(self, *args, **kwargs):
pass

def fetch_html(self, url):
# Minimal HTML that extract_article_fields can parse.
return """
<html><head>
<title>Hello</title>
<meta name="author" content="A. Writer">
</head><body>
<h1>Hello</h1>
<h3>Sub</h3>
<article><p>Body text here with enough words to pass
readability's heuristic threshold for article detection,
padding padding padding padding padding padding padding.</p>
</article>
</body></html>
"""
monkeypatch.setattr(substack2md, "CDPClient", FakeClient)
return FakeClient


def test_paywall_not_called_when_flag_off(monkeypatch, tmp_path, fake_cdp):
called = {"count": 0}

def spy(*args, **kwargs):
called["count"] += 1
return {"is_paid": None, "audience": None}

monkeypatch.setattr(substack2md, "fetch_paywall_status", spy)
substack2md.process_url(
"https://examplepub.substack.com/p/hello",
base_dir=tmp_path,
pub_mappings={},
also_save_html=False, overwrite=True,
cdp_host="x", cdp_port=0, timeout=1, retries=1,
detect_paywall=False,
)
assert called["count"] == 0, "No network call without the flag"


def test_paywall_called_once_with_correct_args(monkeypatch, tmp_path, fake_cdp):
seen = []

def spy(publication, slug):
seen.append((publication, slug))
return {"is_paid": True, "audience": "only_paid"}

monkeypatch.setattr(substack2md, "fetch_paywall_status", spy)
out = substack2md.process_url(
"https://examplepub.substack.com/p/hello",
base_dir=tmp_path,
pub_mappings={},
also_save_html=False, overwrite=True,
cdp_host="x", cdp_port=0, timeout=1, retries=1,
detect_paywall=True,
)
assert len(seen) == 1
pub, slug = seen[0]
assert pub == "examplepub"
assert slug == "hello"
assert out is not None and Path(out).exists()
# Produced file carries the paywall fields
text = Path(out).read_text()
assert "is_paid: true" in text
assert "audience: only_paid" in text


# --- C4: API failure must not kill the pipeline ---------------------------

def test_api_failure_still_writes_file(monkeypatch, tmp_path, fake_cdp):
def failing(pub, slug):
return {"is_paid": None, "audience": None}

monkeypatch.setattr(substack2md, "fetch_paywall_status", failing)
out = substack2md.process_url(
"https://examplepub.substack.com/p/hello",
base_dir=tmp_path,
pub_mappings={},
also_save_html=False, overwrite=True,
cdp_host="x", cdp_port=0, timeout=1, retries=1,
detect_paywall=True,
)
assert out is not None and Path(out).exists()
text = Path(out).read_text()
# None-valued keys should be filtered out of frontmatter
assert "is_paid" not in text
assert "audience" not in text


# --- C5: --from-md path unchanged -----------------------------------------

def test_from_md_path_has_no_paywall_fields(tmp_path):
"""process_from_md doesn't take a detect_paywall flag -- pin it."""
src = tmp_path / "raw.md"
src.write_text("# Some Title\n\nBody.\n", encoding="utf-8")
out_dir = tmp_path / "out"
out_dir.mkdir()

out = substack2md.process_from_md(
src,
base_dir=out_dir,
pub_mappings={},
url="https://examplepub.substack.com/p/hello",
overwrite=True,
)
assert out is not None and Path(out).exists()
text = Path(out).read_text()
assert "is_paid" not in text
assert "audience" not in text
Loading