# Chronicling America Scraper - Complete Dataset

This notebook scrapes **ALL 607 newspapers** from `newspapers_all_years_updated.csv` using the Library of Congress API and Gemini API.

## Setup

1. Install dependencies:
   ```bash
   pip install google-generativeai python-dotenv requests pandas tqdm
   ```

2. Create a `.env` file with your Google API key:
   ```
   GOOGLE_API_KEY=your_api_key_here
   GEMINI_MODEL=gemini-2.0-flash-exp
   ```

3. Ensure `.env` is in your `.gitignore` (**DO NOT commit API keys**)

## Workflow

1. **LOC API Scraping**: Fetches essays from Library of Congress
2. **Stage 1 (Manual Checkpoint)**: Extract timelines from essays using Gemini API
   - Output: `data/timelines/*.json`
   - Review these before proceeding
3. **Stage 2 (Manual Checkpoint)**: Convert timelines to structured JSON
   - Output: `data/essay_derived_rows/*.json`
   - Review these before merging
4. **Merge to Master**: Add structured data to `master.csv` with ISSN validation

## Notes

- Processing all 607 newspapers will take **~1-2 minutes** (1500 RPM rate limit with Gemini 2.0 Flash)
- **Cost: $0** (Gemini 2.0 Flash is free for this usage level)
- Intermediate outputs are saved at each stage for review
- Errors are logged to `data/api_errors.log`

In [1]:
# %pip install google-genai python-dotenv

In [1]:
"""
Chronicling America Scraper - Complete Dataset

Processes ALL newspapers in data/newspapers_all_years_updated.csv (not just unmatched),
queries the Library of Congress API for newspaper metadata and essays,
then uses Gemini API to extract structured editor/publisher data.

NOTE: Uses the current loc.gov API exclusively (post-Aug 2025):
  - Item endpoint:      https://www.loc.gov/item/{lccn}/?fo=json
  - Collection search:  https://www.loc.gov/collections/chronicling-america/
  - General search:     https://www.loc.gov/search/?q={query}&fo=json
"""

import csv
import json
import re
import time
import argparse
import logging
import sys
import requests
import math
import os
from pathlib import Path
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from dotenv import load_dotenv
from google import genai
from google.genai import types

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("data/api_errors.log")
    ]
)
log = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Configure Gemini API client (google-genai SDK)
client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-flash-latest")
GENERATION_CONFIG = types.GenerateContentConfig(
    temperature=0.1,
    top_p=0.95,
    top_k=40,
    max_output_tokens=65536,
)

# Configuration
MATCHES_CSV = "data/matches.csv"
NEWSPAPERS_CSV = "data/newspapers_all_years_updated.csv"
GEMINI_API_DELAY = float(os.getenv("GEMINI_API_DELAY", "15"))

# LOC API endpoints
LOC_ITEM_URL = "https://www.loc.gov/item/{lccn}/?fo=json"
LOC_COLLECTION_URL = "https://www.loc.gov/collections/chronicling-america/"
LOC_DIRECTORY_URL = "https://www.loc.gov/collections/directory-of-us-newspapers-in-american-libraries/"
LOC_SEARCH_URL = "https://www.loc.gov/search/"

HEADERS = {"User-Agent": "NewspaperLookup/1.0 (research script)"}
REQUEST_TIMEOUT = 60


def _build_session() -> requests.Session:
    """Build a requests session with automatic retries."""
    session = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
    )
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    session.headers.update(HEADERS)
    return session


SESSION = _build_session()


def query_gemini_api(prompt: str, retry_count: int = 3) -> str:
    """Query Gemini API with retry logic using google-genai SDK."""
    for attempt in range(retry_count):
        try:
            response = client.models.generate_content(
                model=GEMINI_MODEL,
                contents=prompt,
                config=GENERATION_CONFIG,
            )
            return response.text
        except Exception as e:
            if "quota" in str(e).lower() or "rate" in str(e).lower():
                wait_time = (attempt + 1) * 15
                log.warning(f"Rate limit hit, waiting {wait_time}s before retry {attempt + 1}/{retry_count}")
                time.sleep(wait_time)
            else:
                log.error(f"Gemini API error: {e}")
                raise
    raise Exception(f"Failed after {retry_count} retries")


def _html_to_text(raw_html: str) -> str:
    """Convert an HTML string (like the essay field) to clean plain text."""
    import html as html_mod
    text = re.sub(r'</p>', '\n\n', raw_html)
    text = re.sub(r'<br\s*/?>', '\n', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = html_mod.unescape(text)
    text = re.sub(r'[^\S\n]+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)
    return text.strip()


def read_csv(filepath: str) -> list[dict]:
    """Read a CSV and return rows as dicts with stripped headers."""
    with open(filepath, newline="", encoding="utf-8-sig") as f:
        reader = csv.DictReader(f)
        reader.fieldnames = [h.strip() for h in reader.fieldnames]
        return list(reader)


def build_issn_to_lccn(newspapers_rows: list[dict]) -> dict[str, dict]:
    """
    Build a lookup from ISSN -> {lccn, name, town, state} using
    data/newspapers_all_years_updated.csv.
    """
    lookup = {}
    for row in newspapers_rows:
        issn = row.get("issn", "").strip()
        lccn = row.get("lccn", "").strip()
        if issn and lccn:
            lookup[issn] = {
                "lccn": lccn,
                "name": row.get("name", "").strip(),
                "town": row.get("town", "").strip(),
                "state": row.get("state", "").strip(),
            }
    return lookup


def build_issn_to_master_id(matches_rows: list[dict]) -> dict[str, str]:
    """Build lookup: ISSN -> master_id (empty string if unmatched)"""
    return {row["issn"]: row.get("master_id", "").strip() for row in matches_rows}


def query_loc_item(lccn: str) -> dict | None:
    """PRIMARY METHOD: Fetch newspaper metadata from the loc.gov item endpoint."""
    url = LOC_ITEM_URL.format(lccn=lccn)
    try:
        resp = SESSION.get(url, timeout=REQUEST_TIMEOUT)
        if resp.status_code == 404:
            log.info(f"    loc.gov/item: 404 for LCCN {lccn}")
            return None
        resp.raise_for_status()
        data = resp.json()
        item = data.get("item", {})

        essay_html = item.get("essay", "")
        essay_text = _html_to_text(essay_html) if essay_html else ""
        essay_contributor = item.get("essay_contributor", [])

        return {
            "source": "loc.gov/item",
            "lccn": lccn,
            "url": f"https://www.loc.gov/item/{lccn}/",
            "title": item.get("title", ""),
            "date": item.get("date", ""),
            "dates_of_publication": item.get("dates_of_publication", ""),
            "created_published": item.get("created_published", []),
            "description": item.get("description", []),
            "subject_headings": item.get("subject_headings", []),
            "notes": item.get("notes", []),
            "contributors": item.get("contributor_names", []),
            "location": item.get("location", []),
            "call_number": item.get("call_number", []),
            "medium": item.get("medium", ""),
            "issn": item.get("number_issn", item.get("number", [])),
            "oclc": item.get("number_oclc", []),
            "related_items": item.get("related_items", []),
            "other_title": item.get("other_title", []),
            "publication_frequency": item.get("publication_frequency", []),
            "rights": item.get("rights_advisory", []),
            "essay_html": essay_html,
            "essay": essay_text,
            "essay_contributor": essay_contributor,
        }
    except requests.exceptions.Timeout:
        log.warning(f"    loc.gov/item timed out for LCCN {lccn} (timeout={REQUEST_TIMEOUT}s)")
        return None
    except Exception as e:
        log.warning(f"    loc.gov/item failed for LCCN {lccn}: {e}")
        return None


def query_loc_collection(lccn: str) -> dict | None:
    """FALLBACK #1: Search the Chronicling America collection by LCCN."""
    params = {"fa": f"number_lccn:{lccn}", "fo": "json", "c": 1}
    try:
        resp = SESSION.get(LOC_COLLECTION_URL, params=params, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()
        data = resp.json()
        results = data.get("results", [])
        if not results:
            log.info(f"    collection search: no results for LCCN {lccn}")
            return None

        r = results[0]
        return {
            "source": "loc.gov/collections/chronicling-america",
            "lccn": lccn,
            "url": r.get("url", r.get("id", "")),
            "title": r.get("title", ""),
            "date": r.get("date", ""),
            "description": r.get("description", []),
            "subjects": r.get("subject", []),
            "location": r.get("location", []),
            "contributors": r.get("contributor", []),
            "total_results": data.get("pagination", {}).get("of", 0),
        }
    except requests.exceptions.Timeout:
        log.warning(f"    collection search timed out for LCCN {lccn}")
        return None
    except Exception as e:
        log.warning(f"    collection search failed for LCCN {lccn}: {e}")
        return None


def query_loc_directory(lccn: str) -> dict | None:
    """FALLBACK #2: Search the US Newspaper Directory collection."""
    params = {"q": lccn, "fo": "json", "c": 5}
    try:
        resp = SESSION.get(LOC_DIRECTORY_URL, params=params, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()
        data = resp.json()
        results = data.get("results", [])
        if not results:
            log.info(f"    directory search: no results for LCCN {lccn}")
            return None

        r = results[0]
        return {
            "source": "loc.gov/collections/directory",
            "lccn": lccn,
            "url": r.get("url", r.get("id", "")),
            "title": r.get("title", ""),
            "date": r.get("date", ""),
            "description": r.get("description", []),
            "subjects": r.get("subject", []),
            "location": r.get("location", []),
        }
    except requests.exceptions.Timeout:
        log.warning(f"    directory search timed out for LCCN {lccn}")
        return None
    except Exception as e:
        log.warning(f"    directory search failed for LCCN {lccn}: {e}")
        return None


def query_loc_search_fallback(name: str, issn: str) -> dict | None:
    """LAST RESORT: Search loc.gov generally by newspaper name or ISSN."""
    queries = []
    if issn:
        queries.append(issn)
    if name:
        queries.append(name)

    for q in queries:
        params = {"q": q, "fa": "original_format:newspaper", "fo": "json", "c": 5}
        try:
            resp = SESSION.get(LOC_SEARCH_URL, params=params, timeout=REQUEST_TIMEOUT)
            resp.raise_for_status()
            data = resp.json()
            results = data.get("results", [])
            if results:
                r = results[0]
                return {
                    "source": "loc.gov/search (fallback)",
                    "title": r.get("title", ""),
                    "date": r.get("date", ""),
                    "description": r.get("description", []),
                    "subjects": r.get("subject", []),
                    "location": r.get("location", []),
                    "url": r.get("url", r.get("id", "")),
                    "contributors": r.get("contributor", []),
                }
        except Exception as e:
            log.warning(f"    search fallback failed for '{q}': {e}")

    return None


def lookup_newspaper(
    row: dict,
    issn_to_lccn: dict,
    issn_to_master_id: dict,
    delay: float
) -> dict:
    """
    Look up a single newspaper entry using a cascade:
      1. Join on ISSN to get LCCN
      2. Try loc.gov/item/{lccn}/?fo=json          (primary – full record)
      3. Try loc.gov/collections/chronicling-america (fallback – search)
      4. Try loc.gov/collections/directory           (fallback – directory)
      5. Try loc.gov/search by name/ISSN            (last resort)

    Now processes ALL newspapers, tracking master_id for update vs insert logic.
    """
    name = row.get("name", "").strip()
    issn = row.get("issn", "").strip()
    town = row.get("town", "").strip()
    state = row.get("state", "").strip()
    master_id = issn_to_master_id.get(issn, "")

    log.info(f"Looking up: '{name}' (ISSN: {issn})")
    if master_id:
        log.info(f"  Has master_id: {master_id} (will update existing row)")
    else:
        log.info(f"  No master_id (will insert new row)")

    result = {
        "input": {
            "issn": issn,
            "name": name,
            "town": town,
            "state": state,
        },
        "master_id": master_id,
        "lccn": None,
        "lccn_source": None,
        "api_result": None,
    }

    # Step 1: Try to get LCCN from the newspapers CSV
    lccn_info = issn_to_lccn.get(issn)
    if lccn_info:
        lccn = lccn_info["lccn"]
        result["lccn"] = lccn
        result["lccn_source"] = "newspapers_all_years_updated.csv"
        log.info(f"  LCCN from CSV: {lccn}")

        # Step 2: Primary – loc.gov item endpoint
        time.sleep(delay)
        api = query_loc_item(lccn)
        if api:
            result["api_result"] = api
            log.info(f"  ✔ loc.gov/item: {api['title']}")
            return result

        # Step 3: Fallback – Chronicling America collection search
        time.sleep(delay)
        api = query_loc_collection(lccn)
        if api:
            result["api_result"] = api
            log.info(f"  ✔ collection search: {api['title']}")
            return result

        # Step 4: Fallback – US Newspaper Directory
        time.sleep(delay)
        api = query_loc_directory(lccn)
        if api:
            result["api_result"] = api
            log.info(f"  ✔ directory search: {api['title']}")
            return result

    else:
        result["lccn_source"] = "not_found"
        log.info(f"  No LCCN found via ISSN join")

    # Step 5: Last resort – general search
    log.info(f"  Falling back to general search...")
    time.sleep(delay)
    api = query_loc_search_fallback(name, issn)
    if api:
        result["api_result"] = api
        log.info(f"  ✔ search fallback: {api['title']}")
    else:
        log.info(f"  ✗ No results from any source")

    return result


def main(
    newspapers: str = NEWSPAPERS_CSV,
    matches: str = MATCHES_CSV,
    output: str = "data/all_newspapers_results.json",
    delay: float = 2.0,
    limit: int | None = None,
):
    """
    Process ALL newspapers in newspapers_all_years_updated.csv.

    Can be called directly from a notebook:
        main(limit=3)
    Or from the command line:
        python chronicling_america_scraper.py --limit 3
    """
    if not any("jupyter" in arg.lower() or "ipykernel" in arg.lower() for arg in sys.argv):
        parser = argparse.ArgumentParser(
            description="Look up ALL newspaper entries via Library of Congress APIs"
        )
        parser.add_argument("--newspapers", default=newspapers)
        parser.add_argument("--matches", default=matches)
        parser.add_argument("--output", "-o", default=output)
        parser.add_argument("--delay", "-d", type=float, default=delay)
        parser.add_argument("--limit", "-l", type=int, default=limit)
        args = parser.parse_args()
        newspapers, matches, output, delay, limit = (
            args.newspapers, args.matches, args.output, args.delay, args.limit
        )

    log.info(f"Reading {newspapers} ...")
    newspapers_rows = read_csv(newspapers)
    log.info(f"  {len(newspapers_rows)} total rows")

    log.info(f"Reading {matches} ...")
    matches_rows = read_csv(matches)
    log.info(f"  {len(matches_rows)} total rows")

    issn_to_lccn = build_issn_to_lccn(newspapers_rows)
    log.info(f"  Built ISSN→LCCN lookup with {len(issn_to_lccn)} entries")

    issn_to_master_id = build_issn_to_master_id(matches_rows)
    log.info(f"  Built ISSN→master_id lookup with {len(issn_to_master_id)} entries")

    all_newspapers = newspapers_rows
    log.info(f"Processing ALL {len(all_newspapers)} newspapers from newspapers_all_years_updated.csv")

    if limit:
        all_newspapers = all_newspapers[:limit]
        log.info(f"Processing first {len(all_newspapers)} entries (--limit)")

    have_lccn = sum(1 for r in all_newspapers if r.get("issn", "").strip() in issn_to_lccn)
    have_master_id = sum(1 for r in all_newspapers if issn_to_master_id.get(r.get("issn", "").strip(), ""))
    log.info(f"Of {len(all_newspapers)} newspapers, {have_lccn} have LCCNs via ISSN join, "
             f"{len(all_newspapers) - have_lccn} will need search fallback")
    log.info(f"Of {len(all_newspapers)} newspapers, {have_master_id} already in master.csv (will update), "
             f"{len(all_newspapers) - have_master_id} are new (will insert)")

    results = []
    for i, row in enumerate(all_newspapers, 1):
        log.info(f"——— [{i}/{len(all_newspapers)}] ———")
        result = lookup_newspaper(row, issn_to_lccn, issn_to_master_id, delay)
        results.append(result)

    output_path = Path(output)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    lccn_found = sum(1 for r in results if r["lccn"])
    api_found = sum(1 for r in results if r["api_result"])
    with_master_id = sum(1 for r in results if r["master_id"])
    by_source = {}
    for r in results:
        if r["api_result"]:
            src = r["api_result"]["source"]
            by_source[src] = by_source.get(src, 0) + 1

    log.info("══ Summary ══")
    log.info(f"  Total newspapers processed:   {len(results)}")
    log.info(f"  LCCN found via ISSN:          {lccn_found}")
    log.info(f"  API results found:            {api_found}")
    for src, count in sorted(by_source.items()):
        log.info(f"    {src}: {count}")
    log.info(f"  No results at all:            {len(results) - api_found}")
    log.info(f"  With master_id (updates):     {with_master_id}")
    log.info(f"  Without master_id (inserts):  {len(results) - with_master_id}")
    log.info(f"  Results saved to: {output_path}")


def split_results(
    input_file: str = "data/all_newspapers_results.json",
    output_dir: str = "data/batches",
    batch_size: int = 25,
):
    """
    Reads all_newspapers_results.json and splits it into JSON files of 25 entries each,
    keeping issn, master_id, essay, and notes for Gemini processing.
    """
    with open(input_file, encoding="utf-8") as f:
        data = json.load(f)

    Path(output_dir).mkdir(exist_ok=True)

    num_batches = math.ceil(len(data) / batch_size)

    for i in range(num_batches):
        batch = data[i * batch_size : (i + 1) * batch_size]

        slim = []
        for entry in batch:
            issn = entry.get("input", {}).get("issn", "")
            name = entry.get("input", {}).get("name", "")
            town = entry.get("input", {}).get("town", "")
            state = entry.get("input", {}).get("state", "")
            master_id = entry.get("master_id", "")
            api = entry.get("api_result") or {}
            essay = api.get("essay", "")
            notes = api.get("notes", [])

            slim.append({
                "issn": issn,
                "name": name,
                "town": town,
                "state": state,
                "master_id": master_id,
                "date": api.get("date", ""),
                "dates_of_publication": api.get("dates_of_publication", ""),
                "created_published": api.get("created_published", []),
                "essay": essay,
                "notes": notes,
            })

        out_path = Path(output_dir) / f"batch_{i + 1}.json"
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(slim, f, indent=2, ensure_ascii=False)

        print(f"Wrote {len(slim)} entries to {out_path}")

    print(f"\nDone: {len(data)} entries across {num_batches} files in {output_dir}/")


## Step 1: Run LOC API Scraper

This cell queries the Library of Congress API for all 607 newspapers.

**Note**: Set `limit=None` to process all newspapers, or use a small limit for testing (e.g., `limit=5`).

In [3]:
# Run the LOC API scraper
main(limit=None)  # Set limit=5 for testing, limit=None for full run
split_results()

2026-02-14 12:47:24,596 [INFO] Reading data/newspapers_all_years_updated.csv ...
2026-02-14 12:47:24,604 [INFO]   606 total rows
2026-02-14 12:47:24,606 [INFO] Reading data/matches.csv ...
2026-02-14 12:47:24,610 [INFO]   606 total rows
2026-02-14 12:47:24,613 [INFO]   Built ISSN→LCCN lookup with 606 entries
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\samwt\AppData\Local\Python\pythoncore-3.14-64\Lib\logging\__init__.py", line 1154, in emit
    stream.write(msg + self.terminator)
    ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\samwt\AppData\Local\Python\pythoncore-3.14-64\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2192' in position 43: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen 

Wrote 25 entries to data\batches\batch_1.json
Wrote 25 entries to data\batches\batch_2.json
Wrote 25 entries to data\batches\batch_3.json
Wrote 25 entries to data\batches\batch_4.json
Wrote 25 entries to data\batches\batch_5.json
Wrote 25 entries to data\batches\batch_6.json
Wrote 25 entries to data\batches\batch_7.json
Wrote 25 entries to data\batches\batch_8.json
Wrote 25 entries to data\batches\batch_9.json
Wrote 25 entries to data\batches\batch_10.json
Wrote 25 entries to data\batches\batch_11.json
Wrote 25 entries to data\batches\batch_12.json
Wrote 25 entries to data\batches\batch_13.json
Wrote 25 entries to data\batches\batch_14.json
Wrote 25 entries to data\batches\batch_15.json
Wrote 25 entries to data\batches\batch_16.json
Wrote 25 entries to data\batches\batch_17.json
Wrote 25 entries to data\batches\batch_18.json
Wrote 25 entries to data\batches\batch_19.json
Wrote 25 entries to data\batches\batch_20.json
Wrote 25 entries to data\batches\batch_21.json
Wrote 25 entries to da

## Step 2: Gemini Stage 1 - Extract Timelines

This cell processes all batch files and uses Gemini API to extract editor/publisher timelines from the essays.

**Output**: `data/timelines/timeline_*.json`

**Manual Checkpoint**: Review the timeline files before proceeding to Stage 2.

In [2]:
import glob
import json
from pathlib import Path
from tqdm import tqdm
import time

FORCE_REGENERATE_TIMELINES = False  # Set to True to regenerate
GEMINI_API_DELAY = 5  # Seconds between batch requests
MAX_RETRIES = 5
INITIAL_RETRY_DELAY = 30  # Seconds before first retry (doubles each attempt)


def query_gemini_with_retry(prompt, max_retries=MAX_RETRIES):
    """Query Gemini API with exponential backoff on rate limit errors."""
    delay = INITIAL_RETRY_DELAY
    for attempt in range(max_retries):
        try:
            return query_gemini_api(prompt)
        except Exception as e:
            error_str = str(e).lower()
            if "rate" in error_str or "quota" in error_str or "429" in error_str or "resource" in error_str:
                if attempt < max_retries - 1:
                    print(f"    Rate limited (attempt {attempt + 1}/{max_retries}). Retrying in {delay}s...")
                    time.sleep(delay)
                    delay *= 2  # Exponential backoff
                else:
                    raise RuntimeError(f"Rate limit exceeded after {max_retries} retries: {e}")
            else:
                raise  # Non-rate-limit error, don't retry


def build_batch_prompt(batch_entries):
    """Build a single prompt that asks Gemini to extract timelines for all essays in one call."""
    essay_blocks = []
    for i, entry in enumerate(batch_entries):
        issn = entry.get("issn", "")
        name = entry.get("name", "")
        essay = entry.get("essay", "")
        notes = entry.get("notes", [])

        if not essay:
            continue

        essay_blocks.append(
            f"--- NEWSPAPER {i + 1} ---\n"
            f"ISSN: {issn}\n"
            f"Name: {name}\n"
            f"Essay:\n{essay}\n"
            f"Notes: {', '.join(notes) if notes else 'None'}\n"
        )

    if not essay_blocks:
        return None

    all_essays = "\n".join(essay_blocks)

    prompt = f"""You are given {len(essay_blocks)} historical newspaper essays below. For EACH newspaper, extract all editor and publisher information.

For each editor or publisher mentioned, extract:
- Name
- Years of service (start-end or specific years mentioned)
- Role (editor/publisher/both)

Be extremely careful about dates. If you cannot be reasonably confident about who the owner or editor was in a given year, do not report it. Only include information that is clearly supported by the text.

For each newspaper, write a plain text timeline of ownership/editorial changes in chronological order. Separate each newspaper with a line like:

=== ISSN: [issn] | [name] ===

followed by the timeline entries. Do not use JSON or markdown formatting.

{all_essays}"""
    return prompt


def extract_timelines_from_batches(
    batch_dir: str = "data/batches",
    output_dir: str = "data/timelines",
    force: bool = False
):
    """
    Stage 1: Extract editor/publisher timelines from essays using Gemini API.
    Saves the raw Gemini response directly to the output file.
    """
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)

    batch_files = sorted(glob.glob(f"{batch_dir}/batch_*.json"))
    print(f"Found {len(batch_files)} batch files")

    error_log = output_path / "errors.txt"

    for batch_file in tqdm(batch_files, desc="Processing batches"):
        batch_num = Path(batch_file).stem  # e.g., "batch_1"

        # Skip if output already exists (allows resuming)
        output_file = output_path / f"timeline_{batch_num.replace('batch_', '')}.txt"
        if output_file.exists() and not force:
            print(f"  Skipping {batch_num}: output already exists")
            continue

        with open(batch_file, "r", encoding="utf-8") as f:
            batch_entries = json.load(f)

        # Build one prompt for the entire batch
        prompt = build_batch_prompt(batch_entries)

        if prompt is None:
            print(f"  Skipping {batch_num}: no essays found")
            # Save an empty marker so we know this batch was processed
            with open(output_file, "w", encoding="utf-8") as f:
                f.write("")
            continue

        try:
            print(f"  Sending {batch_num} ({len(batch_entries)} newspapers) as single prompt...")
            response = query_gemini_with_retry(prompt)

            # Save the raw Gemini response directly
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(response)

            print(f"  Saved raw response to {output_file}")
            time.sleep(GEMINI_API_DELAY)

        except Exception as e:
            print(f"  ERROR processing {batch_num}: {e}")
            with open(error_log, "a", encoding="utf-8") as f:
                f.write(f"{batch_num}: {e}\n")

    print(f"\nStage 1 complete! Review timelines in {output_dir}/ before proceeding to Stage 2.")


# Run Stage 1
extract_timelines_from_batches(force=FORCE_REGENERATE_TIMELINES)

Found 25 batch files


Processing batches:   0%|          | 0/25 [00:00<?, ?it/s]2026-02-14 16:30:30,595 [INFO] AFC is enabled with max remote calls: 10.


  Skipping batch_1: output already exists
  Skipping batch_10: output already exists
  Skipping batch_11: output already exists
  Skipping batch_12: output already exists
  Skipping batch_13: output already exists
  Skipping batch_14: output already exists
  Skipping batch_15: output already exists
  Skipping batch_16: output already exists
  Skipping batch_17: output already exists
  Skipping batch_18: output already exists
  Skipping batch_19: output already exists
  Skipping batch_2: output already exists
  Skipping batch_20: output already exists
  Skipping batch_21: output already exists
  Skipping batch_22: output already exists
  Skipping batch_23: output already exists
  Skipping batch_24: output already exists
  Skipping batch_25: output already exists
  Skipping batch_3: output already exists
  Skipping batch_4: output already exists
  Skipping batch_5: output already exists
  Skipping batch_6: output already exists
  Skipping batch_7: output already exists
  Sending batch_8 

2026-02-14 16:31:10,081 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-latest:generateContent "HTTP/1.1 200 OK"


  Saved raw response to data\timelines\timeline_8.txt


Processing batches:  96%|█████████▌| 24/25 [00:44<00:01,  1.85s/it]2026-02-14 16:31:15,110 [INFO] AFC is enabled with max remote calls: 10.


  Sending batch_9 (25 newspapers) as single prompt...


2026-02-14 16:36:13,407 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-latest:generateContent "HTTP/1.1 200 OK"


  Saved raw response to data\timelines\timeline_9.txt


Processing batches: 100%|██████████| 25/25 [05:47<00:00, 13.92s/it]


Stage 1 complete! Review timelines in data/timelines/ before proceeding to Stage 2.





## Manual Checkpoint 1

**Review the timeline files in `data/timelines/` before proceeding.**

Check that the extracted timelines look reasonable. If you need to regenerate, set `FORCE_REGENERATE_TIMELINES=True` in the cell above and re-run.

## Step 3: Gemini Stage 2 - Structure Timelines into JSON

This cell converts the timeline text into structured year-by-year JSON matching the master.csv schema.

**Output**: `data/essay_derived_rows/essay_derived_rows_*.json`

**Manual Checkpoint**: Review the structured JSON files before merging to master.csv.

In [2]:
import glob
import json
from pathlib import Path
from tqdm import tqdm
import time


FORCE_REGENERATE_STRUCTURED = False  # Set to True to regenerate

def timelines_to_structured_json(
    timeline_dir: str = "data/timelines",
    output_dir: str = "data/essay_derived_rows",
    force: bool = False
):
    """
    Stage 2: Send raw timeline text files to Gemini for structured JSON extraction.
    Processes each .txt file in timeline_dir as a single batch and saves a corresponding .json file in output_dir.
    """
    output_path = Path(output_dir)
    
    if output_path.exists() and not force:
        print(f"Structured rows already exist in {output_dir}. Set FORCE_REGENERATE_STRUCTURED=True to regenerate.")
        return
    
    output_path.mkdir(exist_ok=True)
    
    # Find all timeline text files
    timeline_files = sorted(glob.glob(f"{timeline_dir}/timeline_*.txt"))
    print(f"Found {len(timeline_files)} timeline files in {timeline_dir}")
    
    for timeline_file in tqdm(timeline_files, desc="Processing files"):
        file_stem = Path(timeline_file).stem  # e.g., "timeline_1"
        batch_num = file_stem.replace("timeline_", "")
        
        # Skip if output already exists and not forcing
        output_file = output_path / f"essay_derived_rows_{batch_num}.json"
        if output_file.exists() and not force:
            print(f"  Skipping {file_stem}: output already exists")
            continue
        
        # Read the raw text file
        with open(timeline_file, "r", encoding="utf-8") as f:
            raw_text = f.read()
        
        prompt = f"""Convert these editor/publisher timelines into structured year-by-year data for years 1869-1890.

Each newspaper is delimited by a === ISSN: ... === header line. Return a JSON array with one object per newspaper.

Return ONLY valid JSON (no markdown formatting, no code blocks) in this exact format:
[
  {{
    "newspaper_name": "name from the header",
    "issn": "issn from the header",
    "1869 editor": "editor name or empty string",
    "1869 publisher": "publisher name or empty string",
    "1869 frequency": "Daily/Weekly/etc or empty string",
    ... (repeat for each year 1869-1890)
    "data_source": "essay_derived"
  }},
  ...
]

Timelines:
{raw_text}

Important:
- Use empty strings "" for unknown values, not null or "Unknown"
- Extract the ISSN and newspaper name from each === header line
- Publisher/editor change years should be comma-separated (e.g., "1870,1876,1880")
- Return ONLY the JSON array, nothing else
- only use explicitly stated date ranges; do not infer transitions."""
        
        try:
            response = query_gemini_api(prompt)
            
            # Clean up response (remove markdown code blocks if present)
            response = response.strip()
            if response.startswith("```json"):
                response = response[7:]
            if response.startswith("```"):
                response = response[3:]
            if response.endswith("```"):
                response = response[:-3]
            response = response.strip()
            
            structured_rows = json.loads(response)
            print(f"  {file_stem}: parsed {len(structured_rows)} rows")
            
            # Save structured rows for this batch
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(structured_rows, f, indent=2, ensure_ascii=False)
            
            print(f"  Saved {len(structured_rows)} structured rows to {output_file}")
            time.sleep(GEMINI_API_DELAY)
            
        except json.JSONDecodeError as e:
            log.error(f"  Failed to parse JSON for {file_stem}: {e}")
            log.error(f"  Response was: {response[:200]}...")
        except Exception as e:
            log.error(f"  Failed for {file_stem}: {e}")
    
    print(f"\nDone! Review structured data in {output_dir}/ before merging to master.csv.")

# Run Stage 2
timelines_to_structured_json(force=True)

Found 25 timeline files in data/timelines


Processing files:   0%|          | 0/25 [00:00<?, ?it/s]

2026-02-14 16:45:23,050 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:46:16,808 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_1: parsed 25 rows
  Saved 25 structured rows to data\essay_derived_rows\essay_derived_rows_1.json


Processing files:   4%|▍         | 1/25 [01:08<27:33, 68.90s/it]2026-02-14 16:46:31,956 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:47:14,436 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_10: parsed 22 rows
  Saved 22 structured rows to data\essay_derived_rows\essay_derived_rows_10.json


Processing files:   8%|▊         | 2/25 [02:06<23:50, 62.20s/it]2026-02-14 16:47:29,482 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:47:30,410 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 503 Service Unavailable"
2026-02-14 16:47:30,420 [ERROR] Gemini API error: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'This model is currently experiencing high demand. Spikes in demand are usually temporary. Please try again later.', 'status': 'UNAVAILABLE'}}
2026-02-14 16:47:30,423 [ERROR]   Failed for timeline_11: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'This model is currently experiencing high demand. Spikes in demand are usually temporary. Please try again later.', 'status': 'UNAVAILABLE'}}
Processing files:  12%|█▏        | 3/25 [02:07<12:33, 34.24s/it]2026-02-14 16:47:30,441 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:48:10,859 [INFO] HTTP Request: POST

  timeline_12: parsed 24 rows
  Saved 24 structured rows to data\essay_derived_rows\essay_derived_rows_12.json


Processing files:  16%|█▌        | 4/25 [03:02<14:54, 42.61s/it]2026-02-14 16:48:25,884 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:49:06,571 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_13: parsed 24 rows
  Saved 24 structured rows to data\essay_derived_rows\essay_derived_rows_13.json


Processing files:  20%|██        | 5/25 [03:58<15:46, 47.34s/it]2026-02-14 16:49:21,601 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:49:56,034 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_14: parsed 24 rows
  Saved 24 structured rows to data\essay_derived_rows\essay_derived_rows_14.json


Processing files:  24%|██▍       | 6/25 [04:48<15:13, 48.06s/it]2026-02-14 16:50:11,074 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:50:53,784 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_15: parsed 25 rows
  Saved 25 structured rows to data\essay_derived_rows\essay_derived_rows_15.json


Processing files:  28%|██▊       | 7/25 [05:45<15:22, 51.23s/it]2026-02-14 16:51:08,828 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:51:52,768 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_16: parsed 25 rows
  Saved 25 structured rows to data\essay_derived_rows\essay_derived_rows_16.json


Processing files:  32%|███▏      | 8/25 [06:44<15:12, 53.70s/it]2026-02-14 16:52:07,805 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:52:53,192 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_17: parsed 25 rows
  Saved 25 structured rows to data\essay_derived_rows\essay_derived_rows_17.json


Processing files:  36%|███▌      | 9/25 [07:45<14:52, 55.80s/it]2026-02-14 16:53:08,225 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:53:43,065 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_18: parsed 25 rows
  Saved 25 structured rows to data\essay_derived_rows\essay_derived_rows_18.json


Processing files:  40%|████      | 10/25 [08:35<13:29, 53.97s/it]2026-02-14 16:53:58,101 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:54:39,674 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_19: parsed 24 rows
  Saved 24 structured rows to data\essay_derived_rows\essay_derived_rows_19.json


Processing files:  44%|████▍     | 11/25 [09:31<12:46, 54.78s/it]2026-02-14 16:54:54,704 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:55:37,566 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_2: parsed 25 rows
  Saved 25 structured rows to data\essay_derived_rows\essay_derived_rows_2.json


Processing files:  48%|████▊     | 12/25 [10:29<12:04, 55.72s/it]2026-02-14 16:55:52,588 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:56:38,171 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_20: parsed 25 rows
  Saved 25 structured rows to data\essay_derived_rows\essay_derived_rows_20.json


Processing files:  52%|█████▏    | 13/25 [11:30<11:26, 57.20s/it]2026-02-14 16:56:53,189 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:57:25,175 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_21: parsed 20 rows
  Saved 20 structured rows to data\essay_derived_rows\essay_derived_rows_21.json


Processing files:  56%|█████▌    | 14/25 [12:17<09:55, 54.12s/it]2026-02-14 16:57:40,204 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:58:19,655 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_22: parsed 25 rows
  Saved 25 structured rows to data\essay_derived_rows\essay_derived_rows_22.json


Processing files:  60%|██████    | 15/25 [13:11<09:02, 54.23s/it]2026-02-14 16:58:34,685 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 16:59:07,181 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_23: parsed 25 rows
  Saved 25 structured rows to data\essay_derived_rows\essay_derived_rows_23.json


Processing files:  64%|██████▍   | 16/25 [13:59<07:49, 52.21s/it]2026-02-14 16:59:22,210 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 17:00:04,743 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_24: parsed 25 rows
  Saved 25 structured rows to data\essay_derived_rows\essay_derived_rows_24.json


Processing files:  68%|██████▊   | 17/25 [14:57<07:11, 53.92s/it]2026-02-14 17:00:20,102 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 17:00:31,146 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_25: parsed 6 rows
  Saved 6 structured rows to data\essay_derived_rows\essay_derived_rows_25.json


Processing files:  72%|███████▏  | 18/25 [15:23<05:18, 45.55s/it]2026-02-14 17:00:46,167 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 17:01:14,461 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_3: parsed 21 rows
  Saved 21 structured rows to data\essay_derived_rows\essay_derived_rows_3.json


Processing files:  76%|███████▌  | 19/25 [16:06<04:29, 44.88s/it]2026-02-14 17:01:29,489 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 17:02:18,202 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_4: parsed 25 rows
  Saved 25 structured rows to data\essay_derived_rows\essay_derived_rows_4.json


Processing files:  80%|████████  | 20/25 [17:10<04:12, 50.54s/it]2026-02-14 17:02:33,235 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 17:03:10,800 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_5: parsed 25 rows
  Saved 25 structured rows to data\essay_derived_rows\essay_derived_rows_5.json


Processing files:  84%|████████▍ | 21/25 [18:02<03:24, 51.16s/it]2026-02-14 17:03:25,844 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 17:03:33,325 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 503 Service Unavailable"
2026-02-14 17:03:33,331 [ERROR] Gemini API error: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'This model is currently experiencing high demand. Spikes in demand are usually temporary. Please try again later.', 'status': 'UNAVAILABLE'}}
2026-02-14 17:03:33,334 [ERROR]   Failed for timeline_6: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'This model is currently experiencing high demand. Spikes in demand are usually temporary. Please try again later.', 'status': 'UNAVAILABLE'}}
Processing files:  88%|████████▊ | 22/25 [18:10<01:54, 38.06s/it]2026-02-14 17:03:33,340 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 17:04:19,917 [INFO] HTTP Request: POS

  timeline_7: parsed 25 rows
  Saved 25 structured rows to data\essay_derived_rows\essay_derived_rows_7.json


Processing files:  92%|█████████▏| 23/25 [19:11<01:30, 45.12s/it]2026-02-14 17:04:34,950 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 17:05:18,200 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_8: parsed 24 rows
  Saved 24 structured rows to data\essay_derived_rows\essay_derived_rows_8.json


Processing files:  96%|█████████▌| 24/25 [20:10<00:49, 49.07s/it]2026-02-14 17:05:33,233 [INFO] AFC is enabled with max remote calls: 10.
2026-02-14 17:06:12,737 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


  timeline_9: parsed 23 rows
  Saved 23 structured rows to data\essay_derived_rows\essay_derived_rows_9.json


Processing files: 100%|██████████| 25/25 [21:05<00:00, 50.60s/it]


Done! Review structured data in data/essay_derived_rows/ before merging to master.csv.





## Manual Checkpoint 2

**Review the structured JSON files in `data/essay_derived_rows/` before proceeding.**

Check that the structured data looks correct and matches the master.csv schema. If you need to regenerate, set `FORCE_REGENERATE_STRUCTURED=True` in the cell above and re-run.

## Step 4: Validate master_id Mapping

Before merging, let's verify that master_id in matches.csv correctly maps to master.csv rows.

In [3]:
import pandas as pd

# Validate master_id mapping
matches_sample = pd.read_csv("data/matches.csv", dtype=str).head(10)
master_sample = pd.read_csv("data/master.csv", dtype=str)

print("=== Validating master_id mapping ===\n")

for _, row in matches_sample.iterrows():
    issn = row["issn"]
    master_id = row["master_id"]
    master_name = row["master_name"]

    if master_id:
        # Check if master_id is an index
        try:
            mid = int(master_id)
            if mid < len(master_sample):
                master_row = master_sample.iloc[mid]
                print(f"ISSN: {issn}")
                print(f"  master_name in matches.csv: {master_name}")
                print(f"  newspaper_name at index {mid}: {master_row['newspaper_name']}")
                print(f"  Match: {master_name.lower() == master_row['newspaper_name'].lower()}")
                print()
        except ValueError:
            print(f"master_id '{master_id}' is not numeric")

print("\nValidation complete. master_id appears to be a row index (0-based).")

=== Validating master_id mapping ===

ISSN: 1941-0646
  master_name in matches.csv: Tribune
  newspaper_name at index 35245: Tribune
  Match: True

ISSN: 2331-9968
  master_name in matches.csv: Evening Star
  newspaper_name at index 11632: Evening Star
  Match: True

ISSN: 1946-6153
  master_name in matches.csv: Gazette
  newspaper_name at index 45395: Gazette
  Match: True

ISSN: 2333-8547
  master_name in matches.csv: Intelligencer
  newspaper_name at index 46684: Intelligencer
  Match: True

ISSN: 1941-3033
  master_name in matches.csv: Salt Lake Herald
  newspaper_name at index 45159: Salt Lake Herald
  Match: True

ISSN: 2474-3224
  master_name in matches.csv: Herald
  newspaper_name at index 35086: Herald
  Match: True

ISSN: 2331-3285
  master_name in matches.csv: Herald
  newspaper_name at index 44292: Herald
  Match: True

ISSN: 2168-0116
  master_name in matches.csv: GLOBE
  newspaper_name at index 26503: GLOBE
  Match: True

ISSN: 2157-1260
  master_name in matches.csv: Disp

## Step 6: Final Validation

Verify ISSN matching and matches.csv consistency.

In [3]:
import pandas as pd

# Verify that essay_derived entries can match back to newspapers_all_years_updated.csv
newspapers_df = pd.read_csv("data/newspapers_all_years_updated.csv", dtype=str)
master_df = pd.read_csv("data/master.csv", dtype=str, keep_default_na=False)
matches_df = pd.read_csv("data/matches.csv", dtype=str, keep_default_na=False)

# Check essay-derived rows
essay_derived = master_df[master_df["data_source"].str.contains("essay", na=False)]
unmatched = essay_derived[~essay_derived["issn"].isin(newspapers_df["issn"])]

print("=== ISSN Matching Validation ===")
print(f"Essay-derived rows in master.csv: {len(essay_derived)}")
print(f"Successfully matched to newspapers_all_years_updated.csv by ISSN: {len(essay_derived) - len(unmatched)}")
print(f"Unmatched (need manual review): {len(unmatched)}")

if len(unmatched) > 0:
    unmatched[["issn", "newspaper_name", "town", "state"]].to_csv(
        "data/unmatched_issn_review.csv", index=False
    )
    print("Saved unmatched entries to data/unmatched_issn_review.csv")

# Verify matches.csv updates
print("\n=== matches.csv Consistency Check ===")
# Check if any new newspapers should be added to matches.csv
new_issns = set(essay_derived["issn"]) - set(matches_df["issn"])
print(f"ISSNs in essay_derived but not in matches.csv: {len(new_issns)}")

if len(new_issns) > 0:
    print("\nConsider updating matches.csv to include these new entries:")
    for issn in list(new_issns)[:10]:  # Show first 10
        row = essay_derived[essay_derived["issn"] == issn].iloc[0]
        print(f"  {issn}: {row['newspaper_name']} ({row['town']}, {row['state']})")

print("\n=== Validation Complete ===")

=== ISSN Matching Validation ===
Essay-derived rows in master.csv: 137
Successfully matched to newspapers_all_years_updated.csv by ISSN: 136
Unmatched (need manual review): 1
Saved unmatched entries to data/unmatched_issn_review.csv

=== matches.csv Consistency Check ===
ISSNs in essay_derived but not in matches.csv: 1

Consider updating matches.csv to include these new entries:
  2326-7747: new ulm weekly review (, )

=== Validation Complete ===
