In [None]:
# OCR parser Script 1877 - 1890

import fitz  # PyMuPDF
import os


def split_merged_line(line, midpoint):
    """
    Split a merged line into left and right column content by examining spans.
    """
    
    left_spans = []
    right_spans = []
    
    for span in line['spans']:
        span_bbox = span["bbox"]
        span_x0 = span_bbox[0]
        span_x1 = span_bbox[2]
        span_center = (span_x0 + span_x1) / 2
        span_text = span["text"]
        
        if not span_text.strip():
            continue
        
        # Classify span by its center position
        if span_center < midpoint:
            left_spans.append(span_text)
        else:
            right_spans.append(span_text)
    
    left_text = "".join(left_spans).strip()
    right_text = "".join(right_spans).strip()
    
    return left_text, right_text


def extract_columns_v11(
    pages_to_scan,
    input_dir=r"data\Newspaper Directories",
    output_dir=r"data\Newspaper Directory Text",
    file_to_start_at=None
):
    """
    Two-column extraction with dynamic boundary detection per page.
    """
    
    os.makedirs(output_dir, exist_ok=True)
    years = sorted(pages_to_scan.keys())
    
    print(f"Will process years: {years}")
    
    if file_to_start_at is not None:
        years = [y for y in years if y >= file_to_start_at]
        print(f"Starting from year {file_to_start_at}")
    
    for year in years:
        start_page, end_page = pages_to_scan[year]
        
        pdf_path = os.path.join(input_dir, f"Rowell {year}.pdf")
        if not os.path.exists(pdf_path):
            print(f"WARNING: {pdf_path} not found, skipping...")
            continue
        
        output_path = os.path.join(output_dir, f"Rowell {year} - v13.txt")
        
        if os.path.exists(output_path):
            print(f"SKIPPING {year}: Output already exists")
            continue
        
        print(f"\n{'='*60}")
        print(f"Processing: Rowell {year}.pdf (pages {start_page}-{end_page})")
        print(f"{'='*60}")
        
        pdf = fitz.open(pdf_path)
        all_text = []
        
        for page_num in range(start_page - 1, min(end_page, len(pdf))):
            page = pdf[page_num]
            page_width = page.rect.width
            
            page_text = process_page_v11(page, page_width, page_num + 1)
            all_text.append(page_text)
            
            pages_done = page_num - (start_page - 1) + 1
            total_pages = end_page - start_page + 1
            if pages_done % 50 == 0 or pages_done == total_pages:
                print(f"  {pages_done}/{total_pages} pages ({100*pages_done/total_pages:.1f}%)")
        
        pdf.close()
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(all_text))
        
        print(f"  SAVED: {output_path}")
    
    print(f"\n{'='*60}")
    print("Done!")


def process_page_v11(page, page_width, page_num):
    """
    Process page with dynamic column boundary detection.
    """
    
    text_dict = page.get_text("dict")
    midpoint = page_width / 2
    
    # Collect all lines
    all_lines = []
    for block in text_dict["blocks"]:
        if block["type"] != 0:
            continue
        for line in block["lines"]:
            line_bbox = line["bbox"]
            line_text = "".join(span["text"] for span in line["spans"]).strip()
            if not line_text:
                continue
            all_lines.append({
                'text': line_text,
                'x0': line_bbox[0],
                'x1': line_bbox[2],
                'y': line_bbox[1],
                'spans': line["spans"],  # Keep spans for potential splitting
            })
    
    if not all_lines:
        return f"--- Page {page_num} ---\n"
    
    # Find the column boundary by analyzing line positions
    clear_left_x1 = []
    for line in all_lines:
        if line['x1'] < midpoint * 0.95 and line['x0'] < midpoint * 0.5:
            clear_left_x1.append(line['x1'])
    
    clear_right_x0 = []
    for line in all_lines:
        if line['x0'] > midpoint * 1.05:
            clear_right_x0.append(line['x0'])
    
    if clear_left_x1 and clear_right_x0:
        left_edge = max(clear_left_x1)
        right_edge = min(clear_right_x0)
        if right_edge > left_edge:
            boundary = (left_edge + right_edge) / 2
        else:
            boundary = right_edge - 5
    else:
        boundary = midpoint
    
    # Classify lines using the computed boundary
    left_lines = []
    right_lines = []


    # Threshold for "line extends into right column territory"
    right_extent_threshold = midpoint * 1.4
    
    # Threshold for "short line" - if a line is shorter than the gap between midpoint and right_extent_threshold
    short_line_threshold = right_extent_threshold - midpoint + 10  # e.g., 252.8 - 180.6 = 72.2
    
    for line in all_lines:
        line_width = line['x1'] - line['x0']
        
        # Check if this line spans both columns (merged line)
        # Criteria: starts in left column territory AND extends well past midpoint
        starts_left = line['x0'] < midpoint * 0.8
        extends_right = line['x1'] > midpoint * 1.3
        is_wide = line_width > page_width * 0.5
        
        if starts_left and extends_right and is_wide:
            # This is likely a merged line - split it by examining spans
            left_text, right_text = split_merged_line(line, midpoint)
            
            if left_text:
                left_lines.append({
                    'text': left_text,
                    'y': line['y'],
                })
            if right_text:
                right_lines.append({
                    'text': right_text,
                    'y': line['y'],
                })
        # If line STARTS past boundary, it's right column
        elif line['x0'] >= boundary:
            right_lines.append(line)
        # If line STARTS left of boundary BUT EXTENDS well into right territory,
        # it's likely right column content with a slightly left-shifted start due to slant
        elif line['x1'] > right_extent_threshold and line['x0'] > midpoint * 0.9:
            right_lines.append(line)
        # Short lines that start before boundary but extend past it - likely right column
        # Must have at least 40% of line width past the boundary
        elif (line_width < short_line_threshold 
              and line['x0'] < boundary 
              and line['x1'] > boundary
              and (line['x1'] - boundary) / line_width >= 0.35):
            right_lines.append(line)
        else:
            left_lines.append(line)
    
    # Sort each column by y-position
    left_lines.sort(key=lambda l: l['y'])
    right_lines.sort(key=lambda l: l['y'])

    def fix_same_row_order(lines):
        if len(lines) < 2:
            return lines
        
        i = 0
        while i < len(lines) - 1:
            curr = lines[i]
            next_line = lines[i + 1]
            
            # Skip if either line is missing x0/x1 (e.g., from split lines)
            if 'x0' not in curr or 'x0' not in next_line or 'x1' not in next_line:
                i += 1
                continue
            
            # If lines are on same visual row (within 2 units of Y)
            # and next line starts to the LEFT of current line
            # and they are horizontally adjacent (curr x0 is within 20 units of next_line x1)
            if (abs(curr['y'] - next_line['y']) <= 2 
                and next_line['x0'] < curr['x0']
                and abs(curr['x0'] - next_line['x1']) <= 20):
                # Swap them
                lines[i], lines[i + 1] = lines[i + 1], lines[i]
                # Check if we need to swap backwards too
                if i > 0:
                    i -= 1
                    continue
            i += 1
        
        return lines
    
    left_lines = fix_same_row_order(left_lines)
    right_lines = fix_same_row_order(right_lines)
    
    # Build output
    result = [f"--- Page {page_num} ---"]
    for line in left_lines:
        result.append(line['text'])
    for line in right_lines:
        result.append(line['text'])
    
    return '\n'.join(result)


def debug_page(pdf_path, page_num, search_text=None):
    """Debug: show boundary detection and line classifications."""
    
    pdf = fitz.open(pdf_path)
    page = pdf[page_num - 1]
    page_width = page.rect.width
    midpoint = page_width / 2
    
    text_dict = page.get_text("dict")
    
    all_lines = []
    for block in text_dict["blocks"]:
        if block["type"] != 0:
            continue
        for line in block["lines"]:
            line_bbox = line["bbox"]
            line_text = "".join(span["text"] for span in line["spans"]).strip()
            if not line_text:
                continue
            all_lines.append({
                'text': line_text,
                'x0': line_bbox[0],
                'x1': line_bbox[2],
                'y': line_bbox[1],
                'spans': line["spans"],
            })
    
    # Find boundary
    clear_left_x1 = []
    for line in all_lines:
        if line['x1'] < midpoint * 0.95 and line['x0'] < midpoint * 0.5:
            clear_left_x1.append(line['x1'])
    
    clear_right_x0 = []
    for line in all_lines:
        if line['x0'] > midpoint * 1.05:
            clear_right_x0.append(line['x0'])
    
    if clear_left_x1 and clear_right_x0:
        left_edge = max(clear_left_x1)
        right_edge = min(clear_right_x0)
        if right_edge > left_edge:
            boundary = (left_edge + right_edge) / 2
        else:
            boundary = right_edge - 5
    else:
        boundary = midpoint
    
    # Thresholds
    right_extent_threshold = midpoint * 1.4
    short_line_threshold = right_extent_threshold - midpoint + 10
    
    print(f"Page {page_num}")
    print(f"  Page width: {page_width:.1f}")
    print(f"  Midpoint: {midpoint:.1f}")
    print(f"  Left col max x1: {max(clear_left_x1):.1f}" if clear_left_x1 else "  No clear left lines")
    print(f"  Right col min x0: {min(clear_right_x0):.1f}" if clear_right_x0 else "  No clear right lines")
    print(f"  Computed boundary: {boundary:.1f}")
    print(f"  Right extent threshold: {right_extent_threshold:.1f}")
    print(f"  Short line threshold: {short_line_threshold:.1f}")
    print("="*100)
    
    # Show lines sorted by y
    all_lines.sort(key=lambda l: l['y'])
    
    print(f"{'Y':>6} {'X0':>6} {'X1':>6} {'WIDTH':>6} {'%PAST':>6} {'COL':>6} | Text (first 60 chars)")
    print("-"*100)
    
    for line in all_lines:
        line_width = line['x1'] - line['x0']
        
        # Calculate percent past boundary (if applicable)
        if line['x1'] > boundary and line_width > 0:
            pct_past = (line['x1'] - boundary) / line_width * 100
        else:
            pct_past = 0
        
        # Check for merged line
        starts_left = line['x0'] < midpoint * 0.8
        extends_right = line['x1'] > midpoint * 1.3
        is_wide = line_width > page_width * 0.5
        
        if starts_left and extends_right and is_wide:
            col = "SPLIT"
            left_text, right_text = split_merged_line(line, midpoint)
            text_preview = f"L:[{left_text[:25]}] R:[{right_text[:25]}]"
        elif line['x0'] >= boundary:
            col = "RIGHT"
            text_preview = line['text'][:60]
        elif line['x1'] > right_extent_threshold and line['x0'] > midpoint * 0.9:
            col = "RIGHT"
            text_preview = line['text'][:60]
        elif (line_width < short_line_threshold 
              and line['x0'] < boundary 
              and line['x1'] > boundary
              and (line['x1'] - boundary) / line_width >= 0.35):
            col = "RIGHT*"
            text_preview = line['text'][:60]
        else:
            col = "LEFT"
            text_preview = line['text'][:60]
        
        # Highlight search text if provided
        marker = ">>>" if search_text and search_text.upper() in line['text'].upper() else "   "
        print(f"{marker} {line['y']:6.1f} {line['x0']:6.1f} {line['x1']:6.1f} {line_width:6.1f} {pct_past:5.1f}% {col:>6} | {text_preview}")
    
    print(f"\nTotal lines: {len(all_lines)}")
    
    pdf.close()


def test_page(pdf_path, page_num):
    """Test on a single page."""
    
    pdf = fitz.open(pdf_path)
    page = pdf[page_num - 1]
    
    result = process_page_v11(page, page.rect.width, page_num)
    
    print(result)
    
    pdf.close()


def find_text_in_pdf(pdf_path, search_text, start_page=1, end_page=None):
    """Find which page(s) contain specific text."""
    
    pdf = fitz.open(pdf_path)
    if end_page is None:
        end_page = len(pdf)
    
    for page_num in range(start_page - 1, min(end_page, len(pdf))):
        page = pdf[page_num]
        text = page.get_text("text")
        if search_text.upper() in text.upper():
            print(f"Found '{search_text}' on page {page_num + 1}")
    
    pdf.close()


# USAGE

pages_to_scan = {
    1877: (17, 341),
    1878: (13, 341),
    1879: (26, 374),
    1880: (23, 404),
    1882: (15, 476),
    1883: (15, 452),
    1884: (15, 524),
    1885: (35, 590),
    1890: (63, 731),
}

# Debug specific pages:
debug_page(r"data\Newspaper Directories\Rowell 1890.pdf", 375, "FOLSOM")

# Test specific pages:
test_page(r"data\Newspaper Directories\Rowell 1890.pdf", 375)

# Run full extraction:
# extract_columns_v11(pages_to_scan)

Page 375
  Page width: 374.4
  Midpoint: 187.2
  Left col max x1: 165.7
  Right col min x0: 196.6
  Computed boundary: 181.2
  Right extent threshold: 262.1
  Short line threshold: 84.9
     Y     X0     X1  WIDTH  %PAST    COL | Text (first 60 chars)
----------------------------------------------------------------------------------------------------
      28.2   57.1   70.3   13.2   0.0%   LEFT | 368
      28.5  146.9  241.5   94.6  63.8%   LEFT | GEO . P. ROWELL & CO'S
      43.2  104.2  146.2   42.0   0.0%   LEFT | MINNESOTA.
      45.0  244.1  285.7   41.6 251.3%  RIGHT | MINNESOTA.
      55.9  197.0  333.4  136.4 111.6%  RIGHT | publishers ; circulation-daily H1 *, week-
      56.7   61.9  192.6  130.7   8.7%   LEFT | water power, which is used in manufact-
      61.7  197.3  333.4  136.2 111.8%  RIGHT | ly F 2 **; does not insert advertisements in
      63.6   62.6  127.7   65.0   0.0%   LEFT | ures of various kinds.
      67.1  196.5  244.1   47.6 132.2%  RIGHT | the Minnehaha.


In [None]:
# page scanner 1869 - 1876

# %% Imports and setup
import fitz  # pymupdf
import json
import re
from pathlib import Path
from datetime import datetime

# Page ranges for each year: {year: (start_page, end_page)}
# Note: These are 1-indexed page numbers (as they appear in the PDF)
pages_to_scan = {
    1869: (15, 187),
    1871: (21, 251),
    1872: (15, 290),
    1873: (35, 238),
    1876: (25, 260),
}

# %% Configuration - EDIT THESE
INPUT_FOLDER = Path("data/Newspaper Directories")
OUTPUT_FOLDER = Path("data/Newspaper Directory text")

PROGRESS_FILE = Path("extract_progress.json")

# %% Functions
def load_progress():
    if PROGRESS_FILE.exists():
        return set(json.loads(PROGRESS_FILE.read_text()))
    return set()

def save_progress(completed: set):
    PROGRESS_FILE.write_text(json.dumps(list(completed)))

def clear_progress():
    """Call this if you want to start fresh."""
    if PROGRESS_FILE.exists():
        PROGRESS_FILE.unlink()
        print("Progress cleared. Will reprocess all files.")
    else:
        print("No progress file found.")

def extract_year_from_filename(filename: str) -> int | None:
    """Extract the year (18XX) from a filename like 'Rowell 1871.pdf'."""
    match = re.search(r'18\d{2}', filename)
    if match:
        return int(match.group())
    return None

def extract_text_from_pdf(pdf_path: Path, output_path: Path, start_page: int, end_page: int):
    """Extract embedded text from PDF.
    
    Args:
        pdf_path: Path to the PDF file
        output_path: Path for the output text file
        start_page: First page to process (1-indexed)
        end_page: Last page to process (1-indexed, inclusive)
    """
    print(f"\nProcessing: {pdf_path.name}")
    
    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    
    # Convert to 0-indexed and clamp to valid range
    start_idx = max(0, start_page - 1)
    end_idx = min(total_pages, end_page)
    
    print(f"  Total pages in PDF: {total_pages}")
    print(f"  Extracting pages {start_idx + 1} to {end_idx} ({end_idx - start_idx} pages)")
    
    all_text = []
    
    for page_num in range(start_idx, end_idx):
        text = doc[page_num].get_text()
        all_text.append(f"--- Page {page_num + 1} ---\n{text}")
    
    doc.close()
    
    # Save output
    output_path.write_text("\n\n".join(all_text), encoding="utf-8")
    print(f"  Saved: {output_path}")

# %% Run extraction - THIS IS THE MAIN CELL
INPUT_FOLDER.mkdir(exist_ok=True)
OUTPUT_FOLDER.mkdir(exist_ok=True)

# Find PDFs and check progress
pdfs = sorted(INPUT_FOLDER.glob("*.pdf"))
completed = load_progress()

# Filter to only PDFs we have page ranges for
valid_pdfs = []
skipped_no_range = []

for p in pdfs:
    year = extract_year_from_filename(p.name)
    if year and year in pages_to_scan:
        if p.name not in completed:
            valid_pdfs.append((p, year))
    else:
        skipped_no_range.append(p.name)

skipped_completed = [p.name for p in pdfs if p.name in completed]

print(f"Found {len(pdfs)} PDF(s) total")
print(f"  Already completed: {len(skipped_completed)}")
print(f"  Pending (with page ranges): {len(valid_pdfs)}")
if skipped_no_range:
    print(f"  Skipped (no page range defined): {len(skipped_no_range)}")
    for name in skipped_no_range:
        print(f"    - {name}")

if not valid_pdfs:
    print("\nNo files to process. Run clear_progress() if you want to start over.")
else:
    print(f"\nWill process:")
    for p, year in valid_pdfs:
        start, end = pages_to_scan[year]
        print(f"  - {p.name} (pages {start}-{end})")
    
    # Process all pending PDFs
    start_time = datetime.now()
    
    for i, (pdf_path, year) in enumerate(valid_pdfs, 1):
        print(f"\n{'='*60}")
        print(f"[{i}/{len(valid_pdfs)}]", end=" ")
        
        output_path = OUTPUT_FOLDER / f"{pdf_path.stem}.txt"
        start_page, end_page = pages_to_scan[year]
        
        extract_text_from_pdf(pdf_path, output_path, start_page=start_page, end_page=end_page)
        
        # Mark as completed and save progress immediately
        completed.add(pdf_path.name)
        save_progress(completed)
        print(f"  ✓ Progress saved")
    
    # Summary
    elapsed = datetime.now() - start_time
    print(f"\n{'='*60}")
    print(f"Complete! Processed {len(valid_pdfs)} file(s) in {elapsed}")
    print(f"Output saved to '{OUTPUT_FOLDER}' folder")

# %% Utility: Clear progress and start over (run manually if needed)
# clear_progress()

Found 14 PDF(s) total
  Already completed: 0
  Pending (with page ranges): 13
  Skipped (no page range defined): 1
    - Rowell 1869.pdf

Will process:
  - Rowell 1871.pdf (pages 21-251)
  - Rowell 1872.pdf (pages 15-290)
  - Rowell 1873.pdf (pages 35-238)
  - Rowell 1876.pdf (pages 25-260)
  - Rowell 1877.pdf (pages 17-341)
  - Rowell 1878.pdf (pages 13-341)
  - Rowell 1879.pdf (pages 26-374)
  - Rowell 1880.pdf (pages 23-404)
  - Rowell 1882.pdf (pages 15-476)
  - Rowell 1883.pdf (pages 15-452)
  - Rowell 1884.pdf (pages 15-524)
  - Rowell 1885.pdf (pages 35-590)
  - Rowell 1890.pdf (pages 63-731)

[1/13] 
Processing: Rowell 1871.pdf
  Total pages in PDF: 593
  Extracting pages 21 to 251 (231 pages)
  Saved: data\Newspaper Directory text\Rowell 1871.txt
  ✓ Progress saved

[2/13] 
Processing: Rowell 1872.pdf
  Total pages in PDF: 689
  Extracting pages 15 to 290 (276 pages)
  Saved: data\Newspaper Directory text\Rowell 1872.txt
  ✓ Progress saved

[3/13] 
Processing: Rowell 1873.pdf


In [None]:
# 1869 data extraction

import re
import csv
from pathlib import Path
import time

def clean_text(text):
    """Clean OCR artifacts and normalize text."""
    replacements = {
        'Î': 'A', 'Î•': 'E', 'Îœ': 'M', 'Î': 'N', 'Ð¡': 'C', 'Ð¢': 'T',
        'Ã‰': 'E', 'Ñ': 'c', 'Ðµ': 'e', 'Ñ€': 'p', 'Ð': 'N',
        '`': "'", "'": "'", '"': '"', '"': '"',
        '\xad': '', '­': '',
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text

def normalize_for_matching(text):
    """Normalize text for pattern matching - removes extra spaces."""
    return re.sub(r'\s+', ' ', text)

def normalize_editor_publisher_text(text):
    """
    Normalize text specifically for editor/publisher extraction.
    Handles OCR artifacts like hyphenated line breaks and missing spaces.
    """
    normalized = text
    
    # Remove hyphenated line breaks (e.g., "edi- tors" -> "editors", "pub- lisher" -> "publisher")
    normalized = re.sub(r'-\s+', '', normalized)
    
    # Fix common OCR run-together patterns
    normalized = re.sub(r'(editors?)(and)(pub)', r'\1 \2 \3', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(editors?)(and)(prop)', r'\1 \2 \3', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(publishers?)(and)(prop)', r'\1 \2 \3', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(publishers?)(and)', r'\1 \2', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(proprietors?)(and)', r'\1 \2', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(and)(publishers?)', r'\1 \2', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(and)(proprietors?)', r'\1 \2', normalized, flags=re.IGNORECASE)
    
    # Normalize multiple spaces
    normalized = re.sub(r'\s+', ' ', normalized)
    
    return normalized

def extract_circulation(text):
    """Extract circulation number from text."""
    patterns = [
        r'circulation[:\s]+(?:about\s+)?(\d[\d,\.]+)',
        r'claims?\s+(?:about\s+)?(\d[\d,\.]+)\s+circulation',
        r'circ(?:ulation|\'?l?n)[:\s\.]+(?:about\s+)?(\d[\d,\.]+)',
        r'(\d[\d,\.]+)\s+circ(?:ulation|\'?l?n)',
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).replace(',', '').replace('.', '')
    return ''

def extract_political_affiliation(text):
    """Extract political affiliation from text."""
    affiliations = ['democratic', 'republican', 'independent', 'neutral',
                    'whig', 'conservative', 'liberal', 'radical']
    text_lower = text.lower()
    for affiliation in affiliations:
        if re.search(rf';\s*{affiliation}\b', text_lower):
            return affiliation.capitalize()
    return ''

def extract_subscription_details(text):
    """Extract detailed subscription info."""
    daily_match = re.search(r'subscription[-\s]+daily\s+\$(\d+(?:\s+\d{2})?)', text, re.IGNORECASE)
    if daily_match:
        return f"${daily_match.group(1).replace(' ', '.')}"
    
    weekly_match = re.search(r'(?:subscription[-\s]+)?weekly\s+\$(\d+(?:\s+\d{2})?)', text, re.IGNORECASE)
    if weekly_match:
        return f"${weekly_match.group(1).replace(' ', '.')}"
    
    std_match = re.search(r'subscription[:\s]+\$(\d+(?:\s+\d{2})?)', text, re.IGNORECASE)
    if std_match:
        return f"${std_match.group(1).replace(' ', '.')}"
    
    cents_match = re.search(r'subscription\s+(\d+)\s+cents', text, re.IGNORECASE)
    if cents_match:
        return f"${int(cents_match.group(1))/100:.2f}"
    
    return ''

def extract_frequency(text):
    """Extract publication frequency from text."""
    text_lower = text.lower()
    
    if re.search(r'every\s+(?:morning|evening|day)', text_lower):
        return 'Daily & Weekly' if 'and weekly' in text_lower or 'weekly,' in text_lower else 'Daily'
    if re.search(r'tri-?weekly', text_lower):
        return 'Tri-weekly & Weekly' if 'and weekly' in text_lower else 'Tri-weekly'
    if re.search(r'semi-?weekly', text_lower):
        return 'Semi-weekly & Weekly' if 'and weekly' in text_lower else 'Semi-weekly'
    if re.search(r'semi-?monthly', text_lower):
        return 'Semi-monthly'
    if 'quarterly' in text_lower:
        return 'Quarterly'
    if 'monthly' in text_lower:
        return 'Monthly'
    
    days = ['sundays', 'mondays', 'tuesdays', 'wednesdays', 'thursdays', 'fridays', 'saturdays']
    for day in days:
        if day in text_lower:
            return 'Weekly'
    return ''

def extract_established(text):
    """Extract establishment year from text."""
    patterns = [
        r'establish[e]?d\s+(\d{4})',
        r'estab[-\s]*lished\s+(\d{4})',
        r'es[-\s]*tablished\s+(\d{4})',
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            year = match.group(1)
            if 1700 <= int(year) <= 1900:
                return year
    return ''

def clean_name(name):
    """Clean and validate an extracted name."""
    if not name:
        return None
    
    name = name.strip().strip(',;:.')
    
    if len(name) < 3:
        return None
    
    false_positives = ['Four', 'Eight', 'The', 'And', 'Weekly', 'Daily', 'Semi', 
                       'Tri', 'Monthly', 'Sunday', 'Saturday', 'Friday', 'Thursday',
                       'Wednesday', 'Tuesday', 'Monday', 'About', 'Claims', 'Size',
                       'Subscription', 'Established', 'Circulation', 'Pages',
                       'Democratic', 'Republican', 'Independent', 'Neutral',
                       'Temperance', 'Association']
    if name in false_positives:
        return None
    
    if name.replace(',', '').replace('.', '').isdigit():
        return None
    
    if re.search(r'\b(editor|publisher|proprietor|and)\s*$', name, re.IGNORECASE):
        return None
    
    if name[0].islower():
        return None
    
    return name

def add_name_if_unique(name, name_list):
    """Add a name to the list if it's not a duplicate."""
    cleaned = clean_name(name)
    if not cleaned:
        return False
    
    cleaned_lower = cleaned.lower()
    
    for existing in name_list:
        if cleaned_lower == existing.lower() or cleaned_lower in existing.lower():
            return False
    
    to_remove = [e for e in name_list if e.lower() in cleaned_lower]
    for item in to_remove:
        name_list.remove(item)
    
    name_list.append(cleaned)
    return True

def extract_editor_publisher(text):
    """Extract editor and publisher names from text."""
    editors, publishers = [], []
    
    normalized = normalize_editor_publisher_text(text)
    normalized = normalize_for_matching(normalized)
    
    normalized = re.sub(r'(\w)(and)(\w)', r'\1 \2 \3', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(editor[s]?)(and)', r'\1 \2', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(and)(pub)', r'\1 \2', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(and)(prop)', r'\1 \2', normalized, flags=re.IGNORECASE)
    
    # Split by semicolons to process each segment separately
    segments = re.split(r';', normalized)
    
    for segment in segments:
        segment = segment.strip()
        if not segment:
            continue
        
        name_pattern = r'([A-Z][A-Za-z\.\s&,]+?)'
        
        combined_patterns = [
            re.compile(name_pattern + r',?\s+editors?\s+and\s+publishers?', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+editors?\s+and\s+proprietors?', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+editors?andpublishers?', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+editors?andproprietors?', re.IGNORECASE),
        ]
        
        editor_patterns = [
            re.compile(name_pattern + r',?\s+editors?\s*$', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+editors?\s*,', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+editor-in-chief', re.IGNORECASE),
        ]
        
        publisher_patterns = [
            re.compile(name_pattern + r',?\s+publishers?\s*$', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+proprietors?\s*$', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+publishers?\s+and\s+proprietors?', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+publishersandproprietors?', re.IGNORECASE),
        ]
        
        matched_combined = False
        for pattern in combined_patterns:
            match = pattern.search(segment)
            if match:
                add_name_if_unique(match.group(1), editors)
                add_name_if_unique(match.group(1), publishers)
                matched_combined = True
                break
        
        if matched_combined:
            continue
        
        for pattern in editor_patterns:
            match = pattern.search(segment)
            if match:
                add_name_if_unique(match.group(1), editors)
                break
        
        for pattern in publisher_patterns:
            match = pattern.search(segment)
            if match:
                add_name_if_unique(match.group(1), publishers)
                break
    
    return {'editor': '; '.join(editors), 'publisher': '; '.join(publishers)}

def is_state_header(line):
    """Check if a line is a state header (e.g., 'ALABAMA.' or 'ALABAMA .')"""
    # List of valid US states and territories for the period
    valid_states = [
        'ALABAMA', 'ARKANSAS', 'ARIZONA', 'CALIFORNIA', 'COLORADO', 
        'CONNECTICUT', 'DAKOTA', 'DELAWARE', 'DISTRICT OF COLUMBIA',
        'FLORIDA', 'GEORGIA', 'IDAHO', 'ILLINOIS', 'INDIANA', 'IOWA',
        'KANSAS', 'KENTUCKY', 'LOUISIANA', 'MAINE', 'MARYLAND',
        'MASSACHUSETTS', 'MICHIGAN', 'MINNESOTA', 'MISSISSIPPI',
        'MISSOURI', 'MONTANA', 'NEBRASKA', 'NEVADA', 'NEW HAMPSHIRE',
        'NEW JERSEY', 'NEW MEXICO', 'NEW YORK', 'NORTH CAROLINA',
        'OHIO', 'OREGON', 'PENNSYLVANIA', 'RHODE ISLAND',
        'SOUTH CAROLINA', 'TENNESSEE', 'TEXAS', 'UTAH', 'VERMONT',
        'VIRGINIA', 'WASHINGTON', 'WEST VIRGINIA', 'WISCONSIN', 'WYOMING',
        # Territories and other regions
        'INDIAN TERRITORY', 'DOMINION OF CANADA', 'BRITISH COLONIES',
        'CANADA', 'NEWFOUNDLAND', 'NOVA SCOTIA', 'NEW BRUNSWICK',
        'PRINCE EDWARD ISLAND', 'MANITOBA', 'ONTARIO', 'QUEBEC',
        'BRITISH COLUMBIA'
    ]
    
    # Clean the line: remove periods, extra spaces, and strip
    # This handles both "ALABAMA." and "ALABAMA ." patterns
    cleaned = re.sub(r'\s*\.\s*', '', line).strip().upper()
    
    # Check if it matches a state name
    if cleaned in valid_states:
        return cleaned
    
    return None

def is_valid_town_name(town):
    """Check if the extracted town name is a valid town (not an index/header entry)."""
    invalid_patterns = [
        r'^A\s+LIST', r'DOMINION', r'CANADA', r'BRITISH', r'COLONIES',
        r'UNITED\s+STATES', r'TERRITORIES', r'^INDEX', r'^PAGE\s*\d*',
        r'NEWSPAPERS?', r'PERIODICALS?', r'ALPHABETICALLY', r'ARRANGED',
        r'GIVING\s+NAME', r'DAYS\s+OF\s+ISSUE', r'SUBSCRIPTION\s+PRICE',
        r'EDITOR.?S?\s+AND\s+PUBLISHER', r'CIRCULATION', r'ADVERTISEMENTS?',
        r'PRINTING\s+MATERIAL', r'IN\s+WHICH', r'ARE\s+PUBLISHED',
        r'^NOTE', r'^\d+$', r'^THE\s+', r'ALIST\s+OF',
    ]
    
    town_upper = town.upper().strip()
    for pattern in invalid_patterns:
        if re.search(pattern, town_upper):
            return False
    
    if len(town) > 50 or len(town.split()) > 4:
        return False
    
    return True

def is_valid_entry_text(entry_text):
    """Check if the entry text looks like a valid newspaper entry."""
    invalid_patterns = [
        r'ALPHABETICALLY\s+BY\s+TOWNS', r'DAYS\s+OF\s+ISSUE',
        r'POLITICS\s+OR\s+GENERAL\s+CHARACTER', r'DATE\s+OF\s+ESTABLISHMENT',
        r'EDITOR.?S?\s+AND\s+PUBLISHER.?S?\s+NAMES', r'GIV-?\s*ING\s+NAME',
        r'DOMINION\s+OF\s+CANADA', r'BRITISH\s+COLONIES',
        r'UNITED\s+STATES\s+AND\s+TERRITORIES',
        r'A\s*LIST\s+OF\s+THE\s+NEWSPAPERS',
    ]
    
    text_upper = entry_text.upper()
    for pattern in invalid_patterns:
        if re.search(pattern, text_upper):
            return False
    return True

def parse_newspaper_entries(text):
    """Parse the text into individual newspaper entries with state tracking."""
    text = clean_text(text)
    lines = text.split('\n')
    entries = []
    current_entry = []
    current_town = None
    current_state = None
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        # Check if this line is a state header
        state_match = is_state_header(line)
        if state_match:
            # Update state if it's different from current (handles repeated page headers)
            if state_match != current_state:
                # Save any pending entry before changing state
                if current_entry and current_town:
                    entries.append((current_state, current_town, ' '.join(current_entry)))
                current_state = state_match
                current_entry = []
                current_town = None
            # Skip the state header line either way
            continue
        
        new_entry_match = re.match(
            r'^([A-Z][A-Z\s,\.]+?)(?:,\s*|\s+)([A-Z][a-zA-Z\s&\'\.\-]+?)\s*[;:]',
            line
        )
        
        if new_entry_match:
            if current_entry and current_town:
                entries.append((current_state, current_town, ' '.join(current_entry)))
            current_town = new_entry_match.group(1).strip().rstrip(',')
            current_entry = [line]
        elif current_entry:
            current_entry.append(line)
    
    if current_entry and current_town:
        entries.append((current_state, current_town, ' '.join(current_entry)))
    
    return entries

def parse_entry_details(state, town, entry_text):
    """Extract structured data from a single entry."""
    result = {
        'state': state if state else '',
        'town': town.strip().title(),
        'newspaper_name': '',
        'frequency': extract_frequency(entry_text),
        'political_affiliation': extract_political_affiliation(entry_text),
        'subscription_price': extract_subscription_details(entry_text),
        'established': extract_established(entry_text),
        'circulation': extract_circulation(entry_text),
        'raw_text': entry_text[:300] + '...' if len(entry_text) > 300 else entry_text
    }
    
    name_match = re.match(
        rf'^{re.escape(town)}[,\s]+([A-Za-z][A-Za-z\s&\'\.\-,]+?)\s*[;:]',
        entry_text, re.IGNORECASE
    )
    if name_match:
        result['newspaper_name'] = name_match.group(1).strip().rstrip(',;:')
    
    people = extract_editor_publisher(entry_text)
    result['editor'] = people['editor']
    result['publisher'] = people['publisher']
    
    return result

def process_file(input_path, output_path=None):
    """Process the input file and write results to CSV."""
    start_time = time.time()
    
    print(f"Reading file: {input_path}")
    with open(input_path, 'r', encoding='utf-8', errors='replace') as f:
        text = f.read()
    
    print("Parsing entries...")
    raw_entries = parse_newspaper_entries(text)
    total_entries = len(raw_entries)
    print(f"Found {total_entries} raw entries")
    
    results = []
    
    for i, (state, town, entry_text) in enumerate(raw_entries):
        if (i + 1) % 100 == 0 or i == total_entries - 1:
            elapsed = time.time() - start_time
            pct = (i + 1) / total_entries * 100
            print(f"Processing: {i + 1}/{total_entries} ({pct:.1f}%) - {elapsed:.1f}s elapsed")
        
        if not is_valid_town_name(town):
            continue
        if not is_valid_entry_text(entry_text):
            continue
        
        details = parse_entry_details(state, town, entry_text)
        if details['newspaper_name'] and len(details['newspaper_name']) > 1:
            results.append(details)
    
    if output_path is None:
        output_path = Path(input_path).stem + '_extracted.csv'
    
    fieldnames = ['state', 'town', 'newspaper_name', 'frequency', 'political_affiliation', 
                  'subscription_price', 'established', 'editor', 'publisher',
                  'circulation', 'raw_text']
    
    with open(output_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results)
    
    elapsed = time.time() - start_time
    print(f"\n{'='*50}")
    print(f"Completed in {elapsed:.1f} seconds")
    print(f"Processed {len(results)} valid entries")
    print(f"Output written to: {output_path}")
    print(f"{'='*50}")
    print(f"Entries with state: {sum(1 for r in results if r['state'])}")
    print(f"Entries with frequency: {sum(1 for r in results if r['frequency'])}")
    print(f"Entries with political affiliation: {sum(1 for r in results if r['political_affiliation'])}")
    print(f"Entries with subscription price: {sum(1 for r in results if r['subscription_price'])}")
    print(f"Entries with established date: {sum(1 for r in results if r['established'])}")
    print(f"Entries with editor: {sum(1 for r in results if r['editor'])}")
    print(f"Entries with publisher: {sum(1 for r in results if r['publisher'])}")
    print(f"Entries with circulation: {sum(1 for r in results if r['circulation'])}")
    
    return results

# USAGE

import os
for file in os.listdir("data/Newspaper Directory Text/")[:1]:
    input = "data/Newspaper Directory Text/" + file
    output = "data/Newspaper Directory Excel/" + file[:-3] + 'csv'
    results = process_file(input, output)

Reading file: data/Newspaper Directory Text/Rowell 1869.txt
Parsing entries...
Found 3079 raw entries
Processing: 100/3079 (3.2%) - 0.4s elapsed
Processing: 200/3079 (6.5%) - 0.6s elapsed
Processing: 300/3079 (9.7%) - 0.9s elapsed
Processing: 400/3079 (13.0%) - 1.1s elapsed
Processing: 500/3079 (16.2%) - 1.3s elapsed
Processing: 600/3079 (19.5%) - 1.5s elapsed
Processing: 700/3079 (22.7%) - 1.7s elapsed
Processing: 800/3079 (26.0%) - 1.8s elapsed
Processing: 900/3079 (29.2%) - 1.9s elapsed
Processing: 1000/3079 (32.5%) - 2.1s elapsed
Processing: 1100/3079 (35.7%) - 2.2s elapsed
Processing: 1200/3079 (39.0%) - 2.4s elapsed
Processing: 1300/3079 (42.2%) - 2.5s elapsed
Processing: 1400/3079 (45.5%) - 2.7s elapsed
Processing: 1500/3079 (48.7%) - 2.8s elapsed
Processing: 1600/3079 (52.0%) - 3.0s elapsed
Processing: 1700/3079 (55.2%) - 3.2s elapsed
Processing: 1800/3079 (58.5%) - 3.4s elapsed
Processing: 1900/3079 (61.7%) - 3.6s elapsed
Processing: 2000/3079 (65.0%) - 3.8s elapsed
Processing

In [7]:
# 1871 - 1876 extraction
import csv
import re
from typing import Tuple

# Known US states and territories from that era
STATES = {
    "ALABAMA", "ARKANSAS", "ARIZONA", "CALIFORNIA", "COLORADO", "CONNECTICUT",
    "DELAWARE", "DISTRICT OF COLUMBIA", "FLORIDA", "GEORGIA", "IDAHO", "ILLINOIS",
    "INDIANA", "IOWA", "KANSAS", "KENTUCKY", "LOUISIANA", "MAINE", "MARYLAND",
    "MASSACHUSETTS", "MICHIGAN", "MINNESOTA", "MISSISSIPPI", "MISSOURI", "MONTANA",
    "NEBRASKA", "NEVADA", "NEW HAMPSHIRE", "NEW JERSEY", "NEW MEXICO", "NEW YORK",
    "NORTH CAROLINA", "OHIO", "OREGON", "PENNSYLVANIA", "RHODE ISLAND",
    "SOUTH CAROLINA", "TENNESSEE", "TEXAS", "UTAH", "VERMONT", "VIRGINIA",
    "WASHINGTON", "WEST VIRGINIA", "WISCONSIN", "WYOMING",
    "INDIAN TERRITORY", "DAKOTA", "DOMINION OF CANADA", "BRITISH COLONIES"
}


def extract_frequency(text: str) -> str:
    """Extract publication frequency."""
    freq_map = [
        (r'every\s*morning', 'Daily'),
        (r'every\s*evening', 'Daily'),
        (r'every\s*afternoon', 'Daily'),
        (r'every\s*day', 'Daily'),
        (r'semi-weekly', 'Semi-weekly'), (r'tri-weekly', 'Tri-weekly'),
        (r'bi-weekly', 'Bi-weekly'), (r'bi-monthly', 'Bi-monthly'),
        (r'semi-month', 'Semi-monthly'),
        (r'sundays?', 'Sundays'), (r'mondays?', 'Mondays'), (r'tuesdays?', 'Tuesdays'),
        (r'wednesdays?', 'Wednesdays'), (r'thursdays?', 'Thursdays'),
        (r'fridays?', 'Fridays'), (r'saturdays?', 'Saturdays'),
        (r'\bdaily\b', 'Daily'), (r'\bweekly\b', 'Weekly'),
        (r'\bmonthly\b', 'Monthly'), (r'\bquarterly\b', 'Quarterly'),
    ]
    text_lower = text.lower()
    for pattern, freq in freq_map:
        if re.search(pattern, text_lower):
            return freq
    return ""


def extract_political(text: str) -> str:
    """Extract political affiliation."""
    affil_map = [
        (r'\bdemocrat', 'Democratic'), (r'\brepublican', 'Republican'),
        (r'\bindependent', 'Independent'), (r'\bneutral', 'Neutral'),
        (r'\bliberal', 'Liberal'), (r'\bconservative', 'Conservative'),
        (r'\bgreenback', 'Greenback'), (r'\bprohibition', 'Prohibition'),
        (r'\bbaptist', 'Baptist'), (r'\bcongregational', 'Congregational'),
        (r'\bmethodist', 'Methodist'), (r'\buniversalist', 'Universalist'),
        (r'\breligious', 'Religious'), (r'\bagricultural', 'Agricultural'),
        (r'\bliterary', 'Literary'), (r'\bgerman', 'German'),
        (r'\bcomic', 'Comic'),
    ]
    text_lower = text.lower()
    for pattern, affil in affil_map:
        if re.search(pattern, text_lower):
            return affil
    return ""


def extract_price(text: str) -> str:
    """Extract subscription price."""
    patterns = [
        r'subscription\s*\$\s*(\d+)\s+(\d+)',
        r'subscription\s*\$\s*(\d+\.\d+)',
        r'subscription\s*\$\s*(\d+)',
    ]
    for pattern in patterns:
        match = re.search(pattern, text.lower())
        if match:
            groups = match.groups()
            if len(groups) == 2:
                return f"${groups[0]}.{groups[1]}"
            return f"${groups[0]}"
    return ""


def extract_established(text: str) -> str:
    """Extract year established."""
    patterns = [
        r'estab-?\s*lished\s*(\d{4})',
        r'established\s*(\d{4})',
        r're-established\s*(\d{4})',
    ]
    for pattern in patterns:
        match = re.search(pattern, text.lower())
        if match:
            return match.group(1)
    return ""


def extract_editor_publisher(text: str) -> Tuple[str, str]:
    """Extract editor and publisher names."""
    editor = ""
    publisher = ""
    
    # Pattern for "X, editor(s) and publisher(s)" (same person/people)
    match = re.search(r'([A-Z][A-Za-z\.\s\&\-,]+?),?\s*ed[-\s]*i[-\s]*t[-\s]*o[-\s]*r[-\s]*s?\s*and\s+pub[-\s]*l[-\s]*i[-\s]*s[-\s]*h[-\s]*e[-\s]*r[-\s]*s?', text)
    if match:
        name = re.sub(r'\s+', ' ', match.group(1).strip().rstrip(',;'))
        return name, name
    
    # Pattern for "X, editor(s) and proprietor(s)" (same person/people)
    match = re.search(r'([A-Z][A-Za-z\.\s\&\-,]+?),?\s*ed[-\s]*i[-\s]*t[-\s]*o[-\s]*r[-\s]*s?\s*and\s+pro[-\s]*p[-\s]*r[-\s]*i[-\s]*e[-\s]*t[-\s]*o[-\s]*r[-\s]*s?', text)
    if match:
        name = re.sub(r'\s+', ' ', match.group(1).strip().rstrip(',;'))
        return name, name
    
    # Pattern for editor
    match = re.search(r'([A-Z][A-Za-z\.\s\&\-,]+?),?\s*ed[-\s]*i[-\s]*t[-\s]*o[-\s]*r[-\s]*s?[;,:\s]', text)
    if match:
        editor = re.sub(r'\s+', ' ', match.group(1).strip().rstrip(',;'))
    
    # Pattern for publisher
    match = re.search(r'([A-Z][A-Za-z\.\s\&\-,]+(?:\s+Co\.)?),?\s*pub[-\s]*l[-\s]*i[-\s]*s[-\s]*h[-\s]*e[-\s]*r[-\s]*s?[;,:\s\.]', text)
    if match:
        publisher = re.sub(r'\s+', ' ', match.group(1).strip().rstrip(',;'))
    
    # Pattern for proprietor (extracted as publisher)
    if not publisher:
        match = re.search(r'([A-Z][A-Za-z\.\s\&\-,]+(?:\s+Co\.)?),?\s*pro[-\s]*p[-\s]*r[-\s]*i[-\s]*e[-\s]*t[-\s]*o[-\s]*r[-\s]*s?[;,:\s\.]', text)
        if match:
            publisher = re.sub(r'\s+', ' ', match.group(1).strip().rstrip(',;'))
    
    return editor, publisher


def extract_circulation(text: str) -> str:
    """Extract circulation number."""
    # Normalize line-break hyphens first
    text = re.sub(r'-\s+', '', text)
    
    # Pattern explanation:
    # - circ?(?:ulation|'n|e'n) : matches circulation, crculation, circ'n, cire'n
    # - [\s\-]* : optional whitespace or hyphens after the word
    # - (?:about|approximately|nearly|over|around)? : optional qualifier words
    # - [\s\-]* : optional whitespace or hyphens
    # - (?:\w+\s+)? : optional word like "daily" before the number
    # - (\d[\d,]*) : the circulation number
    pattern = r"circ?(?:ulation|'n|e'n)[\s\-]*(?:about|approximately|nearly|over|around)?[\s\-]*(?:\w+\s+)?(\d[\d,]*)"
    
    # Also check for "claims XXX" pattern
    claims_pattern = r"claims[\s\-]+(\d[\d,]*)"
    
    match = re.search(pattern, text.lower())
    if not match:
        match = re.search(claims_pattern, text.lower())
    
    if match:
        circ = match.group(1).replace(',', '')
        if 'estimated' in text.lower() or "est'd" in text.lower():
            return f"{circ} (estimated)"
        return circ
    return ""


def process_file(input_file, output_file):
    """Process a newspaper directory file and extract entries to CSV."""
    
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Remove page markers
    content = re.sub(r'---\s*Page\s+\d+\s*---', ' ', content)
    
    # Fix common OCR Greek letter substitutions (uppercase)
    content = content.replace('Α', 'A')  # Greek Alpha -> A
    content = content.replace('Β', 'B')  # Greek Beta -> B
    content = content.replace('Ε', 'E')  # Greek Epsilon -> E
    content = content.replace('Η', 'H')  # Greek Eta -> H
    content = content.replace('Ι', 'I')  # Greek Iota -> I
    content = content.replace('Κ', 'K')  # Greek Kappa -> K
    content = content.replace('Μ', 'M')  # Greek Mu -> M
    content = content.replace('Ν', 'N')  # Greek Nu -> N
    content = content.replace('Ο', 'O')  # Greek Omicron -> O
    content = content.replace('Ρ', 'P')  # Greek Rho -> P
    content = content.replace('Τ', 'T')  # Greek Tau -> T
    content = content.replace('Χ', 'X')  # Greek Chi -> X
    content = content.replace('Ζ', 'Z')  # Greek Zeta -> Z
    content = content.replace('Θ', 'O')  # Greek Theta -> O (visually similar)
    content = content.replace('Φ', 'O')  # Greek Phi -> O (visually similar)
    
    # Fix common OCR Cyrillic letter substitutions
    content = content.replace('С', 'C')  # Cyrillic Es -> C
    content = content.replace('О', 'O')  # Cyrillic O -> O
    content = content.replace('Р', 'P')  # Cyrillic Er -> P
    content = content.replace('Ф', 'O')  # Cyrillic Ef -> O
    content = content.replace('А', 'A')  # Cyrillic A -> A
    content = content.replace('Е', 'E')  # Cyrillic Ie -> E
    content = content.replace('Н', 'H')  # Cyrillic En -> H
    content = content.replace('В', 'B')  # Cyrillic Ve -> B
    content = content.replace('К', 'K')  # Cyrillic Ka -> K
    content = content.replace('М', 'M')  # Cyrillic Em -> M
    content = content.replace('Т', 'T')  # Cyrillic Te -> T
    
    # Fix lowercase Greek/Cyrillic
    content = content.replace('ο', 'o')  # Greek lowercase omicron -> o
    content = content.replace('а', 'a')  # Cyrillic lowercase a -> a
    content = content.replace('е', 'e')  # Cyrillic lowercase ie -> e
    content = content.replace('о', 'o')  # Cyrillic lowercase o -> o
    content = content.replace('р', 'p')  # Cyrillic lowercase er -> p
    content = content.replace('с', 'c')  # Cyrillic lowercase es -> c
    
    # Fix OCR diacritical errors
    content = content.replace('Ü', 'U')
    content = content.replace('Ö', 'O')
    content = content.replace('Ä', 'A')
    content = content.replace('É', 'E')
    content = content.replace('È', 'E')
    content = content.replace('Ñ', 'N')
    content = content.replace('Ç', 'C')
    
    # Remove OCR artifacts: sequences of O-like characters before town names
    # Matches patterns like "OOO ", "OOD ", "COO ", "CODO ", etc.
    content = re.sub(r'\b[OoCcDd0ΘΦ]{2,}\s+([A-Z]{2,})', r'\1', content)
    
    # Fix missing space between ALL CAPS town and Capitalized newspaper name
    # e.g., "VAN BURENArgus" -> "VAN BUREN Argus"
    content = re.sub(r'([A-Z]{4})([A-Z][a-z])', r'\1 \2', content)
    
    # Normalize whitespace
    text = ' '.join(content.split())
    
    results = []
    
    # Build state position index
    state_positions = []
    
    for state in STATES:
        # Pattern allows optional space before period: "ARKANSAS ." or "ARKANSAS."
        pattern = re.compile(r'\b' + re.escape(state) + r'\s*\.', re.IGNORECASE)
        for m in pattern.finditer(text):
            state_positions.append((m.start(), state))
    
    state_positions.sort()
    
    # Remove duplicate state entries at same/nearby positions
    filtered_positions = []
    for pos, state in state_positions:
        if not filtered_positions or pos - filtered_positions[-1][0] > 10:
            filtered_positions.append((pos, state))
    state_positions = filtered_positions
    
    # Main pattern for newspaper entries
    pattern = re.compile(
        r'\b'
        r'([A-Z][A-Z\'\-]+(?:\s+[A-Z][A-Z\'\-]+)*)'  # Group 1: Town (ALL CAPS words)
        r'\s*[,.\s]\s*'                              # Separator
        r'([A-Z][a-z][^;:†]*?)'                      # Group 2: Newspaper name
        r'\s*[;:†]'                                  # Delimiter
    )
    
    matches = list(pattern.finditer(text))
    
    # First pass: identify valid entries
    valid_matches = []
    for match in matches:
        pos = match.start()
        
        # Determine current state based on position
        match_state = None
        for sp, st in reversed(state_positions):
            if sp < pos:
                match_state = st
                break
        
        if not match_state:
            continue
        
        town = match.group(1).strip().rstrip(' ,.')
        newspaper = match.group(2).strip().rstrip(' ,.')
        
        # Skip index/header content
        if any(kw in newspaper.lower() for kw in ['list of', 'index', 'page']):
            continue
            
        # Skip if newspaper contains what looks like a page header
        if re.search(r'\b\d+\s+[A-Z]{4,}\.', newspaper):
            continue
        
        if len(town) >= 2 and len(newspaper) >= 2:
            valid_matches.append((match, match_state, town, newspaper))
    
    # Second pass: build results
    for i, (match, match_state, town, newspaper) in enumerate(valid_matches):
        if i + 1 < len(valid_matches):
            raw_text = text[match.start():valid_matches[i + 1][0].start()].strip()
        else:
            raw_text = text[match.start():].strip()
        
        # Extract additional fields from raw_text
        frequency = extract_frequency(raw_text)
        political = extract_political(raw_text)
        editor, publisher = extract_editor_publisher(raw_text)
        circulation = extract_circulation(raw_text)
        
        results.append({
            'state': match_state,
            'town': town.title(),
            'newspaper': newspaper,
            'frequency': frequency,
            'political': political,
            'editor': editor,
            'publisher': publisher,
            'circulation': circulation,
            'raw_text': raw_text
        })
    
    # Deduplicate
    seen = set()
    unique = []
    for r in results:
        key = (r['state'], r['town'], r['newspaper'])
        if key not in seen:
            seen.add(key)
            unique.append(r)
    
    # Remove known false positive entries from document header
    false_positives = {
        ("NEW YORK", "York", "January 1, 1869. ee ~~ CONTENTS"),
        ("NEW YORK", "Xiv", "Newspaper Directory Advertiser. XV. A circular to Advertisers, containing the names of more than one thousand newspapers, among which will be found the best advertising mediums in America"),
    }
    unique = [r for r in unique if (r['state'], r['town'], r['newspaper']) not in false_positives]
    
    print(f"{input_file}: {len(unique)} entries found")
    print(f"  States detected at positions: {state_positions[:10]}...")  # Debug
    
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['state', 'town', 'newspaper', 'frequency', 'political', 'editor', 'publisher', 'circulation', 'raw_text'])
        writer.writeheader()
        writer.writerows(unique)
    
    return unique

import os
for file in os.listdir("data/Newspaper Directory Text/")[1:5]:
    input_file = "data/Newspaper Directory Text/" + file
    output_file = "data/Newspaper Directory Excel/" + file[:-3] + 'csv'
    results = process_file(input_file, output_file)

data/Newspaper Directory Text/Rowell 1871.txt: 5878 entries found
  States detected at positions: [(1645, 'ALABAMA'), (6202, 'ALABAMA'), (6936, 'ALABAMA'), (10647, 'ALABAMA'), (14436, 'ALABAMA'), (16217, 'ARKANSAS'), (17901, 'ARKANSAS'), (21657, 'ARKANSAS'), (25257, 'CALIFORNIA'), (28971, 'CALIFORNIA')]...
data/Newspaper Directory Text/Rowell 1872.txt: 6241 entries found
  States detected at positions: [(3031, 'ALABAMA'), (5793, 'ALABAMA'), (8365, 'ALABAMA'), (9568, 'ALABAMA'), (9807, 'ALABAMA'), (15944, 'ALABAMA'), (17629, 'ARKANSAS'), (19444, 'ARKANSAS'), (23221, 'ARKANSAS'), (26938, 'ARKANSAS')]...
data/Newspaper Directory Text/Rowell 1873.txt: 6550 entries found
  States detected at positions: [(4234, 'ALABAMA'), (6070, 'ALABAMA'), (8785, 'ALABAMA'), (9804, 'ALABAMA'), (16230, 'ALABAMA'), (18164, 'ARKANSAS'), (21031, 'ARKANSAS'), (23281, 'ARKANSAS'), (23656, 'ARKANSAS'), (27325, 'ARKANSAS')]...
data/Newspaper Directory Text/Rowell 1876.txt: 7825 entries found
  States detected at p

In [5]:
# post 1877 data extraction

import re
import csv
from typing import List, Dict, Tuple, Optional

# US states and territories (1877 era)
US_STATES = {
    'ALABAMA', 'ALASKA', 'ARIZONA', 'ARKANSAS', 'CALIFORNIA', 'COLORADO',
    'CONNECTICUT', 'DELAWARE', 'DISTRICT OF COLUMBIA', 'FLORIDA', 'GEORGIA',
    'IDAHO', 'ILLINOIS', 'INDIANA', 'IOWA', 'KANSAS', 'KENTUCKY', 'LOUISIANA',
    'MAINE', 'MARYLAND', 'MASSACHUSETTS', 'MICHIGAN', 'MINNESOTA', 'MISSISSIPPI',
    'MISSOURI', 'MONTANA', 'NEBRASKA', 'NEVADA', 'NEW HAMPSHIRE', 'NEW JERSEY',
    'NEW MEXICO', 'NEW YORK', 'NORTH CAROLINA', 'NORTH DAKOTA', 'OHIO',
    'OKLAHOMA', 'OREGON', 'PENNSYLVANIA', 'RHODE ISLAND', 'SOUTH CAROLINA',
    'SOUTH DAKOTA', 'TENNESSEE', 'TEXAS', 'UTAH', 'VERMONT', 'VIRGINIA',
    'WASHINGTON', 'WEST VIRGINIA', 'WISCONSIN', 'WYOMING', 'DAKOTA',
    'INDIAN TERRITORY', 'MONTANA TERRITORY', 'NEW MEXICO TERRITORY',
    'UTAH TERRITORY', 'WASHINGTON TERRITORY', 'WYOMING TERRITORY',
    'ONTARIO', 'QUEBEC', 'NOVA SCOTIA', 'NEW BRUNSWICK', 'MANITOBA',
    'BRITISH COLUMBIA', 'PRINCE EDWARD ISLAND', 'NEWFOUNDLAND',
}


def normalize_state(state: str) -> str:
    """Normalize state names (handle OCR issues)."""
    replacements = {'Μ': 'M', 'Α': 'A', 'Ε': 'E', 'Ο': 'O', 'Ι': 'I', 
                    'Îœ': 'M', 'Î': 'A', 'œ': 'M', 'Ñ€Ð¾Ñ€': 'pop'}
    for old, new in replacements.items():
        state = state.replace(old, new)
    state = ''.join(c for c in state if ord(c) < 128 or c.isalpha())
    return state.strip().rstrip('. ').strip()


def is_state_header(line: str) -> bool:
    """Check if a line is a state header."""
    line = line.strip()
    if not line or len(line) < 4:
        return False
    cleaned = normalize_state(line)
    alpha_chars = [c for c in cleaned if c.isalpha()]
    if not alpha_chars or len(alpha_chars) < 4:
        return False
    if sum(1 for c in alpha_chars if c.isupper()) / len(alpha_chars) < 0.8:
        return False
    return cleaned in US_STATES


def is_town_line(line: str) -> bool:
    """Check if line starts a town entry."""
    line = line.strip()
    if not line:
        return False
    
    # Normalize "ST . " to "ST. " (OCR artifact)
    line = re.sub(r'^(ST|MT|FT|PT)\s+\.', r'\1.', line)
    
    if not re.match(r'^(?:(?:ST|MT|FT|PT)\s?\.| EL|LA|LE|DE|[A-Z]{3})', line):
        return False
    
    # Town: all caps, may include hyphens/periods, AND may have space-separated words
    town = r'[A-Z][A-Z\-\.]*(?:\s+[A-Z][A-Z\-\.]*)*'
    
    # County: Initial cap word(s) like "Mariposa" or "Contra Costa"
    county = r'[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*'
    
    patterns = [
        # === County seat patterns (C. H.) ===
        rf'^{town}\s?,\s*C\.\s*H\.\s*,',
        rf'^{town}\s?,\s*C\.H\.\s*,',
        rf'^{town}\s?,\s*C\.\s*H\.\s*,\s*{county}\s+Co\.?\s*[,\s]',
        rf'^{town}\s?,\s*C\.H\.\s*,\s*{county}\s+Co\.?\s*[,\s]',
        rf'^{town}\s?,\s*C\.\s*H\.\s*$',
        rf'^{town}\s?,\s*C\.H\.\s*$',
        rf'^{town}\s?,\s*c\.\s*h\.\s*,\s*{county}\s+Co\.?\s*[,\s]',
        
        # === Standard "County Co." patterns ===
        rf'^{town}\s?,\s*{county}\s+Co\.\s*,',
        rf'^{town}\s?,\s*{county}\s+Co\.\s*$',
        rf'^{town}\s?,\s*{county}\s+Co\.\s*;',
        rf'^{town}\s?,\s*{county}\s+Co\s*,',
        rf'^{town}\s?,\s*{county}\s+Co\s*$',
        rf'^{town}\s?,\s*{county}\s+Co\.?\s*,?\s*[\d,]+.*pop',
        
        # === OCR variation patterns ===
        rf'^{town}\s?,\s*{county}\s+Co\.\s+,',
        rf'^{town}\s?,\s*C\.\s*H\.\s*,\s*{county}\s+Co\.\s+,',
        rf'^{town}\s?,\s*{county}\s+Co\.\s*\d',
        rf'^{town}\s?,\s*{county}\s+Co,\.',
        rf'^{town}\s?,\s*C\.\s*H\.\s*,\s*{county}\s+Co,\.',
        rf'^{town}\s?,\s*{county}\s+Co\.\s*,\s*[a-z]',
    ]
    
    for pattern in patterns:
        if re.match(pattern, line):
            return True
    return False


def extract_town_name(line: str) -> str:
    """Extract town name from the start of a town line."""
    # Normalize "ST . " to "ST. " at start (OCR artifact)
    line = re.sub(r'^(ST|MT|FT|PT)\s+\.', r'\1.', line)

    match = re.match(r'^([A-Z][A-Z\s\-\.]+?)\s?,', line)
    return match.group(1).strip() if match else ""


def find_newspapers_in_block(block_text: str, town_name: str) -> List[Tuple[str, str]]:
    """Find all newspaper entries in a block of text."""
    newspapers = []
    
    # Frequency words
    freq_words = r'(?:every\s*(?:morning|evening|afternoon|day)|Sundays?|Mondays?|Tuesdays?|Wednesdays?|Thursdays?|Fridays?|Saturdays?|daily|weekly|semi-weekly|tri-weekly|monthly|bi-monthly|semi-monthly|quarterly)'
    
    # Political/type affiliations
    political_words = r'(?:democratic|republican|independent|neutral|liberal|conservative|greenback|prohibition|baptist|congregational|methodist|universalist|religious|agricultural|literary|german|french|spanish|comic)'
    
    # Combined indicator pattern
    indicator_pattern = rf'(?:{freq_words}|{political_words})'
    
    # Find all potential newspaper names (ALL CAPS sequences)
    name_pattern = r'([A-Z][A-Z\s\-\&\.\']+[A-Z])\s*[;:]'
    
    valid_newspapers = []
    
    for name_match in re.finditer(name_pattern, block_text):
        name = name_match.group(1).strip()
        name_end = name_match.end()
        
        if name == town_name:
            continue
        
        clean_name = name.replace('.', '').replace(' ', '').replace('-', '').replace('&', '').replace("'", '')
        if len(clean_name) < 3:
            continue
        if name.strip('. ') in ['CO', 'RD', 'POP', 'THE', 'AND', 'FOR', 'N', 'S', 'E', 'W', 'C', 'H']:
            continue
        if name.strip('. ') in US_STATES:
            continue
        
        lookahead_text = block_text[name_end:name_end + 200]
        
        next_caps_match = re.search(r'[;:]\s*([A-Z][A-Z\s\-\&\.\']{2,}[A-Z])\s*[;:]', lookahead_text)
        if next_caps_match:
            lookahead_text = lookahead_text[:next_caps_match.start()]
        
        sections = re.split(r'\s*;\s*', lookahead_text)[:3]
        lookahead_limited = ' ; '.join(sections)
        
        if re.search(indicator_pattern, lookahead_limited, re.IGNORECASE):
            valid_newspapers.append((name_match.start(), name_end, name))
    
    for i, (name_start, name_end, name) in enumerate(valid_newspapers):
        if i + 1 < len(valid_newspapers):
            entry_end = valid_newspapers[i + 1][0]
        else:
            entry_end = len(block_text)
        
        entry_text = block_text[name_start:entry_end].strip()
        entry_text = re.sub(r'\s+[A-Z]{3,}[A-Z\s\-\.]*,\s*(?:C\.\s*H\.|[A-Z][a-z]+\s+Co).*$', '', entry_text, flags=re.DOTALL)
        
        newspapers.append((name, entry_text))
    
    suspended_pattern = r'([A-Z][A-Z\s\-\&\.\']+[A-Z])\s*[\.:]?\s*(?:††|â€\s*â€|‡‡|\.\s*â€)'
    for match in re.finditer(suspended_pattern, block_text):
        name = match.group(1).strip()
        clean_name = name.replace('.', '').replace(' ', '')
        if name == town_name or len(clean_name) < 3:
            continue
        if name.strip('. ') in US_STATES:
            continue
        if not any(n[0] == name for n in newspapers):
            newspapers.append((name, f"{name} †† (suspended publication)"))
    
    return newspapers


def extract_frequency(text: str) -> str:
    """Extract publication frequency."""
    freq_map = [
        (r'every\s*morning', 'Daily'),
        (r'every\s*evening', 'Daily'),
        (r'every\s*afternoon', 'Daily'),
        (r'every\s*day', 'Daily'),
        (r'semi-weekly', 'Semi-weekly'), (r'tri-weekly', 'Tri-weekly'),
        (r'bi-weekly', 'Bi-weekly'), (r'bi-monthly', 'Bi-monthly'),
        (r'semi-month', 'Semi-monthly'),
        (r'sundays?', 'Sundays'), (r'mondays?', 'Mondays'), (r'tuesdays?', 'Tuesdays'),
        (r'wednesdays?', 'Wednesdays'), (r'thursdays?', 'Thursdays'),
        (r'fridays?', 'Fridays'), (r'saturdays?', 'Saturdays'),
        (r'\bdaily\b', 'Daily'), (r'\bweekly\b', 'Weekly'),
        (r'\bmonthly\b', 'Monthly'), (r'\bquarterly\b', 'Quarterly'),
    ]
    text_lower = text.lower()
    for pattern, freq in freq_map:
        if re.search(pattern, text_lower):
            return freq
    return ""


def extract_political(text: str) -> str:
    """Extract political affiliation."""
    affil_map = [
        (r'\bdemocrat', 'Democratic'), (r'\brepublican', 'Republican'),
        (r'\bindependent', 'Independent'), (r'\bneutral', 'Neutral'),
        (r'\bliberal', 'Liberal'), (r'\bconservative', 'Conservative'),
        (r'\bgreenback', 'Greenback'), (r'\bprohibition', 'Prohibition'),
        (r'\bbaptist', 'Baptist'), (r'\bcongregational', 'Congregational'),
        (r'\bmethodist', 'Methodist'), (r'\buniversalist', 'Universalist'),
        (r'\breligious', 'Religious'), (r'\bagricultural', 'Agricultural'),
        (r'\bliterary', 'Literary'), (r'\bgerman', 'German'),
        (r'\bcomic', 'Comic'),
    ]
    text_lower = text.lower()
    for pattern, affil in affil_map:
        if re.search(pattern, text_lower):
            return affil
    return ""


def extract_price(text: str) -> str:
    """Extract subscription price."""
    patterns = [
        r'subscription\s*\$\s*(\d+)\s+(\d+)',
        r'subscription\s*\$\s*(\d+\.\d+)',
        r'subscription\s*\$\s*(\d+)',
    ]
    for pattern in patterns:
        match = re.search(pattern, text.lower())
        if match:
            groups = match.groups()
            if len(groups) == 2:
                return f"${groups[0]}.{groups[1]}"
            return f"${groups[0]}"
    return ""


def extract_established(text: str) -> str:
    """Extract year established."""
    patterns = [
        r'estab-?\s*lished\s*(\d{4})',
        r'established\s*(\d{4})',
        r're-established\s*(\d{4})',
    ]
    for pattern in patterns:
        match = re.search(pattern, text.lower())
        if match:
            return match.group(1)
    return ""


def extract_editor_publisher(text: str) -> Tuple[str, str]:
    """Extract editor and publisher names."""
    editor = ""
    publisher = ""
    
    match = re.search(r'([A-Z][A-Za-z\.\s\&\-,]+?),?\s*ed[-\s]*i[-\s]*t[-\s]*o[-\s]*r[-\s]*s?\s*and\s+pub[-\s]*l[-\s]*i[-\s]*s[-\s]*h[-\s]*e[-\s]*r[-\s]*s?', text)
    if match:
        name = re.sub(r'\s+', ' ', match.group(1).strip().rstrip(',;'))
        return name, name
    
    match = re.search(r'([A-Z][A-Za-z\.\s\&\-,]+?),?\s*ed[-\s]*i[-\s]*t[-\s]*o[-\s]*r[-\s]*s?\s*and\s+pro[-\s]*p[-\s]*r[-\s]*i[-\s]*e[-\s]*t[-\s]*o[-\s]*r[-\s]*s?', text)
    if match:
        name = re.sub(r'\s+', ' ', match.group(1).strip().rstrip(',;'))
        return name, name
    
    match = re.search(r'([A-Z][A-Za-z\.\s\&\-,]+?),?\s*ed[-\s]*i[-\s]*t[-\s]*o[-\s]*r[-\s]*s?[;,:\s]', text)
    if match:
        editor = re.sub(r'\s+', ' ', match.group(1).strip().rstrip(',;'))
    
    match = re.search(r'([A-Z][A-Za-z\.\s\&\-,]+(?:\s+Co\.)?),?\s*pub[-\s]*l[-\s]*i[-\s]*s[-\s]*h[-\s]*e[-\s]*r[-\s]*s?[;,:\s\.]', text)
    if match:
        publisher = re.sub(r'\s+', ' ', match.group(1).strip().rstrip(',;'))
    
    if not publisher:
        match = re.search(r'([A-Z][A-Za-z\.\s\&\-,]+(?:\s+Co\.)?),?\s*pro[-\s]*p[-\s]*r[-\s]*i[-\s]*e[-\s]*t[-\s]*o[-\s]*r[-\s]*s?[;,:\s\.]', text)
        if match:
            publisher = re.sub(r'\s+', ' ', match.group(1).strip().rstrip(',;'))
    
    return editor, publisher


def extract_circulation_early(text: str) -> str:
    """Extract circulation number for 1877-1878 (numeric values)."""
    # Normalize line-break hyphens first
    text = re.sub(r'-\s+', '', text)
    
    # Pattern for circulation with numeric values
    pattern = r"circ?(?:ulation|'n|e'n)[\s\-]*(?:about|approximately|nearly|over|around)?[\s\-]*(?:\w+\s+)?(\d[\d,]*)"
    
    # Also check for "claims XXX" pattern
    claims_pattern = r"claims[\s\-]+(\d[\d,]*)"
    
    match = re.search(pattern, text.lower())
    if not match:
        match = re.search(claims_pattern, text.lower())
    
    if match:
        circ = match.group(1).replace(',', '')
        if 'estimated' in text.lower() or "est'd" in text.lower():
            return f"{circ} (estimated)"
        return circ
    return ""


CIRCULATION_RATINGS = {
    "A1": 100000, "A": 75000,
    "B1": 50000, "B": 37500,
    "C1": 25000, "C": 22500,
    "D1": 20000, "D": 17500,
    "E1": 15000, "E": 12500,
    "F1": 10000, "F": 7500,
    "G1": 5000, "G": 4000,
    "H1": 3000, "H": 2500,
    "I1": 2000, "I": 1500,
    "J1": 1000, "J": 750,
    "K1": 500, "K": 250,
    "X1": None,
}


def extract_circulation_late(text: str) -> Optional[int]:
    """Extract circulation for 1879+ (letter rating codes)."""
    # First try letter codes (e.g., "circulation K 1", "circulationK", "circulation G")
    match = re.search(r"circ(?:ulation|'n)\s*([A-KX])\s*([1])?", text, re.IGNORECASE)
    if match:
        letter = match.group(1).upper()
        suffix = match.group(2) or ""
        code = letter + suffix
        return CIRCULATION_RATINGS.get(code)
    
    # Then try numeric values (e.g., "circulation 350", "circ'n 800")
    match = re.search(r"circ(?:ulation|'n)\s*(\d[\d,]*)", text.lower())
    if match:
        circ = match.group(1).replace(',', '')
        return int(circ)
    
    return None


def parse_newspaper_entry(name: str, text: str, state: str, town: str, year: int) -> Dict:
    """Parse a newspaper entry and return structured data."""
    editor, publisher = extract_editor_publisher(text)
    
    # Use different circulation extraction based on year
    if year <= 1878:
        circulation = extract_circulation_early(text)
    else:
        circulation = extract_circulation_late(text)
    
    return {
        'state': state,
        'town': town,
        'newspaper_name': name,
        'frequency': extract_frequency(text),
        'political_affiliation': extract_political(text),
        'subscription_price': extract_price(text),
        'established': extract_established(text),
        'editor': editor,
        'publisher': publisher,
        'circulation': circulation,
        'raw_text': re.sub(r'\s+', ' ', text).strip()
    }


def preprocess_text(content: str) -> str:
    """Preprocess text to handle formatting issues."""
    content = re.sub(r'---\s*Page\s*\d+\s*---', '\n<<PAGE_BREAK>>\n', content)
    
    removals = [
        r'^GEO\s*\.\s*P\.?\s*ROWELL.*$',
        r'^P\.\s*ROWELL\s*&.*$',
        r'^AMERICAN NEWSPAPER DIRECTORY\s*\.?\s*$',
        r'^LIBRARY\s*$', r'^UNIVERSITY\s+OF\s*$',
        r'^EXPLANATIONS\s*\.?\s*$', r'^POPULATION\s*\.?\s*$',
        r'^CIRCULATION\s*\.?\s*$', r'^ITALIC WORDS\s*\.?\s*$',
        r'^\d+\s*$', r"^CO'S\s*$", r'^GEO\s*\.\s*$',
    ]
    for pattern in removals:
        content = re.sub(pattern, '', content, flags=re.MULTILINE)
    
    for state in US_STATES:
        pattern = r'(<<PAGE_BREAK>>)\s*\n?\s*' + re.escape(state) + r'\s*\.?\s*$'
        content = re.sub(pattern, r'\1', content, flags=re.MULTILINE | re.IGNORECASE)
    
    content = re.sub(r'<<PAGE_BREAK>>', '\n', content)
    content = re.sub(r'A LIST,\s*ARRANGED ALPHABETICALLY.*?ETC\.', '', content, flags=re.DOTALL)
    return content


def extract_newspapers(content: str, year: int) -> List[Dict]:
    """Main extraction function."""
    content = preprocess_text(content)
    lines = content.split('\n')
    
    results = []
    current_state = ""
    i = 0
    
    while i < len(lines):
        line = lines[i].strip()
        
        if not line:
            i += 1
            continue
        
        if is_state_header(line):
            current_state = normalize_state(line)
            i += 1
            continue
        
        if current_state and is_town_line(line):
            town_name = extract_town_name(line)
            if not town_name:
                i += 1
                continue
            
            block_lines = [line]
            j = i + 1
            while j < len(lines):
                next_line = lines[j].strip()
                if not next_line:
                    block_lines.append('')
                    j += 1
                    continue
                
                if is_state_header(next_line):
                    normalized_next = normalize_state(next_line)
                    if normalized_next == current_state:
                        j += 1
                        continue
                    else:
                        break
                
                if is_town_line(next_line):
                    break
                    
                block_lines.append(next_line)
                j += 1
            
            block_text = ' '.join(block_lines)
            block_text = re.sub(r'-\s+', '', block_text)
            block_text = re.sub(r'\s+', ' ', block_text)
            
            newspapers = find_newspapers_in_block(block_text, town_name)
            
            for name, entry_text in newspapers:
                entry = parse_newspaper_entry(name, entry_text, current_state, town_name, year)
                results.append(entry)
            
            i = j
            continue
        
        i += 1
    
    return results


def print_state_counts(newspapers: List[Dict]):
    """Print the number of newspaper entries found for each state."""
    state_counts = {}
    for entry in newspapers:
        state = entry['state']
        state_counts[state] = state_counts.get(state, 0) + 1
    
    print("\n=== Newspaper Entries by State ===")
    for state in sorted(state_counts.keys()):
        print(f"  {state}: {state_counts[state]}")
    print(f"  {'─' * 30}")
    print(f"  TOTAL: {len(newspapers)}\n")


def main(input_file: str, output_file: str, year: int):
    """Process input file and write CSV output."""
    with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    
    newspapers = extract_newspapers(content, year)
    
    fieldnames = ['state', 'town', 'newspaper_name', 'frequency', 'political_affiliation',
                  'subscription_price', 'established', 'editor', 'publisher', 'circulation', 'raw_text']
    
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(newspapers)
    
    print(f"Extracted {len(newspapers)} newspaper entries to {output_file}")
    print_state_counts(newspapers)
    return newspapers


def output_comparison_file(input_file: str, output_file: str, comparison_file: str):
    """Create a comparison file with chars 15000-35000 of input and output."""
    with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
        f.seek(40000)
        input_content = f.read(20000)
    
    with open(output_file, 'r', encoding='utf-8') as f:
        f.seek(37000)
        output_content = f.read(20000)
    
    with open(comparison_file, 'w', encoding='utf-8') as f:
        f.write("=== INPUT FILE (chars 15000-35000) ===\n\n")
        f.write(input_content)
        f.write("\n\n=== OUTPUT FILE (chars 15000-35000) ===\n\n")
        f.write(output_content)
    
    print(f"Comparison file written to {comparison_file}")


import glob
import os

if __name__ == "__main__":
    input_folder = 'data/Newspaper Directory Text'
    output_folder = 'data/Newspaper Directory Excel'
    
    # Find all files matching the pattern with v13 in the name
    pattern = os.path.join(input_folder, 'Rowell * - v13.txt')
    input_files = glob.glob(pattern)
    
    for input_file in input_files:
        # Extract the year from the filename
        filename = os.path.basename(input_file)
        match = re.search(r'Rowell (\d{4}) - v13\.txt', filename)
        
        if match:
            year = int(match.group(1))
            
            # Only process files from 1877 onwards
            if year < 1877:
                continue
            
            output_file = os.path.join(output_folder, f'Rowell {year}.csv')
            comparison_file = os.path.join(output_folder, f'Rowell {year}_comparison.txt')
            
            print(f"Processing: {input_file} (Year: {year})")
            print(f"  -> Output: {output_file}")
            print(f"  -> Comparison: {comparison_file}")
            
            main(input_file, output_file, year)
            output_comparison_file(input_file, output_file, comparison_file)
            
            print(f"  Done!\n")

    print(f"Processed {len(input_files)} files.")

Processing: data/Newspaper Directory Text\Rowell 1877 - v13.txt (Year: 1877)
  -> Output: data/Newspaper Directory Excel\Rowell 1877.csv
  -> Comparison: data/Newspaper Directory Excel\Rowell 1877_comparison.txt
Extracted 7274 newspaper entries to data/Newspaper Directory Excel\Rowell 1877.csv

=== Newspaper Entries by State ===
  ALABAMA: 81
  ARIZONA: 3
  ARKANSAS: 60
  CALIFORNIA: 218
  COLORADO: 41
  CONNECTICUT: 100
  DAKOTA: 15
  DELAWARE: 39
  DISTRICT OF COLUMBIA: 25
  FLORIDA: 26
  GEORGIA: 123
  IDAHO: 9
  ILLINOIS: 649
  INDIANA: 360
  IOWA: 330
  KANSAS: 159
  KENTUCKY: 119
  LOUISIANA: 82
  MAINE: 79
  MARYLAND: 112
  MASSACHUSETTS: 289
  MICHIGAN: 285
  MINNESOTA: 141
  MISSISSIPPI: 86
  MISSOURI: 335
  MONTANA: 10
  NEBRASKA: 98
  NEVADA: 19
  NEW BRUNSWICK: 18
  NEW HAMPSHIRE: 62
  NEW JERSEY: 162
  NEW MEXICO: 7
  NEW YORK: 950
  NORTH CAROLINA: 81
  OHIO: 538
  OREGON: 42
  PENNSYLVANIA: 633
  RHODE ISLAND: 27
  SOUTH CAROLINA: 63
  TENNESSEE: 134
  TEXAS: 147
  UTAH:

In [None]:
# new merger

import pandas as pd
import os
from pathlib import Path
from difflib import SequenceMatcher
import re
from collections import defaultdict

DAYS_OF_WEEK = ['sundays', 'mondays', 'tuesdays', 'wednesdays', 'thursdays', 'fridays', 'saturdays']

# Precompile regex patterns for day removal and normalization
_DAYS_PATTERN = re.compile('|'.join(re.escape(day) for day in DAYS_OF_WEEK), re.IGNORECASE)
_NORMALIZE_PATTERN = re.compile(r"[.,'\s]")

def normalize_text(s):
    """Normalize text for matching: lowercase, strip whitespace, remove punctuation."""
    if pd.isna(s):
        return ""
    return _NORMALIZE_PATTERN.sub("", str(s).lower().strip())

def normalize_text_no_days(s):
    """Normalize text and also remove days of the week."""
    if pd.isna(s):
        return ""
    text = _DAYS_PATTERN.sub("", str(s).lower().strip())
    return _NORMALIZE_PATTERN.sub("", text)

def similarity(a, b):
    """Calculate similarity ratio between two strings (0 to 1)."""
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, a, b).ratio()

def is_fuzzy_match(town1, name1, town2, name2, threshold=0.90):
    """
    Check if two newspaper records match using fuzzy matching.
    More strict: requires high similarity on both fields.
    """
    town_sim = similarity(town1, town2)
    name_sim = similarity(name1, name2)
    
    if town_sim >= threshold and name_sim >= threshold:
        return True
    if town_sim == 1.0 and name_sim >= 0.85:
        return True
    if name_sim == 1.0 and town_sim >= 0.85:
        return True
    return False

def remove_days_from_name(name):
    """Remove days of the week from a newspaper name, preserving original formatting."""
    if pd.isna(name):
        return ""
    result = _DAYS_PATTERN.sub('', str(name))
    # Clean up extra spaces
    return ' '.join(result.split()).strip()

def find_best_match(town, name, position_pct, existing_records_flat, existing_records_by_letter,
                    existing_records_no_days, current_established=None, established_lookup=None,
                    threshold=0.90):
    """
    Find the best matching key from existing records.
    First tries exact match, then fuzzy match for towns with same first letter,
    then tries again with days of week removed from BOTH current and existing names.
    Uses lower threshold (80%) if established dates match.
    
    Returns tuple: (matched_key or None, matched_via_days_removal: bool)
    """
    town_norm = normalize_text(town)
    name_norm = normalize_text(name)
    name_norm_no_days = normalize_text_no_days(name)
    
    # First: try exact match (using flat dict)
    exact_key = (town_norm, name_norm)
    if exact_key in existing_records_flat:
        return exact_key, False
    
    # Get first letter of town for filtering
    town_first_letter = town_norm[0] if town_norm else ""
    if not town_first_letter:
        return None, False
    
    # Get candidates with same first letter (pre-indexed)
    candidates = existing_records_by_letter.get(town_first_letter, {})
    if not candidates:
        return None, False
    
    # Second: try fuzzy match, only considering towns with same first letter
    # Now also compares with days removed from BOTH names
    best_match = None
    best_score = 0
    matched_via_days = False
    
    for (ex_town, ex_name), ex_position in candidates.items():
        # Check if established dates match for lower threshold
        effective_threshold = threshold
        if current_established and established_lookup:
            ex_established = established_lookup.get((ex_town, ex_name))
            if ex_established and str(current_established).strip() == str(ex_established).strip():
                effective_threshold = 0.80
        
        # Try standard fuzzy match first
        if is_fuzzy_match(town_norm, name_norm, ex_town, ex_name, effective_threshold):
            score = similarity(town_norm, ex_town) + similarity(name_norm, ex_name)
            if score > best_score:
                best_score = score
                best_match = (ex_town, ex_name)
                matched_via_days = False
            continue  # Found a match, no need to try days-removed for this record
        
        # Try with days removed from BOTH current and existing names
        # Use pre-computed no-days version
        ex_name_no_days = existing_records_no_days.get((ex_town, ex_name), ex_name)
        if is_fuzzy_match(town_norm, name_norm_no_days, ex_town, ex_name_no_days, effective_threshold):
            score = similarity(town_norm, ex_town) + similarity(name_norm_no_days, ex_name_no_days)
            if score > best_score:
                best_score = score
                best_match = (ex_town, ex_name)
                matched_via_days = True
    
    return best_match, matched_via_days

def load_and_tag_csvs(directory):
    """Load all CSVs from directory, tag with year, and split into pre/post 1877."""
    csv1_frames = []
    csv2_frames = []
    
    for file in Path(directory).glob("*.csv"):
        filename = file.stem
        year = None
        
        match = re.search(r'(1[89]\d{2})', filename)
        if match:
            year = int(match.group(1))
        
        if year is None:
            print(f"Warning: Could not extract year from {file.name}, skipping...")
            continue
        
        df = pd.read_csv(file, encoding='utf-8', on_bad_lines='skip')
        df['_year'] = year
        
        print(f"Loaded {file.name} (year {year}): {len(df)} records")
        
        if year <= 1876:
            csv1_frames.append(df)
        else:
            csv2_frames.append(df)
    
    return csv1_frames, csv2_frames

def process_dataframe(df, year, has_state=False):
    if 'raw_text' in df.columns:
        df = df.drop(columns=['raw_text'])
    
    df.columns = [c.lower().strip() for c in df.columns]
    
    # Normalize variant column names to standard names
    column_aliases = {
        'newspaper': 'newspaper_name',
        'political': 'political_affiliation',
    }
    df = df.rename(columns={k: v for k, v in column_aliases.items() if k in df.columns})
    
    # ... rest of function
    """Process a single dataframe: standardize and prepare for merging."""
    if 'raw_text' in df.columns:
        df = df.drop(columns=['raw_text'])
    
    df.columns = [c.lower().strip() for c in df.columns]
    
    data_cols = ['frequency', 'political_affiliation', 'subscription_price', 
                 'established', 'editor', 'publisher', 'circulation']
    
    rename_map = {}
    for col in data_cols:
        if col in df.columns:
            rename_map[col] = f"{year} {col}"
    
    df = df.rename(columns=rename_map)
    return df

def merge_newspapers_core(all_frames):
    """Core merge logic used by both full and test functions."""
    merged_records = {}
    # Flat dict for exact lookups
    record_positions_flat = {}
    # Index by first letter of town for faster fuzzy lookups
    record_positions_by_letter = defaultdict(dict)
    # Pre-computed normalized names with days removed
    existing_records_no_days = {}
    original_names = {}
    established_lookup = {}  # Track established dates for each record
    
    print(f"Processing {len(all_frames)} files...")
    print("Strategy: exact match first, then fuzzy match (same first letter, 90% similarity),")
    print("          with days of week removed from BOTH current and existing names,")
    print("          80% threshold if established dates match\n")
    
    for df, year, has_state in all_frames:
        total_rows = len(df)
        print(f"  Processing year {year} ({total_rows} records)...")
        matches_found = 0
        new_records = 0
        
        # Collect new records for this year, add to main dict after processing
        year_new_keys = []
        
        # Find the established column for this year
        established_col = f"{year} established"
        
        for idx, row in df.iterrows():
            row_num = df.index.get_loc(idx)
            position_pct = row_num / max(total_rows - 1, 1)
            
            town = row.get('town', '')
            name = row.get('newspaper_name', '')
            state = row.get('state', '') if has_state else ''
            current_established = row.get(established_col, None)
            
            town_norm = normalize_text(town)
            name_norm = normalize_text(name)
            
            if not town_norm or not name_norm:
                continue
            
            # Only match against records from previous years
            existing_key, matched_via_days = find_best_match(
                town, name, position_pct,
                record_positions_flat,
                record_positions_by_letter,
                existing_records_no_days,
                current_established=current_established,
                established_lookup=established_lookup,
                threshold=0.90
            )
            
            if existing_key:
                key = existing_key
                matches_found += 1
                old_pos = record_positions_flat[key]
                new_pos = (old_pos + position_pct) / 2
                record_positions_flat[key] = new_pos
                # Update indexed version too
                first_letter = key[0][0] if key[0] else ""
                if first_letter:
                    record_positions_by_letter[first_letter][key] = new_pos
                
                # If matched via days removal, update the stored name to remove days
                if matched_via_days:
                    old_town, old_name, old_state = original_names[key]
                    cleaned_name = remove_days_from_name(old_name)
                    original_names[key] = (old_town, cleaned_name, old_state)
            else:
                key = (town_norm, name_norm)
                merged_records[key] = {}
                original_names[key] = (town, name, state)
                # Queue this to be added after processing this year
                year_new_keys.append((key, position_pct, current_established))
                new_records += 1
            
            if state and not original_names[key][2]:
                original_names[key] = (original_names[key][0], original_names[key][1], state)
            
            year_cols = [c for c in row.index if c.startswith(f"{year} ")]
            for col in year_cols:
                merged_records[key][col] = row[col]
        
        # Now add this year's new records to positions for next year's matching
        for key, pos, estab in year_new_keys:
            record_positions_flat[key] = pos
            # Index by first letter
            first_letter = key[0][0] if key[0] else ""
            if first_letter:
                record_positions_by_letter[first_letter][key] = pos
            # Pre-compute no-days version
            existing_records_no_days[key] = normalize_text_no_days(key[1])
            if estab:
                established_lookup[key] = estab
        
        print(f"    -> {matches_found} matched to existing, {new_records} new records")
    
    print(f"\nTotal unique newspapers found: {len(merged_records)}")
    
    rows = []
    for key, data in merged_records.items():
        town, name, state = original_names[key]
        row = {'state': state, 'town': town, 'newspaper_name': name}
        row.update(data)
        rows.append(row)
    
    result = pd.DataFrame(rows)
    
    id_cols = ['state', 'town', 'newspaper_name']
    year_cols = [c for c in result.columns if c not in id_cols]
    
    def sort_key(col):
        parts = col.split(' ', 1)
        if len(parts) == 2 and parts[0].isdigit():
            return (int(parts[0]), parts[1])
        return (9999, col)
    
    year_cols = sorted(year_cols, key=sort_key)
    final_cols = id_cols + year_cols
    result = result[final_cols]
    result = result.sort_values(['state', 'town', 'newspaper_name'])
    
    return result

def prepare_frames(directory, max_years=None):
    """Load and prepare frames, optionally limiting to first N years."""
    csv1_frames, csv2_frames = load_and_tag_csvs(directory)
    
    if not csv1_frames and not csv2_frames:
        print("No CSV files found!")
        return None
    
    all_frames_raw = []
    
    for df in csv1_frames:
        year = df['_year'].iloc[0]
        all_frames_raw.append((df, year, False))
    
    for df in csv2_frames:
        year = df['_year'].iloc[0]
        all_frames_raw.append((df, year, True))
    
    all_frames_raw.sort(key=lambda x: x[1])
    
    if max_years is not None:
        all_frames_raw = all_frames_raw[:max_years]
        years_processing = [f[1] for f in all_frames_raw]
        print(f"\nTEST MODE: Processing only first {max_years} years: {years_processing}\n")
    
    all_frames = []
    for df, year, has_state in all_frames_raw:
        df = df.drop(columns=['_year'])
        df = process_dataframe(df, year, has_state=has_state)
        all_frames.append((df, year, has_state))
    
    return all_frames

def merge_newspapers_fuzzy(directory):
    """Main function to merge all newspaper CSVs with fuzzy matching."""
    all_frames = prepare_frames(directory)
    if all_frames is None:
        return None
    return merge_newspapers_core(all_frames)


if __name__ == "__main__":
    
    directory = r"data\Newspaper Directory Excel"
    
    print(f"Processing CSVs from: {directory}")
    print("=" * 60)
    
    result = merge_newspapers_fuzzy(directory)
    
    if result is not None:
        output_path = "master.csv"
        result.to_csv(output_path, index=False)
        print(f"\nSuccess! Output saved to: {output_path}")
        print(f"Total newspapers: {len(result)}")
        print(f"\nColumns in output:")
        for col in result.columns:
            print(f"  - {col}")
    else:
        print("Failed to create merged CSV.")

Processing CSVs from: data\Newspaper Directory Excel
Loaded Rowell 1869.csv (year 1869): 3072 records
Loaded Rowell 1871.csv (year 1871): 5878 records
Loaded Rowell 1872.csv (year 1872): 6241 records
Loaded Rowell 1873.csv (year 1873): 6550 records
Loaded Rowell 1876.csv (year 1876): 7825 records
Loaded Rowell 1877.csv (year 1877): 7274 records
Loaded Rowell 1878.csv (year 1878): 7548 records
Loaded Rowell 1879.csv (year 1879): 7785 records
Loaded Rowell 1880.csv (year 1880): 8561 records
Loaded Rowell 1882.csv (year 1882): 10435 records
Loaded Rowell 1883.csv (year 1883): 10171 records
Loaded Rowell 1884.csv (year 1884): 11484 records
Loaded Rowell 1885.csv (year 1885): 12282 records
Loaded Rowell 1890.csv (year 1890): 15629 records
Processing 14 files...
Strategy: exact match first, then fuzzy match (same first letter, 90% similarity),
          with days of week removed from BOTH current and existing names,
          80% threshold if established dates match

  Processing year 1869 (

In [None]:
# descriptive stats

import pandas as pd

df = pd.read_csv("data/master.csv")

# Get year columns (those starting with a 4-digit year)
year_prefixes = set()
for col in df.columns:
    parts = col.split(' ', 1)
    if len(parts) == 2 and parts[0].isdigit() and len(parts[0]) == 4:
        year_prefixes.add(parts[0])

years = sorted(year_prefixes)
print(f"Years in dataset: {years}\n")

# For each row, count how many years have any data
def count_years_with_data(row):
    count = 0
    for year in years:
        year_cols = [c for c in df.columns if c.startswith(f"{year} ")]
        if any(pd.notna(row[c]) for c in year_cols):
            count += 1
    return count

df['years_of_data'] = df.apply(count_years_with_data, axis=1)

# Summary
print("Newspapers by number of years with data:")
print("-" * 40)
counts = df['years_of_data'].value_counts().sort_index()
for num_years, count in counts.items():
    print(f"  {num_years} year(s): {count} newspapers")

print(f"\nTotal newspapers: {len(df)}")

  df = pd.read_csv("data/master.csv")


Years in dataset: ['1869', '1871', '1872', '1873', '1876', '1877', '1878', '1879', '1880', '1882', '1883', '1884', '1885', '1890']

Newspapers by number of years with data:
----------------------------------------
  0 year(s): 1277 newspapers
  1 year(s): 28675 newspapers
  2 year(s): 6484 newspapers
  3 year(s): 3584 newspapers
  4 year(s): 2496 newspapers
  5 year(s): 1467 newspapers
  6 year(s): 949 newspapers
  7 year(s): 756 newspapers
  8 year(s): 747 newspapers
  9 year(s): 608 newspapers
  10 year(s): 521 newspapers
  11 year(s): 525 newspapers
  12 year(s): 565 newspapers
  13 year(s): 325 newspapers
  14 year(s): 65 newspapers

Total newspapers: 49044


In [1]:
# CLEANING UP master.csv 

import pandas as pd

# Load the data
df = pd.read_csv('data/master.csv')

# Define the years we're tracking
years = [1869, 1871, 1872, 1873, 1876, 1877, 1878, 1879, 1880, 1882, 1883, 1884, 1885, 1890]

def levenshtein_distance(s1, s2):
    """Calculate the Levenshtein distance between two strings."""
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    prev_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        curr_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = prev_row[j + 1] + 1
            deletions = curr_row[j] + 1
            substitutions = prev_row[j] + (c1 != c2)
            curr_row.append(min(insertions, deletions, substitutions))
        prev_row = curr_row
    return prev_row[-1]

# Clean publisher columns - remove trailing ', editorand' or ', editor and' (with fuzzy matching)
def clean_publisher(val):
    if pd.isna(val):
        return val
    val = str(val).strip()
    lower_val = val.lower()
    
    # Check for ', editor and' first (exact match)
    if lower_val.endswith(', editor and'):
        return val[:-len(', editor and')]
    
    # Check for ', editorand' with fuzzy matching (1 char tolerance)
    # Look for ', ' followed by something close to 'editorand'
    if ', ' in lower_val:
        last_comma_idx = lower_val.rfind(', ')
        suffix = lower_val[last_comma_idx + 2:]  # text after ', '
        if levenshtein_distance(suffix, 'editorand') <= 1:
            return val[:last_comma_idx]
        if levenshtein_distance(suffix, 'editor and') <= 1:
            return val[:last_comma_idx]
    
    return val

# Apply cleaning to all publisher columns
publisher_cols = [f'{year} publisher' for year in years]
changes_made = 0

for col in publisher_cols:
    if col in df.columns:
        original = df[col].copy()
        df[col] = df[col].apply(clean_publisher)
        changes_made += (original != df[col]).sum()

print(f"Cleaned {changes_made} publisher entries")
print("Removed trailing ', editorand' and ', editor and'")

# Save back to master.csv
df.to_csv('data/master.csv', index=False)
print("Saved to data/master.csv")

  df = pd.read_csv('data/master.csv')


Cleaned 570145 publisher entries
Removed trailing ', editorand' and ', editor and'
Saved to data/master.csv
