In [1]:
import os

years = [1869, 1871, 1872, 1873, 1876]
char_limit = 20000

txt_dir = r"data\Newspaper Directory Text"
csv_dir = r"data\Newspaper Directory Excel"
output_file = "extraction_comparison.md"

with open(output_file, "w", encoding="utf-8") as out:
    out.write("# Data Extraction Comparison\n\n")
    out.write("This file contains the first 20,000 characters of each source text file ")
    out.write("alongside the corresponding CSV output for comparison.\n\n")
    out.write("---\n\n")
    
    for year in years:
        txt_path = os.path.join(txt_dir, f"Rowell {year}.txt")
        csv_path = os.path.join(csv_dir, f"Rowell {year}.csv")
        
        out.write(f"# Year: {year}\n\n")
        
        # Read text file
        out.write(f"## Source Text (first {char_limit:,} characters)\n\n")
        out.write("```\n")
        try:
            with open(txt_path, "r", encoding="utf-8") as f:
                txt_content = f.read(char_limit)
            out.write(txt_content)
        except FileNotFoundError:
            out.write(f"[FILE NOT FOUND: {txt_path}]")
        except Exception as e:
            out.write(f"[ERROR READING FILE: {e}]")
        out.write("\n```\n\n")
        
        # Read CSV file
        out.write(f"## CSV Output (first {char_limit:,} characters)\n\n")
        out.write("```csv\n")
        try:
            with open(csv_path, "r", encoding="utf-8") as f:
                csv_content = f.read(char_limit)
            out.write(csv_content)
        except FileNotFoundError:
            out.write(f"[FILE NOT FOUND: {csv_path}]")
        except Exception as e:
            out.write(f"[ERROR READING FILE: {e}]")
        out.write("\n```\n\n")
        
        out.write("---\n\n")

print(f"Comparison file created: {output_file}")

Comparison file created: extraction_comparison.md


In [None]:
import pandas as pd
import json
import numpy as np

# Load the newspaper metadata
newspapers = pd.read_csv('final_list.csv')

# Load the topic counts JSON
with open('topic_counts.json', 'r') as f:
    topic_data = json.load(f)

# Define the topic categories we're tracking
TOPICS = [
    'labor_workers', 'politics_elections', 'congress_government',
    'business_commerce', 'railroads_transportation', 'agriculture_farming',
    'courts_law', 'finance_money', 'immigration_foreign', 'crime_police'
]

# Build a long-form panel from the JSON
records = []
for year, papers in topic_data.items():
    for paper_name, data in papers.items():
        if 'normalized_counts' in data:
            record = {
                'year': int(year),
                'newspapers_all_years_name': paper_name.lower().strip()
            }
            # Add each topic's normalized share
            for topic in TOPICS:
                record[topic] = data['normalized_counts'].get(topic, 0.0)
            records.append(record)

topic_panel = pd.DataFrame(records)

# Normalize newspaper names in metadata for matching
newspapers['newspapers_all_years_name'] = (
    newspapers['newspapers_all_years_name'].str.lower().str.strip()
)

# Merge topic panel with newspaper metadata
panel = topic_panel.merge(
    newspapers[['newspapers_all_years_name', 'master_id', 'master_name', 'publisher_change_year']],
    on='newspapers_all_years_name',
    how='left'
)

# Sort for lag calculation
panel = panel.sort_values(['master_id', 'year']).reset_index(drop=True)

# Calculate Y_it: sqrt of sum of squared differences in topic shares from t-1 to t
def calc_volatility(group):
    group = group.sort_values('year').copy()
    volatility = []
    for i, row in group.iterrows():
        if i == group.index[0]:
            # First observation for this paper: no prior year
            volatility.append(np.nan)
        else:
            prev_idx = group.index[group.index.get_loc(i) - 1]
            prev_row = group.loc[prev_idx]
            # Only calculate if years are consecutive
            if row['year'] == prev_row['year'] + 1:
                sq_diffs = sum((row[t] - prev_row[t])**2 for t in TOPICS)
                volatility.append(np.sqrt(sq_diffs))
            else:
                volatility.append(np.nan)
    group['Y_it'] = volatility
    return group

panel = panel.groupby('master_id', group_keys=False).apply(calc_volatility)

# Calculate Time_to_Treat (k): year - publisher_change_year
panel['Treat_Year'] = panel['publisher_change_year']
panel['Time_to_Treat'] = panel.apply(
    lambda r: r['year'] - r['Treat_Year'] if pd.notna(r['Treat_Year']) else np.nan,
    axis=1
)

# Create the final output table
output = panel[[
    'master_id', 'master_name', 'year', 'Y_it', 'Treat_Year', 'Time_to_Treat'
]].rename(columns={
    'master_id': 'Newspaper_ID',
    'master_name': 'Newspaper_Name',
    'year': 'Year',
    'Treat_Year': 'Treat_Year',
    'Time_to_Treat': 'Time_to_Treat'
})

# Sort for readability
output = output.sort_values(['Newspaper_ID', 'Year']).reset_index(drop=True)

# Display sample
print("Sample of final panel data:")
print(output.head(20).to_string(index=False))
print(f"\nTotal observations: {len(output)}")
print(f"Unique newspapers: {output['Newspaper_ID'].nunique()}")
print(f"Year range: {output['Year'].min()} - {output['Year'].max()}")
print(f"Treated newspapers: {output[output['Treat_Year'].notna()]['Newspaper_ID'].nunique()}")

# Save to CSV
output.to_csv('panel_for_did.csv', index=False)
print("\nSaved to 'panel_for_did.csv'")

In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

# Load the panel data created by the previous script
panel = pd.read_csv('panel_for_did.csv')

# Drop observations with missing outcome (first year of each paper)
panel = panel.dropna(subset=['Y_it'])

# =============================================================================
# Step 1: Create event-time dummies D^k_it
# We use k ∈ [-5, +5] as our event window; bin endpoints to avoid collinearity
# =============================================================================
K_MIN, K_MAX = -5, 5

def bin_time_to_treat(k):
    """Bin extreme values and mark never-treated as separate category."""
    if pd.isna(k):
        return 'never_treated'
    elif k < K_MIN:
        return f'k_{K_MIN}'  # bin early periods
    elif k > K_MAX:
        return f'k_{K_MAX}'  # bin late periods
    else:
        return f'k_{int(k)}'

panel['event_time'] = panel['Time_to_Treat'].apply(bin_time_to_treat)

# =============================================================================
# Step 2: Set reference period (k = -1) for identification
# This is standard: we normalize to the year before treatment
# =============================================================================
panel['event_time'] = pd.Categorical(
    panel['event_time'],
    categories=['never_treated'] + [f'k_{k}' for k in range(K_MIN, K_MAX + 1)],
    ordered=True
)

# =============================================================================
# Step 3: Estimate Two-Way Fixed Effects (TWFE) model
# Y_it = α_i + δ_t + Σ β_k D^k_it + ε_it
# Using k=-1 as reference (omitted) category
# =============================================================================
model = smf.ols(
    'Y_it ~ C(event_time, Treatment("k_-1")) + C(Newspaper_ID) + C(Year)',
    data=panel
).fit(cov_type='cluster', cov_kwds={'groups': panel['Newspaper_ID']})

print("=== TWFE Event Study Results ===\n")
print(model.summary().tables[1])

# =============================================================================
# Step 4: Extract β_k coefficients for plotting
# =============================================================================
coefs = []
for k in range(K_MIN, K_MAX + 1):
    if k == -1:  # reference period
        coefs.append({'k': k, 'beta': 0, 'se': 0, 'ci_lower': 0, 'ci_upper': 0})
    else:
        param_name = f'C(event_time, Treatment("k_-1"))[T.k_{k}]'
        if param_name in model.params:
            coefs.append({
                'k': k,
                'beta': model.params[param_name],
                'se': model.bse[param_name],
                'ci_lower': model.conf_int().loc[param_name, 0],
                'ci_upper': model.conf_int().loc[param_name, 1]
            })

coef_df = pd.DataFrame(coefs)

# =============================================================================
# Step 5: Plot the event study graph
# Pre-treatment coefficients (k < -1) test parallel trends assumption
# Post-treatment coefficients (k >= 0) show causal effect
# =============================================================================
fig, ax = plt.subplots(figsize=(10, 6))

ax.errorbar(coef_df['k'], coef_df['beta'], 
            yerr=[coef_df['beta'] - coef_df['ci_lower'], 
                  coef_df['ci_upper'] - coef_df['beta']],
            fmt='o', capsize=4, color='steelblue', markersize=8)

ax.axhline(y=0, color='black', linestyle='-', linewidth=0.8)
ax.axvline(x=-0.5, color='red', linestyle='--', linewidth=1, label='Treatment')

ax.set_xlabel('Years Relative to Publisher Change (k)', fontsize=12)
ax.set_ylabel('Effect on Content Volatility (βₖ)', fontsize=12)
ax.set_title('Event Study: Publisher Change Effect on Editorial Content', fontsize=14)
ax.set_xticks(range(K_MIN, K_MAX + 1))
ax.legend()
plt.tight_layout()
plt.savefig('event_study_plot.png', dpi=150)
plt.show()

print("\n=== Interpretation Guide ===")
print("• Pre-trend test: β coefficients for k < -1 should be ≈ 0 (not significant)")
print("• Causal effect: β coefficients for k ≥ 0 show treatment impact")
print("• Positive β means MORE content volatility after publisher change")

In [None]:
# ARCHIVED OLD VERSION OF PUBLISHER CHANGE: 

import pandas as pd

# Load the data
df = pd.read_csv('data/master.csv')

# Define the years we're tracking
years = [1869, 1871, 1872, 1873, 1876, 1877, 1878, 1879, 1880, 1882, 1883, 1884, 1885, 1890]

def levenshtein_distance(s1, s2):
    """Calculate the Levenshtein distance between two strings."""
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    
    prev_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        curr_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = prev_row[j + 1] + 1
            deletions = curr_row[j] + 1
            substitutions = prev_row[j] + (c1 != c2)
            curr_row.append(min(insertions, deletions, substitutions))
        prev_row = curr_row
    return prev_row[-1]

def strings_match(s1, s2, max_distance=1):
    """Check if two strings match within max_distance edits."""
    s1_clean = s1.strip().lower()
    s2_clean = s2.strip().lower()
    if s1_clean == s2_clean:
        return True
    return levenshtein_distance(s1_clean, s2_clean) <= max_distance

import re

def tokenize_publisher(publisher_str):
    """
    Tokenize publisher string by splitting on whitespace, semicolons, and commas.
    Keeps periods for names like J.D.
    Returns list of tokens that are 4+ characters long.
    """
    if not publisher_str:
        return []
    # Replace semicolons and commas with spaces, then split on whitespace
    cleaned = re.sub(r'[;,]', ' ', publisher_str)
    tokens = cleaned.split()
    # Only keep tokens 4+ characters long
    return [t.strip() for t in tokens if len(t.strip()) >= 4]

def publishers_match(pub1, pub2):
    """
    Check if two publisher strings match.
    Tokenizes both strings and checks if ANY token (4+ chars) from pub1 
    matches ANY token from pub2 (with 1 char tolerance for OCR errors).
    """
    tokens1 = tokenize_publisher(pub1)
    tokens2 = tokenize_publisher(pub2)
    
    if not tokens1 or not tokens2:
        return False
    
    # Check if any token from pub1 matches any token from pub2
    for t1 in tokens1:
        for t2 in tokens2:
            if strings_match(t1, t2, max_distance=1):
                return True
    return False

def analyze_publisher_changes(row):
    """
    Analyze a newspaper row for publisher changes.
    Returns: (category, year_of_first_change)
    Categories: 'publisher_change', 'publisher_change_same_editor', 'same_publisher', 'insufficient_data'
    """
    # Collect (year, publisher, editor) tuples where publisher is not empty
    data_points = []
    for year in years:
        pub_col = f'{year} publisher'
        ed_col = f'{year} editor'
        publisher = row.get(pub_col, '')
        editor = row.get(ed_col, '')
        
        # Convert to string and check if not empty
        publisher = str(publisher).strip() if pd.notna(publisher) else ''
        editor = str(editor).strip() if pd.notna(editor) else ''
        
        if publisher and publisher.lower() != 'nan':
            data_points.append((year, publisher, editor))
    
    # Check if we have at least 4 years of data
    if len(data_points) < 4:
        return ('insufficient_data', None)
    
    # Check for publisher changes
    first_change_year = None
    has_publisher_change = False
    change_with_same_editor = False
    
    for i in range(1, len(data_points)):
        prev_year, prev_pub, prev_ed = data_points[i-1]
        curr_year, curr_pub, curr_ed = data_points[i]
        
        # Use fuzzy matching with semicolon section handling
        if not publishers_match(prev_pub, curr_pub):
            has_publisher_change = True
            if first_change_year is None:
                first_change_year = curr_year
                # Check if editor stayed the same during this first change
                # Editor must be non-empty in both years to count as "same editor"
                # Also use fuzzy matching for editors
                if prev_ed and curr_ed and strings_match(prev_ed, curr_ed, max_distance=1):
                    change_with_same_editor = True
    
    if not has_publisher_change:
        return ('same_publisher', None)
    elif change_with_same_editor:
        return ('publisher_change_same_editor', first_change_year)
    else:
        return ('publisher_change', first_change_year)

# Apply analysis to each row
results = df.apply(analyze_publisher_changes, axis=1)
df['category'] = results.apply(lambda x: x[0])
df['publisher_change_year'] = results.apply(lambda x: x[1])

# Filter out insufficient data
valid_df = df[df['category'] != 'insufficient_data'].copy()

# Count categories
category_counts = valid_df['category'].value_counts()

print("=" * 60)
print("PUBLISHER CHANGE ANALYSIS RESULTS")
print("=" * 60)
print(f"\nTotal newspapers analyzed: {len(df)}")
print(f"Newspapers with at least 4 years of data: {len(valid_df)}")
print(f"Newspapers with insufficient data: {len(df) - len(valid_df)}")
print("\n" + "-" * 40)
print("CATEGORY BREAKDOWN:")
print("-" * 40)

for cat in ['publisher_change', 'publisher_change_same_editor', 'same_publisher']:
    count = category_counts.get(cat, 0)
    pct = (count / len(valid_df) * 100) if len(valid_df) > 0 else 0
    label = {
        'publisher_change': 'Publisher changed (different editor)',
        'publisher_change_same_editor': 'Publisher changed (same editor)',
        'same_publisher': 'Same publisher throughout'
    }[cat]
    print(f"{label}: {count} ({pct:.1f}%)")

# Save updated CSV
df.to_csv('data/master.csv', index=False)
print("\n" + "=" * 60)
print("Updated master.csv with 'category' and 'publisher_change_year' columns")
print("=" * 60)

# Show sample of newspapers with publisher changes
print("\n" + "-" * 40)
print("SAMPLE: Newspapers with publisher changes")
print("-" * 40)
changes_df = valid_df[valid_df['category'].isin(['publisher_change', 'publisher_change_same_editor'])]
if len(changes_df) > 0:
    sample_cols = ['state', 'town', 'newspaper_name', 'category', 'publisher_change_year']
    print(changes_df[sample_cols].head(10).to_string(index=False))
else:
    print("No publisher changes found.")

# Create lists for each category
publisher_change_list = valid_df[valid_df['category'] == 'publisher_change'][['state', 'town', 'newspaper_name', 'publisher_change_year']]
publisher_change_same_editor_list = valid_df[valid_df['category'] == 'publisher_change_same_editor'][['state', 'town', 'newspaper_name', 'publisher_change_year']]
same_publisher_list = valid_df[valid_df['category'] == 'same_publisher'][['state', 'town', 'newspaper_name']]

print("\n" + "=" * 60)
print("DataFrames created:")
print("  - publisher_change_list")
print("  - publisher_change_same_editor_list") 
print("  - same_publisher_list")
print("=" * 60)

In [2]:
"""
Newspaper Location Lookup Script
Uses the Library of Congress loc.gov API to find state/town information
for newspapers missing location data, using LCCN as the lookup key.
"""

import csv
import time
import requests
from pathlib import Path

# Configuration
INPUT_FILE = "newspapers_all_years.csv"
OUTPUT_FILE = "newspapers_all_years_updated.csv"
RATE_LIMIT_DELAY = 0.5  # seconds between API calls (be nice to LOC servers)

def get_newspaper_location(lccn):
    """
    Query the LOC API for newspaper location data using LCCN.
    Returns (city, state) tuple or (None, None) if not found.
    """
    if not lccn or lccn.strip() == "":
        return None, None
    
    lccn = lccn.strip()
    
    # Try the loc.gov item endpoint with JSON response
    url = f"https://www.loc.gov/item/{lccn}/?fo=json"
    
    try:
        response = requests.get(url, timeout=30)
        
        if response.status_code == 200:
            data = response.json()
            
            # Extract location from the 'item' object
            item = data.get('item', {})
            
            # location_city and location_state are the fields we need
            city = item.get('location_city')
            state = item.get('location_state')
            
            # These can be lists or strings, handle both
            if isinstance(city, list) and city:
                city = city[0]
            if isinstance(state, list) and state:
                state = state[0]
            
            # Also check 'location' field which may have combined info
            if not city or not state:
                location = item.get('location', [])
                if location and isinstance(location, list):
                    for loc in location:
                        if isinstance(loc, str):
                            # Format is often "State--County--City"
                            parts = loc.split('--')
                            if len(parts) >= 1 and not state:
                                state = parts[0]
                            if len(parts) >= 3 and not city:
                                city = parts[2]
            
            return city, state
            
        elif response.status_code == 404:
            print(f"  LCCN {lccn} not found in LOC database")
            return None, None
        else:
            print(f"  Error {response.status_code} for LCCN {lccn}")
            return None, None
            
    except requests.exceptions.Timeout:
        print(f"  Timeout for LCCN {lccn}")
        return None, None
    except requests.exceptions.RequestException as e:
        print(f"  Request error for LCCN {lccn}: {e}")
        return None, None
    except (KeyError, ValueError) as e:
        print(f"  Parse error for LCCN {lccn}: {e}")
        return None, None


def needs_location_lookup(row):
    """Check if this row is missing state or town data."""
    town = row.get('town', '').strip()
    state = row.get('state', '').strip()
    lccn = row.get('lccn', '').strip()
    
    # Need lookup if missing town OR state, AND has an LCCN to look up
    return (not town or not state) and lccn


def main():
    input_path = Path(INPUT_FILE)
    output_path = Path(OUTPUT_FILE)
    
    if not input_path.exists():
        print(f"Error: Input file '{INPUT_FILE}' not found.")
        print("Make sure the file is in the same directory as this script.")
        return
    
    # Read all rows
    with open(input_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        fieldnames = reader.fieldnames
        rows = list(reader)
    
    print(f"Loaded {len(rows)} newspapers from {INPUT_FILE}")
    
    # Count how many need lookups
    needs_lookup = [r for r in rows if needs_location_lookup(r)]
    print(f"Found {len(needs_lookup)} entries missing state or town with valid LCCN")
    
    if not needs_lookup:
        print("No entries need location lookups. Exiting.")
        return
    
    # Process rows needing lookups
    updated_count = 0
    failed_count = 0
    
    for i, row in enumerate(rows):
        if not needs_location_lookup(row):
            continue
        
        lccn = row['lccn'].strip()
        name = row.get('name', 'Unknown')
        
        print(f"[{i+1}/{len(rows)}] Looking up: {name} (LCCN: {lccn})")
        
        city, state = get_newspaper_location(lccn)
        
        if city or state:
            if not row['town'].strip() and city:
                row['town'] = city
                print(f"  Found town: {city}")
            if not row['state'].strip() and state:
                row['state'] = state
                print(f"  Found state: {state}")
            updated_count += 1
        else:
            failed_count += 1
            print(f"  No location data found")
        
        # Rate limiting - be respectful to LOC servers
        time.sleep(RATE_LIMIT_DELAY)
    
    # Write updated CSV
    with open(output_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)
    
    print(f"\nComplete!")
    print(f"  Updated: {updated_count} entries")
    print(f"  Failed lookups: {failed_count} entries")
    print(f"  Output saved to: {OUTPUT_FILE}")


if __name__ == "__main__":
    main()

Loaded 960 newspapers from newspapers_all_years.csv
Found 64 entries missing state or town with valid LCCN
[22/960] Looking up: savannah morning news (LCCN: sn82015137)
  Found town: savannah
  Found state: georgia
[29/960] Looking up: national republican (LCCN: sn86053573)
  Found town: washington
  Found state: district of columbia
[30/960] Looking up: daily republican (LCCN: sn84038114)
  Found town: wilmington
[55/960] Looking up: the cincinnati daily star (LCCN: sn85025759)
  Found town: cincinnati
[85/960] Looking up: the indiana state sentinel (LCCN: sn87056600)
  Found town: indianapolis
  Found state: indiana
[126/960] Looking up: the morning herald (LCCN: sn84038119)
  Found town: wilmington
  Found state: delaware
[154/960] Looking up: the red cloud chief (LCCN: sn84022835)
  Found town: red cloud
  Found state: nebraska
[192/960] Looking up: la crónica (LCCN: sn84025126)
  Found town: los angeles
  Found state: california
[215/960] Looking up: middletown transcript (LCCN: 

In [3]:
# possibly improved pre 1877 data extraction

import re
import csv
from pathlib import Path
import time

# Known multi-word town names that appear in the directories
# This list should be expanded as more are discovered
MULTI_WORD_TOWNS = {
    # Two-word towns
    'DES ARC', 'FORT SMITH', 'UNION SPRINGS', 'BLADEN SPRINGS', 
    'LITTLE ROCK', 'PINE BLUFF', 'HOT SPRINGS', 'VAN BUREN',
    'LAKE VILLAGE', 'DE WITT', 'ARKANSAS CITY', 'FAYETTE C. H.',
    'GROVE HILL', 'LA FAYETTE', 'NEW ORLEANS', 'BATON ROUGE',
    'PORT GIBSON', 'PASS CHRISTIAN', 'BAY ST. LOUIS', 'MOSS POINT',
    'WEST POINT', 'HOLLY SPRINGS', 'WATER VALLEY', 'YAZOO CITY',
    'VICKSBURG', 'JACKSON', 'MERIDIAN',
    'CARROLLTON',
    # Three-word towns
    'DEVALL\'S BLUFF', 'DEVALLS BLUFF',
    # Common suffixes that indicate multi-word towns
    'COURT HOUSE', 'C. H.',
}

# Pattern to match town names that end with common suffixes
TOWN_SUFFIX_PATTERNS = [
    r'C\.\s*H\.?',      # Court House abbreviation
    r'SPRINGS?',         # Springs
    r'CITY',            # City
    r'BLUFF',           # Bluff
    r'ROCK',            # Rock
    r'HILL',            # Hill
    r'POINT',           # Point
    r'VILLAGE',         # Village
]


def clean_text(text):
    """Clean OCR artifacts and normalize text."""
    replacements = {
        'Î': 'A', 'Î•': 'E', 'Îœ': 'M', 'Î': 'N', 'Ð¡': 'C', 'Ð¢': 'T',
        'Ã‰': 'E', 'Ñ': 'c', 'Ðµ': 'e', 'Ñ€': 'p', 'Ð': 'N',
        '`': "'", "'": "'", '"': '"', '"': '"',
        '\xad': '', '­': '',
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text


def normalize_town_name(town):
    """
    Normalize town name - fix common OCR errors and standardize format.
    """
    if not town:
        return town
    
    # Fix common OCR errors in town names
    ocr_fixes = {
        'CAR LLTON': 'CARROLLTON',
        'CARLLTON': 'CARROLLTON', 
        'FRORENCE': 'FLORENCE',
        'TUSHALOOSA': 'TUSCALOOSA',
        'TUSHEGEE': 'TUSKEGEE',
        'SCOTSBORO': 'SCOTTSBORO',
        'MONROEVILLÃ‰': 'MONROEVILLE',
    }
    
    town_upper = town.upper().strip()
    if town_upper in ocr_fixes:
        return ocr_fixes[town_upper]
    
    return town.strip()


def normalize_for_matching(text):
    """Normalize text for pattern matching - removes extra spaces."""
    return re.sub(r'\s+', ' ', text)


def normalize_editor_publisher_text(text):
    """
    Normalize text specifically for editor/publisher extraction.
    Handles OCR artifacts like hyphenated line breaks and missing spaces.
    """
    normalized = text
    
    # Remove hyphenated line breaks (e.g., "edi- tors" -> "editors", "pub- lisher" -> "publisher")
    normalized = re.sub(r'-\s+', '', normalized)
    
    # Fix common OCR run-together patterns
    normalized = re.sub(r'(editors?)(and)(pub)', r'\1 \2 \3', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(editors?)(and)(prop)', r'\1 \2 \3', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(publishers?)(and)(prop)', r'\1 \2 \3', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(publishers?)(and)', r'\1 \2', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(proprietors?)(and)', r'\1 \2', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(and)(publishers?)', r'\1 \2', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(and)(proprietors?)', r'\1 \2', normalized, flags=re.IGNORECASE)
    
    # Normalize multiple spaces
    normalized = re.sub(r'\s+', ' ', normalized)
    
    return normalized


def extract_circulation(text):
    """Extract circulation number from text."""
    patterns = [
        r'circulation[:\s]+(?:about\s+)?(\d[\d,\.]+)',
        r'claims?\s+(?:about\s+)?(\d[\d,\.]+)\s+circulation',
        r'circ(?:ulation|\'?l?n)[:\s\.]+(?:about\s+)?(\d[\d,\.]+)',
        r'(\d[\d,\.]+)\s+circ(?:ulation|\'?l?n)',
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).replace(',', '').replace('.', '')
    return ''


def extract_political_affiliation(text):
    """Extract political affiliation from text."""
    affiliations = ['democratic', 'republican', 'independent', 'neutral',
                    'whig', 'conservative', 'liberal', 'radical']
    text_lower = text.lower()
    for affiliation in affiliations:
        if re.search(rf';\s*{affiliation}\b', text_lower):
            return affiliation.capitalize()
    return ''


def extract_subscription_details(text):
    """Extract detailed subscription info."""
    daily_match = re.search(r'subscription[-\s]+daily\s+\$(\d+(?:\s+\d{2})?)', text, re.IGNORECASE)
    if daily_match:
        return f"${daily_match.group(1).replace(' ', '.')}"
    
    weekly_match = re.search(r'(?:subscription[-\s]+)?weekly\s+\$(\d+(?:\s+\d{2})?)', text, re.IGNORECASE)
    if weekly_match:
        return f"${weekly_match.group(1).replace(' ', '.')}"
    
    std_match = re.search(r'subscription[:\s]+\$(\d+(?:\s+\d{2})?)', text, re.IGNORECASE)
    if std_match:
        return f"${std_match.group(1).replace(' ', '.')}"
    
    cents_match = re.search(r'subscription\s+(\d+)\s+cents', text, re.IGNORECASE)
    if cents_match:
        return f"${int(cents_match.group(1))/100:.2f}"
    
    return ''


def extract_frequency(text):
    """Extract publication frequency from text."""
    text_lower = text.lower()
    
    if re.search(r'every\s+(?:morning|evening|day)', text_lower):
        return 'Daily & Weekly' if 'and weekly' in text_lower or 'weekly,' in text_lower else 'Daily'
    if re.search(r'tri-?weekly', text_lower):
        return 'Tri-weekly & Weekly' if 'and weekly' in text_lower else 'Tri-weekly'
    if re.search(r'semi-?weekly', text_lower):
        return 'Semi-weekly & Weekly' if 'and weekly' in text_lower else 'Semi-weekly'
    if re.search(r'semi-?monthly', text_lower):
        return 'Semi-monthly'
    if 'quarterly' in text_lower:
        return 'Quarterly'
    if 'monthly' in text_lower:
        return 'Monthly'
    
    days = ['sundays', 'mondays', 'tuesdays', 'wednesdays', 'thursdays', 'fridays', 'saturdays']
    for day in days:
        if day in text_lower:
            return 'Weekly'
    return ''


def extract_established(text):
    """Extract establishment year from text."""
    patterns = [
        r'establish[e]?d\s+(\d{4})',
        r'estab[-\s]*lished\s+(\d{4})',
        r'es[-\s]*tablished\s+(\d{4})',
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            year = match.group(1)
            if 1700 <= int(year) <= 1900:
                return year
    return ''


def clean_name(name):
    """Clean and validate an extracted name."""
    if not name:
        return None
    
    name = name.strip().strip(',;:.')
    
    if len(name) < 3:
        return None
    
    false_positives = ['Four', 'Eight', 'The', 'And', 'Weekly', 'Daily', 'Semi', 
                       'Tri', 'Monthly', 'Sunday', 'Saturday', 'Friday', 'Thursday',
                       'Wednesday', 'Tuesday', 'Monday', 'About', 'Claims', 'Size',
                       'Subscription', 'Established', 'Circulation', 'Pages',
                       'Democratic', 'Republican', 'Independent', 'Neutral',
                       'Temperance', 'Association']
    if name in false_positives:
        return None
    
    if name.replace(',', '').replace('.', '').isdigit():
        return None
    
    if re.search(r'\b(editor|publisher|proprietor|and)\s*$', name, re.IGNORECASE):
        return None
    
    if name[0].islower():
        return None
    
    return name


def add_name_if_unique(name, name_list):
    """Add a name to the list if it's not a duplicate."""
    cleaned = clean_name(name)
    if not cleaned:
        return False
    
    cleaned_lower = cleaned.lower()
    
    for existing in name_list:
        if cleaned_lower == existing.lower() or cleaned_lower in existing.lower():
            return False
    
    to_remove = [e for e in name_list if e.lower() in cleaned_lower]
    for item in to_remove:
        name_list.remove(item)
    
    name_list.append(cleaned)
    return True


def extract_editor_publisher(text):
    """Extract editor and publisher names from text."""
    editors, publishers = [], []
    
    normalized = normalize_editor_publisher_text(text)
    normalized = normalize_for_matching(normalized)
    
    normalized = re.sub(r'(\w)(and)(\w)', r'\1 \2 \3', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(editor[s]?)(and)', r'\1 \2', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(and)(pub)', r'\1 \2', normalized, flags=re.IGNORECASE)
    normalized = re.sub(r'(and)(prop)', r'\1 \2', normalized, flags=re.IGNORECASE)
    
    # Split by semicolons to process each segment separately
    segments = re.split(r';', normalized)
    
    for segment in segments:
        segment = segment.strip()
        if not segment:
            continue
        
        name_pattern = r'([A-Z][A-Za-z\.\s&,]+?)'
        
        combined_patterns = [
            re.compile(name_pattern + r',?\s+editors?\s+and\s+publishers?', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+editors?\s+and\s+proprietors?', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+editors?andpublishers?', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+editors?andproprietors?', re.IGNORECASE),
        ]
        
        editor_patterns = [
            re.compile(name_pattern + r',?\s+editors?\s*$', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+editors?\s*,', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+editor-in-chief', re.IGNORECASE),
        ]
        
        publisher_patterns = [
            re.compile(name_pattern + r',?\s+publishers?\s*$', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+proprietors?\s*$', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+publishers?\s+and\s+proprietors?', re.IGNORECASE),
            re.compile(name_pattern + r',?\s+publishersandproprietors?', re.IGNORECASE),
        ]
        
        matched_combined = False
        for pattern in combined_patterns:
            match = pattern.search(segment)
            if match:
                add_name_if_unique(match.group(1), editors)
                add_name_if_unique(match.group(1), publishers)
                matched_combined = True
                break
        
        if matched_combined:
            continue
        
        for pattern in editor_patterns:
            match = pattern.search(segment)
            if match:
                add_name_if_unique(match.group(1), editors)
                break
        
        for pattern in publisher_patterns:
            match = pattern.search(segment)
            if match:
                add_name_if_unique(match.group(1), publishers)
                break
    
    return {'editor': '; '.join(editors), 'publisher': '; '.join(publishers)}


def is_valid_town_name(town):
    """Check if the extracted town name is a valid town (not an index/header entry)."""
    invalid_patterns = [
        r'^A\s+LIST', r'DOMINION', r'CANADA', r'BRITISH', r'COLONIES',
        r'UNITED\s+STATES', r'TERRITORIES', r'^INDEX', r'^PAGE\s*\d*',
        r'NEWSPAPERS?', r'PERIODICALS?', r'ALPHABETICALLY', r'ARRANGED',
        r'GIVING\s+NAME', r'DAYS\s+OF\s+ISSUE', r'SUBSCRIPTION\s+PRICE',
        r'EDITOR.?S?\s+AND\s+PUBLISHER', r'CIRCULATION', r'ADVERTISEMENTS?',
        r'PRINTING\s+MATERIAL', r'IN\s+WHICH', r'ARE\s+PUBLISHED',
        r'^NOTE', r'^\d+$', r'^THE\s+', r'ALIST\s+OF',
    ]
    
    town_upper = town.upper().strip()
    for pattern in invalid_patterns:
        if re.search(pattern, town_upper):
            return False
    
    if len(town) > 50 or len(town.split()) > 4:
        return False
    
    return True


def is_valid_entry_text(entry_text):
    """Check if the entry text looks like a valid newspaper entry."""
    invalid_patterns = [
        r'ALPHABETICALLY\s+BY\s+TOWNS', r'DAYS\s+OF\s+ISSUE',
        r'POLITICS\s+OR\s+GENERAL\s+CHARACTER', r'DATE\s+OF\s+ESTABLISHMENT',
        r'EDITOR.?S?\s+AND\s+PUBLISHER.?S?\s+NAMES', r'GIV-?\s*ING\s+NAME',
        r'DOMINION\s+OF\s+CANADA', r'BRITISH\s+COLONIES',
        r'UNITED\s+STATES\s+AND\s+TERRITORIES',
        r'A\s*LIST\s+OF\s+THE\s+NEWSPAPERS',
    ]
    
    text_upper = entry_text.upper()
    for pattern in invalid_patterns:
        if re.search(pattern, text_upper):
            return False
    return True


def extract_town_and_newspaper(text):
    """
    Extract town name and newspaper name from the beginning of an entry.
    Handles multi-word town names properly.
    
    Returns: (town, newspaper_name, remainder) or (None, None, text) if no match
    """
    text = text.strip()
    if not text:
        return None, None, text
    
    # First, check for known multi-word towns at the start
    text_upper = text.upper()
    
    for known_town in sorted(MULTI_WORD_TOWNS, key=len, reverse=True):
        # Check if text starts with this known town
        if text_upper.startswith(known_town):
            # Verify it's followed by appropriate delimiter and newspaper name
            remainder = text[len(known_town):].lstrip(' ,')
            
            # Extract newspaper name (up to first semicolon or colon)
            paper_match = re.match(r'^([A-Za-z][A-Za-z\s&\'\.\-]+?)\s*[;:](.*)$', remainder, re.DOTALL)
            if paper_match:
                newspaper = paper_match.group(1).strip().rstrip(',')
                rest = paper_match.group(2)
                return normalize_town_name(known_town), newspaper, rest
    
    # Check for town + suffix patterns (e.g., "FAYETTE C. H.")
    for suffix_pattern in TOWN_SUFFIX_PATTERNS:
        pattern = rf'^([A-Z][A-Z]+)\s+({suffix_pattern})[,\s]+([A-Za-z][A-Za-z\s&\'\.\-]+?)\s*[;:](.*)$'
        match = re.match(pattern, text, re.IGNORECASE | re.DOTALL)
        if match:
            town = f"{match.group(1)} {match.group(2)}"
            newspaper = match.group(3).strip().rstrip(',')
            rest = match.group(4)
            return normalize_town_name(town), newspaper, rest
    
    # Standard single-word town pattern
    # Town is ALL CAPS, newspaper starts with capital
    pattern = r'^([A-Z][A-Z]+)[,\s]+([A-Za-z][A-Za-z\s&\'\.\-]+?)\s*[;:](.*)$'
    match = re.match(pattern, text, re.DOTALL)
    if match:
        town = match.group(1).strip()
        newspaper = match.group(2).strip().rstrip(',')
        rest = match.group(3)
        return normalize_town_name(town), newspaper, rest
    
    return None, None, text


def find_entry_boundaries(text):
    """
    Find all entry start positions in the text.
    An entry starts with: TOWNNAME, Newspaper Name;
    
    Returns list of (start_pos, town, newspaper_name) tuples
    """
    boundaries = []
    
    # Build a combined pattern for known multi-word towns + single-word towns
    multi_word_pattern = '|'.join(re.escape(t) for t in sorted(MULTI_WORD_TOWNS, key=len, reverse=True))
    single_word_pattern = r'[A-Z]{2,}'
    
    # Combined pattern - note we're looking for these patterns preceded by 
    # sentence-ending punctuation or start of text
    entry_pattern = re.compile(
        rf'(?:^|[\.;])\s*'  # Start or after sentence end
        rf'((?:{multi_word_pattern})|{single_word_pattern})'  # Town name
        rf'[,\s]+'  # Separator
        rf'([A-Z][a-zA-Z][a-zA-Z\s&\'\.\-]*?)'  # Newspaper name  
        rf'\s*;',  # Semicolon
        re.MULTILINE
    )
    
    for match in entry_pattern.finditer(text):
        town = match.group(1).strip()
        newspaper = match.group(2).strip().rstrip(',')
        
        # Validate this looks like a real entry
        if is_valid_town_name(town) and len(newspaper) > 1:
            boundaries.append((match.start(), town, newspaper))
    
    return boundaries


def parse_newspaper_entries_v2(text):
    """
    Improved parsing that handles:
    1. Multiple entries per line
    2. Multi-word town names
    3. Better entry boundary detection
    """
    text = clean_text(text)
    
    # Normalize whitespace but preserve some structure
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'\s+', ' ', text)  # Normalize multiple spaces
    
    entries = []
    
    # Find all entry boundaries
    boundaries = find_entry_boundaries(text)
    
    if not boundaries:
        # Fall back to original line-by-line parsing if no boundaries found
        return []
    
    # Extract each entry using the boundaries
    for i, (start_pos, town, newspaper) in enumerate(boundaries):
        # Find the end of this entry (start of next entry or end of text)
        if i + 1 < len(boundaries):
            end_pos = boundaries[i + 1][0]
        else:
            end_pos = len(text)
        
        # Extract the full entry text
        entry_text = text[start_pos:end_pos].strip()
        
        # Clean up leading punctuation from previous entry
        entry_text = re.sub(r'^[\.;]\s*', '', entry_text)
        
        if entry_text and is_valid_entry_text(entry_text):
            entries.append((normalize_town_name(town), entry_text))
    
    return entries


def parse_newspaper_entries_fallback(text):
    """
    Fallback line-based parsing for when v2 parser doesn't find entries.
    """
    text = clean_text(text)
    lines = text.split('\n')
    entries = []
    current_entry = []
    current_town = None
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        new_entry_match = re.match(
            r'^([A-Z][A-Z\s,\.]+?)(?:,\s*|\s+)([A-Z][a-zA-Z\s&\'\.\-]+?)\s*[;:]',
            line
        )
        
        if new_entry_match:
            if current_entry and current_town:
                entries.append((current_town, ' '.join(current_entry)))
            current_town = new_entry_match.group(1).strip().rstrip(',')
            current_town = normalize_town_name(current_town)
            current_entry = [line]
        elif current_entry:
            current_entry.append(line)
    
    if current_entry and current_town:
        entries.append((current_town, ' '.join(current_entry)))
    
    return entries


def parse_newspaper_entries(text):
    """
    Parse the text into individual newspaper entries.
    This version handles entries that span multiple lines and 
    multiple entries on the same line.
    """
    # First, try the improved v2 parser
    entries = parse_newspaper_entries_v2(text)
    
    if entries:
        return entries
    
    # Fallback to line-based approach
    return parse_newspaper_entries_fallback(text)


def parse_entry_details(town, entry_text):
    """Extract structured data from a single entry."""
    
    # Normalize the town name
    town = normalize_town_name(town)
    
    result = {
        'town': town.strip().title(),
        'newspaper_name': '',
        'frequency': extract_frequency(entry_text),
        'political_affiliation': extract_political_affiliation(entry_text),
        'subscription_price': extract_subscription_details(entry_text),
        'established': extract_established(entry_text),
        'circulation': extract_circulation(entry_text),
        'raw_text': entry_text[:300] + '...' if len(entry_text) > 300 else entry_text
    }
    
    # Try to extract newspaper name using the improved function
    extracted_town, newspaper, _ = extract_town_and_newspaper(entry_text)
    
    if newspaper:
        result['newspaper_name'] = newspaper
    else:
        # Fallback to original method
        # Escape the town name properly for regex
        town_escaped = re.escape(town)
        name_match = re.match(
            rf'^{town_escaped}[,\s]+([A-Za-z][A-Za-z\s&\'\.\-,]+?)\s*[;:]',
            entry_text, re.IGNORECASE
        )
        if name_match:
            result['newspaper_name'] = name_match.group(1).strip().rstrip(',;:')
    
    people = extract_editor_publisher(entry_text)
    result['editor'] = people['editor']
    result['publisher'] = people['publisher']
    
    return result


def process_file(input_path, output_path=None):
    """Process the input file and write results to CSV."""
    start_time = time.time()
    
    print(f"Reading file: {input_path}")
    with open(input_path, 'r', encoding='utf-8', errors='replace') as f:
        text = f.read()
    
    print("Parsing entries...")
    raw_entries = parse_newspaper_entries(text)
    total_entries = len(raw_entries)
    print(f"Found {total_entries} raw entries")
    
    results = []
    
    for i, (town, entry_text) in enumerate(raw_entries):
        if (i + 1) % 100 == 0 or i == total_entries - 1:
            elapsed = time.time() - start_time
            pct = (i + 1) / total_entries * 100
            print(f"Processing: {i + 1}/{total_entries} ({pct:.1f}%) - {elapsed:.1f}s elapsed")
        
        if not is_valid_town_name(town):
            continue
        if not is_valid_entry_text(entry_text):
            continue
        
        details = parse_entry_details(town, entry_text)
        if details['newspaper_name'] and len(details['newspaper_name']) > 1:
            results.append(details)
    
    if output_path is None:
        output_path = Path(input_path).stem + '_extracted.csv'
    
    fieldnames = ['town', 'newspaper_name', 'frequency', 'political_affiliation', 
                  'subscription_price', 'established', 'editor', 'publisher',
                  'circulation', 'raw_text']
    
    with open(output_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results)
    
    elapsed = time.time() - start_time
    print(f"\n{'='*50}")
    print(f"Completed in {elapsed:.1f} seconds")
    print(f"Processed {len(results)} valid entries")
    print(f"Output written to: {output_path}")
    print(f"{'='*50}")
    print(f"Entries with frequency: {sum(1 for r in results if r['frequency'])}")
    print(f"Entries with political affiliation: {sum(1 for r in results if r['political_affiliation'])}")
    print(f"Entries with subscription price: {sum(1 for r in results if r['subscription_price'])}")
    print(f"Entries with established date: {sum(1 for r in results if r['established'])}")
    print(f"Entries with editor: {sum(1 for r in results if r['editor'])}")
    print(f"Entries with publisher: {sum(1 for r in results if r['publisher'])}")
    print(f"Entries with circulation: {sum(1 for r in results if r['circulation'])}")
    
    return results


# =============================================================================
# USAGE
# =============================================================================

import os
for file in os.listdir("data/Newspaper Directory Text/")[:5]:
    input_file = "data/Newspaper Directory Text/" + file
    output_file = "data/Newspaper Directory Excel/" + file[:-3] + 'csv'
    results = process_file(input_file, output_file)

Reading file: data/Newspaper Directory Text/Rowell 1869.txt
Parsing entries...
Found 2298 raw entries
Processing: 100/2298 (4.4%) - 0.6s elapsed
Processing: 200/2298 (8.7%) - 0.9s elapsed
Processing: 300/2298 (13.1%) - 1.2s elapsed
Processing: 400/2298 (17.4%) - 1.4s elapsed
Processing: 500/2298 (21.8%) - 1.6s elapsed
Processing: 600/2298 (26.1%) - 1.7s elapsed
Processing: 700/2298 (30.5%) - 1.9s elapsed
Processing: 800/2298 (34.8%) - 2.1s elapsed
Processing: 900/2298 (39.2%) - 2.2s elapsed
Processing: 1000/2298 (43.5%) - 2.3s elapsed
Processing: 1100/2298 (47.9%) - 2.5s elapsed
Processing: 1200/2298 (52.2%) - 2.7s elapsed
Processing: 1300/2298 (56.6%) - 3.2s elapsed
Processing: 1400/2298 (60.9%) - 3.3s elapsed
Processing: 1500/2298 (65.3%) - 3.5s elapsed
Processing: 1600/2298 (69.6%) - 3.7s elapsed
Processing: 1700/2298 (74.0%) - 3.8s elapsed
Processing: 1800/2298 (78.3%) - 4.0s elapsed
Processing: 1900/2298 (82.7%) - 4.1s elapsed
Processing: 2000/2298 (87.0%) - 4.2s elapsed
Processin

In [1]:
import csv
import re

# Known US states and territories from that era
STATES = {
    "ALABAMA", "ARKANSAS", "ARIZONA", "CALIFORNIA", "COLORADO", "CONNECTICUT",
    "DELAWARE", "DISTRICT OF COLUMBIA", "FLORIDA", "GEORGIA", "IDAHO", "ILLINOIS",
    "INDIANA", "IOWA", "KANSAS", "KENTUCKY", "LOUISIANA", "MAINE", "MARYLAND",
    "MASSACHUSETTS", "MICHIGAN", "MINNESOTA", "MISSISSIPPI", "MISSOURI", "MONTANA",
    "NEBRASKA", "NEVADA", "NEW HAMPSHIRE", "NEW JERSEY", "NEW MEXICO", "NEW YORK",
    "NORTH CAROLINA", "OHIO", "OREGON", "PENNSYLVANIA", "RHODE ISLAND",
    "SOUTH CAROLINA", "TENNESSEE", "TEXAS", "UTAH", "VERMONT", "VIRGINIA",
    "WASHINGTON", "WEST VIRGINIA", "WISCONSIN", "WYOMING",
    "INDIAN TERRITORY", "DAKOTA", "DOMINION OF CANADA", "BRITISH COLONIES"
}

def process_file(input_file, output_file):
    """Process a newspaper directory file and extract entries to CSV."""
    
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Remove page markers
    content = re.sub(r'---\s*Page\s+\d+\s*---', ' ', content)
    
    # Fix common OCR Greek letter substitutions (uppercase)
    content = content.replace('Α', 'A')  # Greek Alpha -> A
    content = content.replace('Β', 'B')  # Greek Beta -> B
    content = content.replace('Ε', 'E')  # Greek Epsilon -> E
    content = content.replace('Η', 'H')  # Greek Eta -> H
    content = content.replace('Ι', 'I')  # Greek Iota -> I
    content = content.replace('Κ', 'K')  # Greek Kappa -> K
    content = content.replace('Μ', 'M')  # Greek Mu -> M
    content = content.replace('Ν', 'N')  # Greek Nu -> N
    content = content.replace('Ο', 'O')  # Greek Omicron -> O
    content = content.replace('Ρ', 'P')  # Greek Rho -> P
    content = content.replace('Τ', 'T')  # Greek Tau -> T
    content = content.replace('Χ', 'X')  # Greek Chi -> X
    content = content.replace('Ζ', 'Z')  # Greek Zeta -> Z
    content = content.replace('Θ', 'O')  # Greek Theta -> O (visually similar)
    content = content.replace('Φ', 'O')  # Greek Phi -> O (visually similar)
    
    # Fix common OCR Cyrillic letter substitutions
    content = content.replace('С', 'C')  # Cyrillic Es -> C
    content = content.replace('О', 'O')  # Cyrillic O -> O
    content = content.replace('Р', 'P')  # Cyrillic Er -> P
    content = content.replace('Ф', 'O')  # Cyrillic Ef -> O
    content = content.replace('А', 'A')  # Cyrillic A -> A
    content = content.replace('Е', 'E')  # Cyrillic Ie -> E
    content = content.replace('Н', 'H')  # Cyrillic En -> H
    content = content.replace('В', 'B')  # Cyrillic Ve -> B
    content = content.replace('К', 'K')  # Cyrillic Ka -> K
    content = content.replace('М', 'M')  # Cyrillic Em -> M
    content = content.replace('Т', 'T')  # Cyrillic Te -> T
    
    # Fix lowercase Greek/Cyrillic
    content = content.replace('ο', 'o')  # Greek lowercase omicron -> o
    content = content.replace('а', 'a')  # Cyrillic lowercase a -> a
    content = content.replace('е', 'e')  # Cyrillic lowercase ie -> e
    content = content.replace('о', 'o')  # Cyrillic lowercase o -> o
    content = content.replace('р', 'p')  # Cyrillic lowercase er -> p
    content = content.replace('с', 'c')  # Cyrillic lowercase es -> c
    
    # Fix OCR diacritical errors
    content = content.replace('Ü', 'U')
    content = content.replace('Ö', 'O')
    content = content.replace('Ä', 'A')
    content = content.replace('É', 'E')
    content = content.replace('È', 'E')
    content = content.replace('Ñ', 'N')
    content = content.replace('Ç', 'C')
    
    # Remove OCR artifacts: sequences of O-like characters before town names
    # Matches patterns like "OOO ", "OOD ", "COO ", "CODO ", etc.
    content = re.sub(r'\b[OoCcDd0ΘΦ]{2,}\s+([A-Z]{2,})', r'\1', content)
    
    # Fix missing space between ALL CAPS town and Capitalized newspaper name
    # e.g., "VAN BURENArgus" -> "VAN BUREN Argus"
    content = re.sub(r'([A-Z]{4})([A-Z][a-z])', r'\1 \2', content)
    
    # Normalize whitespace
    text = ' '.join(content.split())
    
    results = []
    
    # Build state position index
    state_positions = []
    
    for state in STATES:
        # Pattern allows optional space before period: "ARKANSAS ." or "ARKANSAS."
        pattern = re.compile(r'\b' + re.escape(state) + r'\s*\.', re.IGNORECASE)
        for m in pattern.finditer(text):
            state_positions.append((m.start(), state))
    
    state_positions.sort()
    
    # Remove duplicate state entries at same/nearby positions
    filtered_positions = []
    for pos, state in state_positions:
        if not filtered_positions or pos - filtered_positions[-1][0] > 10:
            filtered_positions.append((pos, state))
    state_positions = filtered_positions
    
    # Main pattern for newspaper entries
    pattern = re.compile(
        r'\b'
        r'([A-Z][A-Z\'\-]+(?:\s+[A-Z][A-Z\'\-]+)*)'  # Group 1: Town (ALL CAPS words)
        r'\s*[,.\s]\s*'                              # Separator
        r'([A-Z][a-z][^;:†]*?)'                      # Group 2: Newspaper name
        r'\s*[;:†]'                                  # Delimiter
    )
    
    matches = list(pattern.finditer(text))
    
    # First pass: identify valid entries
    valid_matches = []
    for match in matches:
        pos = match.start()
        
        # Determine current state based on position
        match_state = None
        for sp, st in reversed(state_positions):
            if sp < pos:
                match_state = st
                break
        
        if not match_state:
            continue
        
        town = match.group(1).strip().rstrip(' ,.')
        newspaper = match.group(2).strip().rstrip(' ,.')
        
        # Skip index/header content
        if any(kw in newspaper.lower() for kw in ['list of', 'index', 'page']):
            continue
            
        # Skip if newspaper contains what looks like a page header
        if re.search(r'\b\d+\s+[A-Z]{4,}\.', newspaper):
            continue
        
        if len(town) >= 2 and len(newspaper) >= 2:
            valid_matches.append((match, match_state, town, newspaper))
    
    # Second pass: build results
    for i, (match, match_state, town, newspaper) in enumerate(valid_matches):
        if i + 1 < len(valid_matches):
            raw_text = text[match.start():valid_matches[i + 1][0].start()].strip()
        else:
            raw_text = text[match.start():].strip()
        
        results.append({
            'state': match_state,
            'town': town.title(),
            'newspaper': newspaper,
            'raw_text': raw_text
        })
    
    # Deduplicate
    seen = set()
    unique = []
    for r in results:
        key = (r['state'], r['town'], r['newspaper'])
        if key not in seen:
            seen.add(key)
            unique.append(r)
    
    # Remove known false positive entries from document header
    false_positives = {
        ("NEW YORK", "York", "January 1, 1869. ee ~~ CONTENTS"),
        ("NEW YORK", "Xiv", "Newspaper Directory Advertiser. XV. A circular to Advertisers, containing the names of more than one thousand newspapers, among which will be found the best advertising mediums in America"),
    }
    unique = [r for r in unique if (r['state'], r['town'], r['newspaper']) not in false_positives]
    
    print(f"{input_file}: {len(unique)} entries found")
    print(f"  States detected at positions: {state_positions[:10]}...")  # Debug
    
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['state', 'town', 'newspaper', 'raw_text'])
        writer.writeheader()
        writer.writerows(unique)
    
    return unique

import os
for file in os.listdir("data/Newspaper Directory Text/")[:5]:
    input_file = "data/Newspaper Directory Text/" + file
    output_file = "data/Newspaper Directory Excel/" + file[:-3] + 'csv'
    results = process_file(input_file, output_file)

data/Newspaper Directory Text/Rowell 1869.txt: 4859 entries found
  States detected at positions: [(178, 'NEW YORK'), (9997, 'ALABAMA'), (14964, 'ALABAMA'), (14975, 'ALABAMA'), (21921, 'ARKANSAS'), (22122, 'ARKANSAS'), (25267, 'ARKANSAS'), (25684, 'ARKANSAS'), (27603, 'CALIFORNIA'), (29205, 'CALIFORNIA')]...
data/Newspaper Directory Text/Rowell 1871.txt: 5878 entries found
  States detected at positions: [(1645, 'ALABAMA'), (6202, 'ALABAMA'), (6936, 'ALABAMA'), (10647, 'ALABAMA'), (14436, 'ALABAMA'), (16217, 'ARKANSAS'), (17901, 'ARKANSAS'), (21657, 'ARKANSAS'), (25257, 'CALIFORNIA'), (28971, 'CALIFORNIA')]...
data/Newspaper Directory Text/Rowell 1872.txt: 6241 entries found
  States detected at positions: [(3031, 'ALABAMA'), (5793, 'ALABAMA'), (8365, 'ALABAMA'), (9568, 'ALABAMA'), (9807, 'ALABAMA'), (15944, 'ALABAMA'), (17629, 'ARKANSAS'), (19444, 'ARKANSAS'), (23221, 'ARKANSAS'), (26938, 'ARKANSAS')]...
data/Newspaper Directory Text/Rowell 1873.txt: 6550 entries found
  States detect

In [9]:
def compare_files(txt_file, csv_file, output_file):
    with open(txt_file, 'r', encoding='utf-8') as f:
        txt_content = f.read()[25000:40000]
    
    with open(csv_file, 'r', encoding='utf-8') as f:
        csv_content = f.read()[20000:35000]
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("=== INPUT (chars 1000-15000) ===\n\n")
        f.write(txt_content)
        f.write("\n\n=== OUTPUT (first 15000 chars) ===\n\n")
        f.write(csv_content)

compare_files(r'data\Newspaper Directory Text\Rowell 1880 - v13.txt', r'data\Newspaper Directory Excel\Rowell 1880.csv', r'comparison.txt')

In [2]:
import pandas as pd
from pathlib import Path

dir_path = Path("data/Newspaper Directory Excel")

for csv_file in sorted(dir_path.glob("*.csv")):
    df = pd.read_csv(csv_file)
    print(f"\n{'='*50}")
    print(f"{csv_file.name}: {len(df)} rows")
    print("-"*50)
    for col in df.columns:
        pct = df[col].notna().mean() * 100
        print(f"  {col}: {pct:.1f}%")


Rowell 1869.csv: 3072 rows
--------------------------------------------------
  state: 100.0%
  town: 100.0%
  newspaper_name: 100.0%
  frequency: 81.4%
  political_affiliation: 44.2%
  subscription_price: 37.1%
  established: 48.8%
  editor: 78.7%
  publisher: 76.0%
  circulation: 38.3%
  raw_text: 100.0%

Rowell 1871.csv: 5878 rows
--------------------------------------------------
  state: 100.0%
  town: 100.0%
  newspaper: 100.0%
  frequency: 84.9%
  political: 47.6%
  editor: 92.7%
  publisher: 95.8%
  circulation: 83.5%
  raw_text: 100.0%

Rowell 1872.csv: 6241 rows
--------------------------------------------------
  state: 100.0%
  town: 100.0%
  newspaper: 100.0%
  frequency: 84.2%
  political: 49.0%
  editor: 91.2%
  publisher: 94.8%
  circulation: 83.6%
  raw_text: 100.0%

Rowell 1873.csv: 6550 rows
--------------------------------------------------
  state: 100.0%
  town: 100.0%
  newspaper: 100.0%
  frequency: 84.8%
  political: 45.2%
  editor: 91.3%
  publisher: 95.1%
  

In [None]:
# original merger

import pandas as pd
import os
from pathlib import Path
from difflib import SequenceMatcher

DAYS_OF_WEEK = ['sundays', 'mondays', 'tuesdays', 'wednesdays', 'thursdays', 'fridays', 'saturdays']

def normalize_text(s):
    """Normalize text for matching: lowercase, strip whitespace, remove punctuation."""
    if pd.isna(s):
        return ""
    return str(s).lower().strip().replace(".", "").replace(",", "").replace("'", "").replace(" ", "")

def normalize_text_no_days(s):
    """Normalize text and also remove days of the week."""
    if pd.isna(s):
        return ""
    text = str(s).lower().strip()
    for day in DAYS_OF_WEEK:
        text = text.replace(day, "")
    return text.replace(".", "").replace(",", "").replace("'", "").replace(" ", "")

def similarity(a, b):
    """Calculate similarity ratio between two strings (0 to 1)."""
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, a, b).ratio()

def is_fuzzy_match(town1, name1, town2, name2, threshold=0.90):
    """
    Check if two newspaper records match using fuzzy matching.
    More strict: requires high similarity on both fields.
    """
    town_sim = similarity(town1, town2)
    name_sim = similarity(name1, name2)
    
    if town_sim >= threshold and name_sim >= threshold:
        return True
    if town_sim == 1.0 and name_sim >= 0.85:
        return True
    if name_sim == 1.0 and town_sim >= 0.85:
        return True
    return False

def remove_days_from_name(name):
    """Remove days of the week from a newspaper name, preserving original formatting."""
    if pd.isna(name):
        return ""
    result = str(name)
    for day in DAYS_OF_WEEK:
        # Case-insensitive replacement
        import re
        result = re.sub(re.escape(day), '', result, flags=re.IGNORECASE)
    # Clean up extra spaces
    result = ' '.join(result.split()).strip()
    return result

def find_best_match(town, name, position_pct, existing_records, current_established=None, established_lookup=None, threshold=0.90):
    """
    Find the best matching key from existing records.
    First tries exact match, then fuzzy match for towns with same first letter,
    then tries again with days of week removed from BOTH current and existing names.
    Uses lower threshold (80%) if established dates match.
    
    Returns tuple: (matched_key or None, matched_via_days_removal: bool)
    """
    town_norm = normalize_text(town)
    name_norm = normalize_text(name)
    name_norm_no_days = normalize_text_no_days(name)
    
    # First: try exact match
    exact_key = (town_norm, name_norm)
    if exact_key in existing_records:
        return exact_key, False
    
    # Get first letter of town for filtering
    town_first_letter = town_norm[0] if town_norm else ""
    
    # Second: try fuzzy match, only considering towns with same first letter
    # Now also compares with days removed from BOTH names
    best_match = None
    best_score = 0
    matched_via_days = False
    
    for (ex_town, ex_name), ex_position in existing_records.items():
        # Only consider towns starting with same letter
        if not ex_town or ex_town[0] != town_first_letter:
            continue
        
        # Check if established dates match for lower threshold
        effective_threshold = threshold
        if current_established and established_lookup and (ex_town, ex_name) in established_lookup:
            ex_established = established_lookup[(ex_town, ex_name)]
            if ex_established and str(current_established).strip() == str(ex_established).strip():
                effective_threshold = 0.80
        
        # Try standard fuzzy match first
        if is_fuzzy_match(town_norm, name_norm, ex_town, ex_name, effective_threshold):
            score = similarity(town_norm, ex_town) + similarity(name_norm, ex_name)
            if score > best_score:
                best_score = score
                best_match = (ex_town, ex_name)
                matched_via_days = False
            continue  # Found a match, no need to try days-removed for this record
        
        # Try with days removed from BOTH current and existing names
        ex_name_no_days = normalize_text_no_days(ex_name)
        if is_fuzzy_match(town_norm, name_norm_no_days, ex_town, ex_name_no_days, effective_threshold):
            score = similarity(town_norm, ex_town) + similarity(name_norm_no_days, ex_name_no_days)
            if score > best_score:
                best_score = score
                best_match = (ex_town, ex_name)
                matched_via_days = True
    
    return best_match, matched_via_days

def load_and_tag_csvs(directory):
    """Load all CSVs from directory, tag with year, and split into pre/post 1877."""
    csv1_frames = []
    csv2_frames = []
    
    import re
    
    for file in Path(directory).glob("*.csv"):
        filename = file.stem
        year = None
        
        match = re.search(r'(1[89]\d{2})', filename)
        if match:
            year = int(match.group(1))
        
        if year is None:
            print(f"Warning: Could not extract year from {file.name}, skipping...")
            continue
        
        df = pd.read_csv(file, encoding='utf-8', on_bad_lines='skip')
        df['_year'] = year
        
        print(f"Loaded {file.name} (year {year}): {len(df)} records")
        
        if year <= 1876:
            csv1_frames.append(df)
        else:
            csv2_frames.append(df)
    
    return csv1_frames, csv2_frames

def process_dataframe(df, year, has_state=False):
    """Process a single dataframe: standardize and prepare for merging."""
    if 'raw_text' in df.columns:
        df = df.drop(columns=['raw_text'])
    
    df.columns = [c.lower().strip() for c in df.columns]
    
    data_cols = ['frequency', 'political_affiliation', 'subscription_price', 
                 'established', 'editor', 'publisher', 'circulation']
    
    rename_map = {}
    for col in data_cols:
        if col in df.columns:
            rename_map[col] = f"{year} {col}"
    
    df = df.rename(columns=rename_map)
    return df

def merge_newspapers_core(all_frames):
    """Core merge logic used by both full and test functions."""
    merged_records = {}
    record_positions = {}
    original_names = {}
    established_lookup = {}  # Track established dates for each record
    
    print(f"Processing {len(all_frames)} files...")
    print("Strategy: exact match first, then fuzzy match (same first letter, 90% similarity),")
    print("          with days of week removed from BOTH current and existing names,")
    print("          80% threshold if established dates match\n")
    
    for df, year, has_state in all_frames:
        total_rows = len(df)
        print(f"  Processing year {year} ({total_rows} records)...")
        matches_found = 0
        new_records = 0
        
        # Collect new records for this year, add to main dict after processing
        year_new_keys = []
        
        # Find the established column for this year
        established_col = f"{year} established"
        
        for idx, row in df.iterrows():
            row_num = df.index.get_loc(idx)
            position_pct = row_num / max(total_rows - 1, 1)
            
            town = row.get('town', '')
            name = row.get('newspaper_name', '')
            state = row.get('state', '') if has_state else ''
            current_established = row.get(established_col, None)
            
            town_norm = normalize_text(town)
            name_norm = normalize_text(name)
            
            if not town_norm or not name_norm:
                continue
            
            # Only match against records from previous years
            existing_key, matched_via_days = find_best_match(
                town, name, position_pct, record_positions,
                current_established=current_established,
                established_lookup=established_lookup,
                threshold=0.90
            )
            
            if existing_key:
                key = existing_key
                matches_found += 1
                old_pos = record_positions[key]
                record_positions[key] = (old_pos + position_pct) / 2
                
                # If matched via days removal, update the stored name to remove days
                if matched_via_days:
                    old_town, old_name, old_state = original_names[key]
                    cleaned_name = remove_days_from_name(old_name)
                    original_names[key] = (old_town, cleaned_name, old_state)
            else:
                key = (town_norm, name_norm)
                merged_records[key] = {}
                original_names[key] = (town, name, state)
                # Queue this to be added after processing this year
                year_new_keys.append((key, position_pct, current_established))
                new_records += 1
            
            if state and not original_names[key][2]:
                original_names[key] = (original_names[key][0], original_names[key][1], state)
            
            year_cols = [c for c in row.index if c.startswith(f"{year} ")]
            for col in year_cols:
                merged_records[key][col] = row[col]
        
        # Now add this year's new records to positions for next year's matching
        for key, pos, estab in year_new_keys:
            record_positions[key] = pos
            if estab:
                established_lookup[key] = estab
        
        print(f"    -> {matches_found} matched to existing, {new_records} new records")
    
    print(f"\nTotal unique newspapers found: {len(merged_records)}")
    
    rows = []
    for key, data in merged_records.items():
        town, name, state = original_names[key]
        row = {'state': state, 'town': town, 'newspaper_name': name}
        row.update(data)
        rows.append(row)
    
    result = pd.DataFrame(rows)
    
    id_cols = ['state', 'town', 'newspaper_name']
    year_cols = [c for c in result.columns if c not in id_cols]
    
    def sort_key(col):
        parts = col.split(' ', 1)
        if len(parts) == 2 and parts[0].isdigit():
            return (int(parts[0]), parts[1])
        return (9999, col)
    
    year_cols = sorted(year_cols, key=sort_key)
    final_cols = id_cols + year_cols
    result = result[final_cols]
    result = result.sort_values(['state', 'town', 'newspaper_name'])
    
    return result

def prepare_frames(directory, max_years=None):
    """Load and prepare frames, optionally limiting to first N years."""
    csv1_frames, csv2_frames = load_and_tag_csvs(directory)
    
    if not csv1_frames and not csv2_frames:
        print("No CSV files found!")
        return None
    
    all_frames_raw = []
    
    for df in csv1_frames:
        year = df['_year'].iloc[0]
        all_frames_raw.append((df, year, False))
    
    for df in csv2_frames:
        year = df['_year'].iloc[0]
        all_frames_raw.append((df, year, True))
    
    all_frames_raw.sort(key=lambda x: x[1])
    
    if max_years is not None:
        all_frames_raw = all_frames_raw[:max_years]
        years_processing = [f[1] for f in all_frames_raw]
        print(f"\nTEST MODE: Processing only first {max_years} years: {years_processing}\n")
    
    all_frames = []
    for df, year, has_state in all_frames_raw:
        df = df.drop(columns=['_year'])
        df = process_dataframe(df, year, has_state=has_state)
        all_frames.append((df, year, has_state))
    
    return all_frames

def merge_newspapers_fuzzy(directory):
    """Main function to merge all newspaper CSVs with fuzzy matching."""
    all_frames = prepare_frames(directory)
    if all_frames is None:
        return None
    return merge_newspapers_core(all_frames)

def test(directory=r"data\Newspaper Directory Excel"):
    """Test function that processes only the first 3 years."""
    print("=" * 60)
    print("RUNNING TEST MODE (first 3 years only)")
    print("=" * 60)
    
    all_frames = prepare_frames(directory, max_years=3)
    if all_frames is None:
        print("Failed to load CSV files.")
        return None
    
    result = merge_newspapers_core(all_frames)
    
    if result is not None:
        output_path = "master_test.csv"
        result.to_csv(output_path, index=False)
        print(f"\nSuccess! Test output saved to: {output_path}")
        print(f"Total newspapers: {len(result)}")
        print(f"\nColumns in output:")
        for col in result.columns:
            print(f"  - {col}")
        print("\nFirst 10 rows preview:")
        print(result.head(10).to_string())
    else:
        print("Failed to create merged CSV.")
    
    return result

if __name__ == "__main__":
    import sys
    
    directory = r"data\Newspaper Directory Excel"
    
    if len(sys.argv) > 1 and sys.argv[1] == "--test":
        test(directory)
    else:
        print(f"Processing CSVs from: {directory}")
        print("=" * 60)
        
        result = merge_newspapers_fuzzy(directory)
        
        if result is not None:
            output_path = "master.csv"
            result.to_csv(output_path, index=False)
            print(f"\nSuccess! Output saved to: {output_path}")
            print(f"Total newspapers: {len(result)}")
            print(f"\nColumns in output:")
            for col in result.columns:
                print(f"  - {col}")
        else:
            print("Failed to create merged CSV.")