# Stage 1: Classify Articles as Labor, Railroad, or Both

Query the SQLite database (~83M articles) and classify articles into three categories using weighted keyword matching with exclusion patterns.

**Output**: `data/classified_articles/{labor_only,railroad_only,both}.json` (~100K articles each)

In [1]:
import sqlite3
import json
import re
import hashlib
import logging
import os
import random
from pathlib import Path
from tqdm import tqdm

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("data/classification_log.log")
    ]
)
log = logging.getLogger(__name__)

# Paths (relative to sentiment_analysis/)
DB_PATH = os.path.join("..", "data", "newspapers.db")
OUTPUT_DIR = Path("data/classified_articles")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Database path: {DB_PATH}")
print(f"Output directory: {OUTPUT_DIR}")

Database path: ..\data\newspapers.db
Output directory: data\classified_articles


## Keyword Definitions

Weighted scoring system:
- **High-weight (3 pts)**: Multi-word phrases that are unambiguously labor/railroad
- **Medium-weight (2 pts)**: Specific terms with low false-positive rates
- **Low-weight (1 pt)**: Ambiguous single terms ("strike", "union", "train")

Articles must score **>=5** in a category to be classified. This prevents a single ambiguous word from triggering a match.

In [10]:
# --- Weighted keyword dictionaries ---

LABOR_KEYWORDS = {
    # High-weight (3): unambiguous multi-word phrases
    'labor union': 3, 'trade union': 3, 'labor strike': 3, 'labor riot': 3,
    'collective bargaining': 3, 'labor movement': 3, 'strikebreaker': 3,
    'scab labor': 3, 'working men': 3, 'workingmen': 3,
    'knights of labor': 3, 'eight hour': 3,
    # Medium-weight (2): specific terms
    'striker': 2, 'strikers': 2, 'picket': 2, 'lockout': 2,
    'boycott': 2, 'walkout': 2, 'arbitration': 2, 'picketing': 2,
    # Low-weight (1): ambiguous terms
    'strike': 1, 'strikes': 1, 'wage': 1, 'wages': 1,
    'workers': 1, 'laborers': 1,
}

RAILROAD_KEYWORDS = {
    # High-weight (3): unambiguous multi-word phrases
    'railroad company': 3, 'railroad strike': 3, 'railroad workers': 3,
    'railway company': 3, 'union pacific': 3, 'central pacific': 3,
    'northern pacific': 3, 'pennsylvania railroad': 3,
    'baltimore and ohio': 3, 'railroad line': 3,
    # Medium-weight (2): specific terms
    'locomotive': 2, 'locomotives': 2, 'brakeman': 2,
    'freight car': 2, 'passenger car': 2, 'rail road': 2,
    # Low-weight (1): ambiguous terms
    'railroad': 1, 'railway': 1, 'train': 1, 'trains': 1,
}

# Exclusion patterns: if ANY match, disqualify the article
EXCLUSION_PATTERNS = [
    # Civil War "Union" (dominant in 1869-1880s newspapers)
    re.compile(r'\b(union army|union forces|union troops|union soldier|union soldiers'
               r'|union victory|union general|federal union|union cause|union side)\b', re.IGNORECASE),
    # Strike false positives
    re.compile(r'\b(struck gold|strike gold|strike oil|struck oil'
               r'|lightning strike|strike a match|struck a match|strike a blow|struck a blow'
               r'|struck the ball|strike the ball|clock struck)\b', re.IGNORECASE),
    # Railroad false positives
    re.compile(r'\bunderground railroad\b', re.IGNORECASE),
]

# Pre-compile keyword regexes for speed
LABOR_PATTERNS = {kw: (re.compile(r'\b' + re.escape(kw) + r'\b', re.IGNORECASE), weight)
                  for kw, weight in LABOR_KEYWORDS.items()}
RAILROAD_PATTERNS = {kw: (re.compile(r'\b' + re.escape(kw) + r'\b', re.IGNORECASE), weight)
                     for kw, weight in RAILROAD_KEYWORDS.items()}

SCORE_THRESHOLD = 4

print(f"Labor keywords: {len(LABOR_KEYWORDS)} ({sum(LABOR_KEYWORDS.values())} max score)")
print(f"Railroad keywords: {len(RAILROAD_KEYWORDS)} ({sum(RAILROAD_KEYWORDS.values())} max score)")
print(f"Classification threshold: >= {SCORE_THRESHOLD} points")

Labor keywords: 26 (58 max score)
Railroad keywords: 20 (46 max score)
Classification threshold: >= 4 points


In [11]:
def is_excluded(text: str) -> bool:
    """Check if article matches any exclusion pattern."""
    for pattern in EXCLUSION_PATTERNS:
        if pattern.search(text):
            return True
    return False


def score_text(text: str, patterns: dict) -> tuple[int, int]:
    """Calculate weighted keyword score for text. Returns (score, match_count)."""
    score = 0
    match_count = 0
    for kw, (regex, weight) in patterns.items():
        if regex.search(text):
            score += weight
            match_count += 1
    return score, match_count


def classify_article(text: str) -> dict | None:
    """
    Classify a single article using weighted keyword scoring.
    
    Returns dict with category and scores, or None if not classified.
    """
    if is_excluded(text):
        return None

    labor_score, labor_matches = score_text(text, LABOR_PATTERNS)
    railroad_score, railroad_matches = score_text(text, RAILROAD_PATTERNS)

    if labor_score >= SCORE_THRESHOLD-1 and railroad_score >= SCORE_THRESHOLD-1:
        return {'category': 'both', 'labor_score': labor_score, 'railroad_score': railroad_score,
                'labor_matches': labor_matches, 'railroad_matches': railroad_matches}
    elif labor_score >= SCORE_THRESHOLD:
        return {'category': 'labor', 'labor_score': labor_score, 'railroad_score': railroad_score,
                'labor_matches': labor_matches, 'railroad_matches': railroad_matches}
    elif railroad_score >= SCORE_THRESHOLD:
        return {'category': 'railroad', 'labor_score': labor_score, 'railroad_score': railroad_score,
                'labor_matches': labor_matches, 'railroad_matches': railroad_matches}
    return None


def make_article_id(lccn: str, issn: str, year: int, text: str) -> str:
    """Generate a deterministic article ID from its contents."""
    raw = f"{lccn}|{issn}|{year}|{text[:500]}"
    return hashlib.sha256(raw.encode('utf-8', errors='replace')).hexdigest()[:16]


# Quick sanity test
test_cases = [
    ("The labor union called a strike against the railroad company", "both"),
    ("The strikers walked out and began picketing the factory", "labor"),
    ("The Union army marched south to strike a blow", None),  # Excluded
    ("The railroad company built a new locomotive depot", "railroad"),
    ("The weather was fine today", None),  # No match
]

for text, expected in test_cases:
    result = classify_article(text)
    actual = result['category'] if result else None
    status = 'PASS' if actual == expected else 'FAIL'
    print(f"  [{status}] '{text[:60]}...' -> {actual} (expected {expected})")
    print(result['labor_score'] if result else 'N/A', result['railroad_score'] if result else 'N/A')

  [PASS] 'The labor union called a strike against the railroad company...' -> both (expected both)
4 4
  [PASS] 'The strikers walked out and began picketing the factory...' -> labor (expected labor)
4 0
  [PASS] 'The Union army marched south to strike a blow...' -> None (expected None)
N/A N/A
  [PASS] 'The railroad company built a new locomotive depot...' -> railroad (expected railroad)
0 6
  [PASS] 'The weather was fine today...' -> None (expected None)
N/A N/A


## SQL Pre-filtering

Use SQL `LIKE` queries to pull candidate articles from the database. This is much faster than scanning all 44M articles with Python regex.

Query broadly (using high/medium weight terms), then apply the full weighted classifier in Python.

In [12]:
# Build SQL LIKE clauses from high + medium weight keywords only
# (low-weight terms alone can't reach the threshold of 5)

def build_like_clause(keywords: dict, min_weight: int = 2) -> str:
    """Build SQL OR clause from keywords at or above min_weight."""
    terms = [kw for kw, weight in keywords.items() if weight >= min_weight]
    clauses = [f"text LIKE '%{term}%'" for term in terms]
    return " OR ".join(clauses)

labor_like = build_like_clause(LABOR_KEYWORDS, min_weight=2)
railroad_like = build_like_clause(RAILROAD_KEYWORDS, min_weight=2)

# Three queries: one for each potential category
# We over-sample since post-filtering will discard some
QUERIES = {
    'labor_candidates': f"""
        SELECT lccn, issn, year, text
        FROM articles
        WHERE ({labor_like})
    """,
    'railroad_candidates': f"""
        SELECT lccn, issn, year, text
        FROM articles
        WHERE ({railroad_like})
    """,
}

print("Labor SQL terms:", len([k for k, w in LABOR_KEYWORDS.items() if w >= 2]))
print("Railroad SQL terms:", len([k for k, w in RAILROAD_KEYWORDS.items() if w >= 2]))

Labor SQL terms: 20
Railroad SQL terms: 16


In [13]:
# Connect and run queries
con = sqlite3.connect(DB_PATH)
con.row_factory = sqlite3.Row
cur = con.cursor()

# Check DB is accessible
cur.execute("SELECT COUNT(*) FROM articles")
total_articles = cur.fetchone()[0]
print(f"Total articles in database: {total_articles:,}")

Total articles in database: 44,269,986


In [14]:
# Pull candidates from both queries and classify with weighted scoring.
# We use a combined approach: query broadly, then classify precisely.

# To avoid duplicate processing, we'll query both sets and deduplicate
# by building a single pass through both candidate sets.

classified = {'labor': [], 'railroad': [], 'both': []}
seen_ids = set()
excluded_count = 0
below_threshold_count = 0

TARGET_PER_CATEGORY = 100_000

def all_categories_full() -> bool:
    return all(len(v) >= TARGET_PER_CATEGORY for v in classified.values())

for query_name, query in QUERIES.items():
    if all_categories_full():
        break

    log.info(f"Running query: {query_name}")
    cur.execute(query)

    batch_size = 10_000
    while True:
        rows = cur.fetchmany(batch_size)
        if not rows:
            break

        for row in rows:
            lccn, issn, year, text = row['lccn'], row['issn'], row['year'], row['text']

            article_id = make_article_id(lccn, issn or '', year, text)
            if article_id in seen_ids:
                continue
            seen_ids.add(article_id)

            result = classify_article(text)
            if result is None:
                if is_excluded(text):
                    excluded_count += 1
                else:
                    below_threshold_count += 1
                continue

            category = result['category']
            if len(classified[category]) >= TARGET_PER_CATEGORY:
                continue

            classified[category].append({
                'article_id': article_id,
                'lccn': lccn,
                'issn': issn or '',
                'year': year,
                'text': text,
                'category': category,
                'labor_score': result['labor_score'],
                'railroad_score': result['railroad_score'],
                'labor_matches': result['labor_matches'],
                'railroad_matches': result['railroad_matches'],
            })

        # Progress update
        counts = {k: len(v) for k, v in classified.items()}
        log.info(f"  Progress: {counts} | Excluded: {excluded_count} | Below threshold: {below_threshold_count}")

        if all_categories_full():
            break

con.close()

print(f"\nFinal counts:")
for cat, articles in classified.items():
    print(f"  {cat}: {len(articles):,}")
print(f"  Excluded (Civil War/false positives): {excluded_count:,}")
print(f"  Below threshold: {below_threshold_count:,}")

2026-02-15 11:49:54,180 [INFO] Running query: labor_candidates
2026-02-15 11:53:07,171 [INFO]   Progress: {'labor': 1741, 'railroad': 197, 'both': 242} | Excluded: 63 | Below threshold: 7755
2026-02-15 11:56:30,364 [INFO]   Progress: {'labor': 3215, 'railroad': 359, 'both': 456} | Excluded: 104 | Below threshold: 15864
2026-02-15 11:59:48,278 [INFO]   Progress: {'labor': 4696, 'railroad': 557, 'both': 615} | Excluded: 156 | Below threshold: 23974
2026-02-15 12:03:11,420 [INFO]   Progress: {'labor': 6075, 'railroad': 803, 'both': 846} | Excluded: 201 | Below threshold: 32073
2026-02-15 12:07:02,871 [INFO]   Progress: {'labor': 7496, 'railroad': 1073, 'both': 1211} | Excluded: 237 | Below threshold: 39981
2026-02-15 12:11:29,337 [INFO]   Progress: {'labor': 8682, 'railroad': 1315, 'both': 1463} | Excluded: 277 | Below threshold: 48261
2026-02-15 12:16:29,278 [INFO]   Progress: {'labor': 9793, 'railroad': 1560, 'both': 1817} | Excluded: 336 | Below threshold: 56456
2026-02-15 12:19:26,050


Final counts:
  labor: 74,001
  railroad: 100,000
  both: 17,873
  Excluded (Civil War/false positives): 4,598
  Below threshold: 533,691


In [15]:
# Save classified articles to JSON
for category, articles in classified.items():
    filename = f"{category}_only.json" if category != 'both' else "both.json"
    output_path = OUTPUT_DIR / filename
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(articles, f, ensure_ascii=False)
    print(f"Saved {len(articles):,} articles to {output_path} ({output_path.stat().st_size / 1e6:.1f} MB)")

Saved 74,001 articles to data\classified_articles\labor_only.json (212.7 MB)
Saved 100,000 articles to data\classified_articles\railroad_only.json (272.8 MB)
Saved 17,873 articles to data\classified_articles\both.json (70.5 MB)


## Verification: Sample 100 Articles per Category

Manually review a random sample to check classification accuracy. Target: >85% precision.

In [19]:
import textwrap

# Print random samples for manual review
SAMPLE_SIZE = 10  # Increase to 100 for full verification

for category, articles in classified.items():
    print(f"\n{'='*80}")
    print(f"CATEGORY: {category.upper()} (sample of {SAMPLE_SIZE})")
    print(f"{'='*80}\n")
    
    sample = random.sample(articles, min(SAMPLE_SIZE, len(articles)))
    
    for i, article in enumerate(sample, 1):
        print(f"--- Article {i} ---")
        print(f"Year: {article['year']:<6} | ISSN: {article['issn']}")
        print(f"Labor score: {article['labor_score']:<6.2f} | Railroad score: {article['railroad_score']:.2f}")
        print(f"\nText preview:")
        
        # Wrap text to 80 characters
        wrapped_text = textwrap.fill(article['text'][:500], width=80)
        print(wrapped_text)
        
        print(f"\n{'-'*80}\n")


CATEGORY: LABOR (sample of 10)

--- Article 1 ---
Year: 1886   | ISSN: 2577-6754
Labor score: 11.00  | Railroad score: 0.00

Text preview:
Powderly May Refuse to Call a Special Session OF the General Assembly. NEW YORK,
December 27.-The Tribune to-day says: Organized labor seems to be in a bad
tangle which grows worse in stead of better. The dissatisfaction of the Knights
of Labor is growing more wide spread, and some members Of the order are afraid
that the present difficulty can only result in spite. There seems to be little
doubt that the legal requirements for call for a special session of the General
Assembly will be fulfilled

--------------------------------------------------------------------------------

--- Article 2 ---
Year: 1883   | ISSN: 2331-3285
Labor score: 4.00   | Railroad score: 1.00

Text preview:
The other day q number Of the railroad lands at work on the Texas Trunk line
truck for higher wages. They were re- sewing at the time the very best pay ior
,he services 

In [17]:
# Summary statistics
import pandas as pd

all_articles = []
for articles in classified.values():
    all_articles.extend(articles)

df = pd.DataFrame(all_articles)

print("\n=== Classification Summary ===")
print(f"\nCategory distribution:")
print(df['category'].value_counts())

print(f"\nYear distribution (top 10):")
print(df['year'].value_counts().head(10))

print(f"\nScore statistics by category:")
print(df.groupby('category')[['labor_score', 'railroad_score']].describe().round(1))


=== Classification Summary ===

Category distribution:
category
railroad    100000
labor        74001
both         17873
Name: count, dtype: int64

Year distribution (top 10):
year
1872    22697
1871    22044
1886    21115
1869    20088
1870    19981
1873    16785
1888    11163
1887    10991
1890     9580
1889     6549
Name: count, dtype: int64

Score statistics by category:
         labor_score                                     railroad_score       \
               count mean  std  min  25%  50%  75%   max          count mean   
category                                                                       
both         17873.0  4.4  1.9  3.0  3.0  4.0  5.0  22.0        17873.0  4.7   
labor        74001.0  5.3  1.7  4.0  4.0  5.0  6.0  24.0        74001.0  0.3   
railroad    100000.0  0.1  0.4  0.0  0.0  0.0  0.0   2.0       100000.0  4.9   

                                         
          std  min  25%  50%  75%   max  
category                                 
both      1.8 