# Stage 2: Create Verification Dataset (Two-Axis)

Build training/validation datasets for **two separate models**:
- **Labor stance model**: pro-labor / anti-labor / neutral (trained on labor + both articles)
- **Railroad outlook model**: optimistic / pessimistic / neutral (trained on railroad + both articles)

"Both" articles are labeled on **both axes**.

**Critical**: Validation sets are **100% hand-labeled** to avoid circularity with Gemini labels.

In [1]:
import json
import os
import random
import time
import logging
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv(os.path.join('..', '.env'))

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("data/verification_log.log")
    ]
)
log = logging.getLogger(__name__)

OUTPUT_DIR = Path("data/verified_labels")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

CLASSIFIED_DIR = Path("data/classified_articles")


def load_json(filepath):
    with open(filepath, encoding='utf-8') as f:
        return json.load(f)

def save_json(filepath, data):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [2]:
# Load classified articles
labor_articles = load_json(CLASSIFIED_DIR / 'labor_only.json')
railroad_articles = load_json(CLASSIFIED_DIR / 'railroad_only.json')
both_articles = load_json(CLASSIFIED_DIR / 'both.json')

print(f"Labor: {len(labor_articles):,}")
print(f"Railroad: {len(railroad_articles):,}")
print(f"Both: {len(both_articles):,}")

Labor: 74,001
Railroad: 100,000
Both: 17,873


In [3]:
import re
import html as html_module

# Keyword definitions (from Stage 1) for centering + highlighting
LABOR_KEYWORDS = {
    'labor union': 3, 'trade union': 3, 'labor strike': 3, 'labor riot': 3,
    'collective bargaining': 3, 'labor movement': 3, 'strikebreaker': 3,
    'scab labor': 3, 'working men': 3, 'workingmen': 3,
    'knights of labor': 3, 'eight hour': 3,
    'striker': 2, 'strikers': 2, 'picket': 2, 'lockout': 2,
    'boycott': 2, 'walkout': 2, 'arbitration': 2, 'picketing': 2,
    'strike': 1, 'strikes': 1, 'wage': 1, 'wages': 1,
    'workers': 1, 'laborers': 1,
}
RAILROAD_KEYWORDS = {
    'railroad company': 3, 'railroad strike': 3, 'railroad workers': 3,
    'railway company': 3, 'union pacific': 3, 'central pacific': 3,
    'northern pacific': 3, 'pennsylvania railroad': 3,
    'baltimore and ohio': 3, 'railroad line': 3,
    'locomotive': 2, 'locomotives': 2, 'brakeman': 2,
    'freight car': 2, 'passenger car': 2, 'rail road': 2,
    'railroad': 1, 'railway': 1, 'train': 1, 'trains': 1,
}

LABOR_PATTERNS = {kw: (re.compile(r'\b' + re.escape(kw) + r'\b', re.IGNORECASE), w)
                  for kw, w in LABOR_KEYWORDS.items()}
RAILROAD_PATTERNS = {kw: (re.compile(r'\b' + re.escape(kw) + r'\b', re.IGNORECASE), w)
                     for kw, w in RAILROAD_KEYWORDS.items()}


def find_keyword_matches(text, category):
    """Return all keyword matches as (start, end, keyword, weight, axis)."""
    matches = []
    if category in ('labor', 'both'):
        for kw, (regex, weight) in LABOR_PATTERNS.items():
            for m in regex.finditer(text):
                matches.append((m.start(), m.end(), kw, weight, 'labor'))
    if category in ('railroad', 'both'):
        for kw, (regex, weight) in RAILROAD_PATTERNS.items():
            for m in regex.finditer(text):
                matches.append((m.start(), m.end(), kw, weight, 'railroad'))
    matches.sort(key=lambda x: (-x[3], x[0]))
    return matches


def best_keyword_position(text, category):
    """Return char position of the highest-weighted keyword match."""
    matches = find_keyword_matches(text, category)
    return matches[0][0] if matches else 0


def keyword_centered_excerpt(text, category, window=2000):
    """Extract text centered on the best keyword match. Returns (excerpt, offset)."""
    if len(text) <= window:
        return text, 0
    center = best_keyword_position(text, category)
    half = window // 2
    start = max(0, center - half)
    end = start + window
    if end > len(text):
        end = len(text)
        start = max(0, end - window)
    return text[start:end], start


def highlight_keywords_html(text, category):
    """Return HTML with keyword matches highlighted in color."""
    matches = find_keyword_matches(text, category)
    # Deduplicate overlapping matches (keep highest weight)
    used_ranges = []
    filtered = []
    for start, end, kw, weight, axis in matches:
        if not any(start < ue and end > us for us, ue in used_ranges):
            filtered.append((start, end, kw, weight, axis))
            used_ranges.append((start, end))
    filtered.sort(key=lambda x: x[0])

    result = []
    last_end = 0
    for start, end, kw, weight, axis in filtered:
        result.append(html_module.escape(text[last_end:start]))
        color = '#ff9800' if axis == 'labor' else '#2196f3'
        matched = html_module.escape(text[start:end])
        result.append(
            f'<span style="background:{color}; color:white; '
            f'padding:1px 4px; border-radius:3px; font-weight:bold;" '
            f'title="{axis} (weight {weight})">{matched}</span>'
        )
        last_end = end
    result.append(html_module.escape(text[last_end:]))
    return ''.join(result)


print("Keyword utils loaded.")
print(f"  Labor patterns: {len(LABOR_PATTERNS)}")
print(f"  Railroad patterns: {len(RAILROAD_PATTERNS)}")

Keyword utils loaded.
  Labor patterns: 26
  Railroad patterns: 20


In [4]:
# Build stratified sample: ~334 from each category
HAND_LABEL_TARGET = 1000
per_category = HAND_LABEL_TARGET // 3

sample = (
    random.sample(labor_articles, min(per_category, len(labor_articles)))
    + random.sample(railroad_articles, min(per_category, len(railroad_articles)))
    + random.sample(both_articles, min(per_category + (HAND_LABEL_TARGET % 3), len(both_articles)))
)
random.shuffle(sample)

print(f"Labeling sample: {len(sample)} articles")
for cat in ['labor', 'railroad', 'both']:
    print(f"  {cat}: {sum(1 for a in sample if a['category'] == cat)}")

Labeling sample: 1000 articles
  labor: 333
  railroad: 333
  both: 334


In [5]:
import ipywidgets as widgets
from IPython.display import display

HAND_LABELED_FILE = OUTPUT_DIR / 'hand_labeled.json'


class ArticleLabelingTool:
    def __init__(self, articles: list, output_file: Path):
        self.articles = articles
        self.output_file = output_file
        self.labels = []
        self.current_idx = 0

        # Resume from existing labels
        if output_file.exists():
            self.labels = load_json(output_file)
            labeled_ids = {l['article_id'] for l in self.labels}
            self.articles = [a for a in self.articles if a['article_id'] not in labeled_ids]
            print(f"Resuming: {len(self.labels)} already labeled, {len(self.articles)} remaining")

    def create_ui(self):
        self.text_widget = widgets.HTML(
            layout=widgets.Layout(width='100%', max_height='400px', overflow_y='auto',
                                  border='1px solid #ccc', padding='10px')
        )

        self.category_widget = widgets.RadioButtons(
            options=['Labor', 'Railroad', 'Both', 'Neither (misclassified)'],
            description='Category:',
            layout=widgets.Layout(width='400px')
        )

        # Labor stance (shown for labor + both)
        self.labor_sentiment_widget = widgets.RadioButtons(
            options=['Pro-Labor', 'Anti-Labor', 'Neutral'],
            description='Labor stance:',
            layout=widgets.Layout(width='300px')
        )
        self.labor_box = widgets.VBox([
            widgets.HTML('<b>Labor Stance</b> (editorial position on workers/unions)'),
            self.labor_sentiment_widget
        ])

        # Railroad outlook (shown for railroad + both)
        self.railroad_sentiment_widget = widgets.RadioButtons(
            options=['Optimistic', 'Pessimistic', 'Neutral'],
            description='RR outlook:',
            layout=widgets.Layout(width='300px')
        )
        self.railroad_box = widgets.VBox([
            widgets.HTML('<b>Railroad Outlook</b> (framing of railroad industry)'),
            self.railroad_sentiment_widget
        ])

        self.sentiment_row = widgets.HBox([self.labor_box, self.railroad_box])

        next_btn = widgets.Button(description='Save & Next', button_style='primary',
                                  layout=widgets.Layout(width='150px'))
        next_btn.on_click(self._save_and_next)

        skip_btn = widgets.Button(description='Skip', button_style='warning',
                                  layout=widgets.Layout(width='100px'))
        skip_btn.on_click(self._skip)

        self.progress = widgets.IntProgress(
            value=len(self.labels),
            min=0,
            max=len(self.labels) + len(self.articles),
            description='Progress:',
            layout=widgets.Layout(width='100%')
        )

        self.status_label = widgets.Label(value='')
        button_row = widgets.HBox([next_btn, skip_btn, self.status_label])

        # Watch category changes to show/hide sentiment widgets
        self.category_widget.observe(self._on_category_change, names='value')

        display(self.progress, self.text_widget, self.category_widget,
                self.sentiment_row, button_row)

        if self.articles:
            self._load_article(0)
        else:
            self.text_widget.value = '<h2>All articles have been labeled!</h2>'

    def _on_category_change(self, change):
        """Show/hide sentiment widgets based on category."""
        cat = change['new']
        self.labor_box.layout.display = '' if cat in ('Labor', 'Both') else 'none'
        self.railroad_box.layout.display = '' if cat in ('Railroad', 'Both') else 'none'

    def _load_article(self, idx):
        article = self.articles[idx]
        category = article['category']

        # Center on the best keyword match
        excerpt, offset = keyword_centered_excerpt(article['text'], category, window=2000)

        # Highlight keywords in the excerpt
        highlighted = highlight_keywords_html(excerpt, category)
        highlighted = highlighted.replace('\n', '<br>')

        offset_note = f" | <b>Showing chars {offset}\u2013{offset+len(excerpt)}</b> of {len(article['text'])}" if offset > 0 else ""

        self.text_widget.value = (
            f"<h3>Article {len(self.labels) + 1} / {self.progress.max}</h3>"
            f"<p><b>Year:</b> {article['year']} | "
            f"<b>ISSN:</b> {article['issn']} | "
            f"<b>Category:</b> {article['category']}</p>"
            f"<p><b>Labor score:</b> {article.get('labor_score', '?')} | "
            f"<b>Railroad score:</b> {article.get('railroad_score', '?')}"
            f"{offset_note}</p>"
            f"<p style='font-size:11px;'>"
            f"<span style='background:#ff9800; color:white; padding:1px 4px; "
            f"border-radius:3px;'>Labor keywords</span> "
            f"<span style='background:#2196f3; color:white; padding:1px 4px; "
            f"border-radius:3px;'>Railroad keywords</span></p>"
            f"<hr><p style='font-family: serif; font-size: 14px; "
            f"line-height: 1.6;'>{highlighted}</p>"
        )

        cat_map = {'labor': 'Labor', 'railroad': 'Railroad', 'both': 'Both'}
        self.category_widget.value = cat_map.get(article['category'], 'Labor')
        # Trigger visibility update
        self._on_category_change({'new': self.category_widget.value})

    def _save_and_next(self, _btn):
        if self.current_idx >= len(self.articles):
            return

        article = self.articles[self.current_idx]
        cat_map = {
            'Labor': 'labor', 'Railroad': 'railroad',
            'Both': 'both', 'Neither (misclassified)': 'neither',
        }
        labor_map = {'Pro-Labor': 'pro_labor', 'Anti-Labor': 'anti_labor', 'Neutral': 'neutral'}
        rr_map = {'Optimistic': 'optimistic', 'Pessimistic': 'pessimistic', 'Neutral': 'neutral'}

        verified_cat = cat_map[self.category_widget.value]

        # Only record sentiments for the relevant axes
        labor_sent = None
        railroad_sent = None
        if verified_cat in ('labor', 'both'):
            labor_sent = labor_map[self.labor_sentiment_widget.value]
        if verified_cat in ('railroad', 'both'):
            railroad_sent = rr_map[self.railroad_sentiment_widget.value]

        label = {
            'article_id': article['article_id'],
            'lccn': article['lccn'],
            'issn': article['issn'],
            'year': article['year'],
            'text': article['text'],
            'predicted_category': article['category'],
            'verified_category': verified_cat,
            'labor_sentiment': labor_sent,
            'railroad_sentiment': railroad_sent,
            'labeler': 'human',
        }
        self.labels.append(label)

        save_json(self.output_file, self.labels)

        self.current_idx += 1
        self.progress.value = len(self.labels)
        self.status_label.value = f'Saved. Total labeled: {len(self.labels)}'

        if self.current_idx < len(self.articles):
            self._load_article(self.current_idx)
        else:
            self.text_widget.value = f'<h2>Labeling complete! {len(self.labels)} articles labeled.</h2>'

    def _skip(self, _btn):
        self.current_idx += 1
        if self.current_idx < len(self.articles):
            self._load_article(self.current_idx)
        else:
            self.text_widget.value = '<h2>No more articles to label.</h2>'


tool = ArticleLabelingTool(sample, HAND_LABELED_FILE)
tool.create_ui()

Resuming: 56 already labeled, 1000 remaining


IntProgress(value=56, description='Progress:', layout=Layout(width='100%'), max=1056)

HTML(value='', layout=Layout(border_bottom='1px solid #ccc', border_left='1px solid #ccc', border_right='1px s…

RadioButtons(description='Category:', layout=Layout(width='400px'), options=('Labor', 'Railroad', 'Both', 'Nei…

HBox(children=(VBox(children=(HTML(value='<b>Labor Stance</b> (editorial position on workers/unions)'), RadioB…

HBox(children=(Button(button_style='primary', description='Save & Next', layout=Layout(width='150px'), style=B…

In [15]:
# Check labeling progress
if HAND_LABELED_FILE.exists():
    hand_labels = load_json(HAND_LABELED_FILE)
    print(f"Total hand-labeled: {len(hand_labels)}")

    # Category verification
    categories = [l['verified_category'] for l in hand_labels]
    print(f"\nCategory distribution:")
    for c in ['labor', 'railroad', 'both', 'neither']:
        print(f"  {c}: {categories.count(c)}")
    misclassified = categories.count('neither')
    print(f"Classification accuracy: {(len(categories) - misclassified) / len(categories) * 100:.1f}%")

    # Labor stance distribution (labor + both articles)
    labor_labels = [l['labor_sentiment'] for l in hand_labels if l.get('labor_sentiment')]
    if labor_labels:
        print(f"\nLabor stance distribution ({len(labor_labels)} articles):")
        for s in ['pro_labor', 'anti_labor', 'neutral']:
            count = labor_labels.count(s)
            print(f"  {s}: {count} ({count/len(labor_labels)*100:.1f}%)")

    # Railroad outlook distribution (railroad + both articles)
    rr_labels = [l['railroad_sentiment'] for l in hand_labels if l.get('railroad_sentiment')]
    if rr_labels:
        print(f"\nRailroad outlook distribution ({len(rr_labels)} articles):")
        for s in ['optimistic', 'pessimistic', 'neutral']:
            count = rr_labels.count(s)
            print(f"  {s}: {count} ({count/len(rr_labels)*100:.1f}%)")
else:
    print("No hand labels found yet. Use the widget above to start labeling.")

Total hand-labeled: 100

Category distribution:
  labor: 34
  railroad: 23
  both: 20
  neither: 23
Classification accuracy: 77.0%

Labor stance distribution (54 articles):
  pro_labor: 24 (44.4%)
  anti_labor: 14 (25.9%)
  neutral: 16 (29.6%)

Railroad outlook distribution (43 articles):
  optimistic: 15 (34.9%)
  pessimistic: 8 (18.6%)
  neutral: 20 (46.5%)


## 2B. Gemini Verification (Training Data Only)

Two separate Gemini prompts — one per axis. "Both" articles go through both prompts.

Gemini labels are used **ONLY for training**, never validation.

In [16]:
from google import genai
from google.genai import types
import os

client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-flash-latest")
GEMINI_API_DELAY = int(os.getenv("GEMINI_API_DELAY", "1"))

GENERATION_CONFIG = types.GenerateContentConfig(
    temperature=0.1,
    top_p=0.95,
    top_k=40,
    max_output_tokens=8192,
)


def query_gemini_api(prompt: str, retry_count: int = 3) -> str:
    for attempt in range(retry_count):
        try:
            response = client.models.generate_content(
                model=GEMINI_MODEL,
                contents=prompt,
                config=GENERATION_CONFIG,
            )
            return response.text
        except Exception as e:
            if "quota" in str(e).lower() or "rate" in str(e).lower():
                wait_time = (attempt + 1) * 15
                log.warning(f"Rate limit hit, waiting {wait_time}s (attempt {attempt + 1}/{retry_count})")
                time.sleep(wait_time)
            else:
                log.error(f"Gemini API error: {e}")
                raise
    raise Exception(f"Failed after {retry_count} retries")


def parse_gemini_json(response: str) -> list:
    response = response.strip()
    if response.startswith("```json"):
        response = response[7:]
    if response.startswith("```"):
        response = response[3:]
    if response.endswith("```"):
        response = response[:-3]
    return json.loads(response.strip())


print(f"Gemini model: {GEMINI_MODEL}")
print(f"API delay: {GEMINI_API_DELAY}s")

Gemini model: gemini-flash-latest
API delay: 1s


In [17]:
# Build Gemini samples — exclude hand-labeled articles
hand_labeled_ids = set()
if HAND_LABELED_FILE.exists():
    hand_labeled_ids = {l['article_id'] for l in load_json(HAND_LABELED_FILE)}

GEMINI_TARGET_PER_AXIS = 100
BATCH_SIZE = 10

# Labor axis: sample from labor + both (excluding hand-labeled)
labor_pool = [a for a in labor_articles + both_articles if a['article_id'] not in hand_labeled_ids]
labor_gemini_sample = random.sample(labor_pool, min(GEMINI_TARGET_PER_AXIS, len(labor_pool)))

# Railroad axis: sample from railroad + both (excluding hand-labeled)
railroad_pool = [a for a in railroad_articles + both_articles if a['article_id'] not in hand_labeled_ids]
railroad_gemini_sample = random.sample(railroad_pool, min(GEMINI_TARGET_PER_AXIS, len(railroad_pool)))

print(f"Labor Gemini sample: {len(labor_gemini_sample)}")
print(f"Railroad Gemini sample: {len(railroad_gemini_sample)}")

Labor Gemini sample: 100
Railroad Gemini sample: 100


In [18]:
def run_gemini_verification(sample, prompt_builder, sentiment_key, output_file):
    """Run Gemini verification for one axis. Saves incrementally."""
    results = []
    if output_file.exists():
        results = load_json(output_file)
        processed_ids = {r['article_id'] for r in results}
        sample = [a for a in sample if a['article_id'] not in processed_ids]
        print(f"Resuming: {len(results)} done, {len(sample)} remaining")

    batches = [sample[i:i+BATCH_SIZE] for i in range(0, len(sample), BATCH_SIZE)]

    for batch_idx, batch in enumerate(tqdm(batches, desc=f"Gemini {sentiment_key}")):
        prompt = prompt_builder(batch)
        try:
            response = query_gemini_api(prompt)
            batch_results = parse_gemini_json(response)

            for j, result in enumerate(batch_results):
                if j >= len(batch):
                    break
                article = batch[j]
                entry = {
                    'article_id': article['article_id'],
                    'lccn': article['lccn'],
                    'issn': article['issn'],
                    'year': article['year'],
                    'text': article['text'],
                    'category': article['category'],
                    sentiment_key: result.get(sentiment_key, 'neutral').lower(),
                    'confidence': result.get('confidence', 'medium').lower(),
                    'labeler': 'gemini',
                }
                results.append(entry)

            if (batch_idx + 1) % 10 == 0:
                save_json(output_file, results)
                log.info(f"Checkpoint: {len(results)} articles")

        except json.JSONDecodeError as e:
            log.error(f"Batch {batch_idx}: JSON parse error: {e}")
        except Exception as e:
            log.error(f"Batch {batch_idx}: Error: {e}")

        time.sleep(GEMINI_API_DELAY)

    save_json(output_file, results)
    print(f"Done: {len(results)} articles verified")
    return results


print("run_gemini_verification() defined.")

run_gemini_verification() defined.


In [19]:
def build_labor_prompt(articles_batch: list) -> str:
    articles_text = ""
    for i, article in enumerate(articles_batch):
        # Center excerpt on labor keywords
        excerpt, _ = keyword_centered_excerpt(article['text'], 'labor', window=1000)
        articles_text += (
            f"\n--- ARTICLE {i + 1} ---\n"
            f"Year: {article['year']}\n"
            f"Text: {excerpt}\n"
        )

    return f"""You are analyzing historical American newspaper articles from 1869-1890 for editorial stance toward labor movements during the Gilded Age.

For each article, determine the labor stance:
- PRO_LABOR: Sympathizes with workers, justifies strikes, criticizes employers/owners
- ANTI_LABOR: Condemns strikes as riots, supports employers, portrays unions as dangerous
- NEUTRAL: Factual reporting without clear editorial bias

Also rate your confidence: high, medium, or low.

Return ONLY a valid JSON array (no markdown, no commentary):
[
  {{"article_num": 1, "labor_sentiment": "pro_labor", "confidence": "high"}},
  ...
]

{articles_text}"""


def build_railroad_prompt(articles_batch: list) -> str:
    articles_text = ""
    for i, article in enumerate(articles_batch):
        # Center excerpt on railroad keywords
        excerpt, _ = keyword_centered_excerpt(article['text'], 'railroad', window=1000)
        articles_text += (
            f"\n--- ARTICLE {i + 1} ---\n"
            f"Year: {article['year']}\n"
            f"Text: {excerpt}\n"
        )

    return f"""You are analyzing historical American newspaper articles from 1869-1890 for editorial framing of the railroad industry during the Gilded Age.

For each article, determine the railroad outlook:
- OPTIMISTIC: Celebrates expansion/progress, praises railroad companies, emphasizes economic benefits
- PESSIMISTIC: Emphasizes accidents, corruption, monopoly power, financial failures, public harm
- NEUTRAL: Factual reporting without clear editorial framing

Also rate your confidence: high, medium, or low.

Return ONLY a valid JSON array (no markdown, no commentary):
[
  {{"article_num": 1, "railroad_sentiment": "pessimistic", "confidence": "high"}},
  ...
]

{articles_text}"""

In [20]:
# Run labor stance verification
GEMINI_LABOR_FILE = OUTPUT_DIR / 'gemini_labor_verified.json'
print("=== Labor Stance Verification ===")
gemini_labor = run_gemini_verification(
    labor_gemini_sample, build_labor_prompt, 'labor_sentiment', GEMINI_LABOR_FILE
)

=== Labor Stance Verification ===
Resuming: 200 done, 100 remaining


Gemini labor_sentiment:   0%|          | 0/10 [00:00<?, ?it/s]2026-02-15 22:59:23,730 [INFO] AFC is enabled with max remote calls: 10.
2026-02-15 23:00:07,281 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-latest:generateContent "HTTP/1.1 200 OK"
Gemini labor_sentiment:  10%|█         | 1/10 [00:44<06:41, 44.60s/it]2026-02-15 23:00:08,355 [INFO] AFC is enabled with max remote calls: 10.
2026-02-15 23:00:52,544 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-latest:generateContent "HTTP/1.1 200 OK"
Gemini labor_sentiment:  20%|██        | 2/10 [01:29<05:59, 44.99s/it]2026-02-15 23:00:53,634 [INFO] AFC is enabled with max remote calls: 10.
2026-02-15 23:01:43,948 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-latest:generateContent "HTTP/1.1 200 OK"
Gemini labor_sentiment:  30%|███       | 3/10 [02:21<05:35, 47.92s/it]2026-02-15 23:01:45,061 [INFO

Done: 300 articles verified





In [21]:
# Run railroad outlook verification
GEMINI_RAILROAD_FILE = OUTPUT_DIR / 'gemini_railroad_verified.json'
print("=== Railroad Outlook Verification ===")
gemini_railroad = run_gemini_verification(
    railroad_gemini_sample, build_railroad_prompt, 'railroad_sentiment', GEMINI_RAILROAD_FILE
)

=== Railroad Outlook Verification ===
Resuming: 200 done, 99 remaining


Gemini railroad_sentiment:   0%|          | 0/10 [00:00<?, ?it/s]2026-02-15 23:05:42,841 [INFO] AFC is enabled with max remote calls: 10.
2026-02-15 23:05:52,956 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-latest:generateContent "HTTP/1.1 200 OK"
Gemini railroad_sentiment:  10%|█         | 1/10 [00:11<01:40, 11.18s/it]2026-02-15 23:05:53,998 [INFO] AFC is enabled with max remote calls: 10.
2026-02-15 23:06:04,018 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-latest:generateContent "HTTP/1.1 200 OK"
Gemini railroad_sentiment:  20%|██        | 2/10 [00:22<01:28, 11.10s/it]2026-02-15 23:06:05,077 [INFO] AFC is enabled with max remote calls: 10.
2026-02-15 23:06:12,806 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-latest:generateContent "HTTP/1.1 200 OK"
Gemini railroad_sentiment:  30%|███       | 3/10 [00:31<01:10, 10.05s/it]2026-02-15 23:06:

Done: 299 articles verified





In [22]:
# Gemini label statistics
for name, filepath, key in [
    ('Labor', GEMINI_LABOR_FILE, 'labor_sentiment'),
    ('Railroad', GEMINI_RAILROAD_FILE, 'railroad_sentiment'),
]:
    if filepath.exists():
        gv = load_json(filepath)
        print(f"\n=== {name} Gemini Stats ===")
        print(f"Total: {len(gv)}")
        vals = [r.get(key, 'unknown') for r in gv]
        for v in set(vals):
            count = vals.count(v)
            print(f"  {v}: {count} ({count/len(vals)*100:.1f}%)")
        high = sum(1 for r in gv if r.get('confidence') == 'high')
        print(f"  High-confidence: {high} ({high/len(gv)*100:.1f}%)")


=== Labor Gemini Stats ===
Total: 300
  anti_labor: 73 (24.3%)
  neutral: 144 (48.0%)
  pro_labor: 83 (27.7%)
  High-confidence: 226 (75.3%)

=== Railroad Gemini Stats ===
Total: 299
  neutral: 127 (42.5%)
  optimistic: 85 (28.4%)
  pessimistic: 87 (29.1%)
  High-confidence: 253 (84.6%)


## 2C. Split into Two Model-Specific Datasets

Each model gets its own train/val split. Validation is **100% hand-labeled**.

- Labor model: trained on (labor + both) articles with `labor_sentiment` labels
- Railroad model: trained on (railroad + both) articles with `railroad_sentiment` labels

In [None]:
hand = load_json(HAND_LABELED_FILE)

# Filter out misclassified
hand_valid = [l for l in hand if l['verified_category'] != 'neither']
print(f"Hand-labeled (excluding 'neither'): {len(hand_valid)}")

# Split hand-labeled by which axes they have labels for
hand_with_labor = [l for l in hand_valid if l.get('labor_sentiment')]
hand_with_railroad = [l for l in hand_valid if l.get('railroad_sentiment')]

print(f"  With labor stance: {len(hand_with_labor)} (labor + both articles)")
print(f"  With railroad outlook: {len(hand_with_railroad)} (railroad + both articles)")

# --- Labor model split (60% train, 40% val) ---
random.shuffle(hand_with_labor)
split_labor = int(len(hand_with_labor) * 0.6)
hand_labor_train = hand_with_labor[:split_labor]
hand_labor_val = hand_with_labor[split_labor:]

# Add Gemini high-confidence labor labels (training only)
gemini_labor = load_json(GEMINI_LABOR_FILE)
gemini_labor_high = [g for g in gemini_labor
                     if g.get('confidence') == 'high'
                     and g.get('labor_sentiment') in ('pro_labor', 'anti_labor', 'neutral')]

labor_train = hand_labor_train + gemini_labor_high
random.shuffle(labor_train)
labor_val = hand_labor_val

print(f"\n--- Labor Model Data ---")
print(f"Training: {len(labor_train)} ({len(hand_labor_train)} hand + {len(gemini_labor_high)} Gemini)")
print(f"Validation: {len(labor_val)} (100% hand-labeled)")

# --- Railroad model split (60% train, 40% val) ---
random.shuffle(hand_with_railroad)
split_rr = int(len(hand_with_railroad) * 0.6)
hand_rr_train = hand_with_railroad[:split_rr]
hand_rr_val = hand_with_railroad[split_rr:]

gemini_railroad = load_json(GEMINI_RAILROAD_FILE)
gemini_rr_high = [g for g in gemini_railroad
                  if g.get('confidence') == 'high'
                  and g.get('railroad_sentiment') in ('optimistic', 'pessimistic', 'neutral')]

railroad_train = hand_rr_train + gemini_rr_high
random.shuffle(railroad_train)
railroad_val = hand_rr_val

print(f"\n--- Railroad Model Data ---")
print(f"Training: {len(railroad_train)} ({len(hand_rr_train)} hand + {len(gemini_rr_high)} Gemini)")
print(f"Validation: {len(railroad_val)} (100% hand-labeled)")

# Distribution checks
for name, data, key in [
    ('Labor Train', labor_train, 'labor_sentiment'),
    ('Labor Val', labor_val, 'labor_sentiment'),
    ('Railroad Train', railroad_train, 'railroad_sentiment'),
    ('Railroad Val', railroad_val, 'railroad_sentiment'),
]:
    vals = [d[key] for d in data if d.get(key)]
    print(f"\n{name} ({len(vals)} samples):")
    for v in sorted(set(vals)):
        count = vals.count(v)
        print(f"  {v}: {count} ({count/len(vals)*100:.1f}%)")

In [None]:
# Save all four datasets
save_json(OUTPUT_DIR / 'labor_train.json', labor_train)
save_json(OUTPUT_DIR / 'labor_val.json', labor_val)
save_json(OUTPUT_DIR / 'railroad_train.json', railroad_train)
save_json(OUTPUT_DIR / 'railroad_val.json', railroad_val)

print("Saved:")
for f in ['labor_train.json', 'labor_val.json', 'railroad_train.json', 'railroad_val.json']:
    path = OUTPUT_DIR / f
    print(f"  {path} ({path.stat().st_size / 1e6:.1f} MB)")