# Update Headlines

In [None]:
from pathlib import Path
import csv
import json
import xml.etree.ElementTree as ET
from datetime import datetime
import shutil

DATA_DIR = Path('data').resolve()
HEADLINES_DIR = Path('analysis/headlines').resolve()
HEADLINES_DIR.mkdir(parents=True, exist_ok=True)

def parse_feed(path: Path):
    entries = []
    if path.suffix == '.json':
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        for item in data.get('entries', []):
            title = item.get('title')
            link = item.get('link')
            if title and link:
                entries.append((title.strip(), link.strip()))
    else:
        try:
            tree = ET.parse(path)
            root = tree.getroot()
        except ET.ParseError:
            return entries
        for item in root.iter():
            if item.tag.lower().endswith(('item', 'entry')):
                title = None
                link = None
                for child in item:
                    if child.tag.lower().endswith('title'):
                        title = (child.text or '').strip()
                    if child.tag.lower().endswith('link'):
                        link = (child.text or '').strip() or child.attrib.get('href')
                if title and link:
                    entries.append((title, link))
    return entries

def collect_headlines():
    all_entries = []
    for source in DATA_DIR.iterdir():
        if source.is_dir() and source.name.startswith('news'):
            candidates = [p for p in source.rglob('latest.*') if p.suffix in {'.json', '.rss', '.xml'}]
            if not candidates:
                candidates = [p for p in source.rglob('*') if p.suffix in {'.json', '.rss', '.xml'}]
            if not candidates:
                continue
            latest_file = max(candidates, key=lambda p: p.stat().st_mtime)
            all_entries.extend(parse_feed(latest_file))
    return all_entries

def update_headlines():
    timestamp = datetime.utcnow().strftime('%Y-%m-%d-%H-00')
    hourly_file = HEADLINES_DIR / f'{timestamp}.csv'
    if hourly_file.exists():
        print(f'{hourly_file.name} already exists. Skipping update.')
        return
    entries = collect_headlines()
    with open(hourly_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['title', 'link'])
        writer.writerows(entries)
    latest_file = HEADLINES_DIR / 'latest.csv'
    shutil.copy(hourly_file, latest_file)
    print(f'Wrote {hourly_file} and updated latest.csv')


In [None]:
update_headlines()