# Update Headlines

In [None]:
from pathlib import Path
import csv
import json
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
import shutil

BASE_DIR = Path.cwd()
if (BASE_DIR / 'data').exists():
    REPO_DIR = BASE_DIR
else:
    REPO_DIR = BASE_DIR.parent.parent
DATA_DIR = REPO_DIR / 'data'
HEADLINES_DIR = REPO_DIR / 'analysis/headlines'
HEADLINES_DIR.mkdir(parents=True, exist_ok=True)

def parse_pubdate(date_str):
    try:
        dt = parsedate_to_datetime(date_str) if date_str else None
        if dt is None:
            return None
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(timezone.utc)
    except Exception:
        return None

def format_pubdate(dt):
    return dt.strftime('%Y-%m-%d-%H-%M-%S +0000') if dt else ''

def parse_feed(path: Path):
    entries = []
    if path.suffix == '.json':
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        for item in data.get('entries', []):
            title = item.get('title')
            link = item.get('link')
            pub = parse_pubdate(item.get('published'))
            if title and link:
                entries.append((pub, title.strip(), link.strip()))
    else:
        try:
            tree = ET.parse(path)
            root = tree.getroot()
        except ET.ParseError:
            return entries
        for item in root.iter():
            if item.tag.lower().endswith(('item', 'entry')):
                title = None
                link = None
                pub = None
                for child in item:
                    tag = child.tag.lower()
                    if tag.endswith('title'):
                        title = (child.text or '').strip()
                    if tag.endswith('link'):
                        link = (child.text or '').strip() or child.attrib.get('href')
                    if tag.endswith(('pubdate', 'published', 'updated')):
                        pub = parse_pubdate((child.text or '').strip())
                if title and link:
                    entries.append((pub, title, link))
    return entries

def collect_headlines():
    all_entries = []
    for source in DATA_DIR.iterdir():
        if source.is_dir() and source.name.startswith('news'):
            candidates = [p for p in source.rglob('latest.*') if p.suffix in {'.json', '.rss', '.xml'}]
            if not candidates:
                candidates = [p for p in source.rglob('*') if p.suffix in {'.json', '.rss', '.xml'}]
            if not candidates:
                continue
            latest_file = max(candidates, key=lambda p: p.stat().st_mtime)
            source_name = latest_file.relative_to(DATA_DIR).parts[1]
            for pub, title, link in parse_feed(latest_file):
                all_entries.append((pub, title, link, source_name))
    return all_entries

def _date_key(date_str):
    try:
        return parsedate_to_datetime(date_str) if date_str else datetime.min
    except Exception:
        return datetime.min

def update_headlines():
    timestamp = datetime.utcnow().strftime('%Y-%m-%d-%H-00')
    hourly_file = HEADLINES_DIR / f"{timestamp}.csv"
    if hourly_file.exists():
        print(f"{hourly_file.name} already exists. Skipping update.")
        return
    entries = collect_headlines()
    entries.sort(key=lambda r: r[0] or datetime.min.replace(tzinfo=timezone.utc), reverse=True)
    deduped = []
    seen_titles = set()
    seen_links = set()
    for pub, title, link, src in entries:
        t_key = title.lower()
        l_key = link.lower()
        if t_key in seen_titles or l_key in seen_links:
            continue
        deduped.append((pub, src, title, link))
        seen_titles.add(t_key)
        seen_links.add(l_key)
    with open(hourly_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(['pubdate', 'source', 'title', 'link'])
        for pub, src, title, link in deduped:
            writer.writerow([format_pubdate(pub), src, title, link])
    latest_file = HEADLINES_DIR / "latest.csv"
    shutil.copy(hourly_file, latest_file)
    print(f"Wrote {hourly_file} and updated latest.csv")

    update_headlines()
