# Add Data Source to Catalog
This notebook guides you through adding a new data source to `catalog.csv`.

In [ ]:
# Ensure required packages
import importlib, subprocess, sys

def _ensure(pkg_name, import_name=None):
    try:
        importlib.import_module(import_name or pkg_name)
    except ModuleNotFoundError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg_name])
    finally:
        globals()[import_name or pkg_name] = importlib.import_module(import_name or pkg_name)

for pkg in ('pandas','requests','feedparser'):
    _ensure(pkg)
print('Dependencies ready.')

In [ ]:
from pathlib import Path
import csv, re
from urllib.parse import urlparse
import pandas as pd, requests, feedparser

BASE_DIR = Path.cwd() if Path('catalog.csv').exists() else Path.cwd() / 'data'
catalog_path = BASE_DIR / 'catalog.csv'
cat = pd.read_csv(catalog_path)

KEY_PHRASES = ['business','world news','economy','politics']

def detect_filetype(url, resp):
    u = url.lower()
    if u.endswith('.rss') or u.endswith('.xml'):
        return 'rss'
    if u.endswith('.json'):
        return 'json'
    if u.endswith('.csv'):
        return 'csv'
    ct = resp.headers.get('content-type', '').lower()
    if 'json' in ct:
        return 'json'
    if 'csv' in ct or 'text/plain' in ct:
        return 'csv'
    if 'xml' in ct or 'rss' in ct:
        return 'rss'
    return None

def root_domain(url):
    host = urlparse(url).netloc.split('@')[-1]  # drop creds if any
    parts = host.split('.')
    if len(parts) > 2:
        return '.'.join(parts[-2:])
    return host

def guess_source(url):
    rd = root_domain(url)
    mask = cat['link'].astype(str).str.contains(rd) | cat['url'].astype(str).str.contains(rd)
    if mask.any():
        return cat.loc[mask, 'source'].iloc[0]
    return rd.split('.')[0]

def guess_category(title, url):
    tokens = set(re.findall(r'[A-Za-z]+', (title or '') + ' ' + root_domain(url)))
    scores = {}
    for token in tokens:
        mask = cat['description'].str.contains(token, case=False, na=False) | cat['category'].str.contains(token, case=False, na=False)
        for c in cat.loc[mask, 'category']:
            scores[c] = scores.get(c, 0) + 1
    if scores:
        return max(scores, key=scores.get)
    return ''

def guess_link(url):
    parsed = urlparse(url)
    return f'{parsed.scheme}://{root_domain(url)}'

def guess_folder(category, source):
    if category and source:
        return f'{category}-{source}'
    return ''

while True:
    url = input('URL (blank to exit): ').strip()
    if not url:
        break
    try:
        resp = requests.get(url, timeout=15, headers={'User-Agent': 'Mozilla/5.0'})
        resp.raise_for_status()
    except Exception as e:
        print('Error fetching:', e)
        continue
    ftype = detect_filetype(url, resp)
    if not ftype:
        print('Could not determine file type.')
        continue
    title = ''
    if ftype == 'rss':
        feed = feedparser.parse(resp.content)
        title = feed.feed.get('title', '')
        text = ' '.join(' '.join(filter(None, [e.get('title', ''), e.get('summary', '')])) for e in feed.entries).lower()
        if any(p in text for p in KEY_PHRASES):
            print('Key phrase found in feed.')
    print('Detected type:', ftype)
    if title:
        print('Feed title:', title)
    source_guess = guess_source(url)
    category_guess = guess_category(title, url)
    link_guess = guess_link(url)
    folder_guess = guess_folder(category_guess or 'unknown', source_guess or 'unknown')
    row = {
        'category': input(f'Category [{category_guess}]: ').strip() or category_guess,
        'source': input(f'Source [{source_guess}]: ').strip() or source_guess,
        'filetype': ftype,
        'folder': input(f'Folder name [{folder_guess}]: ').strip() or folder_guess,
        'url': url,
        'api_key': '',
        'cadence': input('Cadence (e.g., hourly) [hourly]: ').strip() or 'hourly',
        'last_fetched': '',
        'description': input(f'Description [{title}]: ').strip() or title,
        'link': input(f'Link [{link_guess}]: ').strip() or link_guess
    }
    print('Proposed row:', row)
    if input('Add to catalog? [y/N] ').lower().startswith('y'):
        with open(catalog_path, 'a', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=cat.columns)
            writer.writerow(row)
        cat = pd.read_csv(catalog_path)
        print('Row added.')
    else:
        print('Skipped.')
