# Ingestion Experiments 

Right now, we ingest information using RSS feeds, can we do this in a more exhaustive manner?

A conclusion from this process is that maybe agents really aren't the right choice here. At least for PBS content. The pages are well and consistently structured so our hardcoded rules allow our pipelines to run to completion. 

As we explore other sites with less reliable strucrues, maybe it will be something to consider but for now, the added complexity, in my mind, isn't worth it. 

In [21]:
PBS_NEWSHOUR_BASE_URL = "https://www.pbs.org/newshour"

HEADERS = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            + "AppleWebKit/537.36 (KHTML, like Gecko) "
            + "Chrome/120.0.0.0 Safari/537.36"
        }

sections = ["latest", "politics", "arts", "nation", "world", "economy", "science", "health", "education"]

In [None]:
import requests 
from bs4 import BeautifulSoup
from tqdm import tqdm

def get_section_urls(section: str, pages_to_search: int) -> list:
    section_urls = []

    for i in tqdm(range(1, pages_to_search + 1), desc=f"Fetching {section.upper()} URLs"):
        # construct the URL for each page in the section
        url = f"{PBS_NEWSHOUR_BASE_URL}/{section}/page/{i}"

        page = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(page.content, "html.parser")

        # set wrapper container
        if section == "latest":
            wrapper_class_name = "latest__wrapper"
        else: 
            wrapper_class_name = "archive__wrapper"

        # set link class name 
        if section == "latest":
            link_class_name = "card-timeline__title"
        elif section in ["politics", "arts"]:
            link_class_name = "card-horiz__title"
        else:
            link_class_name = "card-lg__title"
            
        page_links = soup.find("div", class_=wrapper_class_name).find_all("a", class_=link_class_name)

        section_urls.extend([link.get("href") for link in page_links])

    return section_urls

In [84]:
import requests 
from bs4 import BeautifulSoup

def get_page_data(url: str, section: str) -> dict:
    page = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(page.content, "html.parser")

    slug = url.split("/")[-1]

    publication_date = soup.find("time").get_text(strip=True)

    byline = soup.find_all("a", class_="post__byline-name-unhyphenated")
    authors = [author.find("span").get_text(strip=True) for author in byline]

    paragraphs = soup.find_all("p")
    content = " ".join([paragraph.get_text(strip=True) for paragraph in paragraphs])

    main_classname = soup.find("main").get("class")[0]
    if "video" in main_classname:
        content_type = "video"
    else:
        content_type = "article"

    return {
        "url": url,
        "slug": slug,
        "publication_date": publication_date,
        "authors": authors,
        "content": content,
        "meatadata": {
            "content_type": content_type,
            "section": section, 
            "short_description": None, 
            "tags": None
        }
    }

### Get List of Page URLs

In [120]:
all_urls = set()

for section in sections:
    pages_to_search = 20
    section_urls = get_section_urls(section, pages_to_search)

    all_urls.update(section_urls)

all_urls = list(all_urls)

Fetching LATEST URLs: 100%|██████████| 20/20 [00:20<00:00,  1.04s/it]
Fetching POLITICS URLs: 100%|██████████| 20/20 [00:25<00:00,  1.27s/it]
Fetching ARTS URLs: 100%|██████████| 20/20 [00:20<00:00,  1.03s/it]
Fetching NATION URLs: 100%|██████████| 20/20 [00:24<00:00,  1.22s/it]
Fetching WORLD URLs: 100%|██████████| 20/20 [00:22<00:00,  1.15s/it]
Fetching ECONOMY URLs: 100%|██████████| 20/20 [00:21<00:00,  1.07s/it]
Fetching SCIENCE URLs: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s]
Fetching HEALTH URLs: 100%|██████████| 20/20 [00:16<00:00,  1.23it/s]
Fetching EDUCATION URLs: 100%|██████████| 20/20 [00:18<00:00,  1.11it/s]


### Get Data for Each Page

In [132]:
from time import sleep 

from tqdm import tqdm
from time import sleep

all_data = []

for i, url in enumerate(tqdm(all_urls, desc="Fetching page data")):
    if i % 10 == 0 and i != 0:
        sleep(15)

    data = get_page_data(url, url.split("/")[4])
    all_data.append(data)


Fetching page data: 100%|██████████| 500/500 [16:48<00:00,  2.02s/it]
