# RSS feed server for websites that don't publish it anymore

- This is a prototyping notebook to test the `newspaper` and `rfeed` libraries.
- Tried to get RSS feeds from Associated Press.
- RSS.app worked...while their free trial lasted. Then they wanted me to pay.
- And I was like, "I *know* I can build something like this in Python."

In [None]:
import os
import newspaper
import rfeed
import tqdm
from markupsafe import escape
import pandas as pd

In [None]:
# Utility function to easily pretty-print entire dataframe in Jupyter
from IPython.display import display


# Show entire dataframe
def show_entire_df(df):
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        display(df)

In [None]:
# Need to specify the websites we'd like to scrape
websites = {
    "ap_main": {
        "shortname": "AP News Main Page",
        "fullname": "Associated Press News - Main Page",
        "url": "https://apnews.com/",
    },
    #     "ap_politics": {
    #         "shortname": "AP News - Politics",
    #         "fullname": "Associated Press News - Politics",
    #         "url": "https://apnews.com/hub/politics",
    #     },
    #     "ap_coronavirus": {
    #         "shortname": "AP News - Coronavirus",
    #         "fullname": "Associated Press News - Coronavirus",
    #         "url": "https://apnews.com/hub/coronavirus-pandemic",
    #     },
}

In [None]:
def get_site_articles(url):
    """
    Reach out and download article objects from a news site.
    """

    feed_items = []

    site = newspaper.build(url, memoize_articles=False)

    articles = [newspaper.Article(art_url) for art_url in site.article_urls()]

    data = []
    for article in tqdm.tqdm(articles):
        article.download()
        article.parse()
        article.nlp()
        feed_item = rfeed.Item(
            title=article.title,
            link=article.url,
            description=article.summary,
            creator=", ".join(article.authors),
            guid=rfeed.Guid(guid=article.url),
            pubDate=article.publish_date,
        )

        feed_items.append(feed_item)

    return feed_items

In [None]:
articles = get_site_articles(url="https://apnews.com")

In [None]:
# Visualize feed items
article_data = {label: article.__dict__ for label, article in enumerate(articles)}
for label, article in article_data.items():
    article["guid"] = article["guid"].guid

df = pd.DataFrame.from_dict(
    data=article_data,
    orient="index",
)

show_entire_df(
    df.drop(
        columns=[
            "handler",
            "extensions",
            "author",
            "comments",
            "enclosure",
            "source",
            "categories",
        ]
    )
)

In [None]:
def get_site_feed(feed_id):
    site_info = websites[escape(feed_id)]

    feed_items = get_site_articles(site_info["url"])

    feed = rfeed.Feed(
        title=site_info["shortname"],
        link=site_info["url"],
        description=site_info["fullname"],
        items=feed_items,
    )

    return feed.rss()