# RSS feed server for websites that don't publish it anymore

- This is a prototyping notebook to test the `newspaper` and `rfeed` libraries.
- Tried to get RSS feeds from Associated Press.
- RSS.app worked...while their free trial lasted. Then they wanted me to pay.
- And I was like, "I *know* I can build something like this in Python."

In [1]:
import os
import sys
import newspaper
import rfeed
import tqdm
from markupsafe import escape

In [2]:
os.environ.pop("HTTP_PROXY")

'http://proxyout.lanl.gov:8080'

In [3]:
# Need to specify the websites we'd like to scrape
websites = {
    "ap_main": {
        "shortname": "AP News Main Page",
        "fullname": "Associated Press News - Main Page",
        "url": "https://apnews.com/",
    },
    "ap_politics": {
        "shortname": "AP News - Politics",
        "fullname": "Associated Press News - Politics",
        "url": "https://apnews.com/hub/politics",
    },
    "ap_coronavirus": {
        "shortname": "AP News - Coronavirus",
        "fullname": "Associated Press News - Coronavirus",
        "url": "https://apnews.com/hub/coronavirus-pandemic",
    },
}

In [8]:
def get_site_articles(url):
    """
    Reach out and download article objects from a news site.
    """

    feed_items = []

    site = newspaper.build(url, memoize_articles=False)

    articles = [newspaper.Article(art_url) for art_url in site.article_urls()]

    data = []
    for article in tqdm.tqdm(articles[:2]):
        article.download()
        article.parse()
        article.nlp()
        feed_item = rfeed.Item(
            title=article.title,
            link=article.url,
            description=article.summary,
            creator=", ".join(article.authors),
#             guid=rfeed.Guid(guid=url),
            pubDate=article.publish_date,
        )

        feed_items.append(feed_item)

    return feed_items

In [6]:
site = newspaper.build(websites["ap_main"]["url"], memoize_articles=False)

In [8]:
articles = [newspaper.Article(art_url) for art_url in site.article_urls()]

In [13]:
articles[51].nlp()

In [14]:
articles[51].summary

'(AP Photo/Vahid Salemi)QOM, Iran (AP) — In Iran’s holy city of Qom, where Shiite scholars study and pilgrims travel to a shrine believed to be a gate to heaven, the Islamic Republic’s coronavirus outbreak began and still rages to this day.\nWhile Iran works to vaccinate its 80 million people, many in Qom have not sought out the shots, authorities say.\nOverall across Iran — the Middle Eastern country hardest hit by the pandemic — there have been 5.5 million confirmed virus infections.\nIt was in Qom, some 125 kilometers (80 miles) southwest of Tehran, that the coronavirus first took hold in Iran.\nBut whatever started the pandemic here, the virus still rages.'

In [7]:
site.article_urls()

['https://apnews.com/hub/ap-top-25-college-football-poll?utm_source=apnewsnav&utm_medium=featured',
 'https://apnews.com/article/business-elections-germany-economy-national-elections-45048b76c22aee85ba7888ae03034283',
 'https://apnews.com/article/united-nations-general-assembly-afghanistan-united-nations-taliban-90837664228c6cd964432655135e33ea',
 'https://apnews.com/article/courts-ronald-reagan-jodie-foster-john-hinckley-c97c9e5f09d8f10e1b42290164965a0d',
 'https://apnews.com/article/business-montana-amtrak-d33d78d5aa0da2abf8124162ed6b678f',
 'https://apnews.com/article/anita-hill-waits-for-change-30-years-after-testimony-c60059b82560e0fdadaf0ef1d1510e91',
 'https://apnews.com/article/beyond-impossible-join-crowded-plant-based-chicken-market-41d42018cc99efb6caeac4d151b02e34',
 'https://apnews.com/article/climate-change-united-nations-general-assembly-lifestyle-business-environment-and-nature-b46559e9a86f2f6873a5c45f2a83b811',
 'https://apnews.com/article/business-technology-aefc9f4574

In [11]:
articles = get_site_articles(websites["ap_main"]["url"])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.59it/s]


In [None]:
def get_site_feed(feed_id):
    site_info = websites[escape(feed_id)]

    feed_items = get_site_articles(site_info["url"])

    feed = rfeed.Feed(
        title=site_info["shortname"],
        link=site_info["url"],
        description=site_info["fullname"],
        items=feed_items,
    )

    return feed.rss()

In [21]:
site_info = websites["ap_main"]

In [22]:
feed = rfeed.Feed(
        title=site_info["shortname"],
        link=site_info["url"],
        description=site_info["fullname"],
        items=articles,
    )

In [23]:
feed.rss()

'<?xml version="1.0" encoding="UTF-8"?>\n<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/"><channel><title>AP News Main Page</title><link>https://apnews.com/</link><description>Associated Press News - Main Page</description><generator>rfeed v1.1.1</generator><docs>https://github.com/svpino/rfeed/blob/master/README.md</docs><item><title>AP Top 25 Poll</title><link>https://apnews.com/hub/ap-top-25-college-football-poll?utm_source=apnewsnav&amp;utm_medium=featured</link><description>AP Top 25 PollThe Associated Press began its college football poll on Oct. 19, 1936, and it is now the longest-running poll of those that award national titles at the end of the season.\nThe preseason poll was started in 1950.\nA panel of 62 sports writers and broadcasters from around the country votes on the poll weekly.</description><dc:creator></dc:creator></item><item><title>Uncertain start to post-Merkel era after close German vote</title><link>https://apnews.com/article/business-elections-ge

In [None]:
feed = get_site_feed("ap_main")