In [1]:
class RssParser(object):
    
    def run(self, feed: dict) -> dict:
        return {
          'title': feed.feed.title,
          'links': [ entry.link for entry in feed.entries ]
        }

In [2]:
from abc import ABC, abstractmethod

class AbstractRequestor(ABC):
    
    @abstractmethod
    def run(self, url: str) -> dict:
        pass

In [3]:
import feedparser

class RssRequestor(AbstractRequestor):
    
    parser: RssParser
    
    def __init__(self, parser: RssParser):
        self.parser = parser

    def run(self, url:str) -> dict:
        feed = feedparser.parse(url)
        return self.parser.run(feed)

In [4]:
parser = RssParser()
requestor = RssRequestor(parser)
response = requestor.run(
    'https://www.espn.com/espn/rss/nba/news'
)

response

{'title': 'www.espn.com - NBA',
 'links': ['https://www.espn.com/nba/story/_/id/29039958/magic-johnson-sees-overlap-coronavirus-aids-crises',
  'https://www.espn.com/nba/story/_/id/29039317/gregg-popovich-gives-pep-talk-staff-san-antonio-food-bank',
  'https://www.espn.com/nba/story/_/id/29034717/chinese-basketball-association-pushes-back-restart-least-july',
  'https://www.espn.com/nba/story/_/id/29035779/after-bulls-shake-staff-court-zach-lavine-set-lead-chicago-charge-court',
  'https://www.espn.com/espn/story/_/id/29037099/donald-trump-includes-sports-execs-200-person-advisory-group',
  'https://www.espn.com/nba/story/_/id/29030877/mother-timberwolves-star-karl-anthony-towns-dies-due-complications-coronavirus',
  'https://www.espn.com/mma/story/_/id/29041258/shaquille-oneal-credits-nba-success-mma',
  'https://www.espn.com/nba/story/_/id/29026754/the-last-dance-know-michael-jordan-bulls-watching-doc',
  'https://www.espn.com/nba/allstar2014/story/_/page/dunk-2000/oral-history-2000-

In [5]:
from bs4 import BeautifulSoup

class AbstractWebsiteParser(ABC):
    
    @abstractmethod
    def run(self, html: str) -> dict:
        pass
    
class EspnWebsiteParser(AbstractWebsiteParser):
    
    def run(self, html: str) -> dict:
        
        bs = BeautifulSoup(html, 'html.parser')

        elements_to_remove = [
            bs.find_all('ul', 'article-social'),
            bs.find_all('div', 'article-meta'),
            bs.find_all('aside'),
            bs.find_all('div', 'teads-inread'),
            bs.find_all('figure'),
            bs.find_all('div', 'cookie-overlay')
        ]

        for element_search in elements_to_remove:
            for tag in element_search:
                tag.decompose()

        for a in bs.find_all('a'):
            a.replaceWith(a.text)

        p = [ p.text for p in bs.find_all('p') ]

        return {
            'text': '\n'.join(p).strip()
        }

In [6]:
import requests

class WebsiteRequestor(AbstractRequestor):
    
    parser: AbstractWebsiteParser
    
    def __init__(self, parser: AbstractWebsiteParser):
        self.parser = parser

    def run(self, url: str) -> dict:
        response = requests.get(url)
        assert response.status_code == 200, \
            f'status code: ${response.status_code}'

        return self.parser.run(response.text)

In [7]:
parser = EspnWebsiteParser()
requestor = WebsiteRequestor(parser)
response = requestor.run(
    'https://www.espn.com/nba/story/_/id/29039958/magic-johnson-sees-overlap-coronavirus-aids-crises'
)

## sample it out,
response['text'][:10]

'For five-t'

In [8]:
from typing import Iterator

import time

class Runner(object):
    
    rss_requestor: AbstractRequestor
    website_requestor: AbstractRequestor
    
    sleep_time_in_seconds: int
    
    def __init__(self, \
        rss_requestor: AbstractRequestor, \
        website_requestor: AbstractRequestor, \
        sleep_time_in_seconds = 60):
        self.rss_requestor = rss_requestor
        self.website_requestor = website_requestor
        self.sleep_time_in_seconds = sleep_time_in_seconds
    
    def run(self, url: str) -> Iterator[tuple]:
        feed = self.rss_requestor.run(url)
        for link in feed['links']:
            response = self.website_requestor.run(link)
            text = response['text']
            yield (link, text)
            
            time.sleep(self.sleep_time_in_seconds)

In [None]:
import re

In [None]:
output_directory = '../data/espn/nba/documents'

url = 'https://www.espn.com/espn/rss/nba/news'
rss_requestor = RssRequestor(RssParser())
website_requestor = WebsiteRequestor(EspnWebsiteParser())
runner = Runner(rss_requestor, website_requestor)

for link, text in runner.run(url):
    
    story_id_search = re.compile(r'\/(?:id|page)\/([^/]+)\/').search(link)
    assert story_id_search != None, f'error: {link}'
    
    story_id = story_id_search.group(1)
    article_path = f'{output_directory}/{story_id}.txt'
    
    with open(article_path, 'w') as output:
        output.write(text)
        
    print(f'finished: {article_path}')

finished: ../data/espn/nba/documents/29039958.txt
finished: ../data/espn/nba/documents/29039317.txt
finished: ../data/espn/nba/documents/29034717.txt
