In [None]:
from time import time
from time import sleep
from random import randint
from IPython.core.display import clear_output
from warnings import warn
from bs4 import BeautifulSoup
import requests

In [None]:
def process_page(soup, news):  
  
  raw_news = soup.select('article')

  # same as above, extract the info we need
  for story in raw_news:
    
    author = story.select_one('span[style="white-space:nowrap"]').get_text() # extract the author

    try:
      # I'm sure there is a better way of filtering out sponsored posts
      if author != ('Space.com Staff' or 'SPACE.com Staff'):
        headline = story.select_one('h3[class="article-name"]').get_text() # extract the headline
        synopsis = story.select_one('p[class="synopsis"]').get_text().strip() # extract the synopsis
        date_time = story.p.time['data-published-date'] # extract the date and time
      
        story = {
          'headline': headline, 
          'author': author, 
          'synopsis': synopsis,
          'date and time': date_time} # construct a dictionary

        news.append(story) # add dictionary to list
    
    except TypeError:
      print('Sponsored Story excluded')
    
    

In [None]:
# prepare for the monitoring logic
start_time = time() # note the system time when the program starts
request_count = 0 # track the number of requests made

# create a list to store the data in
news = []

# variables to handle the request loop
has_next_page = True
MAX_REQUESTS = 9 # do not request more than 100 pages
page_number = 1
url = 'https://www.space.com/news/' + str(page_number)
headers = {'user-agent': 'news scraper - school project (riley.stange@gmail.com)'}

In [13]:
while has_next_page and request_count <= MAX_REQUESTS:
  # keep the output clear
  clear_output(wait = True)
  
  # make an initial request
  response = requests.get(url, headers=headers)

  # make sure we got a valid response
  if(response.ok):
    # get the full data from the response
    data = response.text
    soup = BeautifulSoup(data, 'html.parser')
    process_page(soup, news)

    # check for the next page
    # look for the presence of element with class *test-pagination-next*
    next_button = soup.select('span[class="listings-pagination-button listings-next "]')
    has_next_page = len(next_button) > 0
    
  else:
    # display a warning if there are any problems
    warn('Request #: {}, Failed with status code: {}'.format(request_count, response.status_code))
  
  request_count += 1
  
  # go to sleep for a bit
  # we use a random number between 1 and 5 so
  # We can wait as long as 5 seconds to make a second request
  
  sleep(randint(1,3))
  
  # output some logs for monitoring
  elapsed_time = time() - start_time
  print('Requests: {}, Frequency: {} requests/s, {} news stories processed.'.format(request_count, request_count/elapsed_time, len(news)))
  
  # prepare for next iteration
  page_number += 1
      
print('Scraping complete')
print('Requests: {}, Frequency: {} requests/s, {} news stories processed.'.format(request_count, request_count/elapsed_time, len(news)))

Requests: 10, Frequency: 0.3326724545098831 requests/s, 180 news stories processed.
Scraping complete
Requests: 10, Frequency: 0.3326724545098831 requests/s, 180 news stories processed.


In [14]:
news

[{'author': 'Doris Elin Urrutia',
  'date and time': '2019-09-30T19:38:21Z',
  'headline': 'Crowded Space Station: There Are 9 People from 4 Different Space Agencies in Orbit Right Now',
  'synopsis': "It's a bit crowded at the International Space Station right now."},
 {'author': 'Tariq Malik',
  'date and time': '2019-09-30T18:21:38Z',
  'headline': "Today's the Last Chance to Send Your Name to Mars on NASA's 2020 Rover",
  'synopsis': "Today's the last day to add your name to the more than 9 million already signed up."},
 {'author': 'Mike Wall',
  'date and time': '2019-09-30T17:31:12Z',
  'headline': 'Centaurs Rising: NASA Eyes Missions to Weird Asteroid-Comet Hybrids',
  'synopsis': 'The Centaurs may get their first-ever close-up soon.'},
 {'author': 'Meghan Bartels',
  'date and time': '2019-09-30T17:29:35Z',
  'headline': "NASA Hands Out $43 Million for 'Tipping Point' Tech for the Moon and Mars",
  'synopsis': 'NASA is funding more than a dozen technology projects that could ai