In [None]:
# Import Splinter and BeautfulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

In [None]:
# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path)

Also, because the intention is to reuse this code often, we need to update our scraping.py script to use functions. Each major scrape, such as the news title and paragraph or featured image, will be divided into a self-contained, reusable function. Let's take a look at our code.

Instead of having our title and paragraph printed within the function, we want to return them from the function so they can be used outside of it. We'll adjust our code to do so by deleting news_title and news_p and include them in the return statement instead, as shown below.

When we add the word "browser" to our function, we're telling Python that we'll be using the browser variable we defined outside the function. All of our scraping code utilizes an automated browser, and without this section, our function wouldn't work.

In [None]:
def mars_news(browser):

    # Scrape Mars News
    # Visit the mars nasa news site
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    # Optional delay for loading the page
    browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

    # Convert the browser html to a soup object and then quit the browser
    html = browser.html
    news_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    try:
        slide_elem = news_soup.select_one("ul.item_list li.slide")
        # Use the parent element to find the first 'a' tag and save it as 'news_title'
        news_title = slide_elem.find("div", class_="content_title").get_text()
        # Use the parent element to find the paragraph text
        news_p = slide_elem.find("div", class_="article_teaser_body").get_text()

    except AttributeError:
        return None, None

    return news_title, news_p

# Featured Image

The code to scrape the featured image will be updated in almost the exact same way we just updated the mars_news section

In [None]:
def featured_image(browser):
    # Visit URL
    url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
    browser.visit(url)

    # Find and click the full image button
    full_image_elem = browser.find_by_tag('button')[1]
    full_image_elem.click()

    # Parse the resulting html with soup
    html = browser.html
    img_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    try:
        # Find the relative image url
        img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')

    except AttributeError:
        return None

    # Use the base url to create an absolute url
    img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'

    return img_url

# Mars Facts

Code for the facts table will be updated in a similar manner to the other two. This time, though, we'll be adding BaseException to our except block for error handling.

A BaseException is a little bit of a catchall when it comes to error handling. It is raised when any of the built-in exceptions are encountered and it won't handle any user-defined exceptions. We're using it here because we're using Pandas' read_html() function to pull data, instead of scraping with BeautifulSoup and Splinter. The data is returned a little differently and can result in errors other than AttributeErrors, which is what we've been addressing so far.

In [None]:
def mars_facts():
    # Add try/except for error handling
    try:
        # Use 'read_html' to scrape the facts table into a dataframe
        df = pd.read_html('http://space-facts.com/mars/')[0]

    except BaseException:
        return None

    # Assign columns and set index of dataframe
    df.columns=['Description', 'Mars']
    df.set_index('Description', inplace=True)

    # Convert dataframe into HTML format, add bootstrap
    return df.to_html()

Now you're ready to integrate Mongo.