In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
# NOTE: time.sleep() func is used to make sure that as
# browser visits pages, they are given time to load
# before moving to the next page or pulling the soup.
# Feel free to remove sleep timers to make it run faster.
# but beware of errors it may throw.

In [2]:
# initialize browser
executable_path = 'drivers/chromedriver'
browser = Browser('chrome', executable_path=executable_path, headless=False)

In [3]:
# urls to scrape
news_url = 'https://mars.nasa.gov/news/'
image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
tweet_url = 'https://twitter.com/marswxreport?lang=en'
facts_url = 'https://space-facts.com/mars/'
hemis_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

## NASA Mars News

In [4]:
# go to news page
browser.visit(news_url)
time.sleep(5)

In [5]:
# and make soup
news_soup = bs(browser.html, 'lxml')

In [None]:
# queries to collect article title, snippet, and date
news_title_query = news_soup.find(class_='content_title', string=True)
news_snippet_query = news_soup.find(class_='article_teaser_body', string=True)
news_date_query = news_soup.find(class_='list_date', string=True)

# make a dictionary contatining the string results from the queries
news_dict = {'news_title':news_title_query.text,
             'news_p':news_snippet_query.text,
             'date':news_date_query.text}

In [None]:
# note that date is a string, which is fine as it's just going to be posted again.
news_dict

## JPL Mars Space Images - Featured Image

In [None]:
# go to page
browser.visit(image_url)
time.sleep(5)

# base url for getting HiRes images
jpl_url = 'www.jpl.nasa.gov'

In [None]:
# this button click stays on the same page but opens a collapsed overlay
browser.click_link_by_partial_text('FULL IMAGE')
time.sleep(5)

In [None]:
# this button links to another page containing the full-size image
browser.click_link_by_partial_text('more info')
time.sleep(5)

In [None]:
# make soup
image_soup = bs(browser.html, 'lxml')

In [None]:
# collect image url from page, which is partial, and add the base url with string concatination
image_query = image_soup.find(class_='main_image')
featured_image_url = jpl_url + image_query['src']
featured_image_url

## Mars Weather

In [None]:
# go to twitter
browser.visit(tweet_url)
time.sleep(5)

In [None]:
# make soup
tweet_soup = bs(browser.html, 'lxml')

In [None]:
# find top tweet
tweet_query = tweet_soup.find(class_='tweet-text')

In [None]:
# chop off image or whatever is below the weather report.
mars_weather = tweet_query.text.split('\n')[0]
mars_weather

## Mars Facts

In [6]:
# us built-in pandas function to scrape html table
facts_raw_list = pd.read_html(facts_url)

# result is a list of dataframes, renaming columns
facts_raw_list[0].columns = ['feature', 'value']
# and assign renamed dataframe to its own variable
facts_df = facts_raw_list[0]

In [12]:
# casting the dataframe back to html table, no index column or column headers
facts_html = facts_df.to_html(index=False, header=False)

# to_html() func places newline characters, so removing those.
facts_html = facts_html.replace('\n', '')
facts_html = facts_html.replace('dataframe', 'table table-dark')
facts_html

'<table border="1" class="table">  <tbody>    <tr>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <td>Mass:</td>      <td>6.42 x 10^23 kg (10.7% Earth)</td>    </tr>    <tr>      <td>Moons:</td>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <td>Orbit Distance:</td>      <td>227,943,824 km (1.52 AU)</td>    </tr>    <tr>      <td>Orbit Period:</td>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <td>Surface Temperature:</td>      <td>-153 to 20 °C</td>    </tr>    <tr>      <td>First Record:</td>      <td>2nd millennium BC</td>    </tr>    <tr>      <td>Recorded By:</td>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

## Mars Hemispheres

In [None]:
# will be making a list of dictionaries, one for each hemisphere
hemisphere_image_urls = []

# go to page first to find links to click
browser.visit(hemis_url)
time.sleep(5)

# make soup
hemis_soup = bs(browser.html, 'lxml')

# parse for link titles
hemis_query = hemis_soup.find_all(class_='item')

# now loop through the website four times to collect images
for result in hemis_query:
    
    # start with home page
    browser.visit(hemis_url)
    time.sleep(5)
    
    # click through to find image,
    # using initial query results as a guide
    browser.click_link_by_partial_text(result.h3.text)
    
    # make soup of page with image
    hemis_temp_soup = bs(browser.html, 'lxml')
    
    # make outputs for resulting dictionary
    hemi_name = result.h3.text.replace(' Enhanced', '')
    hemi_img_url = hemis_temp_soup.find(target='_blank')['href']
    
    # and append to results list
    hemisphere_image_urls.append({'title':hemi_name, 'img_url':hemi_img_url})

    
display(hemisphere_image_urls)

# remember to close the browser!
browser.quit()

In [17]:
browser.visit(hemis_url)

In [5]:
soup = bs(browser.html, 'lxml')

In [18]:
result = soup.find_all(class_='description')
name = result[0].h3.text

In [19]:
browser.click_link_by_partial_text(name)