In [1]:
#%%writefile scrape_mars.py

# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
import requests
import pymongo
import time

# Set the chromedriver path
executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
browser = Browser("chrome", **executable_path, headless=False)


# Note that not using requests as the page is scraped before fully loaded (js code has completed)
# so using splinter
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')


# Find the Title and Description of the first news entry in the returned response
news_p = soup.find('div', class_='rollover_description_inner').text
news_title = soup.find('div', class_='content_title').text




# Visit the following URL
url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
base_url = "https://www.jpl.nasa.gov"
browser.visit(url)


# Design an XPATH selector open full image button
xpath = '//a[@id="full_image"]'


# Use splinter to bring up the full resolution image
results = browser.find_by_xpath(xpath)
img = results[0]
img.click()


# Scrape the browser into soup and use soup to find the full resolution image of mars
# Save the image url to a variable called `img_url`
time.sleep(5)  # Give browser time to load full page
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
img_url = soup.find("img", class_="fancybox-image")["src"]
featured_image_url = base_url + img_url




# Visit the Mars Weather twitter account and scrape the latest Mars weather tweet from the page.
url = 'https://twitter.com/marswxreport?lang=en'

# Retrieve page with the requests module
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')


# Get the tweets
tweets = soup.findAll('p', class_='tweet-text')

# Get the first tweet that is weather data
for tweet in tweets:
    if (tweet.text[0:3] == "Sol"):
        break;

mars_weather = tweet.text




# Scrape with pandas
url = 'http://space-facts.com/mars/'
tables = pd.read_html(url)
tables_df = pd.DataFrame(tables[0])
tables_df.rename(index=str, columns={0: "", 1: "value"}, inplace=True)

# Convert to hmtl table string
mars_facts_table = tables_df.to_html(index = False)




# Visit the following URL
url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
base_url = "https://astrogeology.usgs.gov"
browser.visit(url)
hemisphere_image_urls = []
img_titles = []

# Design an XPATH selector to grab the images
xpath_img = '//div[@class="collapsible results"]//div[@class="item"]//a[@class="itemLink product-item"]/img'

# Scrape the browser into soup and use soup to get the titles
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
descriptions = soup.findAll("h3")
for description in descriptions:
    img_titles.append(description.text)

# click through the 4 images
for x in range(0, 4):
    # Click each one to get the larger image
    results_img = browser.find_by_xpath(xpath_img)
    img = results_img[x]    
    img.click()

    # Click the open button
    xpath_open = '//a[@id="wide-image-toggle"]'
    results = browser.find_by_xpath(xpath_open)
    open_btn = results[0]
    open_btn.click()

    # Scrape the browser into soup and use soup to find the full resolution images
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    img_url = soup.find("img", class_="wide-image")["src"]
    full_img_path = base_url + img_url
    
    hemisphere_image_urls.append({img_titles[x]: full_img_path})

    browser.back()
    
mars_dictionary = {"mars_news": [news_title, news_p],
                   "featured_image": featured_image_url,
                   "mars_weather": mars_weather,
                   "mars_facts": mars_facts_table,
                   "mars_hemispheres": hemisphere_image_urls}
print(mars_dictionary)
browser.quit()

{'mars_news': ['NASA Seeking Partner in Contest to Name Next Mars Rover', 'NASA has a class assignment for corporations, nonprofits and educational organizations involved in science and space exploration: partner with the agency to inspire future engineers and scientists by sponsoring a contest to name the next rover to venture to the Red Planet.'], 'featured_image': 'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA19968_ip.jpg', 'mars_weather': 'Sol 2171 (2018-09-14), high -12C/10F, low -65C/-84F, pressure at 8.79 hPa, daylight 05:43-17:59', 'mars_facts': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>value</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <