In [337]:
# Import dependencies

from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
from splinter import Browser
from selenium import webdriver

In [338]:
# Identify chromedriver path

!which chromedriver    

/usr/local/bin/chromedriver


In [339]:
# Identify and activate the browser
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

## Scrape the Mars Nasa Website for News articles

In [340]:
# Set the url and use chromedriver to visit the url
url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
browser.visit(url)

In [341]:
html = browser.html
# Use beautiful soup and set the parser to html
soup = bs(html, 'html.parser')
# find all list items with a class 'slide' and store in a variable for results
results = soup.find_all('li', class_='slide')

# set up a loop to iterate for the number of items stored in 'results'
# for x in range(len(results)):
    #     for each item in results, assign title to be = div where class = 'content-title' 
title = soup.find('div', class_='content_title')
    #     for each item in results, assign p to be = div where class = 'article-teaser-body' 
p = soup.find('div', class_='article_teaser_body')
print('-------------')
#     print the results in text format
print(title.text)
print(p.text)


-------------


NASA's Opportunity Rover Mission on Mars Comes to End




AttributeError: 'NoneType' object has no attribute 'text'

## Scrape the JPL Images Site

In [322]:
# Set the url and use chrome webdriver to visit the webpage
jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(jpl_url)

In [323]:
# Parse through the html, find the item where the id='full_image' and click that item with chrome webdriver
full_image = browser.find_by_id('full_image')
full_image.click()

In [324]:
# Find a link by 'more info' tag and click that item
more_info = browser.find_link_by_partial_text('more info')
more_info.click()

In [325]:
# activate beautiful soup and select html parser 
html = browser.html
soup = bs(html, 'html.parser')

In [326]:
# Parse through and find 'figure' tag where class = 'lede'
results = soup.find('figure', class_='lede')
results

<figure class="lede">
<a href="/spaceimages/images/largesize/PIA17470_hires.jpg"><img alt="NASA's Cassini spacecraft reveals the differences in the composition of surface materials around hydrocarbon lakes at Titan." class="main_image" src="/spaceimages/images/largesize/PIA17470_hires.jpg" title="NASA's Cassini spacecraft reveals the differences in the composition of surface materials around hydrocarbon lakes at Titan."/></a>
</figure>

In [327]:
# Find the image link and store it in a variable
featured_image_url = results.a['href']
# Include the beginning of the full webpage
featured_image_url = f'https://www.jpl.nasa.gov{featured_image_url}'
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA17470_hires.jpg'

## Scrape the Mars Twitter Page

In [328]:
# Set the url and use chrome webdriver to visit the webpage
tw_url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(tw_url)

In [329]:
# Activate beautifulsoup and set html parser
html = browser.html
soup = bs(html, 'html.parser')

In [330]:
# Parse the html and find the first div tag where class = 'js-tweet-text-container'
results = soup.find('div', class_='js-tweet-text-container')

In [331]:
# Get the text in the results and store in a variable
mars_weather = results.p.text
mars_weather

'Watch the #Mars2020 rover being built live from @NASAJPLhttps://youtu.be/PaNiYPglK58\xa0'

## Scrape the Space Facts website for table on Mars

In [332]:
# Set the url
space_facts_url = 'https://space-facts.com/mars/'

In [333]:
# Use pandas to read the html in the url, scrape the table and assign it to a variable
table = pd.read_html(space_facts_url)
# Select the first table
table_df = table[0]
# Change column names
table_df.columns=["Title","Fact"]
# Print table df
table_df

Unnamed: 0,Title,Fact
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [334]:
# Convert table data to html and save as html file
table_df.to_html('mars_table.html')

## Scrape the USGS Astrogeology site

In [335]:
driver = webdriver.Chrome()
usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
driver.get(usgs_url)

In [336]:
# Create empty list to hold dictionaries
hemisphere_image_urls=[]

loops = 4 # We know there are 4 images
# Loop through the following while loops is greater than 0
while loops > 0:
    # Create empty dictionary to save Title and Url
    d = {}
    # Use selenium to find the section where class name = 'description'
    parentElement = driver.find_elements_by_class_name('description')
    #  List comprehension; loop through the sections with class name 'description
    #  and obtain the element where class name = 'itemLink'
    urls = [x.find_element_by_class_name('itemLink') for x in parentElement]
    # Since python is zero based for indexing, decrement loops by one to select from urls list
    url = urls[loops - 1]
    # Click the selected url
    url.click()
    # Find Element With text Original. This contains the url we need
    image_element = driver.find_element_by_link_text('Original')
    # Get the href and store image url in a variable
    image_url = image_element.get_attribute('href')
    # Find element where class name = title. This contains the image title
    title_element = driver.find_element_by_class_name('title')
    # Get the text and store it in a variable
    title = title_element.text
    # Add key-value pairs to dictionary for title and image url
    d["title"] = title
    d["image_url"] = image_url
    # Append dictionary to list
    hemisphere_image_urls.append(d)
    # Go back to previous page
    driver.execute_script("window.history.go(-1)")
    # Decrement loops by 1, to 3, and eventually 0 where the while loop will end
    loops -= 1
# Print the list to check results
hemisphere_image_urls 

[{'title': 'Valles Marineris Hemisphere Enhanced',
  'image_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'image_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'image_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif'},
 {'title': 'Cerberus Hemisphere Enhanced',
  'image_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif'}]