In [1]:
# Dependencies
from splinter import Browser
import pandas as pd
from bs4 import BeautifulSoup as bs
import re

In [2]:
# Begin scraping
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=True)

## Nasa Mars News
* Scrape the NASA Mars News Site and collect the latest News Title & Paragraph Text
* Assign to variables that can be referenced later

In [3]:
mars_news_url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
browser.visit(mars_news_url)
html = browser.html
soup = bs(html, 'html.parser')

latest_news = soup.find('li', class_='slide')
news_title = latest_news.find('div', class_='content_title').text
news_p = latest_news.find('div', class_='article_teaser_body').text

print("TITLE: ", news_title)
print("PARAGRAPH: ", news_p)

TITLE:  Curiosity Tastes First Sample in 'Clay-Bearing Unit'
PARAGRAPH:  This new region on Mars might reveal more about the role of water on Mount Sharp.


## JPL Mars Space Images - Featured Image

In [4]:
jpl_images_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

browser.visit(jpl_images_url)
html = browser.html
soup = bs(html, 'html.parser')

The full-resolution image url hides in the `style` attribute of the `article` tag, hence the need of the regular expression to process it. For example:

```<article alt="Blazing Black Holes Spotted in Spiral Beauty" class="carousel_item" style="background-image: url('/spaceimages/images/wallpaper/PIA16605-1920x1200.jpg');">```

In [5]:
img_url_tail = re.search("\('([^)]*)'\)", soup.find('article', class_="carousel_item")['style'])[1]
featured_image_url = "https://www.jpl.nasa.gov" + img_url_tail

print("URL: ", featured_image_url)

URL:  https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA11777-1920x1200.jpg


## Mars Weather

In [6]:
mars_twitter_url = "https://twitter.com/marswxreport?lang=en"
browser.visit(mars_twitter_url)
html = browser.html
soup = bs(html, 'html.parser')

In [7]:
mars_weather = soup.find('p', class_="tweet-text").a.previousSibling

print(mars_weather)

InSight sol 137 (2019-04-16) low -97.0ºC (-142.7ºF) high -15.9ºC (3.4ºF)
winds from the SW at 4.3 m/s (9.7 mph) gusting to 12.4 m/s (27.7 mph)
pressure at 7.30 hPa


## Mars Facts
* Use pandas to scrape the table containing facts about the planet
* Convert the data to an HTML table

In [8]:
mars_facts_url = "https://space-facts.com/mars/"

In [9]:
tables = pd.read_html(mars_facts_url)

In [10]:
df = tables[0]
df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [11]:
# Convert pandas table to HTML, applying bootstrap classes
df.to_html(classes="table table-sm", header=False, index=False)

'<table border="1" class="dataframe table table-sm">\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

## Mars Hemispheres
* Obtain high resolution images for each of Mars' hemispheres from the USGS Astrogeology site
* 'Click' through each of the links to get the full resolution image
* Save the url's and image titles in a dictionary with the  `img_url` and `title` keys
* Form a list of all such dictionaries

In [12]:
usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(usgs_url)
html = browser.html
soup = bs(html, 'html.parser')

In [13]:
full_res_lps = ["https://astrogeology.usgs.gov"+h.parent.get('href') for h in soup.find_all('h3')]

In [14]:
hemisphere_image_urls = []

In [15]:
for lp in full_res_lps:
    browser.visit(lp)
    html = browser.html
    soup = bs(html, 'html.parser')
    
    # Get title of hemisphere
    title = re.match('(.+?) Enhanced', soup.find('h2', class_='title').text)[1]
    
    # Get URL of high-res JPEG sample image
    # Useful since most browsers do not render TIFF
    img_url = soup.find('div', class_='downloads').li.a['href']
        
    ## Get URL of full resolution TIFF image
    # img_url = soup.find('div', class_='downloads').li.find_next_sibling('li').a['href']
    
    # Add to list
    hemisphere_image_urls.append({'title': title, 'img_url': img_url})

In [16]:
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]