In [2]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [3]:
# web driver manager driver to scrape websites
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/Users/bobc/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache






### Visit the NASA Mars News Site

In [4]:
# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)

True

In [5]:
html = browser.html
news_soup = BeautifulSoup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

In [6]:
# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

'Independent Review Indicates NASA Prepared for Mars Sample Return Campaign'

In [7]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

'NASA released an independent review report Tuesday indicating the agency is well positioned for its Mars Sample Return campaign to bring pristine samples from Mars to Earth for scientific study.'

### Featured Images

In [8]:
# Visit URL using splinter
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [9]:
# Find and click the full image button using Splinter
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

In [10]:
# Parse the resulting html with Beautiful Soup
html = browser.html
img_soup = BeautifulSoup(html, 'html.parser')

In [11]:
# Find the relative image url with Beautiful Soup
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars3.jpg'

### Mars Facts

In [12]:
# Use the base URL to create an absolute URL
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars3.jpg'

In [13]:
df = pd.read_html('https://galaxyfacts-mars.com')[0] #0 tells pd to pull the 1st table it finds!
df.columns=['Description', 'Mars', 'Earth'] #add columns 
df.set_index('Description', inplace=True)
# df.to_html()
df.to_html(classes="table table-striped")

'<table border="1" class="dataframe table table-striped">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n 

## D1: Scrape High-Resolution Mars’ Hemisphere Images and Titles

### Hemispheres

In [14]:
# 1. Use browser to visit the URL 
hemispheres_url = 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/index.html'
browser.visit(hemispheres_url)

In [15]:
# HTML Object
html_hemispheres = browser.html

# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_hemispheres, 'html.parser')

# Retreive all items that contain mars hemispheres information
# items = soup.find_by_all("div.collapsible").find_by_tag("a").find_by_tag("img")
items = soup.find_all('div', class_='item')

In [16]:
# items

In [17]:
# Create empty list for the hemisphere dictionaries created in for loop
# hemispheres[{"title" : title, "img_url" : img_url}]
hemisphere_image_urls = []

# Store the main_ul 
hemispheres_main_url = 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/'

In [18]:
for i in items: 

    hemispheres = {}
    
    # Store title
    title = i.find('h3').text
    # add title to hemispheres dict
    hemispheres['title'] = title
    
    # Store full link that leads to full image website
    img_url = hemispheres_main_url + i.find('a', class_='itemLink product-item')['href']
    # Visit the link that contains the full image website 
    browser.visit(img_url)
    # HTML Object of individual hemisphere information website 
    img_html = browser.html
    # Parse HTML with Beautiful Soup for every individual hemisphere information website 
    soup = BeautifulSoup(img_html, 'html.parser')
    # Retrieve full image source 
    img_url = hemispheres_main_url + soup.find('ul').li.a['href']
    # add image_url to hemispheres dict
    hemispheres['img_url'] = img_url
    
    # Append the retreived information into the list of dictionaries 
#     hemisphere_image_urls.append({"title" : title, "img_url" : img_url})
    hemisphere_image_urls.append(hemispheres)
    
    

In [21]:
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/images/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/images/schiaparelli_enhanced-full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/images/syrtis_major_enhanced-full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/images/valles_marineris_enhanced-full.jpg'}]

In [21]:
# 5. Quit the browser
browser.quit()