In [35]:
# Import Splinter and Browser
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

In [36]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path)

In [8]:

# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)

# With the line above we are accomplishing two things.
# 1. We're searching for elements with a specific combination of tag (div) and attribute (list_text).
# 2. We're telling our browser to wait one second before searching for components. The optional delay is 
# useful to allow dynamic pages time to load while to laod, esspecially if they are image-heavy.

True

In [9]:
# Add the HTML parser
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

### Title of Articles

In [19]:
# Start Scraping
# Use the parent element to find the first 'a' tag and save it as news_title
news_title = slide_elem.find('div', class_='content_title').text
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_='article_teaser_body').text
print(news_title)
print('-')
print(news_p)

NASA's Perseverance Mars Rover Gets Balanced
-
The mission team performed a crucial weight-balancing test on the rover in preparation for this summer's history-making launch to the Red Planet.


### Featured Images 

In [23]:
# Visit URL
url = 'https://spaceimages-mars.com/'
browser.visit(url)

In [24]:
# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

In [26]:
# Parese the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [27]:
# Find the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars2.jpg'

In [28]:
# Use the base URL to create an absolut URL
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars2.jpg'

#### Scraping into a DataFrame
- In the first line we are creating a new DataFrame from the HTML table. The Pandas function read_html() specifically searchs for and returns a list of tables found in the HTML. By specifying an index 0, we are telling Pandas to pull only the first table it encounters, or the first item in the list. Then, it turns the table into a DataFrame.
- In the second line we are assigning clumns to the new DataFrame
- Then by using the .set_index() function, we're turning the description column into the DataFrame's index. implace=True means that the updated index will remain in place, without having to reassign the DataFrame to a new variable

In [44]:
# We want to create a dataframe to store our scraped table into.
df = pd.read_html('https://galaxyfacts-mars.com/')[0]
df.columns=['description', 'Mars', 'Earth']
df.set_index('description', inplace=True)
df

Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


### Scraping High Resolution Mars' Hemisphere images and titles

In [37]:
# Link to the site that we want to scrape from
url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(url)

In [38]:
# create an empty list to store hemisphere_img link to
hemisphere_img_url = []

In [39]:
# We want to request the html
html = browser.html
h_soup = soup(html, 'html.parser')

# Get the links for each of the 4 hemispheres
h_links = h_soup.find_all('h3')


# Loop throup the the four links to click on the page 
# and retrieve the information
# for i in h_links:
for h in h_links:
    h_page = browser.find_by_text(h.text)
    h_page.click()
    html = browser.html
    sp = soup(html, 'html.parser')
    # Scrape the image link and add the initail part to the link
    img_url = 'https://astrogeology.usgs.gov/' + str(sp.find('img', class_='thumb')['src'])
    # Scrape the title
    title = sp.find('h2', class_='title').text
    body = sp.find('p').text
    h_dict = {'image_url' : img_url, 'Title' : title, 'Info' : body}
    hemisphere_img_url.append(h_dict)
    browser.back()

hemisphere_img_url

[{'image_url': 'https://astrogeology.usgs.gov//cache/images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png',
  'Title': 'Cerberus Hemisphere Enhanced',
  'Info': 'Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired on February 11, 1980. At that time, it was early northern summer on Mars. The center of the image is at latitude 3 degrees, longitude 185 degrees.'},
 {'image_url': 'https://astrogeology.usgs.gov//cache/images/08eac6e22c07fb1fe72223a79252de20_schiaparelli_enhanced.tif_thumb.png',
  'Title': 'Schiaparelli Hemisphere Enhanced',
  'Info': 'Mosaic of the Schiaparelli hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. The images were acquired in 1980 during early northern summer on Mars. The center of this image is near the impact crater Schiaparelli (la

In [None]:
browser.quit()