In [14]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
# set up the URL (NASA Mars News (Links to an external site.)) for scraping.
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/98.0.4758.102/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\shama\.wdm\drivers\chromedriver\win32\98.0.4758.102]


In [3]:
# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1) #optional delay is useful, dynamic pages take longer to load


True

I've assigned slide_elem as the variable to look for the <div /> tag and its descendent (the other tags within the <div /> element). This is parent element. This means that this element holds all of the other elements within it, and I'll reference it when we want to filter search results even further. The (.) is used for selecting classes, such as list_text, so the code 'div.list_text' pinpoints the <div /> tag with the class of list_text. CSS works from right to left, such as returning the last item on the list instead of the first. Because of this, when using select_one, the first matching element returned will be a <li /> element with a class of slide and all nested elements within it.

In [4]:
#set up the HTML parser
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

In [5]:
#This variable holds a ton of information, so look inside of that information to find this specific data.
slide_elem.find('div', class_='content_title') #The specific data is in a <div /> with a class of 'content_title'."

<div class="content_title">NASA's Perseverance Rover Mission Getting in Shape for Launch</div>

In [6]:
# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find('div', class_='content_title').get_text() # get_text(), only the text of the element is returned
news_title

"NASA's Perseverance Rover Mission Getting in Shape for Launch"

In [7]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

'Stacking spacecraft components on top of each other is one of the final assembly steps before a mission launches to the Red Planet. '

### Featured Images

In [13]:
# Visit URL
url = 'https://spaceimages-mars.com'
browser.visit(url)

WebDriverException: Message: chrome not reachable
  (Session info: chrome=98.0.4758.102)
Stacktrace:
Backtrace:
	Ordinal0 [0x00A769A3+2582947]
	Ordinal0 [0x00A0A6D1+2139857]
	Ordinal0 [0x00903960+1063264]
	Ordinal0 [0x008F8092+1015954]
	Ordinal0 [0x008F88C8+1018056]
	Ordinal0 [0x008FA102+1024258]
	Ordinal0 [0x008F3DE9+998889]
	Ordinal0 [0x00904DD0+1068496]
	Ordinal0 [0x00957B12+1407762]
	Ordinal0 [0x00948366+1344358]
	Ordinal0 [0x00925176+1200502]
	Ordinal0 [0x00926066+1204326]
	GetHandleVerifier [0x00C1BE02+1675858]
	GetHandleVerifier [0x00CD036C+2414524]
	GetHandleVerifier [0x00B0BB01+560977]
	GetHandleVerifier [0x00B0A8D3+556323]
	Ordinal0 [0x00A1020E+2163214]
	Ordinal0 [0x00A15078+2183288]
	Ordinal0 [0x00A151C0+2183616]
	Ordinal0 [0x00A1EE1C+2223644]
	BaseThreadInitThunk [0x76B76739+25]
	RtlGetFullPathName_UEx [0x77558E7F+1215]
	RtlGetFullPathName_UEx [0x77558E4D+1165]


In [9]:
# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1] # browser to click the second button
full_image_elem.click()

In [10]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [11]:
# Find the relative image url
img_url_rel = img_soup.find('img', class_='headerimage fade-in').get('src')
img_url_rel

'image/featured/mars3.jpg'

In [12]:
# Use the base URL to create an absolute URL
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars3.jpg'

In [16]:
# scrape the entire table with Pandas' .read_html() function
df = pd.read_html('https://galaxyfacts-mars.com')[0] # index of 0, telling Pandas to pull only the first table/first item in list
df.columns=['description', 'Mars', 'Earth']
df.set_index('description', inplace=True)
df

Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [17]:
# convert our DataFrame back into HTML-ready code
df.to_html() # it's a <table /> element with a lot of nested elements

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

In [18]:
# turning off the browser
browser.quit()