In [14]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd

In [2]:
# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path)

In [3]:
# Visit the mars nasa news site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

True

In [4]:
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [5]:
# Scraping
slide_elem.find("div", class_='content_title')


<div class="content_title"><a href="/news/8749/nasa-readies-perseverance-mars-rovers-earthly-twin/" target="_self">NASA Readies Perseverance Mars Rover's Earthly Twin </a></div>

In [6]:
# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find("div", class_='content_title').get_text()
news_title

"NASA Readies Perseverance Mars Rover's Earthly Twin "

In [7]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_="article_teaser_body").get_text()
news_p

"Did you know NASA's next Mars rover has a nearly identical sibling on Earth for testing? Even better, it's about to roll for the first time through a replica Martian landscape."

Scrape Mars Data: Images

In [8]:
# Visit URL
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [9]:
# Find and click the full image button
full_image_elem = browser.find_by_id('full_image')
full_image_elem.click()

In [10]:
# Find the more info button and click that
browser.is_element_present_by_text('more info', wait_time=1)
more_info_elem = browser.links.find_by_partial_text('more info')
more_info_elem.click()

#### is_element_present_by_text() method 
* is use to search for an element that has the provided text
* "wait_time=1" is the additinal augument that allows the browser to fully load before we search for the element. 

#### more_info_elem
* a new variable

#### browser.links.find_by_partial_text() method
* this method take string 'more info' to find the link associated with the "more info" text

#### final step is to tell Splinter to click that link by chaining the '.click()' function onto our 'more_info_elem' variable

In [11]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [12]:
# Find the relative image url
img_url_rel = img_soup.select_one('figure.lede a img').get("src")
img_url_rel

'/spaceimages/images/largesize/PIA16192_hires.jpg'

* figure.lede references the ""<figure />"" tag and its class, lede.

* a is the next tag nested inside the "<figure />" tag.

* An img tag is also nested within this HTML, so we’ve included that as well.

* .get("src") pulls the link to the image.

In [13]:
# Use the base URL to create an absolute URL
img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
img_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16192_hires.jpg'

In [16]:
df = pd.read_html('http://space-facts.com/mars/')[0]
df.columns=['description', 'value']
df.set_index('description', inplace=True)
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>value</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\

* df = pd.read_html('http://space-facts.com/mars/ (Links to an external site.)')[0] 
-- With this line, we’re creating a new DataFrame from the HTML table. The Pandas function read_html() specifically searches for and returns a list of tables found in the HTML. By specifying an index of 0, we’re telling Pandas to pull only the first table it encounters, or the first item in the list. Then, it turns the table into a DataFrame.

* df.columns=['description', 'value'] 
-- Here, we assign columns to the new DataFrame for additional clarity.

* df.set_index('description', inplace=True) 
-- By using the .set_index() function, we’re turning the Description column into the DataFrame’s index. inplace=True means that the updated index will remain in place, without having to reassign the DataFrame to a new variable.

* df.to_html()
-- a function to convert DataFrame back into HTML-ready code

In [17]:
browser.quit()