In [1]:
# Dependencies
from bs4 import BeautifulSoup
from time import sleep

from splinter import Browser 
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Open the Chrome Driver Browser
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 84.0.4147
[WDM] - Get LATEST driver version for 84.0.4147


 


[WDM] - Driver [C:\Users\pooja\.wdm\drivers\chromedriver\win32\84.0.4147.30\chromedriver.exe] found in cache


## 1. Scraping NASA Mars News

In [3]:
# Page URL that we are going to scrape
news_url = "https://mars.nasa.gov/news/"
browser.visit(news_url)

# Convention to let the script we are running sleep to allow everything to load 
sleep(0.5)

In [4]:
# Create a BeautifulSoup object and parse as lxml
html = browser.html
soup = BeautifulSoup(html, 'lxml')

In [28]:
# Obtain the latest news title 
latest_news_titles = soup.find_all("div", class_="content_title")
news_title = latest_news_titles[1].text

"NASA Engineers Checking InSight's Weather Sensors"

In [33]:
# Obtain the paragraph (description) attached to the title above 
latest_news_paragraphs = soup.find_all("div", class_="article_teaser_body")
news_paragraph = latest_news_paragraphs[0].text

'An electronics issue is suspected to be preventing the sensors from sharing their data about Mars weather with the spacecraft.'

## 2. Scraping JPL Mars Space Images

In [None]:
# Page URL that we are going to scrape
images_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(images_url)

# Convention to let the script we are running sleep to allow everything to load 
sleep(0.5)

In [None]:
# Interact with the 'FULL IMAGE' button in the browser to get to the featured image
browser.click_link_by_id('full_image')

In [None]:
# Interact with the 'more info' button in the browser to access the larger image size 
browser.click_link_by_partial_text('more info')

In [None]:
# Create a BeautifulSoup object and parse as lxml
html = browser.html
soup = BeautifulSoup(html, 'lxml')

In [None]:
# Obtain the specific path to the larger image size 
lg_image_url = soup.find(class_="main_image")["src"]

In [None]:
# Combine the base url and path to image 
base_url = "https://www.jpl.nasa.gov"
featured_image_url = base_url + lg_image_url
featured_image_url

## 3. Scrape Mars Facts

In [None]:
# Import dependencies 
import pandas as pd 

In [None]:
# Page URL that we are going to scrape
facts_url = "https://space-facts.com/mars/"

# use the read_html function in Pandas to obtain the data on the page in HTML 
tables = pd.read_html(facts_url)
print(tables)

In [None]:
# Obtain the table that contains info on: Diameter, Mass, etc.
df = tables[0]
df

In [None]:
# Rename columns
df.columns = ['Description', 'Mars']
df

In [None]:
# Set the Description as index
df.set_index('Description')

In [None]:
# Convert the dataframe above to an HTML string
html_table = df.to_html()
print(html_table)

In [None]:
# Save the html table 
df.to_html('table.html')

## 4. Mars Hemisphere

In [None]:
# Page URL that we are going to scrape
hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(hemisphere_url)

# Convention to let the script we are running sleep to allow everything to load 
sleep(0.5)

In [None]:
# Create a BeautifulSoup object and parse as lxml
html = browser.html
soup = BeautifulSoup(html, 'lxml')

In [None]:
# Create a list to store the dictionaries (for each hemisphere)
hemisphere_image_urls = list()

# Retrieve all the hemisphere link titles
link_titles = soup.find_all('h3')
for title in link_titles:
    print(title.text)

In [None]:
# Loop through the link titles 
for title in link_titles:
    
    # Click the next link that has the next title in link_titles
    browser.click_link_by_partial_text(title.text)
    sleep(0.5)
    
     # HTML object
    html = browser.html
    
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    
    
    # Retrieve the specific hemisphere name from the link titles 
    hem_name = title.text.split('Enhanced')[0]
    
    # Retrieve the full resolution image URL 
    img_path = soup.find("li")
    lg_img_url = img_path.a['href']
    
    # Append each dictionary to the list 
    hemisphere_image_urls.append({
        "title": hem_name,
        "img_url": lg_img_url
    })
    
    # Go back to the home page before loop repeats
    browser.visit(hemisphere_url)
    sleep(0.5)

In [None]:
# Double check the list of dictionaries
hemisphere_image_urls

In [None]:
# Quit the browser 
broswer.quit()