In [12]:
# Import the required dependencies 
from bs4 import BeautifulSoup
from splinter import Browser 
import pandas as pd 
import requests 

#### For Windows, the following was used to be able to utilise Splinter on Chrome. This may be different if you use another OS

In [None]:
#from webdriver_manager.chrome import ChromeDriverManager
#executable_path = {'executable_path': ChromeDriverManager().install()}
#browser = Browser('chrome', **executable_path, headless=True)

### 1. Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest) and collect the latest News Title and Paragraph Text 

In [None]:
# Visit the NASA Mars News Site using the "browser" variable created with ChromeDriverManager
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

In [15]:
#Create a HTML object
nasa_news_html = browser.html

# Parse HTML with Beautiful Soup
parsed_nasa_news = BeautifulSoup(nasa_news_html, 'html.parser')


# Locate where and how your desired elements are stored in the site's HTML script and use ".find()" to extract the text 
news_title = parsed_nasa_news.find('div', class_='list_text').find('div', class_='content_title').text
news_para = parsed_nasa_news.find('div', class_='article_teaser_body').text

# Display the obtained text 
print(news_title)
print(news_para)

NASA's Ingenuity Mars Helicopter Succeeds in Historic First Flight
The small rotorcraft made history, hovering above Jezero Crater, demonstrating that powered, controlled flight on another planet is possible.


### 2. Extract the image URL for the featured image on [JPL Featured Images](https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html)

In [16]:
# Visit the JPL Featured Images Site using the "browser" variable created with ChromeDriverManager
url2 = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url2)

In [17]:
#Create a HTML object
jpl_image_html = browser.html

#Parse the HTML with Beautiful Soup
parsed_jpl = BeautifulSoup(jpl_image_html, 'html.parser')

# Locate where and how your desired elements are stored in the site's HTML script and use ".find()" and ".get()" to extract the text 
featured_img = parsed_jpl.find('div', class_="header").find('div', class_="floating_text_area").find('a').get('href')

#Identify the main URL that the href is otherwise attached to 
main_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/'

#Join the main URL and the featured images href together to create the link for the image
featured_image_url = main_url + featured_img

#Display the image url
featured_image_url

'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars3.jpg'

### 3. Use pandas to scrape the table contianing facts about Mars on [Mars Facts](https://space-facts.com/mars/)

In [18]:
#Obtain the url required 
url3 = 'https://space-facts.com/mars/'

#Used "pd.read_html" to parse the html in pandas 
mars_facts_html = pd.read_html(url3)

#Extract the the table on the site. "[0]" indicates the first table listed in the html
planet_profile_df = mars_facts_html[0]

#Assign column names
planet_profile_df.columns = ['Description', 'Value']

#Set the index row to be the 'Description' column
planet_profile_df.set_index('Description', inplace=True)

#Display the dataframe 
planet_profile_df

Unnamed: 0_level_0,Value
Description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [19]:
#Convert the table into html 
profile_table_html = planet_profile_df.to_html()

### 4. Visit the [USGS Astrogeology]('https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars') site and extract the images of each hemisphere

In [26]:
# Visit the JPL Featured Images Site using the "browser" variable created with ChromeDriverManager
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)

In [27]:
#Create a html object
html_hemispheres = browser.html

#Parse the HTML with Beautiful Soup
parsed_hemipheres_img = BeautifulSoup(html_hemispheres, 'html.parser')

#Create an empty list to store all the links for the hemispheres
hemisphere_image_urls = []

#Information and images of the hemispheres is listed under the "item" class in the "div" element
#Use ".find_all()" to find all the information for the element "div" with a class of "item" 
items = parsed_hemipheres_img.find_all('div', class_='item')

#Enter a "for loop" to efficiently extract the requirements
for i in items: 
    #The title is listed in the "h3" tag in the "div" element with the "item" class
    #Extract the title using ".find().text"
    title = i.find('h3').text

    #This is not necessary however in the above, you will recieve "[HEMISPHERE NAME] Enhanced" as the string
    #The word "enhanced" has no meaning for our purposes and can be removed from the string
    #The ".strip()" function was used to achieve this
    title = title.strip("Enhanced")
    
    #Extract the 'href' to each individual hemisphere page 
    partial_img_url = i.find('a', class_='itemLink product-item')['href']

    #Identify the main url that the above href would otherwise be attached to 
    hemispheres_main_url = 'https://astrogeology.usgs.gov'
    
    #Visit each site with the 'browser' variable created by joining the main url and 'href' 
    browser.visit(hemispheres_main_url + partial_img_url)

    #Create a html object 
    individual_site = browser.html

    #Parse each of the sites 
    parsed_individual_site = BeautifulSoup(individual_site, 'html.parser')

    #Find the URL for the images by identifying where they are stored in the site's HTML and extract this information
    img_url = hemispheres_main_url + parsed_individual_site.find('img', class_='wide-image')['src']

    #Append what had been found as a dictionary in the empty list created outside this loop
    hemisphere_image_urls.append({"title" : title, "img_url" : img_url})

#Display the findings
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere ',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere ',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere ',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere ',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]