# Scraping with BeautifulSoup, Splinter and Pandas

In [1]:
# Import required libraries
from bs4 import BeautifulSoup
from splinter import Browser
import pandas as pd
import time
import requests as req
import pprint as pp

# Invoking Chromedriver for Windows users

In [2]:
# Invoke the chromedriver/browser to read the html pages for scraping/parsing the data.
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

# NASA Mars News Site

In [3]:
# Setup the Chrome browser
nasa_url = 'https://mars.nasa.gov/news/'
browser.visit(nasa_url)

# wait for 2 secs for the browser to load
time.sleep(2)

## Invoking the browser via chromedriver and scraping

In [4]:
# HTML object
html = browser.html

# Parse HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text.
# Assign the text to variables that you can reference later.
# Use Beautiful Soup's find() method to navigate and retrieve news
news = soup.find('div', class_='image_and_description_container')
news_title = news.find('div', class_='content_title').find('a').text
news_p = news.find('div', class_='article_teaser_body').text
news_href = nasa_url + news.find('div', class_='content_title').find('a')['href']

#news_title = newsies.find('div', class='bottom_gradient')
print(f'-----------')
print(f' href: {news_href}')
print(f'Title: {news_title}')
print(f' Text: {news_p}')


-----------
 href: https://mars.nasa.gov/news//news/8677/air-deliveries-bring-nasas-perseverance-mars-rover-closer-to-launch/
Title: Air Deliveries Bring NASA's Perseverance Mars Rover Closer to Launch
 Text: A NASA Wallops Flight Facility cargo plane transported more than two tons of equipment — including the rover's sample collection tubes — to Florida for this summer's liftoff.


## JPL Mars Space Images - Featured Image

### Navigate to the specifified url

In [5]:
# Visit JPL Mars Space Images through splinter module
# This is a 3 step process

jpl_url = 'https://www.jpl.nasa.gov'
featured_page_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    
# Invoke the chromebrowser and navigate to the 'featured_page_url' and get the html object to it
browser.visit(featured_page_url)

# HTML Object 
html_image = browser.html

# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_image, 'html.parser')

### Get the url to the details page

In [6]:
# Get the url for the detailed image
featured_url = soup.find('a', class_='button fancybox')['data-link']
#print(featured_url)
image_detail_url = jpl_url + featured_url

# Print the URL for the image to be used for getting the fullsize image URL
print(image_detail_url)

https://www.jpl.nasa.gov/spaceimages/details.php?id=PIA17841


### Extract the full image url from the details html page.

In [7]:
# Use the URL from above to invoke the html page and extract the url for the Largeimage
browser.visit(image_detail_url)

# wait for 1 secs for the browser to load
time.sleep(1)

# HTML object
html = browser.html

# Parse HTML with Beautiful Soup for the html page returned from above
soup = BeautifulSoup(html, 'html.parser')
fullres_url = soup.find('article').find('figure', class_='lede').find('a')['href']
featured_image_url = jpl_url + fullres_url

# Print the URL for the largesize image
print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA17841_hires.jpg


## Mars weather

### Visit the Mars Weather Twitter account
twitter account - https://twitter.com/marswxreport?lang=en here and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called mars_weather.

In [8]:
# Invoke the chromedriver/browser to read the html pages for scraping/parsing the data.
# executable_path = {'executable_path': 'chromedriver.exe'}
# browser = Browser('chrome', **executable_path, headless=False)

twitter_mars_url = 'https://twitter.com/marswxreport?lang=en'

# Invoke the chromebrowser and navigate to the 'twitter_mars_url' and get the html object to it
browser.visit(twitter_mars_url)

# wait for 10 secs for the browser to load
time.sleep(10)

# HTML object
html = browser.html

# Parse HTML with Beautiful Soup for the html page returned from above.
soup = BeautifulSoup(html, 'html.parser')

# Parse HTML with Beautiful Soup for the html page returned from above.
# Find all the tweets by 'div' and  class', then get the first tweet for the latest weather info.

tweets = soup.find_all('div', class_='css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0')

# Get the latest tweet and replace the new line ('\n') character with a space (' ') character.
latest_tweet = tweets[0].find('span').text.replace('\n', ' ')

# Print out the weather, use the new line ('\n') and line break ('\r') characters to make the printout readable.
print(latest_tweet.replace(')', ')\n\r'))

InSight sol 528 (2020-05-22)
 low -93.5ºC (-136.3ºF)
 high -5.3ºC (22.4ºF)
 winds from the SW at 6.3 m/s (14.0 mph)
 gusting to 23.6 m/s (52.8 mph)
 pressure at 7.10 hPa


## Mars Facts

### Visit the Mars Facts Webpage and get the facts using Pandas
Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.

In [9]:
#Mars space facts url
mars_facts_url = 'https://space-facts.com/mars/'

#use Pandas to scrape the planet profile
mars_facts = pd.read_html(mars_facts_url)
mars_facts_df = mars_facts[0]

In [10]:
mars_facts_list = []
for index, row in mars_facts_df.iterrows():
    # print(index, row[0], row[1])
    mars_facts_list.append({'#' : index,'Fact' : row[0], 'Detail' : row[1]})        
mars_facts_list    

[{'#': 0, 'Fact': 'Equatorial Diameter:', 'Detail': '6,792 km'},
 {'#': 1, 'Fact': 'Polar Diameter:', 'Detail': '6,752 km'},
 {'#': 2, 'Fact': 'Mass:', 'Detail': '6.39 × 10^23 kg (0.11 Earths)'},
 {'#': 3, 'Fact': 'Moons:', 'Detail': '2 (Phobos & Deimos)'},
 {'#': 4, 'Fact': 'Orbit Distance:', 'Detail': '227,943,824 km (1.38 AU)'},
 {'#': 5, 'Fact': 'Orbit Period:', 'Detail': '687 days (1.9 years)'},
 {'#': 6, 'Fact': 'Surface Temperature:', 'Detail': '-87 to -5 °C'},
 {'#': 7, 'Fact': 'First Record:', 'Detail': '2nd millennium BC'},
 {'#': 8, 'Fact': 'Recorded By:', 'Detail': 'Egyptian astronomers'}]

In [11]:
mars_facts_list = mars_facts_df.values.tolist()
print(mars_facts_list)

[['Equatorial Diameter:', '6,792 km'], ['Polar Diameter:', '6,752 km'], ['Mass:', '6.39 × 10^23 kg (0.11 Earths)'], ['Moons:', '2 (Phobos & Deimos)'], ['Orbit Distance:', '227,943,824 km (1.38 AU)'], ['Orbit Period:', '687 days (1.9 years)'], ['Surface Temperature:', '-87 to -5 °C'], ['First Record:', '2nd millennium BC'], ['Recorded By:', 'Egyptian astronomers']]


In [12]:
# # Name the columns of the DataFrame
# mars_facts_df.columns = ['Fact', 'Detail']
# mars_facts_df.set_index('Fact', inplace=True)

# # Display the Mars Facts DataFrame
# mars_facts_df

###  Use Pandas to convert the data to a HTML table string.

In [None]:
# Pandas Dataframe converted to html table
mars_facts_table_html = mars_facts_df.to_html()
print(mars_facts_table_html)

In [None]:
# html version with new line charcaters removed.
mars_facts_table_html = mars_facts_table_html.replace("\n", "")
mars_facts_table_html

## Mars Hemispheres
* Visit the USGS Astrogeology site (https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) to obtain high resolution images for each of Mar's hemispheres.
* You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
* Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title.
* Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

Example:
```
hemisphere_image_urls = [
{"title": "Valles Marineris Hemisphere", "img_url": "..."},
{"title": "Cerberus Hemisphere", "img_url": "..."},
{"title": "Schiaparelli Hemisphere", "img_url": "..."},
{"title": "Syrtis Major Hemisphere", "img_url": "..."},
]
```

In [None]:
# Visit hemispheres website through splinter module 
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)

# wait for 2 secs for the browser to load
time.sleep(2)

# HTML Object
html_hemispheres = browser.html

# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_hemispheres, 'html.parser')

# Retreive all items that contain mars hemispheres information
items_list = soup.find_all('div', class_='item')

# Display the Item list with the hemispheres information
items_list

In [None]:
# Store the base url 
hemispheres_base_url = 'https://astrogeology.usgs.gov'

# Create empty list for hemisphere urls 
hemispheres_image_urls = []

# Loop through the items_list retrived in teh previous step
for i in items_list: 
    # Store title
    title = i.find('h3').text
    
    # Store link that leads to full image website
    partial_img_url = i.find('a', class_='itemLink product-item')['href']
    
    # Visit the link that contains the full image website 
    browser.visit(hemispheres_base_url + partial_img_url)
    
    # HTML Object of individual hemisphere information website 
    html = browser.html
    
    # Parse HTML with Beautiful Soup for every individual hemisphere information website 
    soup = BeautifulSoup(html, 'html.parser')
    
    # Retrieve full image source 
    img_url = hemispheres_base_url + soup.find('img', class_='wide-image')['src']
    
    # Append the retreived information into a list of dictionaries 
    hemispheres_image_urls.append({'title' : title, 'img_url' : img_url})

#Quit the browser before exiting the application
browser.quit()
    
# Display the hemispheres image URL list    
hemispheres_image_urls

In [None]:
?hemispheres_image_urls

In [None]:
# 2020-05-19 17:56:50 