# Mission to Mars: Homework on Scraping and Web Analysis: Alternative Approach

##Scrape https://mars.nasa.gov/news/ and collect News Titles and Paragraphs

In [1]:
#Import dependencies. Will use pandas for processing, BeautifulSoup to grab data, requests to pull html text, and MongoDB
#and Splinter for repository data storage.
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import pymongo
from splinter import Browser

In [2]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

# 1. Mars News Articles

### URL for collecting html data on Mars

In [30]:
url = "https://mars.nasa.gov/news"

In [31]:
###Tried a request approach, but didn't give all of the data. Chris suggested splinter approach.
###Setup response variables to collect url html data. Then convert to BeautifulSoup object and
###extract the titles and teaser by 'div' and 'content_title' and 'rollover_description_inner'.

### Browser.visit to view URL 

In [32]:
browser.visit(url)

In [33]:
#for x in range(1,11):

#Following code from https://stackoverflow.com/questions/29773368/splinter-how-to-click-a-link-button-implemented-as-a-div-or-span-element
    #button = browser.find_by_name('More')
    #button.click()
    #browser.click_link_by_partial_text('More')
    
#When using this code below, it only reloads the same page! Actual website loads more articles.

### Alternative approach: Grab data from 'li' tags as a list, then iterate to extract title, paragraph text, and reference for making a list of dictionaries suitable for MongoDB.

In [34]:
#Alternative try to grab <li> tags, which have all of the articles.
#Then use iteration for loop to extract the title, teaser paragraph, and reference.
#Empty list for appending dictionaries of data for articles.
top_data = []

#Iteration knowing 40 articles per page and target of 400 articles
#for x in range(1,11):
    
html = browser.html
soup = bs(html, 'html.parser')

init_data = soup.find_all('li', class_='slide')

#Construct a list of dictionaries with these data for incorporation into MongoDB. Include
#'mars.nasa.gov' to enable direct use of the reference as a URL.
for item in init_data:
    title = item.find('div', class_='content_title').text
    teaser = item.find('div', class_='rollover_description_inner').text
    ref = 'mars.nasa.gov' + item.a['href']
    
    dictionary = {
        'title': title,
        'teaser': teaser,
        'reference': ref  
    }
    
    top_data.append(dictionary)

#Following code from https://stackoverflow.com/questions/29773368/splinter-how-to-click-a-link-button-implemented-as-a-div-or-span-element
#button = browser.find_by_name('More')
#button.click()
#browser.click_link_by_text('More')
#browser.click_link_by_partial_text('More')

#Link address for "More" button   
#https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest#    

#This range/browser.click_link for 'More' didn't seem to work. I was able to manually
#click the More link and obtain a lot more articles. There should be a way to adapt
#the click_link_by_partial_text('More') to obtain a given number of articles.

### Check top_data list of dictionaries to confirm formatting and data

In [35]:
len(top_data)

40

In [36]:
top_data[0]

{'title': "Media Get a Close-Up of NASA's Mars 2020 Rover",
 'teaser': "The clean room at NASA's Jet Propulsion Laboratory was open to the media to see NASA's next Mars explorer before it leaves for Florida in preparation for a summertime launch.",
 'reference': 'mars.nasa.gov/news/8578/media-get-a-close-up-of-nasas-mars-2020-rover/'}

In [37]:
top_data[39]

{'title': 'MEDLI2 Installation on Mars 2020 Aeroshell Begins',
 'teaser': "Hardware installed onto NASA's Mars 2020 entry vehicle this week will help to increase the safety of future Mars landings.",
 'reference': 'mars.nasa.gov/news/8497/medli2-installation-on-mars-2020-aeroshell-begins/'}

# 2. JPL Mars Space Images

### Url for Mars Image

In [38]:
url_2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

### Visit url with browser, then create BeautifulSoup object to grab image information.

In [39]:
browser.visit(url_2)

In [40]:
html_2 = browser.html
soup_2 = bs(html_2, 'html.parser')

In [41]:
raw_image_data = soup_2.find_all('article', class_="carousel_item")
raw_image_data

[<article alt="Red Arcs on Tethys" class="carousel_item" style="background-image: url('/spaceimages/images/wallpaper/PIA19637-1920x1200.jpg');">
 <div class="default floating_text_area ms-layer">
 <h2 class="category_title">
 </h2>
 <h2 class="brand_title">
 				  FEATURED IMAGE
 				</h2>
 <h1 class="media_feature_title">
 				  Red Arcs on Tethys				</h1>
 <div class="description">
 </div>
 <footer>
 <a class="button fancybox" data-description="Unusual arc-shaped, reddish streaks cut across the surface of Saturn's ice-rich moon Tethys in this enhanced-color mosaic from NASA's Cassini spacecraft." data-fancybox-group="images" data-fancybox-href="/spaceimages/images/mediumsize/PIA19637_ip.jpg" data-link="/spaceimages/details.php?id=PIA19637" data-title="Red Arcs on Tethys" id="full_image">
 					FULL IMAGE
 				  </a>
 </footer>
 </div>
 <div class="gradient_container_top"></div>
 <div class="gradient_container_bottom"></div>
 </article>]

### raw_image_data is a list, so take element [0] to extract string data on the 'style' tag. Extract the reference url for the full-size image and add the full information for generating the complete url. 

In [42]:
# Little tricky because only one element in list. j element ([0]) and isolate
# the 'style' tag. The url is not a tag, so can't use it to isolate the url information.
for j in raw_image_data:
    raw_url = j['style']

In [43]:
featured_image_url = 'jpl.nasa.gov' + raw_url.split("'")[1]

In [44]:
featured_image_url

'jpl.nasa.gov/spaceimages/images/wallpaper/PIA19637-1920x1200.jpg'

# 3. Mars Weather

### The url for Mars weather via twitter.com.

In [45]:
url_3 = 'https://twitter.com/marswxreport?lang=en'

In [46]:
browser.visit(url_3)

### Setup browser.html and create BeautifulSoup object for parsing.

In [47]:
html_3 = browser.html
soup_3 = bs(html_3, 'html.parser')

### In other cases, used find_all to get all instances. I think this time we want the most recent weather data, so used .find. 

In [48]:
mars_weather_raw = soup_3.find('div', class_="js-tweet-text-container")
mars_weather_raw

<div class="js-tweet-text-container">
<p class="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" data-aria-label-part="0" lang="en">InSight sol 397 (2020-01-08) low -100.1ºC (-148.1ºF) high -15.7ºC (3.7ºF)
winds from the SSE at 6.7 m/s (15.0 mph) gusting to 21.2 m/s (47.5 mph)
pressure at 6.40 hPa<a class="twitter-timeline-link u-hidden" data-pre-embedded="true" dir="ltr" href="https://t.co/e4wIfniCoX">pic.twitter.com/e4wIfniCoX</a></p>
</div>

### Extract current Mars weather from the twitter url current data.

In [50]:
current_mars_weather = mars_weather_raw.p.text.replace('\n', ', ').split('pic')[0]
current_mars_weather

'InSight sol 397 (2020-01-08) low -100.1ºC (-148.1ºF) high -15.7ºC (3.7ºF), winds from the SSE at 6.7 m/s (15.0 mph) gusting to 21.2 m/s (47.5 mph), pressure at 6.40 hPa'

# 4. Mars Facts 

### New url for Mars Facts Website 

In [55]:
url_4 = 'https://space-facts.com/mars/'

### Use Pandas to extract table information and reference for inclusion in website.

In [60]:
tables = pd.read_html(url_4)

In [61]:
len(tables)

3

In [None]:
# Tables[0] and [1] have relevant data for use.

In [72]:
mars_data = tables[0]

In [73]:
mars_comparison_data = tables[1]

In [None]:
# Rename columns and set index to Parameters

In [81]:
renamed_mars_data = mars_data.rename(columns = {0: 'Parameters', 1: 'Measurements'})
renamed_mars_data

Unnamed: 0,Parameters,Measurements
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [82]:
mars_comparison_data

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Diameter:,"6,779 km","12,742 km"
1,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
2,Moons:,2,1
3,Distance from Sun:,"227,943,824 km","149,598,262 km"
4,Length of Year:,687 Earth days,365.24 days
5,Temperature:,-153 to 20 °C,-88 to 58°C


### Export dataframes as html formatted tables.

In [85]:
renamed_mars_data.to_html('Mars_Data_Table.html')

In [86]:
mars_comparison_data.to_html('Mars_Earth_Comparison_Data.html')

# 5. Mars Hemispheres

### URLs for high resolution images of Martian hemispheres. Attepmted to find a single webpage with all of these links, but did not find one. Will have to scrape each individual webpage, one for each Martian hemisphere. Main URL for website below.

In [135]:
#url_5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

### Cerberus Hemisphere Enhanced

In [129]:
url_5a = 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'

In [130]:
browser.visit(url_5a)

In [131]:
html_5a = browser.html
soup_5a = bs(html_5a, 'html.parser')

In [132]:
cerberus_raw = soup_5a.find_all('img', class_="wide-image")

In [133]:
for data in cerberus_raw:
    cerberus_url = 'astrogeology.usgs.gov' + data['src']

In [134]:
cerberus_url

'astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'

### Schiaparelli Hemisphere Enhanced

In [137]:
url_5b = 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced'

In [138]:
browser.visit(url_5b)

In [139]:
html_5b = browser.html
soup_5b = bs(html_5b, 'html.parser')

In [143]:
schiaparelli_raw = soup_5b.find_all('img', class_="wide-image")

In [144]:
for data in schiaparelli_raw:
    schiaparelli_url = 'astrogeology.usgs.gov' + data['src']

In [145]:
schiaparelli_url

'astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg'

### Syrtis Major Hemisphere Enhanced 

In [146]:
url_5c = 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced'

In [147]:
browser.visit(url_5c)

In [149]:
html_5c = browser.html
soup_5c = bs(html_5c, 'html.parser')

In [150]:
syrtis_major_raw = soup_5c.find_all('img', class_="wide-image")

In [151]:
for data in syrtis_major_raw:
    syrtis_major_url = 'astrogeology.usgs.gov' + data['src']

In [152]:
syrtis_major_url

'astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg'

### Valles Marineris Hemisphere Enhanced

In [153]:
url_5d = 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced'

In [154]:
browser.visit(url_5d)

In [155]:
html_5d = browser.html
soup_5d = bs(html_5d, 'html.parser')

In [156]:
valles_marineris_raw = soup_5d.find_all('img', class_="wide-image")

In [157]:
for data in valles_marineris_raw:
    valles_marineris_url = 'astrogeology.usgs.gov' + data['src']

In [158]:
valles_marineris_url

'astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'

## Construct image titles and url's list of dictionaries

In [174]:
titles = ['Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced']
urls = [cerberus_url, schiaparelli_url, syrtis_major_url, valles_marineris_url]

image_urls= []

for x in range(len(titles)):
    dictionary = {
    'title': titles[x], 'img-url': urls[x]    
    }
    
    image_urls.append(dictionary)


In [175]:
image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img-url': 'astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img-url': 'astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img-url': 'astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img-url': 'astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'}]