# Mission to Mars: Homework on Scraping and Web Analysis: Alternative Approach

##Scrape https://mars.nasa.gov/news/ and collect News Titles and Paragraphs

In [83]:
#Import dependencies. Will use pandas for processing, BeautifulSoup to grab data, requests to pull html text, and MongoDB
#and Splinter for repository data storage.
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import pymongo
from splinter import Browser

In [84]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

# 1. Mars News Articles

### URL for collecting html data on Mars

In [3]:
url = "https://mars.nasa.gov/news"

In [4]:
###Tried a request approach, but didn't give all of the data. Chris suggested splinter approach.
###Setup response variables to collect url html data. Then convert to BeautifulSoup object and
###extract the titles and teaser by 'div' and 'content_title' and 'rollover_description_inner'.

### Browser.visit to view URL 

In [85]:
browser.visit(url)

In [63]:
#for x in range(1,11):

#Following code from https://stackoverflow.com/questions/29773368/splinter-how-to-click-a-link-button-implemented-as-a-div-or-span-element
    #button = browser.find_by_name('More')
    #button.click()
    #browser.click_link_by_partial_text('More')
    
#When using this code below, it only reloads the same page! Actual website loads more articles.

### Alternative approach: Grab data from 'li' tags as a list, then iterate to extract title, paragraph text, and reference for making a list of dictionaries suitable for MongoDB.

In [74]:
#Alternative try to grab <li> tags, which have all of the articles.
#Then use iteration for loop to extract the title, teaser paragraph, and reference.
#Empty list for appending dictionaries of data for articles.
top_data = []

#Iteration knowing 40 articles per page and target of 400 articles
#for x in range(1,11):
    
html = browser.html
soup = bs(html, 'html.parser')

init_data = soup.find_all('li', class_='slide')

#Construct a list of dictionaries with these data for incorporation into MongoDB. Include
#'mars.nasa.gov' to enable direct use of the reference as a URL.
for item in init_data:
    title = item.find('div', class_='content_title').text
    teaser = item.find('div', class_='rollover_description_inner').text
    ref = 'mars.nasa.gov' + item.a['href']
    
    dictionary = {
        'title': title,
        'teaser': teaser,
        'reference': ref  
    }
    
    top_data.append(dictionary)

#Following code from https://stackoverflow.com/questions/29773368/splinter-how-to-click-a-link-button-implemented-as-a-div-or-span-element
#button = browser.find_by_name('More')
#button.click()
#browser.click_link_by_text('More')
#browser.click_link_by_partial_text('More')

#Link address for "More" button   
#https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest#    

#This range/browser.click_link for 'More' didn't seem to work. I was able to manually
#click the More link and obtain a lot more articles. There should be a way to adapt
#the click_link_by_partial_text('More') to obtain a given number of articles.

### Check top_data list of dictionaries to confirm formatting and data

In [75]:
len(top_data)

200

In [76]:
top_data[0]

{'title': "Media Get a Close-Up of NASA's Mars 2020 Rover",
 'teaser': "The clean room at NASA's Jet Propulsion Laboratory was open to the media to see NASA's next Mars explorer before it leaves for Florida in preparation for a summertime launch.",
 'reference': 'mars.nasa.gov/news/8578/media-get-a-close-up-of-nasas-mars-2020-rover/'}

In [77]:
top_data[40]

{'title': "NASA's Mars 2020 Rover Does Biceps Curls ",
 'teaser': "In this time-lapse video, the robotic arm on NASA's Mars 2020 rover maneuvers its 88-pound (40-kilogram) sensor-laden turret as it moves from a deployed to stowed configuration.",
 'reference': 'mars.nasa.gov/news/8496/nasas-mars-2020-rover-does-biceps-curls/'}

# 2. JPL Mars Space Images

### Url for Mars Image

In [86]:
url_2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

### Visit url with browser, then create BeautifulSoup object to grab image information.

In [87]:
browser.visit(url_2)

In [88]:
html_2 = browser.html
soup_2 = bs(html_2, 'html.parser')

In [161]:
raw_image_data = soup_2.find_all('article', class_="carousel_item")
raw_image_data

[<article alt="A Tale of Two Worlds: Silicate Versus Carbon Planets (Artist Concept)" class="carousel_item" style="background-image: url('/spaceimages/images/wallpaper/PIA17550-1920x1200.jpg');">
 <div class="default floating_text_area ms-layer">
 <h2 class="category_title">
 </h2>
 <h2 class="brand_title">
 				  FEATURED IMAGE
 				</h2>
 <h1 class="media_feature_title">
 				  A Tale of Two Worlds: Silicate Versus Carbon Planets (Artist Concept)				</h1>
 <div class="description">
 </div>
 <footer>
 <a class="button fancybox" data-description="This artist's concept illustrates the fate of two different planets: the one on the left is similar to Earth, made up largely of silicate-based rocks with oceans coating its surface." data-fancybox-group="images" data-fancybox-href="/spaceimages/images/mediumsize/PIA17550_ip.jpg" data-link="/spaceimages/details.php?id=PIA17550" data-title="A Tale of Two Worlds: Silicate Versus Carbon Planets (Artist Concept)" id="full_image">
 					FULL IMAGE


### raw_image_data is a list, so take element [0] to extract string data on the 'style' tag. Extract the reference url for the full-size image and add the full information for generating the complete url. 

In [176]:
# Little tricky because only one element in list. j element ([0]) and isolate
# the 'style' tag. The url is not a tag, so can't use it to isolate the url information.
for j in raw_image_data:
    raw_url = j['style']

In [177]:
featured_image_url = 'jpl.nasa.gov' + raw_url.split("'")[1]

In [178]:
featured_image_url

'jpl.nasa.gov/spaceimages/images/wallpaper/PIA17550-1920x1200.jpg'