# Web Scraping Homework - Mission to Mars

In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
#import dependencies
from splinter import Browser
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import pymongo
from flask import Flask, render_template, redirect
from flask_pymongo import PyMongo

In [3]:
# Setup config variables to enable Splinter interaction with browser
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 93.0.4577
Get LATEST driver version for 93.0.4577
Driver [C:\Users\jchan\.wdm\drivers\chromedriver\win32\93.0.4577.63\chromedriver.exe] found in cache


<strong> Hint:</strong> Use Splinter to navigate the sites when needed and BeautifulSoup to help find and parse out the necessary data.

In [4]:
# Create dictionary to store news
scraped_data = {}

## NASA Mars News

Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later.

In [5]:
# Visit Nasa news url through splinter module
nasa_url = "https://redplanetscience.com" 
browser.visit(nasa_url)

In [6]:
html = browser.html
# Create BeautifulSoup object; parse with 'html.parser'
soup = bs(html, 'html.parser')

In [7]:
# Get news title and news text by searching for appropriate div class 
news_title = soup.find('div', class_='content_title').text
news_p = soup.find('div', class_='article_teaser_body').text
print(news_title)
print(news_p)

NASA Invites Public to Share Excitement of Mars 2020 Perseverance Rover Launch
There are lots of ways to participate in the historic event, which is targeted for July 30.


In [8]:
# Create dictionary to store data and save entries
scrape_nasa_news={"Title":news_title, "Paragraph":news_p}
scrape_nasa_news

{'Title': 'NASA Invites Public to Share Excitement of Mars 2020 Perseverance Rover Launch',
 'Paragraph': 'There are lots of ways to participate in the historic event, which is targeted for July 30.'}

In [9]:
# Save scraped data as a new entry in the dictionary
scraped_data ["Title"] = news_title
scraped_data["Paragraph"] = news_p

## JPL Mars Space Images - Featured Image

- Visit the url for JPL Featured Space Image [here](https://spaceimages-mars.com).
- Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable called featured_image_url.
- Find the image url to the full size .jpg image. Make sure to save a complete url string for this image.

In [10]:
mars_url = "https://spaceimages-mars.com" 
browser.visit( mars_url)
image_html = browser.html

# Create BeautifulSoup object; parse with 'html.parser'
soup = bs( image_html, "html.parser")

In [11]:
featured_image = soup.find_all("img", class_ = "headerimage fade-in")[0]["src"]
featured_image_url = mars_url + "/" + featured_image
print(featured_image_url)

https://spaceimages-mars.com/image/featured/mars2.jpg


In [12]:
# Create dictionary to store data and save entries
jpl = {"img_url":featured_image_url}
jpl

{'img_url': 'https://spaceimages-mars.com/image/featured/mars2.jpg'}

In [13]:
# Save scraped data as a new entry in the dictionary
scraped_data["img_url"] = featured_image_url

In [14]:
browser.quit()

## Mars Facts

Visit the Mars Facts webpage [here](https://galaxyfacts-mars.com/) and use Pandas to scrape the table containing facts about the planet.
Use Pandas to convert the data to a HTML table string.

In [15]:
facts_url = "https://galaxyfacts-mars.com/"
facts_data = pd.read_html(facts_url)
facts_data

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [16]:
facts_df = facts_data[0]
facts_table = facts_df.to_html(index=False)
facts_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Mars - Earth Comparison</td>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <td>Diameter:</td>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Distance from Sun:</td>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <td>Length of Year:</td>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <td>Temperature:</td>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>\n</table>'

In [17]:
# Check out table
facts_table.replace("\n", "")
print(facts_table)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>0</th>
      <th>1</th>
      <th>2</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Mars - Earth Comparison</td>
      <td>Mars</td>
      <td>Earth</td>
    </tr>
    <tr>
      <td>Diameter:</td>
      <td>6,779 km</td>
      <td>12,742 km</td>
    </tr>
    <tr>
      <td>Mass:</td>
      <td>6.39 × 10^23 kg</td>
      <td>5.97 × 10^24 kg</td>
    </tr>
    <tr>
      <td>Moons:</td>
      <td>2</td>
      <td>1</td>
    </tr>
    <tr>
      <td>Distance from Sun:</td>
      <td>227,943,824 km</td>
      <td>149,598,262 km</td>
    </tr>
    <tr>
      <td>Length of Year:</td>
      <td>687 Earth days</td>
      <td>365.24 days</td>
    </tr>
    <tr>
      <td>Temperature:</td>
      <td>-87 to -5 °C</td>
      <td>-88 to 58°C</td>
    </tr>
  </tbody>
</table>


In [18]:
# Create dictionary to store data and save entries
mars_facts = {"htmlTable":facts_data}

# Save scraped data as a new entry in the dictionary
scraped_data["htmlTable"] = facts_df.to_html

## Mars Hemispheres:

Visit the USGS Astrogeology site [here](https://marshemispheres.com/) to obtain high resolution images for each of Mar's hemispheres.

You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.

Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title.

Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

<strong>Example:</strong> hemisphere_image_urls = [ {"title": "Valles Marineris Hemisphere", "img_url": "..."}, {"title": "Cerberus Hemisphere", "img_url": "..."}, {"title": "Schiaparelli Hemisphere", "img_url": "..."}, {"title": "Syrtis Major Hemisphere", "img_url": "..."}, ]

In [27]:
# Visit hemisphere url through splinter module
#Mars Hemispheres
# executable_path = {'executable_path': ChromeDriverManager().install()}
# browser = Browser('chrome', **executable_path, headless=False)

hem_url = 'https://marshemispheres.com/'
browser.visit(hem_url)

In [28]:
html = browser.html
# Create BeautifulSoup object; parse with 'html.parser'
soup = bs(html, 'html.parser')

In [29]:
hemisphere_links = soup.find_all('div', class_='item')

In [31]:
hemisphere_img_urls = []

# Create list of dictionaries for each hemisphere and append the dict 
# with an image URL string and title.
for i in range(4):
    try:
        browser.find_by_tag('h3')[i].click()
        soup = bs(browser.html, 'html.parser')
        img_title = soup.find("h2", class_="title").text
        img_url = soup.find("a", text ="Sample")["href"]
        hem_img_url = hem_url + img_url
        
        if (img_title and hem_img_url):
            # Print results
            print('-'*50)
            print(img_title)
            print(hem_img_url)
        # Create dictionary for title and url
        hem_dic = {"title": img_title, "img_url": hem_img_url}
        hemisphere_img_urls.append(hem_dic)
    except Exception as e:
        print(e)
#         browser.back()

# hemisphere_img_urls

--------------------------------------------------
Syrtis Major Hemisphere Enhanced
https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg
'NoneType' object has no attribute 'text'
--------------------------------------------------
Syrtis Major Hemisphere Enhanced
https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg
no elements could be found with tag_name "h3"


In [None]:
# Create dictionary to store data and save entries
usgs = {"hemiImages":hemisphere_img_urls}
usgs

In [None]:
# Save scraped data as a new entry in the dictionary
scraped_data["hemiImages"] = hemisphere_img_urls

In [None]:
browser.quit()

In [None]:
# Define mars dictionary
mars_dict ={"news_title": news_title, "news_p": news_p, "featured_image_url": featured_image_url, 
            "htmlTable": facts_table, "hem_url":hem_url}
mars_dict

In [None]:
# Create list of all dictionaries with scraped data
scraped_data = [scrape_nasa_news, jpl, mars_facts, usgs]
scraped_data

## MongoDB and Flask Application

- Use MongoDB with Flask templating to create a new HTML page that displays all of the information that was scraped from the URLs above.
- Start by converting your Jupyter notebook into a Python script called scrape_mars.py with a function called scrape that will execute all of your scraping code from above and return one Python dictionary containing all of the scraped data.
- Next, create a route called /scrape that will import your scrape_mars.py script and call your scrape function.
- Store the return value in Mongo as a Python dictionary.
- Create a root route / that will query your Mongo database and pass the mars data into an HTML template to display the data.
- Create a template HTML file called index.html that will take the mars data dictionary and display all of the data in the appropriate HTML elements. Use the following as a guide for what the final product should look like, but feel free to create your own design.

In [None]:
# Use flask_pymongo to set up mongo connection
# conn =  "mongodb://localhost:27017/mars_mission_scraping"
# client =  pymongo.MongoClient(conn)

In [None]:
# Get collection and drop existing data for this application
# db = client.mars_mission_scraping
# db.mars_data.drop()

In [None]:
# db.mars_data.insert_many([scraped_data])