# Scraper Builder Tester 


In [59]:
#################################
# DEPENDENCIES:
#################################
from splinter import Browser
from bs4 import BeautifulSoup
import time
import pandas as pd

In [60]:
#################################
# FUNCTION TO INITIALIZE BROWSER USING CHROME DRIVER 
# - LOCATED IN THE SAME FOLDER
#################################
def init_browser():
    executable_path = {"executable_path": "chromedriver"}
    return Browser("chrome", **executable_path, headless=True)


In [61]:
#################################
# FUNCTION TO SCRAPE NASA
# NASA
# https://mars.nasa.gov/news/
#################################
def scrape_nasa():
    browser = init_browser()

    nasa_dict = {
        "news_title": "",
        "news_p": ""
    }


    url = "https://mars.nasa.gov/news/"
    browser.visit(url)

    time.sleep(1)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    nasa_dict["news_title"] = soup.find("div", class_="content_title").get_text()
    nasa_dict["news_p"] = soup.find("div", class_="article_teaser_body").get_text()


    browser.quit()

    return nasa_dict

#################################
# VALIDATE:
validate_nasa = scrape_nasa()
print(validate_nasa["news_title"])
print(validate_nasa["news_p"])


Robotic Toolkit Added to NASA's Mars 2020 Rover
The bit carousel, which lies at the heart of the rover's Sample Caching System, is now aboard NASA's newest rover. 


In [63]:
#################################
# FUNCTION TO SCRAPE JPL
# JPL
# https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars
#################################
def scrape_jpl():
    browser = init_browser()

    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    time.sleep(1)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")


    raw_ending_url = soup.find("div", class_="carousel_items").find("article")["style"] 
    # check the result:
    # raw_ending_url


    ending_url_list = raw_ending_url.split("'")
    # check the result:
    # ending_url_list[1]


    base_url = "https://www.jpl.nasa.gov"
    featured_image_url = base_url + ending_url_list[1]


    browser.quit()
    
    return featured_image_url


#################################
# VALIDATE:
validate_jpl = scrape_jpl()
print(validate_jpl)


https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA17838-1920x1200.jpg


In [12]:
##################################################################
# SKIP: CANNOT BE DONE DUE TO TWITTER ENCRYPTION 
##################################################################
# FUNCTION TO SCRAPE MARS WEATHER
# MWEATHER
# https://twitter.com/marswxreport?lang=en

# Visit the Mars Weather twitter account here and scrape the latest Mars 
# weather tweet from the page. Save the tweet text for the weather report 
# as a variable called mars_weather.
#################################


In [64]:
#################################
# FUNCTION TO SCRAPE MARS FACTS
# MFACTS
# https://space-facts.com/mars/

# Mars Facts

# Visit the Mars Facts webpage here and use Pandas to scrape 
# the table containing facts about the planet including Diameter, Mass, etc.

# Use Pandas to convert the data to a HTML table string.
#################################
def scrape_mfacts():
    url = "https://space-facts.com/mars/"

    # Use PANDAS to scrape:
    tableslist = pd.read_html(url)

    table_df = tableslist[1]


    # if necessary: named the columns:
    final_table_df = table_df.rename(columns={
        0: 'fact',
        1: 'value'
    })
    
    # Use PANDAS to convert data to HTML table string: 
    html_table = final_table_df.to_html(index=False, justify="left")
    html_table.replace('\n', '')

    return final_table_df


#################################
# VALIDATE:
validate_mfacts = scrape_mfacts()
print(validate_mfacts)


                   fact                          value
0  Equatorial Diameter:                       6,792 km
1       Polar Diameter:                       6,752 km
2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
3                Moons:            2 (Phobos & Deimos)
4       Orbit Distance:       227,943,824 km (1.38 AU)
5         Orbit Period:           687 days (1.9 years)
6  Surface Temperature:                   -87 to -5 °C
7         First Record:              2nd millennium BC
8          Recorded By:           Egyptian astronomers


In [67]:
#################################
# FUNCTION TO SCRAPE MARS HEMISPHERE
# MHEMI
# https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars
#################################
def scrape_mhemi():
    browser = init_browser()

    # This list will contain one dictionary for each hemisphere:
    hemisphere_image_urls = []


    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)

    time.sleep(1)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")


    linkresults = soup.find_all('a', class_='itemLink')

    #################################
    # Pull out the image partial links and their titles:
    #################################
    for x in linkresults:

        raw_title = x.find('img')
        link = x['href']

        if raw_title and link:
            title = raw_title['alt']
            final_title = title.replace(" Enhanced thumbnail", "")

            # click on the link to retrieve the the full res image url:
            browser.click_link_by_partial_text(final_title)
            time.sleep(1)
            html2 = browser.html
            soup2 = BeautifulSoup(html2, "html.parser")


            newresults = soup2.find_all('a')
            # loop through the a tags to find the right link:
            for hemi in newresults:
                if hemi.text == 'Sample':
                    newlink = hemi['href']


            #Use a Python dictionary to store the data using the keys img_url and title:
            temp_dict = {
                "title": final_title,
                "img_url": newlink
            }

            # Append the dictionary with the image url string and the hemisphere title to a list
            hemisphere_image_urls.append(temp_dict)


    # As of 08/18/2019 the original link stopped working (error 404). Here is my workaround:       
    if not hemisphere_image_urls:
        temp_dict = {
                "title": "Cerberus Hemisphere",
                "img_url": "https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg"
        }
        hemisphere_image_urls.append(temp_dict)

        temp_dict = {
                "title": "Valles Marineris Hemisphere",
                "img_url": "https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg"
        }
        hemisphere_image_urls.append(temp_dict)

        temp_dict = {
                "title": "Syrtis Major Hemisphere",
                "img_url": "https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg"
        }
        hemisphere_image_urls.append(temp_dict)

        temp_dict = {
                "title": "Schiaparelli Hemisphere",
                "img_url": "https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg"
        }
        hemisphere_image_urls.append(temp_dict)


    #################################
    # Clean up:
    #################################
    browser.quit()

    return hemisphere_image_urls



#################################
# VALIDATE:
validate_mhemi = scrape_mhemi()
for x in validate_mhemi:
    print(x["title"])
    print(x["img_url"])
    

Cerberus Hemisphere
https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg
Valles Marineris Hemisphere
https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg
Syrtis Major Hemisphere
https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg
Schiaparelli Hemisphere
https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg


In [77]:

#################################
# TO TEST THE NESTED FOR LOOP ABOVE
#################################
browser = init_browser()
url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(url)
time.sleep(1)

browser.click_link_by_partial_text('Hemisphere')

time.sleep(1)
html2 = browser.html
soup2 = BeautifulSoup(html2, "html.parser")

newresults = soup2.find_all('a')

for x in newresults:
    if x.text == "Sample":
        print(x['href'])

browser.quit()

newresults

http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg


[<a href="https://www.usgs.gov/centers/astrogeo-sc" style="float:right;margin-top:10px;">
 <img alt="USGS: Science for a Changing World" class="logo" height="60" src="/images/usgs_logo_main_2x.png"/>
 </a>,
 <a href="https://nasa.gov" style="float:right;margin-top:5px;margin-right:20px;">
 <img alt="NASA" class="logo" height="65" src="/images/logos/nasa-logo-web-med.png"/>
 </a>,
 <a href="https://pds-imaging.jpl.nasa.gov/" style="float:right;margin-top:5px;margin-right: 10px;">
 <img alt="PDS Cartography and Imaging Science Node" class="logo" height="65" src="/images/pds_logo-invisible-web.png"/>
 </a>,
 <a href="/search" style="float:right;text-decoration:none;">
 <img alt="Astropedia" src="/images/astropedia/astropedia-logo-main.png" style="width:200px;border:none;float:right;"/>
 <div style="clear:both;font-size:.8em;float:right;color:#888;">Lunar and Planetary Cartographic Catalog</div>
 </a>,
 <a href="http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced

In [69]:
##################################################################################
# Combined the above functions into one scrape function: 
# this was used here to test what I coded in the scrape_mars.py
##################################################################################


#########################################
# DEPENDENCIES:
#########################################
from splinter import Browser
from bs4 import BeautifulSoup
import time
import pandas as pd


#########################################
# FUNCTION TO INITIALIZE BROWSER USING CHROME DRIVER 
# - PLEASE MAKE SURE YOUR CHROMEDRIVER IS LOCATED IN THE SAME FOLDER
#########################################
def init_browser():
    executable_path = {"executable_path": "chromedriver"}
    return Browser("chrome", **executable_path, headless=True)


#########################################
# FUNCTION TO SCRAPE THE PREDEFINED WEBSITES 
#########################################
def scrape():

    # Define Variables:
    #####################################################################################
    # Nasa Data:
    nasa_url = "https://mars.nasa.gov/news/"
    nasa_dict = {
        "news_title": "",
        "news_p": ""
    }

    # JPL Data:
    jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    featured_image_url = ""

    # Mars Facts:
    mfact_url = "https://space-facts.com/mars/"

    # Mars Hemisphere Data:
    mhemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    hemisphere_image_urls = []

    
    
    # Initialize browser:
    #####################################################################################
    browser = init_browser()



    # Nasa Data Scraping:
    #####################################################################################
    browser.visit(nasa_url)

    time.sleep(1)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    nasa_dict["news_title"] = soup.find("div", class_="content_title").get_text()
    nasa_dict["news_p"] = soup.find("div", class_="article_teaser_body").get_text()



    # JPL Data Scraping:
    #####################################################################################
    browser.visit(jpl_url)

    time.sleep(1)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    results = soup.find("div", class_="carousel_items").find("article")["style"] 
    ending_url_list = results.split("'")

    base_url = "https://www.jpl.nasa.gov"

    featured_image_url = base_url + ending_url_list[1]
    


    # Mars Facts Data Scraping:
    #####################################################################################
    # Use PANDAS to scrape:
    tableslist = pd.read_html(mfact_url)

    table_df = tableslist[1]

    # if necessary: named the columns:
    final_table_df = table_df.rename(columns={
        0: 'Fact',
        1: 'Value'
    })

    # Use PANDAS to convert data to HTML table string: 
    html_table = final_table_df.to_html(index=False, justify="left")
    html_table.replace('\n', '')



    # Mars Hemisphere Data Scraping:
    #####################################################################################
    browser.visit(mhemi_url)

    time.sleep(1)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    linkresults = soup.find_all('a', class_='itemLink')

    #################################
    # Pull out the image partial links and their titles:
    #################################
    for x in linkresults:
        
        raw_title = x.find('img')
        link = x['href']
        
        if raw_title and link:
            title = raw_title['alt']
            final_title = title.replace(" Enhanced thumbnail", "")
            
            # click on the link to retrieve the the full res image url:
            browser.click_link_by_partial_text(final_title)
            time.sleep(1)
            html2 = browser.html
            soup2 = BeautifulSoup(html2, "html.parser")
            
            
            newresults = soup2.find_all('a')
            # loop through the a tags to find the right link:
            for hemi in newresults:
                if hemi.text == 'Sample':
                    newlink = hemi['href']
                    
    
            #Use a Python dictionary to store the data using the keys img_url and title:
            temp_dict = {
                "title": final_title,
                "img_url": newlink
            }
    
            # Append the dictionary with the image url string and the hemisphere title to a list
            hemisphere_image_urls.append(temp_dict)
            
            
            
            
            
    # As of 08/18/2019 the original link stopped working (error 404). Here is my workaround:       
    if not hemisphere_image_urls:
        temp_dict = {
            "title": "Cerberus Hemisphere",
            "img_url": "https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg"
        }
        hemisphere_image_urls.append(temp_dict)

        temp_dict = {
            "title": "Valles Marineris Hemisphere",
            "img_url": "https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg"
        }
        hemisphere_image_urls.append(temp_dict)

        temp_dict = {
            "title": "Syrtis Major Hemisphere",
            "img_url": "https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg"
        }
        hemisphere_image_urls.append(temp_dict)

        temp_dict = {
            "title": "Schiaparelli Hemisphere",
            "img_url": "https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg"
        }
        hemisphere_image_urls.append(temp_dict)


        
        
        
    # Quite the browser after scraping
    #####################################################################################
    browser.quit()

    
    # Define the output:
    #####################################################################################
    mars_dict = {
        "nasa" : nasa_dict,
        "jpl" : featured_image_url,
        "mfacts" : html_table,
        "mhemi" : hemisphere_image_urls
    }
    

    # Return results
    #####################################################################################
    return mars_dict

    



    
    
#################################
# VALIDATE:
validate_complete_scrape = scrape()
print(validate_complete_scrape)


{'nasa': {'news_title': "Robotic Toolkit Added to NASA's Mars 2020 Rover", 'news_p': "The bit carousel, which lies at the heart of the rover's Sample Caching System, is now aboard NASA's newest rover. "}, 'jpl': 'https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA16682-1920x1200.jpg', 'mfacts': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: left;">\n      <th>Fact</th>\n      <th>Value</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td

In [70]:
##################################################################################
# TO VALIDATE THE FLASK APP:
##################################################################################
from flask import Flask, render_template, redirect
from flask_pymongo import PyMongo

app = Flask(__name__)

app.config["MONGO_URI"] = "mongodb://localhost:27017/mars_db"

mongo = PyMongo(app)

mars_table = mongo.db.mars_table

mars_data = scrape()

mars_table.update({}, mars_data, upsert=True)



{'n': 1, 'nModified': 1, 'ok': 1.0, 'updatedExisting': True}