# Beautiful Soup + MongoDB

### Import dependencies

In [14]:
from bs4 import BeautifulSoup
import requests
import pymongo
from splinter import Browser

In [15]:
#pip install pymongo#3

### Initialize PyMongo to work with MongoDBs

In [16]:
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

### Define database and collection

In [17]:
db = client.mars_db
collection = db.news

In [18]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

### Scrape a webpage and create a BeautifulSoup object from the results


In [19]:
url = 'https://mars.nasa.gov/news'
browser.visit(url)

### Examine the results, then input results containing desired info into MongoDB collection

In [20]:
collection.drop()

for x in range(1, 2):
    # Splinter can capture a page's underlying html and use pass it to BeautifulSoup to allow us to scrape the content
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Using BS, we can execute standard functions to capture the page's content
    results = soup.find_all('div', class_='image_and_description_container')
  
    # The below loop will print out the current page number and the quote's text
    for result in results:
#        print(result.text)
        try:
            # Identify and return title of listing
            date = result.find('div', class_='list_date').text
            title = result.find('div', class_='content_title').text
            article = result.find('div', class_='article_teaser_body').text
        
            # Run only if title, article content 
            if (title and article):
                # Print results
                print('------------- Page -', x)
                print('news_date = ', date)
                print('news_title =', title)
                print('news_p =',article)

                # Dictionary to be inserted as a MongoDB document
                post = {
                    'news_date': date,
                    'news_title': title,
                    'news_p': article,
                }
                # Insert result into collection
                collection.insert_one(post)

        except Exception as e:
              print(e)
try:
    browser.click_link_by_href('#')    
except Exception as e:
    x = 10
    print(e)    


------------- Page - 1
news_date =  January 13, 2020
news_title = NASA's Mars 2020 Rover Closer to Getting Its Name
news_p = 155 students from across the U.S. have been chosen as semifinalists in NASA's essay contest to name the Mars 2020 rover, and see it launch from Cape Canaveral this July.
------------- Page - 1
news_date =  December 27, 2019
news_title = Media Get a Close-Up of NASA's Mars 2020 Rover
news_p = The clean room at NASA's Jet Propulsion Laboratory was open to the media to see NASA's next Mars explorer before it leaves for Florida in preparation for a summertime launch.
------------- Page - 1
news_date =  December 23, 2019
news_title = Space History Is Made in This NASA Robot Factory
news_p = From rockets to rovers, JPL's Spacecraft Assembly Facility has been at the center of robotic spaceflight. Here's a closer look at what makes it so special.
------------- Page - 1
news_date =  December 18, 2019
news_title = NASA's Mars 2020 Rover Completes Its First Drive
news_p = I

------------- Page - 1
news_date =  August 22, 2019
news_title = NASA-JPL Names 'Rolling Stones Rock' on Mars
news_p = NASA's Mars InSight mission honored one of the biggest bands of all time at Pasadena concert.
------------- Page - 1
news_date =  August 15, 2019
news_title = Robotic Toolkit Added to NASA's Mars 2020 Rover
news_p = The bit carousel, which lies at the heart of the rover's Sample Caching System, is now aboard NASA's newest rover. 
------------- Page - 1
news_date =  August 13, 2019
news_title = Space Samples Link NASA's Apollo 11 and Mars 2020
news_p = While separated by half a century, NASA's Apollo 11 and Mars 2020 missions share the same historic goal: returning samples to Earth.
------------- Page - 1
news_date =  August  9, 2019
news_title = Small Satellite Mission of the Year
news_p = The first interplanetary CubeSats were recognized by the engineering community with the 2019 Small Satellite Mission of the Year award.
------------- Page - 1
news_date =  August  5,



## JPL Mars Space Images - Featured Image

In [21]:
db = client.mars_db
collection = db.figure

In [22]:
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)
browser.click_link_by_partial_text('FULL IMAGE')    
browser.click_link_by_partial_text('more info')    
     




In [23]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')


In [24]:
results = soup.find_all('figure', class_='lede')
print(results)

[<figure class="lede">
<a href="/spaceimages/images/largesize/PIA16694_hires.jpg"><img alt="NASA's Kepler mission compares artist's concepts of the planets in the Kepler-37 system to the moon and planets in the solar system. The smallest planet, Kepler-37b, is slightly larger than our moon." class="main_image" src="/spaceimages/images/largesize/PIA16694_hires.jpg" title="NASA's Kepler mission compares artist's concepts of the planets in the Kepler-37 system to the moon and planets in the solar system. The smallest planet, Kepler-37b, is slightly larger than our moon."/></a>
</figure>]


In [25]:
for result in results:
#        print(result.text)
        try:
            # Identify and return title of listing
            featured_image_url = 'https://www.jpl.nasa.gov' + result.find('a')['href']
            print(f'featured_image_url = ', featured_image_url)
            post = {
                'featured_image_url': featured_image_url
            }
            # Insert result into collection
            collection.insert_one(post)

        except Exception as e:
              print(e)

featured_image_url =  https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16694_hires.jpg


## Mars Weather

In [26]:
db = client.mars_db
collection = db.weather

In [27]:
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url)


In [28]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [29]:
results = soup.find_all('div', class_='js-tweet-text-container')
print(results[0])

<div class="js-tweet-text-container">
<p class="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" data-aria-label-part="0" lang="en">InSight sol 404 (2020-01-15) low -99.5ºC (-147.1ºF) high -16.8ºC (1.7ºF)
winds from the SSW at 5.2 m/s (11.7 mph) gusting to 19.8 m/s (44.2 mph)
pressure at 6.40 hPa<a class="twitter-timeline-link u-hidden" data-pre-embedded="true" dir="ltr" href="https://t.co/fX6vAdxuSZ">pic.twitter.com/fX6vAdxuSZ</a></p>
</div>


In [30]:
#for result:

try:
    # Identify and return title of listing
    fulltext = results[0].find('p', class_='TweetTextSize').text
    atext = results[0].find('a').text
    weather = fulltext.replace(atext, '')
    print(f'mars_weather = ', weather)
    post = {
            'mars_weather': weather
    }
    # Insert result into collection
    collection.insert_one(post)
except Exception as e:
      print(e)

mars_weather =  InSight sol 404 (2020-01-15) low -99.5ºC (-147.1ºF) high -16.8ºC (1.7ºF)
winds from the SSW at 5.2 m/s (11.7 mph) gusting to 19.8 m/s (44.2 mph)
pressure at 6.40 hPa


## Mars Facts

In [31]:
import pandas as pd

In [32]:
url = 'https://space-facts.com/mars/'

In [33]:
tables = pd.read_html(url)
len(tables)

3

In [34]:
print(type(tables))
print(type(tables[0]))

<class 'list'>
<class 'pandas.core.frame.DataFrame'>


In [35]:
df = tables[0]
df.head()

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"


In [36]:
html_table = df.to_html(index=False)
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th>0</th>\n      <th>1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

In [37]:
html_table.replace('\n', '')

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th>0</th>      <th>1</th>    </tr>  </thead>  <tbody>    <tr>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <td>Mass:</td>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <td>Moons:</td>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <td>Orbit Distance:</td>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <td>Orbit Period:</td>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <td>Surface Temperature:</td>      <td>-87 to -5 °C</td>    </tr>    <tr>      <td>First Record:</td>      <td>2nd millennium BC</td>    </tr>    <tr>      <td>Recorded By:</td>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

In [38]:
df.rename(columns = {0:'Mars'}, inplace = True)
df.rename(columns = {1:'Value'}, inplace = True)
       
df.to_html('table.html',index=False)
df.head(10)

Unnamed: 0,Mars,Value
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [39]:
db = client.mars_db
collection = db.facts
df.reset_index(inplace=True)
collection.insert_many(df.to_dict('records'))      

<pymongo.results.InsertManyResult at 0x1cc269c3308>

## Mars Hemispheres

In [40]:
db = client.mars_db
collection = db.hemisphere

In [41]:
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)
  

In [42]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')


In [43]:
# Populate the list and get title
hemi_title = []

results = soup.find_all('h3')
#print(results)

for hemi in results:
    hemi_title.append(hemi.text)
print(results[0])
hemi_title

<h3>Cerberus Hemisphere Enhanced</h3>


['Cerberus Hemisphere Enhanced',
 'Schiaparelli Hemisphere Enhanced',
 'Syrtis Major Hemisphere Enhanced',
 'Valles Marineris Hemisphere Enhanced']

In [44]:
# Initialize hemisphere_image_urls list
hemi_image_urls = []
browser.visit(url)

# Loop through the hemisphere links to obtain the images
for hemi in hemi_title:
    # Initialize a dictionary for the hemisphere
    try:
        hemi_dict = {}

        # Click on the link with the corresponding text
        browser.click_link_by_partial_text(hemi)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        results = soup.find_all('div', class_='content')
        #print(results)    
        browser.visit(url)
        for result in results:
            # Identify and return URL
            dts = result.find_all('dt')[1]
            hemi_image = result.find_all('dd')[1].find('a')['href']
            #print(hemi_image)
    
            # Run only if title and URL content 
            if (title and url):
                # Print results
                print(f'-----------')
                print(f'title: ', hemi)
                print(f'img_url: ', hemi_image)
                post = {
                        'title': hemi,
                        'img_url': hemi_image
                }
                # Insert result into collection
                collection.insert_one(post)
    except Exception as e:
      print(e)        




-----------
title:  Cerberus Hemisphere Enhanced
img_url:  http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif
-----------
title:  Schiaparelli Hemisphere Enhanced
img_url:  http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif
-----------
title:  Syrtis Major Hemisphere Enhanced
img_url:  http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif
-----------
title:  Valles Marineris Hemisphere Enhanced
img_url:  http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif
