In [1]:
# import dependencies
from bs4 import BeautifulSoup
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import requests
import pymongo
import pandas as pd

In [2]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless = False)

In [3]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/'

# retrieve page with the requests module
response = requests.get(url)

# create bs object; parse with html
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
# find all div with class slide, store them in result set
results = soup.find_all('div', class_ = 'slide')

In [5]:
# create empty list to store dictionaries
mars_news = []

# loop through all slides, store relevant content
for result in results:
    
    # create empty dict to store article data
    article_data = {}
    
    # scrape article title
    title = result.find('div', class_='content_title').text
    # remove leading/trailing spaces
    title = title.strip()
    
    # scrape article description
    desc = result.find('div', class_='rollover_description_inner').text
    # remove leading/trailing spaces
    desc = desc.strip()
    
    # print article data
    print('-----------------')
    print(title)
    print(desc)
    
    # store data in dictionary to be appended to list
    article_data = {
        'news_title': title,
        'news_p': desc
    }
    
    mars_news.append(article_data)

-----------------
NASA Prepares for Moon and Mars With New Addition to Its Deep Space Network
Robotic spacecraft will be able to communicate with the dish using radio waves and lasers.
-----------------
NASA Administrator Statement on Moon to Mars Initiative, FY 2021 Budget
Jim Bridenstine addresses NASA's ambitious plans for the coming years, including Mars Sample Return.
-----------------
NASA's Mars 2020 Rover Closer to Getting Its Name
155 students from across the U.S. have been chosen as semifinalists in NASA's essay contest to name the Mars 2020 rover, and see it launch from Cape Canaveral this July.
-----------------
NASA Invites Students to Name Mars 2020 Rover
Through Nov. 1, K-12 students in the U.S. are encouraged to enter an essay contest to name NASA's next Mars rover.
-----------------
NASA's Curiosity Mars Rover Finds a Clay Cache
The rover recently drilled two samples, and both showed the highest levels of clay ever found during the mission.
-----------------
Why This M

In [6]:
# set url to scrape image
base_url = 'https://www.jpl.nasa.gov'
img_search_url = f'{base_url}/spaceimages/?search=&category=Mars'

# use splinter to visit url
browser.visit(img_search_url)

In [7]:
# set html from browser
html = browser.html

# create soup object, parse with html
soup = BeautifulSoup(html, 'html.parser')

In [8]:
# find anchor with image link
results = soup.find_all('a', class_='button fancybox')

# find href from result
img_href = results[0]['data-fancybox-href']

# build url from href and base url
featured_img_url = f'{base_url}{img_href}'

In [9]:
# url of twitter page to scrape
twitter = 'https://twitter.com/marswxreport?lang=en'

# retrieve page with the requests module
response = requests.get(twitter)

# create bs object; parse with html
soup = BeautifulSoup(response.text, 'html.parser')

In [10]:
# find all div with class js-tweet-text-container, store them in result set
results = soup.find_all('div', class_ = 'js-tweet-text-container')

# get the top result (most recent tweet), and pull the text inside the paragraph element
mars_weather = results[0].find('p').text

In [11]:
# url of facts page to scrape
space_facts = 'https://space-facts.com/mars/'

# retrieve page with the requests module
response = requests.get(space_facts)

# create bs object; parse with html
soup = BeautifulSoup(response.text, 'html.parser')

In [12]:
# find all tables, store them in result set
results = soup.find_all('table')

# get the top result, and read the data into a pandas table
table = results[0]

table = pd.read_html(str(table))

# read_html returns a list, use index to store the table as a df
mars_facts = table[0]

In [13]:
# print out html code for table
print(mars_facts.to_html(header = False))

<table border="1" class="dataframe">
  <tbody>
    <tr>
      <th>0</th>
      <td>Equatorial Diameter:</td>
      <td>6,792 km</td>
    </tr>
    <tr>
      <th>1</th>
      <td>Polar Diameter:</td>
      <td>6,752 km</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Mass:</td>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <th>3</th>
      <td>Moons:</td>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <th>4</th>
      <td>Orbit Distance:</td>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <th>5</th>
      <td>Orbit Period:</td>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <th>6</th>
      <td>Surface Temperature:</td>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <th>7</th>
      <td>First Record:</td>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <th>8</th>
      <td>Recorded By:</td>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>
