In [1]:
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
!which chromedriver

/usr/local/bin/chromedriver


In [3]:
#set path to chromedriver
executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
browser = Browser("chrome", **executable_path, headless=False)

## Scrape latest news title and paragraph

In [4]:
url = "https://redplanetscience.com/"
browser.visit(url)

In [5]:
html = browser.html
soup = BeautifulSoup(html,'html.parser')

In [6]:
title = soup.find_all('div', class_='content_title')[0].get_text()

In [7]:
title

'NASA Perseverance Mars Rover Scientists Train in the Nevada Desert'

In [8]:
paragraph = soup.find_all('div', class_='article_teaser_body')[0].get_text()

In [9]:
paragraph

"Team members searched for signs of ancient microscopic life there, just as NASA's latest rover will on the Red Planet next year."

## Scrape Mars image

In [10]:
img_url = "https://spaceimages-mars.com/"
browser.visit(img_url)

In [11]:
browser.links.find_by_partial_text('FULL IMAGE')

<splinter.element_list.ElementList at 0x7fb0a87cd070>

In [12]:
html = browser.html
soup = BeautifulSoup(html,'html.parser')

img_search = soup.find_all('img', class_='headerimage fade-in')

In [13]:
img_search[0]

<img class="headerimage fade-in" src="image/featured/mars1.jpg"/>

In [14]:
for i in img_search:
    
    img_path = i['src']
    print(img_path)
    

image/featured/mars1.jpg


In [15]:
img_path

'image/featured/mars1.jpg'

In [16]:
featured_image_url = f'https://www.spaceimages-mars.com/{img_path}'

In [17]:
featured_image_url

'https://www.spaceimages-mars.com/image/featured/mars1.jpg'

## Scrape Mars facts table

In [18]:
table_url = "https://galaxyfacts-mars.com/"

In [19]:
browser.visit(table_url)

In [20]:
table = pd.read_html(table_url)

In [21]:
table

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [22]:
table_df = table[1]

In [23]:
table_df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 ( Phobos & Deimos )
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [24]:
table_final = table_df.to_html(header=False,index=False)
print(table_final)

<table border="1" class="dataframe">
  <tbody>
    <tr>
      <td>Equatorial Diameter:</td>
      <td>6,792 km</td>
    </tr>
    <tr>
      <td>Polar Diameter:</td>
      <td>6,752 km</td>
    </tr>
    <tr>
      <td>Mass:</td>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <td>Moons:</td>
      <td>2 ( Phobos &amp; Deimos )</td>
    </tr>
    <tr>
      <td>Orbit Distance:</td>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <td>Orbit Period:</td>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <td>Surface Temperature:</td>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <td>First Record:</td>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <td>Recorded By:</td>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>


## Scrape Mars hemispheres

In [25]:
hemisphere_url = "https://marshemispheres.com/"

In [26]:
browser.visit(hemisphere_url)

In [27]:
html = browser.html
soup = BeautifulSoup(html,'html.parser')

ht = soup.find_all('div',class_='description')
ht[3]

<div class="description">
<a class="itemLink product-item" href="valles.html">
<h3>Valles Marineris Hemisphere Enhanced</h3>
</a>
<span class="subtitle" style="float:left">image/tiff 27 MB</span><span class="pubDate" style="float:right"></span><br/>
<p>Mosaic of the Valles Marineris hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. The distance is 2500 kilometers from the surface of…</p>
</div>

In [28]:
hemisphere_titles = []

for i in ht:
    headers = i.find('h3')
    hemisphere_titles.append(headers.text)

In [29]:
hemisphere_titles

['Cerberus Hemisphere Enhanced',
 'Schiaparelli Hemisphere Enhanced',
 'Syrtis Major Hemisphere Enhanced',
 'Valles Marineris Hemisphere Enhanced']

In [32]:
html = browser.html
soup = BeautifulSoup(html,'html.parser')

hi = soup.find_all('div',class_='item')

In [33]:
hi

[<div class="item">
 <a class="itemLink product-item" href="cerberus.html"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png"/></a>
 <div class="description">
 <a class="itemLink product-item" href="cerberus.html">
 <h3>Cerberus Hemisphere Enhanced</h3>
 </a>
 <span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br/>
 <p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p>
 </div>
 <!-- end description -->
 </div>,
 <div class="item">
 <a class="itemLink product-item" href="schiaparelli.html"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="images/08eac6e22c07fb1fe72223a79252de20_schiaparelli_enhanced.tif_thumb.png"/></a>
 <div class="description">
 <a class="ite

In [34]:
hemisphere_img_urls = []

for i in hi:
    links = i.a['href']
    hemisphere_img_urls.append(f'https://marshemispheres.com/{links}')

In [35]:
hemisphere_img_urls

['https://marshemispheres.com/cerberus.html',
 'https://marshemispheres.com/schiaparelli.html',
 'https://marshemispheres.com/syrtis.html',
 'https://marshemispheres.com/valles.html']

In [43]:
hemisphere_enhanced_img_urls = []

for j in hemisphere_img_urls:
    
    browser.visit(j)
    
    html = browser.html
    soup = BeautifulSoup(html,'html.parser')
    
    enhanced_img = soup.find_all('img',class_='wide-image')
    enhanced_img_url = enhanced_img[0]['src']
    
    hemisphere_enhanced_img_urls.append(f'https://marshemispheres.com/{enhanced_img_url}')
    

In [44]:
hemisphere_enhanced_img_urls

['https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg',
 'https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg',
 'https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg',
 'https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg']

In [45]:
hemisphere_zip = zip(hemisphere_titles,hemisphere_enhanced_img_urls)

In [46]:
hemisphere_final = []

In [47]:
for a, b in hemisphere_zip:
    hemisphere_final_dict = {}
    
    hemisphere_final_dict['title'] = a
    
    hemisphere_final_dict['img_url'] = b
    
    hemisphere_final.append(hemisphere_final_dict)

In [48]:
hemisphere_final

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]

## Store all scraped data into dictionary

In [39]:
full_data = {
        'latest_news_title': title,
        'latest_news_paragraph': paragraph,
        'mars_image': featured_image_url,
        'mars_facts': table_final,
        'hemispheres': hemisphere_final
    }

In [40]:
full_data

{'latest_news_title': 'Valles Marineris Hemisphere Enhanced',
 'latest_news_paragraph': 'The team has learned to meet new challenges as they work remotely on the Mars mission.',
 'mars_image': 'https://www.spaceimages-mars.com/image/featured/mars3.jpg',
 'mars_facts': '<table border="1" class="dataframe">\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 ( Phobos &amp; Deimos )</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n  