In [1]:
# Setup 
#-----------------------------
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [C:\Users\Asus\.wdm\drivers\chromedriver\win32\99.0.4844.51\chromedriver.exe] found in cache


In [3]:
# Create a empty dictionary to store the data
#----------------------------------------------
scraped_data = {}

In [7]:
# Visit the Mars News Site
#---------------------------------------------
Mars_News_Site = ('https://redplanetscience.com/')
browser.visit(Mars_News_Site)


# Create HTML object 
#---------------------
html = browser.html

# Parse HTML with BeautifulSoup 
#---------------------------------------------
bsoup = BeautifulSoup(html, 'html.parser')

# Get the first list of headlines containing the latest news title and paragraph text 
#------------------------------------------------------------------------------------------
first_div = bsoup.select_one('div.list_text')
first_div


<div class="list_text">
<div class="list_date">March 19, 2022</div>
<div class="content_title">Join NASA for the Launch of the Mars 2020 Perseverance Rover</div>
<div class="article_teaser_body">No matter where you live, choose from a menu of activities to join NASA as we "Countdown to Mars" and launch the Perseverance rover to the Red Planet.</div>
</div>

In [8]:
# Save the news title under the <div> tag with a class of 'content_title' 
#--------------------------------------------------------------------------
news_title = first_div.find('div', class_='content_title').get_text()
print(news_title)

# Save the paragraph text under the <div> tag with a class of 'article_teaser_body' 
#-----------------------------------------------------------------------------------
news_p = first_div.find('div', class_='article_teaser_body').get_text()
print(news_p)

Join NASA for the Launch of the Mars 2020 Perseverance Rover
No matter where you live, choose from a menu of activities to join NASA as we "Countdown to Mars" and launch the Perseverance rover to the Red Planet.


In [10]:
# Create a dictionary with the scraped data
#---------------------------------------------------------
Nasa_News = {"Title":news_title, "Paragraph": news_p}
Nasa_News

{'Title': 'Join NASA for the Launch of the Mars 2020 Perseverance Rover',
 'Paragraph': 'No matter where you live, choose from a menu of activities to join NASA as we "Countdown to Mars" and launch the Perseverance rover to the Red Planet.'}

In [12]:
# Save the scraped data to an entry of the dictionary
#----------------------------------------------------
scraped_data["Title"] = news_title
scraped_data["Paragraph"] = news_p

In [19]:
# Visit the JPL Featured Space Image website 
#-----------------------------------------------
JPL_image = 'https://spaceimages-mars.com'
browser.visit(JPL_image)


In [20]:
# Featured image is in the div class="carousel_container"
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()


In [21]:
html = browser.html

# Parse HTML with BeautifulSoup 
#-------------------------------------------
soup = BeautifulSoup(html, 'html.parser')

In [22]:
# find the relative image url
img_url_rel = soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars3.jpg'

In [29]:
full_address = "https://spaceimages-mars.com/"+ img_url_rel
full_address

'https://spaceimages-mars.com/image/featured/mars3.jpg'

In [32]:
# URL
#--------------------------------------------------------
url_mars_facts = "https://galaxyfacts-mars.com/"

In [33]:
# Pandas to scrape any table data from a page
#-----------------------------------------------
tables = pd.read_html(url_mars_facts)


# Tables are available
#-------------------------
len(tables)

2

In [34]:
tables[0]

Unnamed: 0,0,1,2
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [37]:
# Select the intended table
#-------------------------------
table_facts = tables[0]
table_facts

Unnamed: 0,0,1,2
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [38]:
# Convert the data to a HTML table string
#---------------------------------------------------
scraped_data["TableHTML"] = table_facts.to_html()

In [39]:
# URL
#--------------------------------------------------------------------------------------------------------------
url_mars_hemispheres = "https://marshemispheres.com/"

In [40]:
# Use the browser to visit the url
#--------------------------------------
browser.visit(url_mars_hemispheres)

In [41]:
# Splinter capture a page's underlying html and use pass it to BeautifulSoup to scrape the content
#-------------------------------------------------------------------------------------------------------------------
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [42]:
# By analyzing the page we can find that the images are in a div class='description'
#--------------------------------------------------------------------------------------
results= soup.find_all('div',class_='description')

In [43]:
# list with the name of the hemispheres
#------------------------------------------------
list_hemispheres = []
for i in range(len(results)):
    list_hemispheres.append(results[i].a.h3.text)

list_hemispheres

['Cerberus Hemisphere Enhanced',
 'Schiaparelli Hemisphere Enhanced',
 'Syrtis Major Hemisphere Enhanced',
 'Valles Marineris Hemisphere Enhanced']

In [44]:
hemisphere_image_urls = []

# Create a list of dictionaries for each hemisphere
for i in range(len(list_hemispheres)):

    # Use the browser to visit the url
    browser.click_link_by_partial_text(list_hemispheres[i])
    
    # Splinter can capture a page's underlying html and use pass it to BeautifulSoup to allow us to scrape the content
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # By analyzing the page we can find that the images link are in a li
    results_new = soup.find_all('li')


    # Append the dictionary with the image url string and the hemisphere title to a list.
    for n in range(len(results_new)):
        if results_new[n].a.text == 'Sample':
            hemisphere_image_urls.append({"title": list_hemispheres[i].replace("Hemisphere Enhanced", 'Hemisphere'), "img_url": results_new[0].a['href']})
            
    # Use the browser to visit the url
    browser.visit(url_mars_hemispheres)



In [45]:
# Create a dictionary with the scraped data
DSD = {"ListImages": hemisphere_image_urls}
DSD

{'ListImages': [{'title': 'Cerberus Hemisphere', 'img_url': 'images/full.jpg'},
  {'title': 'Schiaparelli Hemisphere',
   'img_url': 'images/schiaparelli_enhanced-full.jpg'},
  {'title': 'Syrtis Major Hemisphere',
   'img_url': 'images/syrtis_major_enhanced-full.jpg'},
  {'title': 'Valles Marineris Hemisphere',
   'img_url': 'images/valles_marineris_enhanced-full.jpg'}]}

In [46]:
# Save the scraped data to an entry of the dictionary
scraped_data["ListImages"] = hemisphere_image_urls

In [48]:
# When you’ve finished testing, close your browser using browser.quit:
browser.quit()

In [49]:
# The scraped data is available on the dictionary form
scraped_data

{'Title': 'Join NASA for the Launch of the Mars 2020 Perseverance Rover',
 'Paragraph': 'No matter where you live, choose from a menu of activities to join NASA as we "Countdown to Mars" and launch the Perseverance rover to the Red Planet.',
 'TableHTML': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Mars - Earth Comparison</td>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Diameter:</td>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Distance from Sun:</td>\n      <td>227,943,824 km</td

In [50]:
import pymongo

# Flask_pymongo to set up mongo connection
#------------------------------------------------------------
conn =  "mongodb://localhost:27017/scrape_mars"
client =  pymongo.MongoClient(conn)

# Collection and drop existing data
#-----------------------------------
db = client.scrape_mars
db.mars_data.drop()

db.mars_data.insert_many([scraped_data])


# query_result = list(db.mars_data.find())
# query_result = (db.mars_data.find())
#----------------------------------------------
query_result = (db.mars_data.find_one())
query_result

{'_id': ObjectId('623601380b519a1136a93252'),
 'Title': 'Join NASA for the Launch of the Mars 2020 Perseverance Rover',
 'Paragraph': 'No matter where you live, choose from a menu of activities to join NASA as we "Countdown to Mars" and launch the Perseverance rover to the Red Planet.',
 'TableHTML': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Mars - Earth Comparison</td>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Diameter:</td>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Distanc

In [51]:
type(query_result)

dict

In [52]:
len(query_result)

5

In [53]:
query_result['Title']

'Join NASA for the Launch of the Mars 2020 Perseverance Rover'