In [None]:
# Dependencies
import os
from bs4 import BeautifulSoup as bs
import requests
import pymongo
import time
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

# Scape Source 1 - NASA Mars News

In [None]:
# Set up Chrome Driver because the news articles here are generated dynamically
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
# Define the url for NASA Mars News and use the headless browser to visit it
url = 'https://redplanetscience.com/'
browser.visit(url)

# Get the html object
html = browser.html

# Parse the html object
soup = bs(html, 'html.parser')

# Get all the relevant elements for news
results = soup.find_all('div', class_='list_text')

In [None]:
# Check results are valid
results

In [None]:
# Loop through results and pull title and paragraph
# Loop through returned results
for result in results:
    # Error handling
    try:
        # Identify and return title of news post
        title = result.find('div', class_='content_title').text
        # Identify and return paragraph of news post
        paragraph = result.find('div', class_='article_teaser_body').text

        # Run only if title and paragraph are available
        if (title and paragraph):
            # Print results
            print('-------------')
            print(title)
            print(paragraph)

            # Dictionary to be inserted as a MongoDB document
            post = {
                'title': title,
                'paragraph': paragraph,
            }

    except Exception as e:
        print(e)

# Scape Source 2 - JPL Mars Space Images - Featured Image

In [None]:
url = 'https://spaceimages-mars.com/'
browser.visit(url)

# Get the html object
html = browser.html

# Parse the html object
soup = bs(html, 'html.parser')

# Find a tag where class is brand_title
results = soup.find('div', class_='floating_text_area')

In [None]:
# Get the url from the href attribute
relimageurl = results.find('a', href=True)
relimageurl['href']

In [None]:
# Assign it to a variable and concatenate it with the main url of the site
featured_image_url = url +relimageurl['href']
featured_image_url

# Scrape Source 3 - Mars Facts

In [None]:
# Define the source to scrap
url = 'https://galaxyfacts-mars.com/'

In [None]:
# Use Pandas to get the table from the url. It is returned as a list so get the first instance to get the dataframe
facts_list = pd.read_html(url)
facts_df = facts_list[0]
facts_df

In [None]:
# Set the first row as column headers
facts_df.columns = facts_df.iloc[0]
facts_df = facts_df[1:]
facts_df

In [None]:
# Set the right column as index
facts_df = facts_df.set_index('Mars - Earth Comparison')
facts_df

In [None]:
# Save it to a html string
facts_html = facts_df.to_html()
facts_html

# Scrape Source 4 - Mars Hemispheres

In [None]:
# Declare the list of urls to go after
urllist = ['https://marshemispheres.com/cerberus.html',
            'https://marshemispheres.com/schiaparelli.html',
            'https://marshemispheres.com/syrtis.html',
            'https://marshemispheres.com/valles.html']

# Declare an empty list to hold the final scrap
image_list = []

In [None]:
def getImageInfo(url):
    # Declare an empty dict to be populated and returned
    dict = {}
    # Retrieve page with the requests module
    response = requests.get(url)
    
    # Create bs object; parse with 'html.parser'
    soup = bs(response.text, 'html.parser')
    
    # Find the title and relative url of the image
    imageinfo = soup.find('div', class_='cover')
    title = imageinfo.find('h2', class_='title').text
    relimageurl = imageinfo.find('a', href=True)
    imageurl = url + relimageurl['href']
    
    # Populate the dictionary with the retrieved info
    dict = {"title": title, "img_url": imageurl}
    return dict

# Test out our new method
getImageInfo('https://marshemispheres.com/cerberus.html')
    

In [None]:
for url in urllist:
    image_list.append(getImageInfo(url))
image_list

In [None]:
# Be polite and close the browser
browser.quit()