In [1]:
from bs4 import BeautifulSoup as bs
from splinter import Browser
import requests
import re
import pandas as pd
import time

In [2]:
# Create a funtion that returns the Soup object
def create_soup(html):
    return bs(html,'html.parser')

In [3]:
# Create a function that returns Browser object
def init_browser():
    executable_path = {'executable_path':'/usr/local/bin/chromedriver'}
    return Browser('chrome',**executable_path,headless=False)

In [4]:
# Scraping Nasa Mars News
url = 'https://mars.nasa.gov/news/'
# Create browser object
browser = init_browser()
browser.visit(url)
time.sleep(2)
# Create Soup object
html = browser.html
soup = create_soup(html)
# Scrape to get news title and content
news = soup.find('ul',class_='item_list')
news_title = news.find('div',class_='content_title').text
news_p = soup.find('ul',class_='item_list').find('div',class_='article_teaser_body').text
browser.quit()
# Splinter is very unstable for this website. Sometime the above code works and sometime not.
# Therefore displaying the title an paragraph text below

In [5]:
news_title

'Perseverance Scientists Train for Mars in Nevada Desert'

In [6]:
news_p

"Team members searched for signs of ancient microscopic life there, just as NASA's latest rover will on the Red Planet next year."

In [7]:
# # Scraping Nasa Mars News
# url = 'https://mars.nasa.gov/news/'

# # Create soup object
# html = requests.get(url).text
# soup = create_soup(html)

# # Scrape the webpage
# news_title = soup.find('div',class_='content_title').a.text.strip()
# news_p = soup.find('div', class_='image_and_description_container').text.strip()

In [8]:
# Scraping JPL Mars Space Images
base_url = "https://www.jpl.nasa.gov"
scrape_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

# Create Browser object
browser = init_browser()
browser.visit(scrape_url)

# Create Soup object
html = browser.html
soup = create_soup(html)

# Scrape the webpage
featured_image_url = ( soup.find('article')['style']
                      .replace('background-image: url(\'','')
                      .replace('\');','')
                     )# Using replace to extract only the URL

# Build complete URL
featured_image_url = base_url + featured_image_url

browser.quit()

In [9]:
# # Scraping Mars Weather twitter account - Did not work
# scrape_url = 'https://twitter.com/marswxreport?lang=en'

# # Create Browser object
# browser = init_browser()
# browser.visit(scrape_url)

# # Create Soup object
# html = browser.html
# soup = create_soup(html)

# # Scrape the website - Assuming that if the text contains low,high and pressure it is related to weather
# mars_weather = soup.find_all(string=re.compile('low.*high.*\n.*\npressure.*'))[0]

# browser.quit()

In [10]:
# Scraping Mars Weather twitter account
url = 'https://twitter.com/marswxreport?lang=en'

# Create soup object
html = requests.get(url).text
soup = create_soup(html)

# Scrape the website - Assuming that if the text contains low,high and pressure it is related to weather
mars_weather = soup.find_all(string=re.compile('low.*high.*\n.*\npressure.*'))[0]


In [11]:
# Scraping Mars Facts Webpage
url = 'https://space-facts.com/mars/'

# Create a DataFrame to read the HTML table
mars_df = pd.read_html(url)[0]
mars_df.columns = ['description','value']
mars_df['description'] = mars_df['description'].str.replace(':','')

# Create index on description column
mars_df.set_index('description',inplace=False)

# Convert the DataFrame to HTML table
html_data = mars_df.to_html(index=False).replace('\n','')

In [12]:
# Scraping USGS Astrogeology site
base_url = 'https://astrogeology.usgs.gov'
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

# Create soup object
html = requests.get(url).text
soup = create_soup(html)

# Retrieve the items
items = soup.find_all('div',class_='item')

# Dictionary and list to retrieve the results of scraping
title_url_dict = {}
hemisphere_image_urls = []

for item in items:
    title = item.a.h3.text
    img_url = base_url + item.a['href']
    
    # Create soup object for the image URL to scrape the webpage
    html = requests.get(img_url).text
    img_soup = create_soup(html)
    
    # Get the image url string from the src attribute of the image
    full_img_url = base_url + img_soup.find('img',class_='wide-image')['src']
    
    # Update dictionary with the results
    title_url_dict = {'title':title,'img_url':full_img_url}
    
    # Update list
    hemisphere_image_urls.append(title_url_dict)