In [27]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 101.0.4951
Get LATEST chromedriver version for 101.0.4951 google-chrome
Driver [C:\Users\PC\.wdm\drivers\chromedriver\win32\101.0.4951.41\chromedriver.exe] found in cache


In [3]:
#assign the url and instruct the broswer to visit it
# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)
#does 2 things:
    #searching for elements with a comb of div and attribute list_text
    #telling to wait one sec before searching for components
        #helps bc often dynamic pages take a bit to load esp with lots of images


True

In [5]:
#set up HTML parser
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')
    #using <div /> bc its out parent element - all other elements are inside it
    #so can use this when we want to drill down farther
    #div.list text is like doing div class='list_text'
    
    #CSS works from right to left - returns last item instead of first
    #bc of this using select_one the first matching element returned will be 
    #a <li /> element with a class of slide and all nested elements within it

In [13]:
#find the title and teaser paragraph
slide_elem.find('div', class_='content_title')
    #look inside the slide_elem var that has all the info inside div (parent ele)
    #and find the content_title class
    

<div class="content_title">Common Questions about InSight's 'Mole'</div>

In [14]:
#get just the text from the title
#use the parent element to find the first 'a' tag and save to a variable
news_title = slide_elem.find('div', class_='content_title').get_text()
    #get_text() returns only the text from the results not any of the 
    #HTML tags or elements
news_title

"Common Questions about InSight's 'Mole'"

In [15]:
#get just the article summary(teaser) from the title
#use the parent element to find the teaser and pull the text
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
    #there are more than 1 elem with that class - only want the first on
    #want to pull the first one on the list not a specific one so more than we need is ok
    #if we got more specific wed be pulling from a certain article not just whatever
    #was at the top
    #best to leave more general
    
    #find() - only want the first of the class and attribute weve specified
    #find_all() - retrieves all matches
        #so just use find() to pull the first

news_p

'The following Q&As with members of the team answer some of the most common questions about the burrowing device, part of a science instrument called the Heat Flow and Physical Properties Package (HP3).'

### Featured Images

In [16]:
#pull the image from website for use in our app
#want full size image so need splinter to click the full image button
#That directs us to a slide show


In [17]:
#set up the URL for visiting
url = 'https://spaceimages-mars.com/'
browser.visit(url)

In [20]:
#want to click the full image button - check HTML from website
#find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
#using [1] bc there are 3 instances that <button is used in the code
#want the second one or index 1
full_image_elem.click()

In [21]:
#parse the new page loaded onto the testing browser so we can continue
#and scrape the full-size image url
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [23]:
#need to find the relative image URL - on open devtools testing browser 
#find the image link - want most recently posted
#if you use the actual url for the image youll only ever pull that one,
    #not the most recent
#the <img class"fancybox-image" src="image url"
    #the src will change so can use that - use img.fancybox... to get the image

#find the relative image URL
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
    #using img_soup that has the html parsed earlier
    #get('src') pulls the link to the most current image
img_url_rel

#only a partial link bc the base url isnt included, need the websites URL
    #before it for it to work in a new tab

'image/featured/mars1.jpg'

In [26]:
#add the base URL to the code - creates a whole, absolute URL
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
    #using f string bc its cleaner way to print & bc
    #f strings are evaluated at run time
        #the string and the variable it holds are nonexistant until its executed
        #and the values are pulled - theyre not constant
        #works well with lots of updating info
img_url

'https://spaceimages-mars.com/image/featured/mars1.jpg'

### Scrape the mars data table

In [None]:
#want to keep the table format so the HTML table format is important
#all of the data we want is in the <table /> tag 
#inside the table is the <tbody /> tag which is the body of the table
    #headers, columns, and rows
#<tr /> is the tag for each table row
#within that tag the data is stored in <td /> 
    #where columns are established
    
#instead of scraping each row or the data in each <td /> were going to scrape
    #the whole table with Pandas' read_html() funct
    

In [28]:
df = pd.read_html('https://galaxyfacts-mars.com/')[0]
    #read in the html from webside, pull 1st table (index 0)
        #function specifically looks for tables so just spec which table index
        #youre wanting
df.columns = ['description', 'Mars', 'Earth']
    #assign new columns to the new df for clarity
df.set_index('description', inplace=True)
    #setting the description column into the df index
    #inplace=true means the updated index will remain in place without
        #having to put the new df into a new variable
        #lets it edit df rather than making a new one
df

Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [30]:
#want to put the table into a new website so need the HTML w/ updating info
#can convert df back into HTML-ready code using .to_html() funct (pandas funct)
df.to_html()
#can add the resulting block of text directly to a new app 
#and will show in the same table w. easy to read format

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

In [31]:
#got all the info we need --- can end the automated browsing session
#important line to add to the web app also 
    #without it the auto browser wont now to shut down 
    #will keep going and put a strain on resources if left on
    #only want the browser to be on when were scrapping data
    #like turning off a light switch when your ready to leave a room or home

In [32]:
#shut down the automated browsing session
browser.quit()

In [33]:
#jupyter notebook is great for scraping one thing at a time, or building
#your code in chunks - 1 for image, 1 for article, 1 for facts
#allows testing of each independently from another
#to fully automate scrapping you have to use a .py file
    #cant be done in jupyter
    
#copy the current code into a .py file
    #wont transition perfectly but better than individ copying everything
    
#How to trans to a .py file
# click on file, download as, select .py from the next menu to download
#ignore the warning and click keep
#go to downloads and open the new file
#clean up code by removing unnecessary blank spaces and comments

#can then test the script by running through the terminal