In [1]:
# Import dependencies
from time import sleep
import pandas as pd
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from selenium.webdriver import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs
import pymongo
import requests

In [2]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 85.0.4183
[WDM] - Get LATEST driver version for 85.0.4183
[WDM] - Driver [C:\Users\Boss\.wdm\drivers\chromedriver\win32\85.0.4183.87\chromedriver.exe] found in cache


 


## Extract: scrape animal data from zoo.org.au

In [3]:
zoo_url = "https://www.zoo.org.au/fighting-extinction/local-threatened-species/"

# Use splinter to navigate the site
browser.visit(zoo_url)

In [4]:
html = browser.html
soup = bs(html, 'lxml')

In [5]:
animal_html_list = soup.find_all("div", class_="feature-tile__title")

In [6]:
animal_list = []

for animal in animal_html_list:
    animal_list.append(animal.get_text().split("(")[0].strip())

In [7]:
animal_list

['Alpine She-oak Skink',
 'Baw Baw Frog',
 'Brush-tailed Rock-wallaby',
 'Eastern Barred Bandicoot',
 'Giant Burrowing Frog',
 'Golden-rayed Blue Butterfly',
 'Grassland Earless Dragon',
 'Guthega Skink',
 'Helmeted Honeyeater',
 'Key’s Matchstick Grasshopper',
 'Large Brown Tree Frog',
 "Leadbeater's Possum",
 'Lord Howe Island Stick Insect',
 'Mallee Emu-wren',
 'Mountain Pygmy-possum',
 'New Holland Mouse',
 'Northern Corroboree Frog',
 'Orange-bellied Parrot',
 'Plains-wanderer',
 'Regent Honeyeater',
 'Smoky Mouse',
 'Southern Bent-wing Bat',
 'Southern Corroboree Frog',
 'Spotted Tree Frog',
 'Stuttering Barred Frog',
 'Swift Parrot',
 'Tasmanian Devil']

In [8]:
browser.links.find_by_partial_text("Learn More").click()

In [9]:
animal_info_image_urls = list()

for item in animal_list:

    sleep(1)
    html = browser.html
    soup = bs(html, 'lxml')

    # Retrieve the first animal image url
    image_url = soup.select_one("picture img")["src"]
    full_image_url = f"https://www.zoo.org.au{image_url}"

    # Retrieve the first animal image url alternative
    image_alternative = soup.select_one("picture img")["alt"]
        
    # Retrieve the introduction paragraphs
    intro = soup.find("p", class_="intro").text.strip()

    # Retrieve the first threat paragraph
    try:
        threat_paragraph = soup.find("div", class_="row-wrapper--right-weighted-column").text.strip()
    except AttributeError:
        results = soup.find_all("p")
        threat_paragraph = [result.text.strip() for result in results][1:4]
        
    # Append the dictionary with the above info to a list
    animal_info_image_urls.append({
        "animal_name": item,
        "image_url": full_image_url,
        "image_alternative": image_alternative,
        "introduction": intro,
        "threat_paragraph": threat_paragraph
    })

    browser.find_by_xpath('//*[@id="main-content"]/nav/div/div/div/div[2]/nav/a[2]').click()

In [10]:
animal_info_image_urls[:10]

[{'animal_name': 'Alpine She-oak Skink',
  'image_url': 'https://www.zoo.org.au/media/2050/1023_alpine_she-oak_skink_healesville_sanctuary1.jpg?anchor=center&mode=crop&quality=75&width=2000&height=570&rnd=132131638000000000',
  'image_alternative': 'Alpine She-oak Skink sunning it self on a rock with its tongue out.',
  'introduction': 'Found in only a few locations in Victoria and NSW, the Alpine She-oak Skink is listed as endangered.',
  'threat_paragraph': "Major threats\nFire is a huge danger to the Alpine She-Oak Skink. It kills individual skinks and destroys the understory vegetation. This makes any survivors vulnerable to predators.\nClimate change is obviously a threat to all alpine areas. So is grazing and trampling by cattle and feral horses. The skink's habitat is also disturbed by the construction and maintenance of ski resorts and roads."},
 {'animal_name': 'Baw Baw Frog',
  'image_url': 'https://www.zoo.org.au/media/2052/21295_baw_baw_frog_melbourne_zoo1.jpg?anchor=center

## Extract: scrape animal data from environment.vic.gov.au

In [None]:
# URL of page to be scraped
url = "https://www.environment.vic.gov.au/conserving-threatened-species/threatened-species"

# Use splinter to navigate the site
browser.visit(url)

In [None]:
# Create BeautifulSoup object; parse with 'lxml'
html = browser.html
soup = bs(html, 'lxml')

In [None]:
text_with_links = soup.find_all(class_ = 'internal-link')

for text in text_with_links:
    print(text.get_text())

In [None]:
# As we are only interested in fauna, we'll only keep animal names.
text_to_search = [text.get_text() for text in text_with_links][1: -4]
text_to_search

In [None]:
# ensuring the link is in view
browser.execute_script('window.scrollTo(0, 1000);')

In [None]:
//*[@id="content_container_70299"]/p[5]/a

In [None]:
browser.find_by_xpath('//*[@id="content_container_70299"]/p[5]/a').click()

In [None]:
browser.links.find_("Hooded Plover").click()

In [None]:
html = browser.html
soup = bs(html, 'lxml')

In [None]:
all_species_scraped = []

for text in text_to_search:
    
    # Click each of the links to the threatened species in order to find the image url to the full resolution image.
    browser.links.find_by_partial_text(text).click()
    sleep(1)
    html = browser.html
    soup = bs(html, 'lxml')
    
    each_specie = {}
    
    # Retrieve the Hemisphere title containing the hemisphere name
    each_specie["name"] = text.lower()
    each_specie["overview"] = 
    
    # Retrieve the image url string for the full resolution hemisphere image
    img_url = soup.select_one("li a")["href"]
    
    # Append the dictionary with the image url string and the hemisphere title to a list
    hemisphere_image_urls.append({
        "title": title,
        "img_url": img_url
    })
    
    browser.back() 
    sleep(1)