<a href="https://colab.research.google.com/github/poudelmohit/project_IUCN/blob/main/iucn_data_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pwd

/content


# Mounting Directory:

In [2]:
from google.colab import drive
MOUNTPOINT = '/content/drive'
drive.mount(MOUNTPOINT)

import os
directory = os.path.join(MOUNTPOINT,'MyDrive','Colab Notebooks')
os.chdir(directory)

Mounted at /content/drive


In [4]:
! git clone https://github.com/poudelmohit/project_IUCN

Cloning into 'project_IUCN'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 9 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (9/9), 9.49 KiB | 1.19 MiB/s, done.


In [17]:
os.chdir('project_IUCN')
! ls

all_download_links.csv	data_extraction.py  iucn_data_extraction.ipynb	mammals_list.txt


In [19]:
! pip install selenium

Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.23.1-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.2-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m476.0/476.0 kB[0m [31m22.

## creating a function:

In [25]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time

def search_iucn_species(species_name):

  """
    Searches for a species on the IUCN Red List website and retrieves the common name and a download link of the IUCN species assessment report pdf.

    Args:
        species_name (str): The name of the species to search for, preferentially scientific name.

    Returns:
        dict(A dictionary containing):
            - "common name": The headline text of the species page.
            - "download_link": The URL of the first available download button, or None if no download buttons are found.
    """

  # Set up Chrome options
  chrome_options = Options()
  chrome_options.add_argument("--headless")
  chrome_options.add_argument("--no-sandbox")
  chrome_options.add_argument("--disable-dev-shm-usage")

  # Initialize the WebDriver with Chrome options
  driver = webdriver.Chrome(options=chrome_options)

  try:
      # Open the IUCN Red List website
      driver.get("https://www.iucnredlist.org/")

      # Find the search box element using the class attribute
      search_box = driver.find_element("css selector", "input.search.search--site")

      # Type the search query (species name) and hit Enter
      search_box.send_keys(species_name)
      search_box.send_keys(Keys.RETURN)

      # Wait for the search results to load
      time.sleep(3)

      # Find and click on the first 'View' link with the class "link--faux"
      view_link = driver.find_element("css selector", "a.link--faux")
      view_link.click()

      # Wait for the species page to load after clicking the link
      time.sleep(3)

      # Find the h1 element with the class "headline__title"
      headline = driver.find_element("css selector", "h1.headline__title")
      headline_text = headline.text

        # Find and click the download button with the specified class
      download_button = driver.find_element("name", "download_search_results")
      download_button.click()

      # Wait for the download options to appear
      time.sleep(4)

      # Find all 'link--download' buttons
      download_buttons = driver.find_elements("css selector", "a.link--download")
      if download_buttons:
          # Get the href attribute of the first download button
          first_href = download_buttons[0].get_attribute("href")
      else:
          first_href = None
          print("No download buttons found.")

      # Create a dictionary with the headline and first href
      result = {
          "scientific_name": species_name,
          "common_name": headline_text,
          "download_link": first_href
      }

      return result

  finally:
      # Close the browser
      driver.quit()
      # print("Browser closed.")



In [24]:
print(search_iucn_species("Didelphis virginiana"))

{'scientific name': 'Didelphis virginiana', 'common name': 'Virginia Opossum', 'download_link': 'https://www.iucnredlist.org/species/pdf/22176259'}


In [None]:
## this function works well, now I need a list of all species to iterate over this function.
## For now, I am iterating over all mammal species of North America.


## Obtaining North American Mammals List:

In [26]:
import pandas as pd


In [36]:
url =  'https://www.mammaldiversity.org/explore.html'
mammal_database = pd.read_html(url)[0]

In [39]:
mammal_database.columns

Index(['Species ID', 'Genus', 'Species', 'Family', 'Order'], dtype='object')

In [40]:
mammal_database['scientific_name'] = mammal_database['Genus'] + " " + mammal_database["Species"]
mammal_database['scientific_name'].to_csv("mammals_list.txt", header=None, index=False)

In [43]:
# lets iterate the function through each value in mammals_list.txt:

results = []

df_dict = {}

for species in mammal_database['scientific_name'][0:3]:
  print("working on", species)
  result = search_iucn_species(species)
  results.append(result)

df_link = pd.DataFrame(results)
df_link.to_csv("all_download_links.csv")

working on Ornithorhynchus anatinus
working on Tachyglossus aculeatus
working on Zaglossus attenboroughi


In [45]:
df_link

Unnamed: 0,scientific_name,common_name,download_link
0,Ornithorhynchus anatinus,Platypus,https://www.iucnredlist.org/species/pdf/21964009
1,Tachyglossus aculeatus,Short-beaked Echidna,https://www.iucnredlist.org/species/pdf/21964662
2,Zaglossus attenboroughi,Sir David's Long-beaked Echidna,https://www.iucnredlist.org/species/pdf/21964353


In [51]:
! git config --global user.email "poudelmohit59@gmail.com"
! git config --global user.name "poudelmohit"

In [49]:
! git add mammals_list.txt
! git commit -m "obtained new list of mammals using ASM mammal diversity database"

! git add iucn_data_extraction.ipynb
! git commit -m "updated function to download iucn pdf files"

[main 1b73e5a] obtained new list of mammals using ASM mammal diversity database
 2 files changed, 6754 insertions(+), 212 deletions(-)
 create mode 100644 iucn_data_extraction.ipynb
 rewrite mammals_list.txt (99%)
[main ebc78cf] updated function to download iucn pdf files
 1 file changed, 1 insertion(+), 1 deletion(-)


In [53]:
! git status

On branch main
Your branch is ahead of 'origin/main' by 2 commits.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   iucn_data_extraction.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [52]:
! git push

fatal: could not read Username for 'https://github.com': No such device or address
