<a href="https://colab.research.google.com/github/poudelmohit/project_IUCN/blob/main/iucn_pdf_link_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mounting Directory:

In [None]:
from google.colab import drive
MOUNTPOINT = '/content/drive'
drive.mount(MOUNTPOINT)

import os
directory = os.path.join(MOUNTPOINT,'MyDrive','Colab Notebooks','LAB','project_IUCN')
os.chdir(directory)

Mounted at /content/drive


In [None]:
! pip install selenium

Collecting selenium
  Downloading selenium-4.24.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.24.0-py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.2-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m476.0/476.0 kB[0m [31m23.

## creating a function:

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd

def search_iucn_species(species_name):
    """
    Searches for a species on the IUCN Red List website and retrieves the common name and a download link of the IUCN species assessment report pdf.

    Args:
        species_name (str): The name of the species to search for, preferentially scientific name.

    Returns:
        dict(A dictionary containing):
            - "scientific_name": The input species name.
            - "common_name": The headline text of the species page.
            - "download_link": The URL of the first available download button, or None if no download buttons are found.
    """

    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Initialize the WebDriver with Chrome options
    driver = webdriver.Chrome(options=chrome_options)

    try:
        # Open the IUCN Red List website
        driver.get("https://www.iucnredlist.org/")

        # Find the search box element using the class attribute
        search_box = driver.find_element("css selector", "input.search.search--site")

        # Type the search query (species name) and hit Enter
        search_box.send_keys(species_name)
        search_box.send_keys(Keys.RETURN)

        # Wait for the search results to load
        time.sleep(1)

        # Find and click on the first 'View' link with the class "link--faux"
        view_link = driver.find_element("css selector", "a.link--faux")
        view_link.click()

        # Wait for the species page to load after clicking the link
        time.sleep(1)

        # Find the h1 element with the class "headline__title"
        headline = driver.find_element("css selector", "h1.headline__title")
        headline_text = headline.text

        # Find and click the download button with the specified class
        download_button = driver.find_element("name", "download_search_results")
        download_button.click()

        # Wait for the download options to appear
        time.sleep(1)

        # Find all 'link--download' buttons
        download_buttons = driver.find_elements("css selector", "a.link--download")
        if download_buttons:
            # Get the href attribute of the first download button
            first_href = download_buttons[0].get_attribute("href")
        else:
            first_href = None
            print(f"No download buttons found for species: {species_name}")

        # Create a dictionary with the headline and first href
        result = {
            "scientific_name": species_name,
            "common_name": headline_text,
            "download_link": first_href
        }
        return result

    except Exception as e:
        # Print the error message and skip to the next species
        print(f"Error searching for species: {species_name}")
        return {
            "scientific_name": species_name,
            "common_name": None,
            "download_link": None
        }

    finally:
        # Close the browser
        driver.quit()



In [None]:
print(search_iucn_species("Didelphis virginiana"))

{'scientific_name': 'Didelphis virginiana', 'common_name': 'Virginia Opossum', 'download_link': 'https://www.iucnredlist.org/species/pdf/22176259'}


In [None]:
## this function works well, now I need a list of all species to iterate over this function.
## For now, I am iterating over all mammal species of North America.


## Obtaining Mammals List:

In [None]:
import pandas as pd


## obtaining the list of mammals from Mammal Diversity Database:

In [None]:

url =  'https://www.mammaldiversity.org/explore.html'
mammal_database = pd.read_html(url)[0]

mammal_database['scientific_name'] = mammal_database['Genus'] + " " + mammal_database["Species"]

# Convert DataFrame column to a list
mammals_list = mammal_database['scientific_name'].to_list()

# Correct the spelling error in the list
mammals_list = [species.replace('Caluromysiops irruptus', 'Caluromysiops irrupta') for species in mammals_list]


# Save to a text file, comma-separated
with open('mammals_list.txt', 'w') as file:
    file.write(','.join(mammals_list))

In [None]:
print(search_iucn_species("Didelphis virginiana"))

{'scientific_name': 'Didelphis virginiana', 'common_name': 'Virginia Opossum', 'download_link': 'https://www.iucnredlist.org/species/pdf/22176259'}


## Iterating the function over each species in mammals_list:

In [None]:
import pandas as pd

# Read the single line from the file mammals_list
with open('mammals_list.txt', 'r') as file:
    # Read the single line and split into a list of species names using comma as the separator
    mammals_list = file.readline().split(',')

# Loop over each species and its index in the mammals_list
for index, species in enumerate(mammals_list):
    species = species.strip()  # Remove any leading/trailing whitespace
    print(f"Working on species: {species} (Position: {index + 1})")

    # Assuming search_iucn_species is a function that takes a species name and returns some result
    result = search_iucn_species(species)

    # Create a DataFrame for the current result
    df_link = pd.DataFrame([result])

    # Append the result to the CSV file (without header after the first write)
    df_link.to_csv("all_download_links.csv", mode='a', index=False, header=not index)


Working on species: Ornithorhynchus anatinus (Position: 1)
Working on species: Tachyglossus aculeatus (Position: 2)
Working on species: Zaglossus attenboroughi (Position: 3)
Working on species: Zaglossus bartoni (Position: 4)
Working on species: Zaglossus bruijnii (Position: 5)
Working on species: Caenolestes caniventer (Position: 6)
Working on species: Caenolestes condorensis (Position: 7)
Working on species: Caenolestes convelatus (Position: 8)
Working on species: Caenolestes fuliginosus (Position: 9)
Working on species: Caenolestes sangay (Position: 10)
Working on species: Lestoros inca (Position: 11)
Working on species: Rhyncholestes raphanurus (Position: 12)
Working on species: Caluromys derbianus (Position: 13)
Working on species: Caluromys lanatus (Position: 14)
Working on species: Caluromys philander (Position: 15)
Working on species: Caluromysiops irrupta (Position: 16)
Working on species: Chironectes minimus (Position: 17)
Working on species: Didelphis albiventris (Position: 