<a href="https://colab.research.google.com/github/poudelmohit/project_IUCN/blob/main/iucn_pdf_link_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Steps (Workflow):


1.   Mouting directory and installing required libraries
2.   Obtain a list of mammals
3.   obtain IUCN data download link for each of those species
4.   Download IUCN-report of each species using the download link obtained
5.   extract IUCN information into a dataframe by reading pdfs
6.   genomic approaches

# 1.1 Mounting Directory:




In [None]:
from google.colab import drive
MOUNTPOINT = '/content/drive'
drive.mount(MOUNTPOINT)

import os
directory = os.path.join(MOUNTPOINT,'MyDrive','Colab Notebooks','LAB','project_IUCN')
os.chdir(directory)

Mounted at /content/drive


In [None]:
! ls

all_download_links.csv	data_extraction.py  iucn_pdf_link_extraction.ipynb  mammals_list.txt


# 2. Obtaining the list of mammals from Mammal Diversity Database:

In [None]:

url =  'https://www.mammaldiversity.org/explore.html'
mammal_database = pd.read_html(url)[0]

mammal_database['scientific_name'] = mammal_database['Genus'] + " " + mammal_database["Species"]

# Convert DataFrame column to a list
mammals_list = mammal_database['scientific_name'].to_list()

# Correct the spelling error in the list
mammals_list = [species.replace('Caluromysiops irruptus', 'Caluromysiops irrupta') for species in mammals_list]


# Save to a text file, comma-separated
with open('mammals_list.txt', 'w') as file:
    file.write(','.join(mammals_list))

#### some issues here:
##### a. this list have some incorrect species name (while comparing with IUCN site)
###### b. this list has all names in a single line, which needs to be fixed while reading the file

In [None]:
# let's check the number of species we have:

with open('mammals_list.txt', 'r') as file:
    total_entries = sum(len(line.split(',')) for line in file)

print(f"Total number of entries: {total_entries}")


Total number of entries: 6753


#### 6753 entries are present currently.

# 3.1 Creating a function to obtain IUCN data download link:

In [None]:
# ! pip install selenium
# might require installation.

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd

def search_iucn_species(species_name):
    """
    Searches for a species on the IUCN Red List website and retrieves the common name and a download link of the IUCN species assessment report pdf.

    Args:
        species_name (str): The name of the species to search for, preferentially scientific name.

    Returns:
        dict(A dictionary containing):
            - "scientific_name": The input species name.
            - "common_name": The headline text of the species page.
            - "download_link": The URL of the first available download button, or None if no download buttons are found.
    """

    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Initialize the WebDriver with Chrome options
    driver = webdriver.Chrome(options=chrome_options)

    try:
        # Open the IUCN Red List website
        driver.get("https://www.iucnredlist.org/")

        # Find the search box element using the class attribute
        search_box = driver.find_element("css selector", "input.search.search--site")

        # Type the search query (species name) and hit Enter
        search_box.send_keys(species_name)
        search_box.send_keys(Keys.RETURN)

        # Wait for the search results to load
        time.sleep(1)

        # Find and click on the first 'View' link with the class "link--faux"
        view_link = driver.find_element("css selector", "a.link--faux")
        view_link.click()

        # Wait for the species page to load after clicking the link
        time.sleep(1)

        # Find the h1 element with the class "headline__title"
        headline = driver.find_element("css selector", "h1.headline__title")
        headline_text = headline.text

        # Find and click the download button with the specified class
        download_button = driver.find_element("name", "download_search_results")
        download_button.click()

        # Wait for the download options to appear
        time.sleep(1)

        # Find all 'link--download' buttons
        download_buttons = driver.find_elements("css selector", "a.link--download")
        if download_buttons:
            # Get the href attribute of the first download button
            first_href = download_buttons[0].get_attribute("href")
        else:
            first_href = None
            print(f"No download buttons found for species: {species_name}")

        # Create a dictionary with the headline and first href
        result = {
            "scientific_name": species_name,
            "common_name": headline_text,
            "download_link": first_href
        }
        return result

    except Exception as e:
        # Print the error message and skip to the next species
        print(f"Error searching for species: {species_name}")
        return {
            "scientific_name": species_name,
            "common_name": None,
            "download_link": None
        }

    finally:
        # Close the browser
        driver.quit()



In [None]:
# just a test:
print(search_iucn_species("Didelphis virginiana"))

{'scientific_name': 'Didelphis virginiana', 'common_name': 'Virginia Opossum', 'download_link': 'https://www.iucnredlist.org/species/pdf/22176259'}


#### This function works well. Now, I need to iterate it over the list of mammals (or any species) I have, to get the download link of the IUCN data.

#### Although, currently I have >6k entries, I will work with only ~1000 first, just to check the codes/pipelines.


# 3.2 Using the function to obtain the download links:

In [None]:
import pandas as pd

# Read the single line from the file mammals_list
with open('mammals_list.txt', 'r') as file:
    # Read the single line and split into a list of species names using comma as the separator
    mammals_list = file.readline().split(',')

# Loop over each species and its index in the mammals_list
for index, species in enumerate(mammals_list):
    species = species.strip()  # Remove any leading/trailing whitespace
    print(f"Working on species: {species} (Position: {index + 1})")

    # Assuming search_iucn_species is a function that takes a species name and returns some result
    result = search_iucn_species(species)

    # Create a DataFrame for the current result
    df_link = pd.DataFrame([result])

    # Append the result to the CSV file (without header after the first write)
    df_link.to_csv("all_download_links.csv", mode='a', index=False, header=not index)


#### Currently, >1000 species are iterated in the 'search_iucn_species()' function, and those are saved into: 'all_download_links.csv'

# 4. Download IUCN-reports from the dataframe:

In [None]:
df_report_download_link = pd.read_csv('all_download_links.csv')
df_report_download_link.columns = ['scientific_name','common_name','download_link']

# deleting rows without download link:
df_report_download_link = df_report_download_link[~df_report_download_link['download_link'].isnull()] # 146 rows have no download_links

df_report_download_link = df_report_download_link.reset_index(drop=True)

In [None]:
! mkdir iucn_reports

In [None]:
for link in df_report_download_link['download_link']:
     os.system(f"wget -P iucn_reports {link}")

In [27]:
! ls iucn_reports | wc -l

906


##### At this point, 906 pdfs have been downloaded

# 5. Obtaining IUCN-values from the pdfs:

In [50]:
# ! pip install pdfplumber
# import pdfplumber

In [49]:
# trying with a single pdf first:

# Open the PDF file
with pdfplumber.open('iucn_reports/229492631') as pdf:
    text = ''

    # Loop through all the pages
    for page in pdf.pages:
        text += page.extract_text()

# processing the text:
# Extract the fields using regular expressions
scientific_name = re.search(r'Scientific Name:\s*(.*)', text).group(1).strip()
taxonomy = re.search(r'^Animalia.*', text, re.MULTILINE).group(0).strip()
red_list_category = re.search(r'Red List Category & Criteria:\s*(.*)', text).group(1).strip()
date_assessed = re.search(r'Date Assessed:\s*(.*)', text).group(1).strip()
year_published = re.search(r'Year Published:\s*(.*)', text).group(1).strip()
current_population_trend = re.search(r'Current Population Trend:\s*(.*)', text).group(1).strip()
systems = re.search(r'Systems:\s*(.*)', text).group(1).strip()



range_description = re.search(r'Range Description:\s*(.*?)(?=\n[A-Z])', text, re.DOTALL).group(1).strip()
habitat_and_ecology = re.search(r'Habitat and Ecology\s*(.*?)(?=Threats)', text, re.DOTALL).group(1).strip()
threats = re.search(r'Threats\s*(.*?)(?=\nConservation Actions)', text, re.DOTALL).group(1).strip()



# Print the extracted fields
print(f"Scientific Name: {scientific_name}")
print(f"Taxonomy: {taxonomy}")
print(f"Red List Category & Criteria: {red_list_category}")
print(f"Date Assessed: {date_assessed}")
print(f"Year Published: {year_published}")
print(f"Current Population Trend: {current_population_trend}")
print(f"Systems: {systems}")
print(f"Range Description: {range_description}")

print(f"Habitat and Ecology: {habitat_and_ecology}")
print(f"Threats: {threats}")




Scientific Name: Tupaia tana Raffles, 1821
Taxonomy: Animalia Chordata Mammalia Scandentia Tupaiidae
Red List Category & Criteria: Least Concern ver 3.1
Date Assessed: November 16, 2022
Year Published: 2023
Current Population Trend: Decreasing
Systems: Terrestrial
Range Description: This species is found in Malaysia (Sabah, Sarawak, Banggi), Brunei and Indonesia (Kalimantan, Sumatra,
the Batu Islands, Lingga Islands, Bangka, Tambelan, Serasan) (Helgen 2005). Probably occurs to 1,500 m,
but more common up to 1,200 m (K.H. Han pers. comm.).
Habitat and Ecology: (see Appendix for additional information)
This species is found mainly in lowland forest, but is also fairly common in fruit orchards and secondary
forest. Wells (2005) found that in selectively logged forest with a dense understorey, this species
increases significantly.
Systems: Terrestrial
Threats: (see Appendix for additional information)
Although this species is found in fruit orchards, it is not present in unforested agricul

In [55]:
print(text)

The IUCN Red List of Threatened Species™
ISSN 2307-8235 (online)
IUCN 2023: T41501A229492631
Scope(s): Global
Language: English
Tupaia tana,
Large Treeshrew
Assessment by: Juman, M.M. & Sargis, E.J.
View on www.iucnredlist.org
Citation: Juman, M.M. & Sargis, E.J. 2023. Tupaia tana. The IUCN Red List of Threatened Species
2023: e.T41501A229492631. https://dx.doi.org/10.2305/IUCN.UK.2023-
1.RLTS.T41501A229492631.en
Copyright: © 2023 International Union for Conservation of Nature and Natural Resources
Reproduction of this publication for educational or other non-commercial purposes is authorized without prior written
permission from the copyright holder provided the source is fully acknowledged.
Reproduction of this publication for resale, reposting or other commercial purposes is prohibited without prior written
permission from the copyright holder. For further details see Terms of Use.
The IUCN Red List of Threatened Species™ is produced and managed by the IUCN Global Species Programme,

In [93]:
# import os
# import re
# import pdfplumber
# import pandas as pd

def extract_data_from_pdf(pdf_path):
    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        text = ''

        # Loop through all the pages and extract text
        for page in pdf.pages:
            text += page.extract_text()

    # Extract the fields using regular expressions
    try:
        scientific_name = re.search(r'^(Scientific Name:|Taxon Name:)\s*(.*)', text, re.MULTILINE).group(2).strip()
    except:
        scientific_name = ''
    try:
        taxonomy = re.search(r'^Animalia.*', text, re.MULTILINE).group(0).strip()
    except:
        taxonomy = ''
    try:
        red_list_category = re.search(r'Red List Category & Criteria:\s*(.*)', text).group(1).strip()
    except:
        red_list_category = ''
    try:
        date_assessed = re.search(r'Date Assessed:\s*(.*)', text).group(1).strip()
    except:
        date_assessed = ''
    try:
        year_published = re.search(r'Year Published:\s*(.*)', text).group(1).strip()
    except:
        year_published = ''
    try:
        current_population_trend = re.search(r'Current Population Trend:\s*(.*)', text).group(1).strip()
    except:
        current_population_trend = ''
    try:
        systems = re.search(r'Systems:\s*(.*)', text).group(1).strip()
    except:
        systems = ''
    try:
        range_description = re.search(r'Range Description:\s*(.*?)[.]\s', text, re.DOTALL).group(1).strip()
    except:
        range_description = ''
    try:
       habitat_and_ecology = re.search(r'Habitat and Ecology\s*(.*?)[.]\s', text, re.DOTALL).group(1).strip()
    except:
        habitat_and_ecology = ''
    try:
        threats = re.search(r'Threats\s*(.*?)[.]\s', text, re.DOTALL).group(1).strip()
    except:
        threats = ''

    # Return the extracted data as a dictionary
    return {
        "Scientific Name": scientific_name,
        "Taxonomy": taxonomy,
        "Red List Category & Criteria": red_list_category,
        "Date Assessed": date_assessed,
        "Year Published": year_published,
        "Current Population Trend": current_population_trend,
        "Systems": systems,
        "Range Description": range_description,
        "Habitat and Ecology": habitat_and_ecology,
        "Threats": threats
    }



In [94]:
extract_data_from_pdf('iucn_reports/215090780')

{'Scientific Name': 'Presbytis femoralis (Martin, 1838)',
 'Taxonomy': 'Animalia Chordata Mammalia Primates Cercopithecidae',
 'Red List Category & Criteria': 'Critically Endangered C2a(i) ver 3.1',
 'Date Assessed': 'February 18, 2022',
 'Year Published': '2022',
 'Current Population Trend': 'Decreasing',
 'Systems': 'Terrestrial',
 'Range Description': 'This species is found in southern Peninsular Malaysia (from extreme south of Pahang State into Johor\nState) and the Republic of Singapore',
 'Habitat and Ecology': '(see Appendix for additional information)\nThis species is found in taller trees of swampy peat forests in Peninsular Malaysia, while in Singapore it is\nfound in primary, secondary, and swamp forests (Lucas et al',
 'Threats': '(see Appendix for additional information)\nDeforestation and habitat conversion continue to be the major threats to this species'}

In [88]:
# ! ls iucn_reports/

In [82]:
# print(text)

In [97]:
extract_data_from_pdf('iucn_reports/210442893')
# extract_data_from_pdf('iucn_reports/21286959 ')
# extract_data_from_pdf('iucn_reports/17971958')

{'Scientific Name': 'Rhynchocyon petersi Bocage, 1880',
 'Taxonomy': 'Animalia Chordata Mammalia Macroscelidea Macroscelididae',
 'Red List Category & Criteria': 'Least Concern ver 3.1',
 'Date Assessed': 'January 31, 2016',
 'Year Published': '2016',
 'Current Population Trend': 'Decreasing',
 'Systems': 'Terrestrial',
 'Range Description': 'An East African endemic ranging in coastal forests from about 3°60’S in south-eastern Kenya to about\n7°40’S in Tanzania, just north of the Rufiji River, and in the Eastern Arc Mountains, where it is reliably\nrecorded from North and South Pare (Stanley et al',
 'Habitat and Ecology': '(see Appendix for additional information)\nRelatively little is known about the biology of the Black and Rufous Sengi because no detailed field\nstudies have been completed',
 'Threats': '(see Appendix for additional information)\nFragmentation and degradation of forested habitats due to urban and agricultural expansion is the\n© The IUCN Red List of Threatened Spec

# Needs some cleaning in the dictionary values before saving as csv :)

# Work From Here..

In [None]:
def process_all_pdfs_in_directory(directory_path, output_csv):
    # List to hold all the extracted data
    all_data = []

    # Iterate through all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, filename)
            print(f"Processing {pdf_path}...")
            data = extract_data_from_pdf(pdf_path)
            all_data.append(data)

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(all_data)

    # Save the DataFrame to a CSV file
    df.to_csv(output_csv, index=False)
    print(f"Data saved to {output_csv}")

# Example usage:
# Set the path to the directory containing the PDFs and the output CSV file path
directory_path = 'iucn_reports/'  # Replace with your PDF directory
output_csv = 'iucn_extracted_data.csv'

# Process all PDFs and save the results in a CSV file
process_all_pdfs_in_directory(directory_path, output_csv)
