# Phylopic Image Download

## Variables:

In [None]:
MYFOLDERPATH = "/your/download/path/here"

## 1. Download all images.

In [None]:
import requests
import os

# Create a directory to store the images
os.makedirs('phylo_images', exist_ok=True)

# Base URL for the PhyloPic API
base_url = "https://api.phylopic.org"

# Headers for the requests
headers = {
    "Accept": "application/vnd.phylopic.v2+json",
}

# Function to get the current build number
def get_current_build():
    response = requests.get(base_url, headers=headers)
    response.raise_for_status()
    data = response.json()
    return data['build']

# Function to get all image metadata
def get_all_images(build):
    images = []
    page = 0
    while True:
        response = requests.get(f"{base_url}/images", headers=headers, params={"page": page, "build": build, "embed_items": "true"})
        response.raise_for_status()
        data = response.json()
        
        # Debugging: Print the response structure
        print(f"Page {page} Response: {data}")

        if '_embedded' in data and 'items' in data['_embedded']:
            images.extend(data['_embedded']['items'])
        else:
            break
        
        if 'next' not in data['_links'] or data['_links']['next'] is None:
            break
        page += 1
    return images

# Function to download an image from a URL
def download_image(url, filename):
    response = requests.get(url)
    response.raise_for_status()
    with open(filename, 'wb') as file:
        file.write(response.content)

# Get the current build number
try:
    current_build = get_current_build()
    print(f"Current build number: {current_build}")
    
    # Get all image metadata
    images = get_all_images(current_build)
    print(f"Retrieved {len(images)} images.")
    
    # Download all images
    for image in images:
        if 'http://ogp.me/ns#image' in image['_links']:
            image_url = image['_links']['http://ogp.me/ns#image']['href']
            image_id = image['uuid']
            filename = f"phylo_images/{image_id}.png"
            download_image(image_url, filename)
            print(f"Downloaded {filename}")
        else:
            print(f"No image link found for UUID: {image['uuid']}")

    print("All images have been downloaded.")
except Exception as e:
    print(f"An error occurred: {e}")


## 2. Create CSV of metadata.

In [None]:
# A bit broken, if anyone has time to fix, does grab a large chunk of them. 

import os
import requests
import csv

# Function to log errors to a text file
def log_error(message):
    with open("errors.txt", "a") as error_file:
        error_file.write(message + "\n")

# Function to get taxonomic rank and common name information from GBIF
def get_taxonomic_info(canonical_name):
    gbif_api_url = f"https://api.gbif.org/v1/species/match?name={canonical_name}"
    try:
        response = requests.get(gbif_api_url)
        response.raise_for_status()
        data = response.json()

        if data.get("matchType") == "EXACT":
            return {
                "Kingdom": data.get("kingdom", ""),
                "Phylum": data.get("phylum", ""),
                "Class": data.get("class", ""),
                "Order": data.get("order", ""),
                "Family": data.get("family", ""),
                "Genus": data.get("genus", ""),
                "Species": data.get("species", ""),
                "Common Name": data.get("vernacularName", "")
            }
    except requests.exceptions.RequestException as e:
        log_error(f"Error retrieving taxonomic info for {canonical_name}: {e}")
    return {
        "Kingdom": "",
        "Phylum": "",
        "Class": "",
        "Order": "",
        "Family": "",
        "Genus": "",
        "Species": "",
        "Common Name": ""
    }

# Function to get the PhyloPic data for a given UUID
def fetch_phylopic_data(uuid):
    base_url = f"https://api.phylopic.org/images/{uuid}?build=402"
    try:
        response = requests.get(base_url, headers={"Accept": "application/vnd.phylopic.v2+json"})
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        log_error(f"HTTP error for UUID {uuid}: {e}")
        return None

# Function to create a CSV from PhyloPic images with full taxonomic rank and common names
def create_csv_from_phylopic(root_directory, output_csv):
    # Prepare the CSV file for writing
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
        fieldnames = [
            'Filename',
            'UUID',
            'FolderName',
            'Path',
            'Name',
            'Canonical Name',
            'Common Name',
            'Kingdom',
            'Phylum',
            'Class',
            'Order',
            'Family',
            'Genus',
            'Species',
            'Rights Holder',
            'License',
            'Creator'
        ]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        # Write the header row
        writer.writeheader()

        # Traverse the directory recursively
        for dirpath, dirnames, filenames in os.walk(root_directory):
            for filename in filenames:
                if filename.lower().endswith(".png"):
                    file_path = os.path.join(dirpath, filename)
                    relative_path = os.path.relpath(file_path, root_directory)
                    folder_name = os.path.basename(dirpath)

                    # Assume filename is the UUID
                    uuid = os.path.splitext(filename)[0]

                    # Fetch data from PhyloPic API
                    phylopic_data = fetch_phylopic_data(uuid)

                    if phylopic_data:
                        specific_node = phylopic_data.get('_links', {}).get('specificNode', {})
                        canonical_name = specific_node.get('title', "")
                        name = phylopic_data.get('attribution', "")
                        rights_holder = phylopic_data.get('_links', {}).get('contributor', {}).get('title', "")
                        license_info = phylopic_data.get('_links', {}).get('license', {}).get('href', "")
                        creator = phylopic_data.get('attribution', "")

                        # Get taxonomic information using the canonical name
                        taxonomic_info = get_taxonomic_info(canonical_name)

                        # Write the information to the CSV file
                        writer.writerow({
                            'Filename': filename,
                            'UUID': uuid,
                            'FolderName': folder_name,
                            'Path': relative_path,
                            'Name': name,
                            'Canonical Name': canonical_name,
                            'Common Name': taxonomic_info['Common Name'],
                            'Kingdom': taxonomic_info['Kingdom'],
                            'Phylum': taxonomic_info['Phylum'],
                            'Class': taxonomic_info['Class'],
                            'Order': taxonomic_info['Order'],
                            'Family': taxonomic_info['Family'],
                            'Genus': taxonomic_info['Genus'],
                            'Species': taxonomic_info['Species'],
                            'Rights Holder': rights_holder,
                            'License': license_info,
                            'Creator': creator
                        })

                        print(f"Processed {relative_path}, UUID: {uuid}")
                    else:
                        print(f"Failed to process UUID: {uuid}")

    print("CSV creation completed.")

# Example usage
create_csv_from_phylopic("phylo_images", "phylopic_data.csv")


## 3. Rename Images

In [None]:
import os
import re
import requests

# Directory where the images are stored
image_directory = 'phylo_images'

# Base URL for the PhyloPic API
base_url = "https://api.phylopic.org"

# Headers for the requests
headers = {
    "Accept": "application/vnd.phylopic.v2+json",
}

# Function to sanitize filenames
def sanitize_filename(name):
    return re.sub(r'[^a-zA-Z0-9\s_]', '_', name).strip()

# Function to fetch the title of an image based on its UUID
def fetch_image_title(uuid):
    try:
        response = requests.get(f"{base_url}/images/{uuid}", headers=headers)
        response.raise_for_status()
        data = response.json()
        
        # Attempt to find the title in various potential fields
        if 'specificNode' in data and 'title' in data['specificNode']:
            return data['specificNode']['title']
        elif 'self' in data and 'title' in data['self']:
            return data['self']['title']
        elif 'nodes' in data['_links'] and data['_links']['nodes']:
            return data['_links']['nodes'][0]['title']
        else:
            print(f"Warning: No title found for UUID {uuid}. Full response: {data}")
            return None
    except Exception as e:
        print(f"Failed to fetch title for UUID {uuid}: {e}")
        return None

# Function to generate a unique filename
def generate_unique_filename(directory, filename_base, extension=".png"):
    counter = 1
    new_filename = f"{filename_base}{extension}"
    while os.path.exists(os.path.join(directory, new_filename)):
        new_filename = f"{filename_base}_{counter}{extension}"
        counter += 1
    return new_filename

# Function to rename images based on their titles
def rename_images():
    try:
        for filename in os.listdir(image_directory):
            if filename.endswith('.png'):
                uuid = filename.replace('.png', '')
                title = fetch_image_title(uuid)
                
                if title:
                    sanitized_title = sanitize_filename(title)
                    unique_filename = generate_unique_filename(image_directory, sanitized_title)
                    old_filename = os.path.join(image_directory, filename)
                    new_filename = os.path.join(image_directory, unique_filename)
                    
                    # Rename the file
                    os.rename(old_filename, new_filename)
                    print(f"Renamed {old_filename} to {new_filename}")
                else:
                    print(f"No title found for UUID: {uuid}, image not renamed.")
        
        print("All files have been processed.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the renaming process
rename_images()


## 4. Cleanup: Image Crop

In [None]:
import os
from PIL import Image

# Directory containing the images
image_directory = MYFOLDERPATH

# Width of the crop to remove from the left side
crop_width = 515

# Function to crop images
def crop_images(directory, crop_width):
    try:
        for filename in os.listdir(directory):
            if filename.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
                file_path = os.path.join(directory, filename)
                
                with Image.open(file_path) as img:
                    width, height = img.size

                    # Check if the image is wider than the crop width
                    if width > crop_width:
                        # Crop the image: (left, upper, right, lower)
                        cropped_img = img.crop((crop_width, 0, width, height))
                        
                        # Save the cropped image, overwriting the original
                        cropped_img.save(file_path)
                        print(f"Cropped and saved: {filename}")
                    else:
                        print(f"Skipping {filename}, width is less than or equal to {crop_width} pixels.")
        
        print("All images have been processed.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the cropping process
crop_images(image_directory, crop_width)


## 5. Cleanup: Remove background & crop.

In [None]:
import os
from PIL import Image, ImageChops

# Directory containing the images
image_directory = MYFOLDERPATH

# Define the background color to be removed (light green in the attached image)
background_color = (247, 255, 251, 255)  # RGBA for light green

def remove_background_and_trim(directory):
    try:
        for filename in os.listdir(directory):
            if filename.endswith('.png'):
                file_path = os.path.join(directory, filename)
                
                with Image.open(file_path) as img:
                    img = img.convert("RGBA")
                    datas = img.getdata()
                    
                    new_data = []
                    for item in datas:
                        # Change the background color pixels to transparent
                        if item[:3] == background_color[:3]:
                            new_data.append((255, 255, 255, 0))  # Transparent
                        else:
                            new_data.append(item)
                    
                    img.putdata(new_data)
                    
                    # Trim the transparent areas
                    img = trim_image(img)
                    
                    # Save the trimmed image, overwriting the original
                    img.save(file_path)
                    print(f"Processed and trimmed: {filename}")
        
        print("All images have been processed.")
    except Exception as e:
        print(f"An error occurred: {e}")

def trim_image(img):
    """Trim the transparent edges of an image."""
    bg = Image.new(img.mode, img.size, (255, 255, 255, 0))
    diff = ImageChops.difference(img, bg)
    bbox = diff.getbbox()
    if bbox:
        return img.crop(bbox)
    return img

# Run the process
remove_background_and_trim(image_directory)


## 6. Move into kingdom folders.

In [None]:
import os
import re
import requests

# Function to get classification from GBIF
def get_classification(species_name):
    try:
        url = f"https://api.gbif.org/v1/species?name={species_name}"
        response = requests.get(url)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        data = response.json()
        if data['results']:
            kingdom = data['results'][0].get('kingdom')
            return kingdom
    except (requests.RequestException, ValueError) as e:
        print(f"Error retrieving data for {species_name}: {e}")
    return None

# Function to move files to appropriate folder
def categorize_images(image_folder):
    for image_file in os.listdir(image_folder):
        if image_file.endswith(".png"):
            # Remove any underscores and trailing numbers from the species name
            species_name = re.sub(r'_\d+$', '', image_file.split(".png")[0]).replace("_", " ")
            kingdom = get_classification(species_name)
            if kingdom:
                try:
                    category_folder = os.path.join(image_folder, kingdom)
                    if not os.path.exists(category_folder):
                        os.makedirs(category_folder)
                    os.rename(os.path.join(image_folder, image_file), os.path.join(category_folder, image_file))
                    print(f"Moved {image_file} to {kingdom} folder.")
                except OSError as e:
                    print(f"Error moving file {image_file}: {e}")
            else:
                print(f"Skipping {image_file}: No classification found.")

# Path to the folder containing the images
image_folder = MYFOLDERPATH
categorize_images(image_folder)
