In [1]:
# !pip install torch torchvision
# !pip install matplotlib pandas scikit-learn
# !pip install pillow
# !pip install opencv-python

# Scrape https://www.ifoundbutterflies.org for butterfly dataset

In [1]:
dataset_dir = "insect-dataset/butterfly"

website_url = "https://www.ifoundbutterflies.org"
initial_path = "/lepidoptera"
first_page = 0
last_page = 53

batch_size = 3
max_workers = 50
page_timeout = 120
image_timeout = 30

ignore_image_regex = r"^(imgs10|.*(boimobileapp|butterfliesofurbangreeneries|webheader|headerlogo|WPA-[IVX]+).*)\.(png|jpg|jpeg)$"
skip_downloaded_species = False

early_stage_suffix = '-early'

In [2]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import threading
from concurrent.futures import ThreadPoolExecutor

def log_header():
    return f"[ {threading.current_thread().name:24} ]  "

def download_image(img_url, output_dir):
    try:
        img_data = requests.get(img_url, timeout=image_timeout).content
        img_name = img_url.split("/")[-1]
        img_path = os.path.join(output_dir, img_name)
        with open(img_path, 'wb') as file:
            file.write(img_data)
        return True
    except Exception as e:
        # print(f"{log_header()}{e}")
        return False

def has_parent_with_prop(tag, prop, value, max_parents):
    parent = tag
    for i in range(0, max_parents):
        parent = parent.parent
        if parent.get(prop) == value:
            return True
    return False
    
def scrape_images(url, output_dir):
    try:
        print(f"{log_header()}    Scraping URL: {url}")
        response = requests.get(url, timeout=page_timeout)
        soup = BeautifulSoup(response.text, 'html.parser')
        img_tags = soup.find_all('img')
        success_cnt = 0
        failure_cnt = 0
        already_downloaded_cnt = 0
        for img in img_tags:
            img_url = img.get('src')
            if has_parent_with_prop(img, 'id', 'laraval', 5):
                # larval host plants photo
                # print (f"Ignoring larval host plants image {img_url}")
                continue
            class_suffix = ''
            if has_parent_with_prop(img, 'id', 'early', 6):
                # early stages
                # print (f"Found early stages image {img_url}")
                class_suffix = early_stage_suffix
            if img_url:
                img_url = urljoin(url, img_url)
                img_name = img_url.split("/")[-1]
                if re.search(ignore_image_regex, img_name):
                    continue
                if os.path.exists(f"{output_dir}{class_suffix}/{img_name}"):
                    already_downloaded_cnt = already_downloaded_cnt + 1
                    continue
                if not os.path.exists(output_dir+class_suffix):
                    os.makedirs(output_dir+class_suffix)
                if download_image(img_url, output_dir+class_suffix):
                    success_cnt = success_cnt + 1
                else:
                    failure_cnt = failure_cnt + 1
        if success_cnt > 0:
            print(f"{log_header()}      Downloaded {success_cnt}(+{already_downloaded_cnt}) / {success_cnt+already_downloaded_cnt+failure_cnt} image(s) in {output_dir}")
        return True
    except Exception as e:
        print(f"{log_header()}{e}")
        return False

def crawl(base_url, root, output_dir):
    try:
        url = urljoin(base_url, root)
        print(f"{log_header()}Crawling URL: {url}")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        response = requests.get(url, timeout=page_timeout)
        soup = BeautifulSoup(response.text, 'html.parser')
        img_tags = soup.find_all('img')
        for img in img_tags:
            img_url = img.get('src')
            if img_url:
                img_url = urljoin(url, img_url)
                # print (f"Found image {img_url}")
                img_name = img_url.split("/")[-1]
                if re.search(ignore_image_regex, img_name):
                    continue
                species_path = img.parent.parent.get('href')
                species_dir = output_dir+species_path.lower()
                if not (skip_downloaded_species and os.path.exists(species_dir) and os.path.isdir(species_dir) and os.listdir(species_dir)):
                    scrape_images(urljoin(base_url, species_path), species_dir)
        return True
    except Exception as e:
        print(f"{log_header()}{e}")
        return False

def crawl_in_batch(batch_start):
    print(f"Starting batch from page {batch_start} on thread {threading.current_thread().name}")
    for page in range(batch_start, min(batch_start+batch_size, last_page+1), 1):
        crawl(website_url, f"{initial_path}?page={page}", f"{dataset_dir}/data")

In [7]:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(crawl_in_batch, batch_start) for batch_start in range(first_page, last_page+1, batch_size)]
    for future in futures:
        print(f"Thread completed with result {future.result()}")

print("Scraping completed!")

Starting batch from page 0 on thread ThreadPoolExecutor-1_0
[ ThreadPoolExecutor-1_0   ]  Crawling URL: https://www.ifoundbutterflies.org/lepidoptera?page=0
Starting batch from page 3 on thread ThreadPoolExecutor-1_1
[ ThreadPoolExecutor-1_1   ]  Crawling URL: https://www.ifoundbutterflies.org/lepidoptera?page=3
Starting batch from page 6 on thread ThreadPoolExecutor-1_2
[ ThreadPoolExecutor-1_2   ]  Crawling URL: https://www.ifoundbutterflies.org/lepidoptera?page=6
Starting batch from page 9 on thread ThreadPoolExecutor-1_3
[ ThreadPoolExecutor-1_3   ]  Crawling URL: https://www.ifoundbutterflies.org/lepidoptera?page=9
Starting batch from page 12 on thread ThreadPoolExecutor-1_4
[ ThreadPoolExecutor-1_4   ]  Crawling URL: https://www.ifoundbutterflies.org/lepidoptera?page=12
Starting batch from page 15 on thread ThreadPoolExecutor-1_5
[ ThreadPoolExecutor-1_5   ]  Crawling URL: https://www.ifoundbutterflies.org/lepidoptera?page=15
Starting batch from page 18 on thread ThreadPoolExecut

# Remove corrupted images & empty folders

In [8]:
import os
from PIL import Image
from pathlib import Path

def check_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()
        return True
    except (IOError, SyntaxError):
        return False

for species_dir in Path(f"{dataset_dir}/data").iterdir():
    if species_dir.is_dir() and os.listdir(species_dir):
        for file in Path(f"{species_dir}").iterdir():
            if file.is_file() and not check_image(file):
                os.remove(file)
                print(f"Corrupted file {file} removed")
    if species_dir.is_dir() and not os.listdir(species_dir):
        os.rmdir(species_dir)
        print(f"Empty folder {species_dir} removed")

Corrupted file insect-dataset\butterfly\data\acraea-terpsicore\th_a-28805.jpg removed
Corrupted file insect-dataset\butterfly\data\acraea-terpsicore\th_a-28806.jpg removed
Corrupted file insect-dataset\butterfly\data\acraea-terpsicore\th_a-28807.jpg removed
Corrupted file insect-dataset\butterfly\data\acraea-terpsicore-early\media_images_23 removed
Corrupted file insect-dataset\butterfly\data\araotes-lapithis\AraotesLapithis_SanjaySondhi_ag777%202.jpg removed
Corrupted file insect-dataset\butterfly\data\argynnis-childreni\Argynnis%20childreni%20sakontala_1719655759_272600.jpg removed
Corrupted file insect-dataset\butterfly\data\argynnis-childreni\Argynnis%20childreni%20sakontala_1719655759_272601.jpg removed
Corrupted file insect-dataset\butterfly\data\argynnis-childreni\Argynnis%20childreni%20sakontala_1719655760_272602.jpg removed
Corrupted file insect-dataset\butterfly\data\arhopala-abseus\Arhopala%20abseus_MilindBhakare_am018.jpg removed
Corrupted file insect-dataset\butterfly\data

# ZIP the data

In [9]:
import shutil
import time
import datetime

shutil.make_archive(f"{dataset_dir}/ifoundbutterflies.org.{datetime.datetime.now().strftime("%Y.%m.%d")}", 'zip', f"{dataset_dir}/data")

'D:\\Projects\\my-jupyter-notebook\\insect-species-identification\\insect-dataset\\butterfly\\ifoundbutterflies.org.2025.02.06.zip'

# Fetch species names

In [3]:
classes = { class_dir: len([ img for img in os.listdir(f"insect-dataset/butterfly/data/{class_dir}") ]) for class_dir in os.listdir(f"insect-dataset/butterfly/data") }

In [69]:
class_details = {}

In [70]:
for class_name in [ c for c in classes.keys() if not c.endswith("-early") ]:
    try:
        if class_name in class_details and "name" in class_details[class_name]:
            continue
        print()
        print(f"class_name: {class_name}")
        url = f"{website_url}/{class_name}"
        response = requests.get(url, timeout=page_timeout)
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find(lambda tag: tag.find("span", class_="spices-title") is not None, class_="field field--name-title field--type-string field--label-hidden").get_text(strip=True)
        print(f"title: {title}")
        species_name = re.split(r"\s+-\s+", title)
        if len(species_name) > 1:
            species_name = species_name[-1]
        else:
            continue
        print(f"species_name: {species_name}")
        class_details[class_name] = {"name": species_name}
    except Exception:
        print("ERROR")


class_name: abisara-attenuata
title: Abisara attenuataTytler, 1915  - Attenuated Judy
species_name: Attenuated Judy

class_name: abisara-bifasciata
title: Abisara bifasciataMoore, 1877  - Double-banded Judy
species_name: Double-banded Judy

class_name: abisara-burnii
title: Abisara burnii(de Nicéville, 1895)  - White-spotted Judy
species_name: White-spotted Judy

class_name: abisara-chela
title: Abisara chelade Nicéville, 1886  - Spot Judy
species_name: Spot Judy

class_name: abisara-echerius
title: Abisara echerius(Stoll, [1790])  - Plum Judy
species_name: Plum Judy

class_name: abisara-fylla
title: Abisara fylla(Westwood, [1851])  - Dark Judy
species_name: Dark Judy

class_name: abisara-neophron
title: Abisara neophron(Hewitson, 1861)  - Tailed Judy
species_name: Tailed Judy

class_name: abrota-ganga
title: Abrota gangaMoore, [1858]  - Sergeant-major
species_name: Sergeant-major

class_name: acraea-issoria
title: Acraea issoria(Hübner, [1819])  - Yellow Coster
species_name: Yellow C

In [72]:
import json

with open(f"insect-dataset/class_details.ifoundbutterflies.org.json", "w", encoding="utf-8") as file:
        json.dump(class_details, file, indent=4)