In [1]:
# !pip install torch torchvision
# !pip install matplotlib pandas scikit-learn
# !pip install pillow
# !pip install opencv-python

In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
dataset_dir = "insect-dataset/moth"

# Scrape https://www.mothsofindia.org for moths dataset

In [3]:
website_url = "https://www.mothsofindia.org"
initial_path = "/lepidoptera"
first_page = 0
last_page = 145

batch_size = 3
max_workers = 50
page_timeout = 120
image_timeout = 30

ignore_image_regex = r"^(imgs10|.*(boimobileapp|butterfliesofurbangreeneries|webheader|headerlogo|WPA-[IVX]+).*)\.(png|jpg|jpeg)$"
skip_downloaded_species = False

early_stage_suffix = '-early'

In [5]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import threading
from concurrent.futures import ThreadPoolExecutor

def log_header():
    return f"[ {threading.current_thread().name:24} ]  "

def download_image(img_url, output_dir):
    try:
        img_data = requests.get(img_url, timeout=image_timeout).content
        img_name = img_url.split("/")[-1]
        img_path = os.path.join(output_dir, img_name)
        with open(img_path, 'wb') as file:
            file.write(img_data)
        return True
    except Exception as e:
        # print(f"{log_header()}{e}")
        return False

def has_parent_with_prop(tag, prop, value, max_parents):
    parent = tag
    for i in range(0, max_parents):
        parent = parent.parent
        if parent.get(prop) == value:
            return True
    return False
    
def scrape_images(url, output_dir):
    try:
        print(f"{log_header()}    Scraping URL: {url}")
        response = requests.get(url, timeout=page_timeout)
        soup = BeautifulSoup(response.text, 'html.parser')
        img_tags = soup.find_all('img')
        success_cnt = 0
        failure_cnt = 0
        already_downloaded_cnt = 0
        for img in img_tags:
            img_url = img.get('src')
            if has_parent_with_prop(img, 'id', 'laraval', 5):
                # larval host plants photo
                # print (f"Ignoring larval host plants image {img_url}")
                continue
            class_suffix = ''
            if has_parent_with_prop(img, 'id', 'early', 6):
                # early stages
                # print (f"Found early stages image {img_url}")
                class_suffix = early_stage_suffix
            if img_url:
                img_url = urljoin(url, img_url)
                img_name = img_url.split("/")[-1]
                if re.search(ignore_image_regex, img_name):
                    continue
                if os.path.exists(f"{output_dir}{class_suffix}/{img_name}"):
                    already_downloaded_cnt = already_downloaded_cnt + 1
                    continue
                if not os.path.exists(output_dir+class_suffix):
                    os.makedirs(output_dir+class_suffix)
                if download_image(img_url, output_dir+class_suffix):
                    success_cnt = success_cnt + 1
                else:
                    failure_cnt = failure_cnt + 1
        if success_cnt > 0:
            print(f"{log_header()}      Downloaded {success_cnt}(+{already_downloaded_cnt}) / {success_cnt+already_downloaded_cnt+failure_cnt} image(s) in {output_dir}")
        return True
    except Exception as e:
        print(f"{log_header()}{e}")
        return False

def crawl(base_url, root, output_dir):
    try:
        url = urljoin(base_url, root)
        print(f"{log_header()}Crawling URL: {url}")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        response = requests.get(url, timeout=page_timeout)
        soup = BeautifulSoup(response.text, 'html.parser')
        img_tags = soup.find_all('img')
        for img in img_tags:
            img_url = img.get('src')
            if img_url:
                img_url = urljoin(url, img_url)
                # print (f"Found image {img_url}")
                img_name = img_url.split("/")[-1]
                if re.search(ignore_image_regex, img_name):
                    continue
                species_path = img.parent.parent.get('href')
                species_dir = output_dir+species_path.lower()
                if not (skip_downloaded_species and os.path.exists(species_dir) and os.path.isdir(species_dir) and os.listdir(species_dir)):
                    scrape_images(urljoin(base_url, species_path), species_dir)
        return True
    except Exception as e:
        print(f"{log_header()}{e}")
        return False

def crawl_in_batch(batch_start):
    print(f"Starting batch from page {batch_start} on thread {threading.current_thread().name}")
    for page in range(batch_start, min(batch_start+batch_size, last_page+1), 1):
        crawl(website_url, f"{initial_path}?page={page}", f"{dataset_dir}/data")

In [9]:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(crawl_in_batch, batch_start) for batch_start in range(first_page, last_page+1, batch_size)]
    for future in futures:
        print(f"Thread completed with result {future.result()}")

print("Scraping completed!")

Starting batch from page 0 on thread ThreadPoolExecutor-1_0
[ ThreadPoolExecutor-1_0   ]  Crawling URL: https://www.mothsofindia.org/lepidoptera?page=0
Starting batch from page 3 on thread ThreadPoolExecutor-1_1
[ ThreadPoolExecutor-1_1   ]  Crawling URL: https://www.mothsofindia.org/lepidoptera?page=3
Starting batch from page 6 on thread ThreadPoolExecutor-1_2
[ ThreadPoolExecutor-1_2   ]  Crawling URL: https://www.mothsofindia.org/lepidoptera?page=6
Starting batch from page 9 on thread ThreadPoolExecutor-1_3
[ ThreadPoolExecutor-1_3   ]  Crawling URL: https://www.mothsofindia.org/lepidoptera?page=9
Starting batch from page 12 on thread ThreadPoolExecutor-1_4
[ ThreadPoolExecutor-1_4   ]  Crawling URL: https://www.mothsofindia.org/lepidoptera?page=12
Starting batch from page 15 on thread ThreadPoolExecutor-1_5
[ ThreadPoolExecutor-1_5   ]  Crawling URL: https://www.mothsofindia.org/lepidoptera?page=15
Starting batch from page 18 on thread ThreadPoolExecutor-1_6
[ ThreadPoolExecutor-1_

# Remove corrupted images & empty folders

In [10]:
import os
from PIL import Image
from pathlib import Path

def check_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()
        return True
    except (IOError, SyntaxError):
        return False

for species_dir in Path(f"{dataset_dir}/data").iterdir():
    if species_dir.is_dir() and os.listdir(species_dir):
        for file in Path(f"{species_dir}").iterdir():
            if file.is_file() and not check_image(file):
                os.remove(file)
                print(f"Corrupted file {file} removed")
    if not os.listdir(species_dir):
        os.rmdir(species_dir)
        print(f"Empty folder {species_dir} removed")

Corrupted file insect-dataset\moth\data\asota-caricae\Asota%20caricae_1693314811_268043.jpg removed
Corrupted file insect-dataset\moth\data\asota-caricae\Asota%20caricae_1693314812_268044.jpg removed
Corrupted file insect-dataset\moth\data\calonola-argyria\Calonola%20argyria_1700264414_276888.jpg removed
Corrupted file insect-dataset\moth\data\chrysodeixis-spp\Chrysodeixis%20spp._1712961436_290366.jpg removed
Corrupted file insect-dataset\moth\data\endotricha-spp\Endotricha%20spp._1714768313_291891.jpg removed
Corrupted file insect-dataset\moth\data\endotricha-spp\Endotricha%20spp._1714768314_291892.jpg removed
Corrupted file insect-dataset\moth\data\endotricha-spp\Endotricha%20spp._1714768329_291927.jpg removed
Corrupted file insect-dataset\moth\data\endotricha-spp\Endotricha%20spp._1714768330_291928.jpg removed
Corrupted file insect-dataset\moth\data\epipaschiinae-genera-spp\Epipaschiinae-genera%20spp._1700264330_276895.jpg removed
Corrupted file insect-dataset\moth\data\epipleminae-

In [13]:
for species_dir in Path(f"{dataset_dir}/data").iterdir():
    if species_dir.is_dir():
        new_name = species_dir.name.lower()
        new_name_2 = re.sub(r"(%20.*)|(-species$)|(-species-group$)|(-group$)", "", new_name)
        if new_name != new_name_2:
            print(f"Renaming {species_dir} to {new_name_2}")
            new_name = new_name_2
        Path(species_dir).rename(f"{dataset_dir}/data/{new_name}")

Renaming insect-dataset\moth\data\rinaca-grotei-species to rinaca-grotei


# ZIP the data

In [14]:
import shutil
import time
import datetime

shutil.make_archive(f"{dataset_dir}/mothsodindia.org.{datetime.datetime.now().strftime("%Y.%m.%d")}", 'zip', f"{dataset_dir}/data")

'D:\\Projects\\my-jupyter-notebook\\insect-species-identification\\insect-dataset\\moth\\mothsodindia.org.2025.02.06.zip'

# Fetch species names

In [9]:
classes = { class_dir: len([ img for img in os.listdir(f"insect-dataset/moth/data/{class_dir}") ]) for class_dir in os.listdir(f"insect-dataset/moth/data") }

In [10]:
class_details = {}

In [11]:
for class_name in [ c for c in classes.keys() if not c.endswith("-early") ]:
    try:
        if class_name in class_details and "name" in class_details[class_name]:
            continue
        print()
        print(f"class_name: {class_name}")
        url = f"{website_url}/{class_name}"
        response = requests.get(url, timeout=page_timeout)
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find(lambda tag: tag.find("span", class_="spices-title") is not None, class_="field field--name-title field--type-string field--label-hidden").get_text(strip=True)
        print(f"title: {title}")
        species_name = re.split(r"\s+-\s+", title)
        if len(species_name) > 1:
            species_name = re.sub(r"\u2019", "''", species_name[-1])
        else:
            continue
        print(f"species_name: {species_name}")
        class_details[class_name] = {"name": species_name}
    except Exception:
        print("ERROR")


class_name: abaciscus-figlina
title: Abaciscus figlina(Swinhoe, 1890)  -

class_name: abaciscus-spp
title: Abaciscus spp.Butler, 1889  -

class_name: abaciscus-tristis
title: Abaciscus tristisButler, 1889   -

class_name: aberrasine-strigivenata
title: Aberrasine strigivenata(Hampson, 1894)  -

class_name: abraxaphantes-perampla
title: Abraxaphantes perampla(Swinhoe, 1890)  -

class_name: abraxas-etridoides
title: Abraxas etridoidesHampson, 1895  -

class_name: abraxas-fasciaria
title: Abraxas fasciariaGuérin-Méneville, 1843  -

class_name: abraxas-irrorata
title: Abraxas irrorataMoore, [1868]  -

class_name: abraxas-latizonata
title: Abraxas latizonataHampson, 1907  -

class_name: abraxas-poliostrota
title: Abraxas poliostrotaHampson, 1907  -

class_name: abraxas-spp
title: Abraxas spp.Leach, 1815  - Magpie Moth
species_name: Magpie Moth

class_name: abrostola-anophioides
title: Abrostola anophioidesMoore, 1882  -

class_name: absala-dorcada
title: Absala dorcadaSwinhoe, 1893  -

cla

In [13]:
import json

with open(f"insect-dataset/class_details.mothsofindia.org.json", "w", encoding="utf-8") as file:
        json.dump(class_details, file, indent=4)