In [1]:
# !pip install torch torchvision
# !pip install matplotlib pandas scikit-learn
# !pip install pillow
# !pip install opencv-python

In [2]:
import torch
torch.cuda.is_available()

True

In [1]:
dataset_dir = "insect-dataset/moth"

# Scrape https://www.mothsofindia.org for moths dataset

In [4]:
website_url = "https://www.mothsofindia.org"
initial_path = "/lepidoptera"
first_page = 0
last_page = 145

batch_size = 3
max_workers = 50
page_timeout = 120
image_timeout = 30

ignore_image_regex = r"^(imgs10|.*webheader.*)\.png$"
skip_downloaded_species = False

In [2]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import threading
from concurrent.futures import ThreadPoolExecutor

In [6]:
def log_header():
    return f"[ {threading.current_thread().name:24} ]  "

In [7]:
def download_image(img_url, output_dir):
    try:
        img_data = requests.get(img_url, timeout=image_timeout).content
        img_name = img_url.split("/")[-1]
        img_path = os.path.join(output_dir, img_name)
        with open(img_path, 'wb') as file:
            file.write(img_data)
        return True
    except Exception as e:
        # print(f"{log_header()}{e}")
        return False

In [8]:
def scrape_images(url, output_dir):
    try:
        print(f"{log_header()}    Scraping URL: {url}")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        response = requests.get(url, timeout=page_timeout)
        soup = BeautifulSoup(response.text, 'html.parser')
        img_tags = soup.find_all('img')
        success_cnt = 0
        failure_cnt = 0
        already_downloaded_cnt = 0
        for img in img_tags:
            img_url = img.get('src')
            if img_url:
                img_url = urljoin(url, img_url)
                img_name = img_url.split("/")[-1]
                if re.search(ignore_image_regex, img_name):
                    continue
                if os.path.exists(f"{output_dir}/{img_name}"):
                    already_downloaded_cnt = already_downloaded_cnt + 1
                    continue
                if download_image(img_url, output_dir):
                    success_cnt = success_cnt + 1
                else:
                    failure_cnt = failure_cnt + 1
        print(f"{log_header()}      Downloaded {success_cnt}(+{already_downloaded_cnt}) / {success_cnt+already_downloaded_cnt+failure_cnt} image(s) in {output_dir}")
        return True
    except Exception as e:
        print(f"{log_header()}{e}")
        return False

In [9]:
def crawl(base_url, root, output_dir):
    try:
        url = urljoin(base_url, root)
        print(f"{log_header()}Crawling URL: {url}")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        response = requests.get(url, timeout=page_timeout)
        soup = BeautifulSoup(response.text, 'html.parser')
        img_tags = soup.find_all('img')
        for img in img_tags:
            img_url = img.get('src')
            if img_url:
                img_url = urljoin(url, img_url)
                img_name = img_url.split("/")[-1]
                if re.search(ignore_image_regex, img_name):
                    continue
                species_path = img.parent.parent.get('href')
                species_dir = output_dir+species_path
                if not (skip_downloaded_species and os.path.exists(species_dir) and os.path.isdir(species_dir) and os.listdir(species_dir)):
                    scrape_images(urljoin(base_url, species_path), species_dir)
        return True
    except Exception as e:
        print(f"{log_header()}{e}")
        return False

def crawl_in_batch(batch_start):
    print(f"Starting batch from page {batch_start} on thread {threading.current_thread().name}")
    for page in range(batch_start, min(batch_start+batch_size, last_page+1), 1):
        crawl(website_url, f"{initial_path}?page={page}", f"{dataset_dir}/data")

In [10]:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(crawl_in_batch, batch_start) for batch_start in range(first_page, last_page+1, batch_size)]
    for future in futures:
        print(f"Thread completed with result {future.result()}")

print("Scraping completed!")

Starting batch from page 0 on thread ThreadPoolExecutor-0_0
[ ThreadPoolExecutor-0_0   ]  Crawling URL: https://www.mothsofindia.org/lepidoptera?page=0
Starting batch from page 3 on thread ThreadPoolExecutor-0_1
[ ThreadPoolExecutor-0_1   ]  Crawling URL: https://www.mothsofindia.org/lepidoptera?page=3
Starting batch from page 6 on thread ThreadPoolExecutor-0_2
[ ThreadPoolExecutor-0_2   ]  Crawling URL: https://www.mothsofindia.org/lepidoptera?page=6
Starting batch from page 9 on thread ThreadPoolExecutor-0_3
[ ThreadPoolExecutor-0_3   ]  Crawling URL: https://www.mothsofindia.org/lepidoptera?page=9
Starting batch from page 12 on thread ThreadPoolExecutor-0_4
[ ThreadPoolExecutor-0_4   ]  Crawling URL: https://www.mothsofindia.org/lepidoptera?page=12
Starting batch from page 15 on thread ThreadPoolExecutor-0_5
[ ThreadPoolExecutor-0_5   ]  Crawling URL: https://www.mothsofindia.org/lepidoptera?page=15
Starting batch from page 18 on thread ThreadPoolExecutor-0_6
[ ThreadPoolExecutor-0_

KeyboardInterrupt: 

# Remove corrupted images & empty folders

In [4]:
import os
from PIL import Image
from pathlib import Path

def check_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()
        return True
    except (IOError, SyntaxError):
        return False

for species_dir in Path(f"{dataset_dir}/data").iterdir():
    if species_dir.is_dir() and os.listdir(species_dir):
        for file in Path(f"{species_dir}").iterdir():
            if file.is_file() and not check_image(file):
                os.remove(file)
                print(f"Corrupted file {file} removed")
    if not os.listdir(species_dir):
        os.rmdir(species_dir)
        print(f"Empty folder {species_dir} removed")

# ZIP the data

In [12]:
import shutil
import time
import datetime

shutil.make_archive(f"{dataset_dir}/data_{int(time.time())}", 'zip', f"{dataset_dir}/data")

'D:\\Projects\\my-jupyter-notebook\\insect-dataset\\moth\\data_1737815141.zip'