In [1]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import threading
from concurrent.futures import ThreadPoolExecutor
import shutil
from PIL import Image
from pathlib import Path
import time
import datetime

In [2]:
dataset_dir = "insect-dataset/insecta.pro"

In [3]:
page_timeout = 120
image_timeout = 30

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

def log_header():
    return f"[ {threading.current_thread().name:24} ]  "

def check_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()
        return True
    except (IOError, SyntaxError):
        return False
    
def download_image(img_url, output_dir):
    try:
        # print(f"{log_header()} Downloading {img_url} into {output_dir}")
        # print(f"{log_header()} Downloading {img_url.split("/")[-1].split("?")[0]} into {output_dir.split("/")[-1]}")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        img_name = img_url.split("/")[-1]
        img_path = os.path.join(output_dir, img_name.split("?")[0])
        if Path(img_path).is_file() and check_image(img_path):
            # skipping, already downloaded
            return 'EXISTS'
        img_data = requests.get(img_url, timeout=image_timeout, headers=headers).content
        with open(img_path, 'wb') as file:
            file.write(img_data)
        if not check_image(img_path):
            print(f"{log_header()}Removing corrupted image {file.name}")
            os.remove(Path(img_path))
            return 'FAILURE'
        return 'SUCCESS'
    except Exception as e:
        print(f"{log_header()}{e}")
        return 'FAILURE'

def scrape(start, end):
    success_cnt = 0
    failure_cnt = 0
    exists_cnt = 0
    for page in range(start, end, 1):
        try:
            url = f"https://insecta.pro/gallery?page={page}&prec=1&fem=1&live=1&size=m&tx=[1]%20Lepidoptera%20%20[%D0%9E%D1%82%D1%80%D1%8F%D0%B4]&taxlev=10"
            print(f"{log_header()}Scraping {url} ...")
            response = requests.get(url, timeout=page_timeout, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            div = soup.find("div", class_="gal")
            if div:
                img_tags = div.find_all("img")
                for img in img_tags:
                    try:
                        img_url = img.get('src')
                        img_name = img_url.split("/")[-1].split("?")[0]
                        class_name = re.sub(r" ", "-", img.parent.get("title").lower())
                        class_dir = f"{dataset_dir}/{class_name}"
                        img_url = f"https://insecta.pro{img_url}"
                        if not os.path.exists(class_dir):
                            os.makedirs(class_dir)
                        # print(f"{log_header()}\tDownloading {img_url} in {class_name}")
                        status = download_image(img_url, class_dir)
                        success_cnt += 1 if status=='SUCCESS' else 0
                        failure_cnt += 1 if status=='FAILURE' else 0
                        exists_cnt += 1 if status=='EXISTS' else 0
                        if not os.listdir(class_dir):
                            os.rmdir(class_dir)
                    except Exception as e:
                        print(f"{log_header()}{e}")
        except Exception as e:
            print(f"{log_header()}{e}")
        print(f"{log_header()}SUCCESS: {success_cnt:5} | FAILURE: {failure_cnt:5} | EXISTS: {exists_cnt:5} | CLASSES: {len(os.listdir(dataset_dir)):5}")

start = 0
end = 254
step = 5
max_workers = 100
print(f"{log_header()}Starting scraping...")
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(scrape, offset, offset + step) for offset in range(start, end + 1, step)]
    for future in futures:
        print(f"Thread completed with result {future.result()}")
print(f"{log_header()}Scraping completed")

[ MainThread               ]  Starting scraping...
[ ThreadPoolExecutor-0_0   ]  Scraping https://insecta.pro/gallery?page=0&prec=1&fem=1&live=1&size=m&tx=[1]%20Lepidoptera%20%20[%D0%9E%D1%82%D1%80%D1%8F%D0%B4]&taxlev=10 ...
[ ThreadPoolExecutor-0_1   ]  Scraping https://insecta.pro/gallery?page=5&prec=1&fem=1&live=1&size=m&tx=[1]%20Lepidoptera%20%20[%D0%9E%D1%82%D1%80%D1%8F%D0%B4]&taxlev=10 ...
[ ThreadPoolExecutor-0_2   ]  Scraping https://insecta.pro/gallery?page=10&prec=1&fem=1&live=1&size=m&tx=[1]%20Lepidoptera%20%20[%D0%9E%D1%82%D1%80%D1%8F%D0%B4]&taxlev=10 ...
[ ThreadPoolExecutor-0_3   ]  Scraping https://insecta.pro/gallery?page=15&prec=1&fem=1&live=1&size=m&tx=[1]%20Lepidoptera%20%20[%D0%9E%D1%82%D1%80%D1%8F%D0%B4]&taxlev=10 ...
[ ThreadPoolExecutor-0_4   ]  Scraping https://insecta.pro/gallery?page=20&prec=1&fem=1&live=1&size=m&tx=[1]%20Lepidoptera%20%20[%D0%9E%D1%82%D1%80%D1%8F%D0%B4]&taxlev=10 ...
[ ThreadPoolExecutor-0_5   ]  Scraping https://insecta.pro/gallery?page=25&p

In [6]:
data_cnt = 0
class_cnt = 0
for class_name in os.listdir(dataset_dir):
    class_cnt += 1
    data_cnt += len(os.listdir(f"{dataset_dir}/{class_name}"))
print(f"Class count: {class_cnt}")
print(f"Data count: {data_cnt}")

Class count: 5068
Data count: 25221
