In [62]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import threading
from concurrent.futures import ThreadPoolExecutor
import shutil
from PIL import Image
from pathlib import Path
import time
import datetime

In [22]:
dataset_dir = "insect-dataset/wikipedia.org"

In [23]:
classes = []
for data_dir in ["insect-dataset/moth", "insect-dataset/butterfly"]:
    for class_dir in os.listdir(f"{data_dir}/data"):
        if not re.match(r"^.+-(spp|genera|early)$", class_dir):
            classes += [class_dir]

In [24]:
len(classes)

3739

In [93]:
page_timeout = 120
image_timeout = 30

img_blacklist_regex = r"(?i)^(.*\.svg|wikipedia.png|start|.+\bicon\b.+.+\bUI\b.+).*$"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

def log_header():
    return f"[ {threading.current_thread().name:24} ]  "

def check_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()
        return True
    except (IOError, SyntaxError):
        return False
    
def download_image(img_url, output_dir):
    try:
        # print(f"{log_header()} Downloading {img_url} into {output_dir}")
        # print(f"{log_header()} Downloading {img_url.split("/")[-1].split("?")[0]} into {output_dir.split("/")[-1]}")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        img_name = img_url.split("/")[-1]
        img_path = os.path.join(output_dir, img_name.split("?")[0])
        if Path(img_path).is_file() and check_image(img_path):
            # skipping, already downloaded
            return 'EXISTS'
        img_data = requests.get(img_url, timeout=image_timeout, headers=headers).content
        with open(img_path, 'wb') as file:
            file.write(img_data)
        if not check_image(img_path):
            print(f"{log_header()}Removing corrupted image {file.name}")
            os.remove(Path(img_path))
            return 'FAILURE'
        return 'SUCCESS'
    except Exception as e:
        print(f"{log_header()}{e}")
        return 'FAILURE'

for class_name in classes:
    page_name = class_name.replace("-", "_")
    page_name = page_name[0].upper() + page_name[1:]
    url = f"https://en.wikipedia.org/wiki/{page_name}"
    # print(f"{log_header()}Scraping {url}")
    class_dir = f"{dataset_dir}/{class_name}"
    response = requests.get(url, timeout=page_timeout)
    soup = BeautifulSoup(response.text, 'html.parser')
    infobox = soup.find("table", class_="infobox biota")
    if infobox:
        img_tags = infobox.find_all('img')
        for img in img_tags:
            img_url = img.get('src')
            img_name = img_url.split("/")[-1].split("?")[0]
            if re.match(img_blacklist_regex, img_name):
                continue
            img_url = re.sub(r"^//", "https://", img_url)
            print(f"{log_header()}\tDownloading {img_url} in {class_dir}")
            if not os.path.exists(class_dir):
                os.makedirs(class_dir)
            download_image(img_url, class_dir)
            if not os.listdir(class_dir):
                os.rmdir(class_dir)

[ MainThread               ]  	Downloading https://upload.wikimedia.org/wikipedia/commons/thumb/4/46/%E7%A2%8E%E4%B8%89%E7%B7%9A%E5%B0%BA%E8%9B%BE-%E7%A2%8E%E7%99%BD%E9%8B%B8%E7%B7%9A%E5%B0%BA%E8%9B%BE_Abaciscus_tristis_Butler%2C_1889_%289183566992%29.jpg/220px-%E7%A2%8E%E4%B8%89%E7%B7%9A%E5%B0%BA%E8%9B%BE-%E7%A2%8E%E7%99%BD%E9%8B%B8%E7%B7%9A%E5%B0%BA%E8%9B%BE_Abaciscus_tristis_Butler%2C_1889_%289183566992%29.jpg in insect-dataset/wikipedia.org/abaciscus-tristis
[ MainThread               ]  	Downloading https://upload.wikimedia.org/wikipedia/commons/thumb/6/66/Aberrasine_strigivenata_-_inat_331660.jpg/220px-Aberrasine_strigivenata_-_inat_331660.jpg in insect-dataset/wikipedia.org/aberrasine-strigivenata
[ MainThread               ]  	Downloading https://upload.wikimedia.org/wikipedia/commons/thumb/9/93/Geometrid_Moths_%28Abraxaphantes_perampla%29_mating_%287788487322%29.jpg/220px-Geometrid_Moths_%28Abraxaphantes_perampla%29_mating_%287788487322%29.jpg in insect-dataset/wikipedia.org/a

In [94]:
# remove empty folders

for species_dir in Path(dataset_dir).iterdir():
    if not os.listdir(species_dir):
        os.rmdir(species_dir)
        print(f"Empty folder {species_dir} removed")

Empty folder insect-dataset\wikipedia.org\actinor-radians removed
Empty folder insect-dataset\wikipedia.org\allotinus-unicolor removed
Empty folder insect-dataset\wikipedia.org\amathuxidia-amythaon removed
Empty folder insect-dataset\wikipedia.org\apona-caschmirensis removed
Empty folder insect-dataset\wikipedia.org\aporia-harrietae removed
Empty folder insect-dataset\wikipedia.org\aporia-nabellica removed
Empty folder insect-dataset\wikipedia.org\apostictopterus-fuliginosus removed
Empty folder insect-dataset\wikipedia.org\apporasa-atkinsoni removed
Empty folder insect-dataset\wikipedia.org\araotes-lapithis removed
Empty folder insect-dataset\wikipedia.org\arhopala-athada removed
Empty folder insect-dataset\wikipedia.org\arhopala-khamti removed
Empty folder insect-dataset\wikipedia.org\arhopala-nicevillei removed
Empty folder insect-dataset\wikipedia.org\arhopala-paraganesa removed
Empty folder insect-dataset\wikipedia.org\arhopala-rama removed
Empty folder insect-dataset\wikipedia.or

In [95]:
data_cnt = 0
class_cnt = 0
for class_name in os.listdir(dataset_dir):
    class_cnt += 1
    data_cnt += len(os.listdir(f"{dataset_dir}/{class_name}"))
print(f"Class count: {class_cnt}")
print(f"Data count: {data_cnt}")

Class count: 1825
Data count: 2270
