In [1]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import threading
from concurrent.futures import ThreadPoolExecutor
import shutil
from PIL import Image
from pathlib import Path
import time
import datetime

In [2]:
dataset_dir = "insect-dataset/src/inaturalist.org"

In [25]:
page_timeout = 120
image_timeout = 30
max_workers = 50

def log_header():
    return f"[ {threading.current_thread().name:24} ]  "

def check_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()
        return True
    except (IOError, SyntaxError):
        return False
    
def download_image(img_url, output_dir, uuid):
    try:
        # print(f"{log_header()} Downloading {img_url} into {output_dir}")
        # print(f"{log_header()} Downloading {img_url.split("/")[-1].split("?")[0]} into {output_dir.split("/")[-1]}")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        img_name = img_url.split("/")[-1]
        img_path = os.path.join(output_dir, f"{uuid}-{img_name.split("?")[0]}")
        if Path(img_path).is_file() and check_image(img_path):
            # skipping, already downloaded
            return 'EXISTS'
        img_data = requests.get(img_url, timeout=image_timeout).content
        with open(img_path, 'wb') as file:
            file.write(img_data)
        if not check_image(img_path):
            print(f"{log_header()}Removing corrupted image {file.name}")
            os.remove(Path(img_path))
            if not os.listdir(output_dir):
                os.rmdir(output_dir)
            return 'FAILURE'
        return 'SUCCESS'
    except Exception as e:
        print(f"{log_header()}{e}")
        if not os.listdir(output_dir):
            os.rmdir(output_dir)
        return 'FAILURE'

def get_observations(taxon_id, page):
    url = "https://api.inaturalist.org/v1/observations"
    params = {
        "taxon_id": taxon_id,
        "order_by": "votes",
        "quality_grade": "research",
        "photos": "true",
        "page": page,
        "per_page": 100
    }
    headers = {
        "Accept": "application/json",
    }
    return requests.get(url, params=params, headers=headers)

def find_taxon_id(soup):
    tag_a = soup.find("a", class_="name sciname")
    if not tag_a:
        parent_div = soup.find("div", class_="taxonimage")
        if not parent_div:
            parent_div = soup.find("div", class_="first")
        if parent_div:
            tag_a = parent_div.find("a")
    if tag_a:
        return re.sub(r"(/taxa/)|(-.+$)", "", tag_a.get("href"))
    return None

def scrape(class_names, skip_existing_dir=False):
    success_cnt = 0
    failure_cnt = 0
    exists_cnt = 0
    for class_name in class_names:
        if skip_existing_dir and os.path.exists(f"{dataset_dir}/{class_name}"):
            continue
        try:
            url = f"https://www.inaturalist.org/taxa/search?q={class_name}"
            response = requests.get(url, timeout=page_timeout)
            soup = BeautifulSoup(response.text, 'html.parser')
            taxon_id = find_taxon_id(soup)
            if not taxon_id:
                continue
            print(f"{log_header()}Processing {class_name} | taxon_id:{taxon_id}")
            response = get_observations(taxon_id, 1)
            for result in response.json()["results"]:
                suffixed_class_name = class_name
                if "tags" in result and len([ t for t in result["tags"] if re.match(r"^.*(egg|eggs|larva|larvae|pupa|pupae).*$", t)]) > 0:
                    suffixed_class_name += "-early"
                for observation in result["observation_photos"]:
                    if "photo" in observation:
                        img_url = re.sub(r"\bsquare\b", "medium", observation["photo"]["url"])
                        status = download_image(img_url, f"{dataset_dir}/{suffixed_class_name}", observation["uuid"])
                        success_cnt += 1 if status=='SUCCESS' else 0
                        failure_cnt += 1 if status=='FAILURE' else 0
                        exists_cnt += 1 if status=='EXISTS' else 0
            print(f"{log_header()}SUCCESS: {success_cnt:5} | FAILURE: {failure_cnt:5} | EXISTS: {exists_cnt:5}")
        except Exception as ex:
            print(f"{log_header()}{ex}")

In [4]:
def scrape_multithread(class_names, batch_size, skip_existing_dir=False):
    print(f"{log_header()}Starting scraping...")
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(scrape, class_names[offset:(offset+batch_size)], skip_existing_dir) for offset in range(0, len(class_names), batch_size)]
        for future in futures:
            print(f"{log_header()}Thread completed with result {future.result()}")
    print(f"{log_header()}Scraping completed")

In [5]:
moth_data_dir = "insect-dataset/moth/data"
butterfly_data_dir = "insect-dataset/butterfly/data"
moth_classes = [class_name for class_name in os.listdir(moth_data_dir) if not re.match(r"^.*-(early|spp|genera)$", class_name)]
butterfly_classes = [class_name for class_name in os.listdir(butterfly_data_dir) if not re.match(r"^.*-(early|spp|genera)$", class_name)]

# Moth

In [36]:
class_names = ['abaciscus-figlina',  'abraxas-fasciaria',  'abraxas-irrorata',  'abraxas-latizonata',  'abraxas-poliostrota',  'absala-dorcada',  'acolutha-flavipictaria',  'acontia-catenula',  'acosmeryx-sinjaevi',  'adisura-atkinsoni',  'adrapsa-abnormalis',  'adrapsa-geometroides',  'aemene-maculifascia',  'aeolanthes-cyclantha',  'agassiziella-fuscifusale',  'agathia-gemma',  'agrisius-excellens',  'agrisius-guttivitta',  'agrotera-effertalis',  'aiteta-apriformis',  'alcis-arisema',  'alcis-nilgirica',  'alcis-sublimis',  'allata-violacaeus',  'amana-angulifera',  'ambulyx-substrigilis',  'anabelcia-kala',  'anabelcia-nepalensis',  'anoba-lunifera',  'anoba-pectinata',  'anonychia-diversilinea',  'anonychia-lativitta',  'anoratha-costalis',  'apithecia-viridata',  'apophyga-sericea',  'araeopteron-xanthopis',  'arasada-ornata',  'arbudas-leno',  'arcanusa-sinuosa',  'archernis-humilis',  'arcyophora-icterica',  'arichanna-marginata',  'arichanna-transectata',  'ariolica-lineolata',  'ariolica-superba',  'aristebulea-nobilis',  'artena-lacteicincta',  'astygisa-vexillaria',  'ataboruza-stragulata',  'atosia-himalayana',  'auzata-ocellata',  'avatha-garthei',  'bagada-poliomera',  'balataea-zebraica',  'baodera-khasiana',  'baradesa-lithosioides',  'barsine-insolita',  'bastilla-analis',  'berta-acte',  'berta-apopemta',  'besaia-rubiginea',  'blasticorhinus-varius',  'bocchoris-ciliata',  'bombyx-incomposita',  'brunia-cucullata',  'brunia-sarawaca',  'bryophilopsis-griseata',  'calamotropha-indica',  'calamotropha-melanosticta',  'calamotropha-punctivenellus',  'calliteara-angulata',  'callopistria-callopistrioides',  'calonola-argyria',  'calyptra-ophideroides',  'camadena-vespertilionis',  'camptochilus-reticulatum',  'canucha-miranda',  'carriola-fenestrata',  'catocala-inconstans',  'catocala-prolifica',  'catopta-cashmirensis',  'centronaxa-orthostigialis',  'chabulina-tenera',  'chamaesphecia-xanthosticta',  'chamaita-neuropteroides',  'chlorodontopera-chalybeata',  'chloromianta-ferruginata',  'choreutis-aegyptiaca',  'choreutis-taprobanes',  'chorodna-testaceata',  'chrysartona-stipata',  'chrysocraspeda-dysmothauma',  'chrysodeixis-permissa',  'churinga-beema',  'churinga-metaxantha',  'cidaria-basharica',  'cirrhochrista-annulifera',  'cirrhochrista-semibrunnea',  'cispia-charma',  'cleapa-latifascia',  'clelea-discriminis',  'clethrorasa-pilcheri',  'collinsa-subcostalis',  'comibaena-apicipicta',  'comocritis-cyanobactra',  'comocritis-olympia',  'comostola-caerulea',  'comostola-leucosticta',  'conogethes-sahyadriensis',  'costicoma-exangulata',  'cyana-flavicincta',  'cyana-moelleri',  'cyana-puer',  'cypa-pallens',  'cypoides-parachinensis',  'dahira-rubiginosa',  'dalima-vulpinaria',  'darisa-lampasaria',  'dasysphecia-bombyliformis',  'daulia-aurantialis',  'delineatia-mesortha',  'deltote-obliqua',  'dendrolimus-himalayanus',  'diacrotricha-fasciola',  'dichocrocis-festivalis',  'dindicodes-harutai',  'diomea-orbifera',  'diphtherocome-fasciata',  'diphtherocome-pallida',  'disepholcia-caerulea',  'ditrigona-idaeoides',  'docirava-affinis',  'docirava-fulgurata',  'dolgoma-reticulata',  'doloessa-constellata',  'doloessa-ochrociliella',  'doratoptera-nicevillei',  'drapetodes-fratercula',  'drosophantis-caeruleata',  'dysmilichia-calamistrata',  'dysodia-miniata',  'dysodia-rajah',  'dysodia-viridatrix',  'dysstroma-albiangulata',  'dysstroma-dentifera',  'ecliptopera-relata',  'ectoblemma-rosella',  'eilicrinia-flava',  'elophila-difflualis',  'endoclita-makundae',  'enispa-minuta',  'enispa-regulata',  'enispa-vinacea',  'eois-plicata',  'eospilarctia-erythrophleps',  'epaena-candidatalis',  'epicopeia-philenora',  'epiplema-latifasciata',  'epipristis-minimaria',  'episteme-vetula',  'erebus-glaucopis',  'erebus-jaintiana',  'eschata-ochreipes',  'etanna-albisecta',  'eucyclodes-aureofulva',  'eugnathia-apiciplaga',  'eugnathia-pictipennis',  'eugraptoblemma-pictalis',  'euparyphasma-albibasis',  'eupithecia-albigutta',  'eurytaphria-viridulata',  'eusabena-miltochristalis',  'eustroma-elista',  'eustroma-mixtilineata',  'euthrix-improvisa',  'euthrix-inobtrusa',  'exeliopsis-hibernaria',  'extremoplusia-megaloba',  'fujimacia-bicoloralis',  'gandaritis-flavata',  'ganisa-similis',  'garella-ruficirra',  'gariga-mirabilis',  'garudinia-pseudosimulana',  'gaurena-aurofasciata',  'ghatarbela-bifidunca',  'ghoria-albocinerea',  'ghoria-postfusca',  'giaura-sceptica',  'glyphodes-badialis',  'glyphodes-lacustralis',  'glyphodes-prothymalis',  'goenycta-niveiguttata',  'gonerda-perornata',  'gonodontis-pallida',  'hallicarnia-albipectus',  'helicopage-hirundinalis',  'hemithea-ochrolauta',  'heracula-discivitta',  'hermonassa-incisa',  'herochroma-subspoliata',  'heterolocha-mariailgeae',  'heterostegania-lunulosa',  'himantopterus-dohertyi',  'hirasa-cuprearia',  'homodes-bracteigutta',  'hyperaeschrella-nigribasis',  'hypocometa-decussata',  'hypolamprus-ypsilon',  'hypomecis-infixaria',  'hypomecis-lioptilaria',  'hypomecis-reparata',  'hypomecis-tamilensis',  'idaea-chotaria',  'imaus-munda',  'imma-auxobathra',  'iraga-rugosa',  'jodis-argentilineata',  'jodis-argutaria',  'jodis-delicatula',  'jodis-pallescens',  'kosala-sanguinea',  'krananda-nepalensis',  'kunugia-placida',  'labanda-saturalis',  'laelia-umbrina',  'laothoe-witti',  'lassaba-interruptaria',  'lemaireia-luteopeplus',  'lemyra-pseudoburmanica',  'lepidopoda-heterogyna',  'leptomiza-parableta',  'leucinodella-leucostola',  'leucoblepsis-fenestraria',  'lithosiopsis-rectigramma',  'lobogonodes-porphyriata',  'loepa-diffunoccidentalis',  'lophobates-flavicosta',  'lophophelma-costistrigaria',  'luma-sericea',  'lymantria-nussi',  'lymantria-obfuscata',  'lyncestis-amphix',  'mabra-nigriscripta',  'maliattha-picata',  'maliattha-plumbata',  'maliattha-tegulata',  'maliattha-vialis-complex',  'mataeomera-obliquisigna',  'maxates-coelataria',  'mecodina-cyanodonta',  'melittia-hampsoni',  'menophra-costistrigata',  'metallaxis-miniata',  'metallolophia-ocellata',  'metaterpna-differens',  'micrapatetis-flavipars',  'microcalicha-melanosticta',  'microselene-mesostipa',  'microselene-mopsa',  'miltochrista-chromatica',  'miltochrista-ila',  'miltochrista-uncalis',  'moca-purpurascens',  'monobolodes-simulans',  'mudaria-cornifrons',  'mudaria-leprosa',  'myrioblephara-marmorata',  'myrioblephara-pingasoides',  'namangana-cashmirensis',  'nannoarctia-himalayana',  'nebula-homophana',  'negritothripa-orbifera',  'neocalyptis-affinisana',  'neocerura-thomasi',  'neoreta-olga',  'neoris-huttoni',  'niaccaba-sumptualis',  'nistra-coelatalis',  'nola-infralba',  'nothomastix-obliquifascialis',  'nothomiza-costinotata',  'numenes-flagrans',  'numenes-grisa',  'nycteola-mesoplaga',  'nymphicula-trimacula',  'odontocraspis-hasora',  'odontopera-bivittaria',  'odontopera-heydena',  'oenospila-strix',  'omiodes-maculicostalis',  'ophiorrhabda-mormopa',  'oreta-ancora',  'oreta-pavaca',  'ornithospila-lineata',  'orthobrachia-hirowatarii',  'orthobrachia-tenebrosa',  'orthocabera-ocernaria',  'ovipennis-dudgeoni',  'oxymacaria-brunneata',  'ozarba-mallarba',  'pagyda-lustralis',  'pagyda-straminealis',  'palirisa-lineosa',  'pangrapta-albistigma',  'pangrapta-shivula',  'pantana-substrigosa',  'paradiopa-postfusca',  'paramaxates-posterecta',  'paramaxates-taiwana',  'parasynegia-lidderdalii',  'pardasena-verna',  'penicillaria-plusioides',  'perizoma-albofasciata',  'perizoma-plumbeata',  'perizoma-schistacea',  'peucela-subresectalis',  'phaiogramma-discessa',  'phalacra-excisa',  'phazaca-multistrigaria',  'phorica-phasipennis',  'photoscotosia-amplicata',  'photoscotosia-metachryseis',  'phragmacossia-brahmana',  'physetobasis-annulata',  'picrostomastis-subrosealis',  'pidorus-albifascia',  'pingasa-pseudoterpinaria',  'pingasa-subviridis',  'pingasa-venusta',  'planovalvata-confusa',  'platycerota-homoema',  'platycerota-vitticostata',  'plusiodonta-auripicta',  'plutodes-nilgirica',  'plutodes-philornis',  'plutodes-quadratus',  'plutodes-subcaudata',  'plutodes-warreni',  'polymixis-albosignata',  'polyscia-viridispurca',  'polythlipta-distorta',  'polythlipta-euroalis',  'polythlipta-inconspicua',  'polythlipta-peragrata',  'pomasia-sparsata',  'pristostegania-trilineata',  'problepsis-crassinotata',  'problepsis-delphiaria',  'prolophota-trigonifera',  'prometopidia-conisaria',  'prometopidia-joshimathensis',  'prospalta-contigua',  'psaphis-euschemoides',  'pseudargyria-marginepunctalis',  'pseudeuchlora-kafebera',  'pseudeustrotia-semialba',  'psilalcis-subtochracea',  'pterogonia-cardinalis',  'ptyomaxia-syntaractis',  'pyrinioides-oquiho',  'racotis-keralaria',  'ramila-angustifimbrialis',  'ramila-marginella',  'raparna-transversa',  'rhagastis-acuta',  'rhodoneura-acaciusalis',  'rinaca-cachara',  'rinaca-grotei',  'rondotia-diaphana',  'rosama-auritracta',  'rubrindiania-cardinalis',  'rusicada-lineosa',  'salma-nubilalis',  'sarbanissa-albifascia',  'sarcinodes-carnearia',  'sarcinodes-debitaria',  'scardamia-seminigra',  'scopula-mecysma',  'scopula-vicina',  'scrobigera-amatrix',  'semidonta-basalis',  'serratophyga-subangulata',  'siccia-amnaea',  'siglophora-haematica',  'sinobirma-bouyeri',  'smerinthulus-mirabilis',  'solus-drepanoides',  'sonagara-strigipennis',  'speiredonia-itynx',  'spilarctia-gopara',  'spilosoma-erythrozona',  'stenoloba-glaucescens',  'stenopsestis-alternata',  'stericta-divitalis',  'stictane-fractilinea',  'stictane-rectilinea',  'stigmatophora-strigivenata',  'stigmatophora-zolotuhini',  'streblote-igniflua',  'striatella-multistriata',  'strotihypera-semiochrea',  'suerkenola-longiventris',  'symmoracma-minoralis',  'synclera-danalis',  'syntypistis-nigribasalis',  'tabidia-aculealis',  'tamba-multiplaga',  'tamba-nigrilinea',  'tamba-venusta',  'tarika-varana',  'tarsolepis-rufobrunnea',  'tasta-argozana',  'taviodes-fulvescens',  'teuloma-oblitterans',  'theorica-malnadense',  'theretra-griseomarginata',  'thyrassia-virescens',  'timandromorpha-discolor',  'tiruvaca-hollowayi',  'tortriciforma-viridipuncta',  'tyana-magniplaga',  'tyana-pustulifera',  'tycracona-obliqua',  'tyspanodes-hypsalis',  'uthinia-albisignalis',  'varmina-indica',  'venusia-crassisigna',  'viridifentonia-plagiviridis',  'xenochroa-costiplaga',  'xenographia-semifusca',  'xestia-pseudoaccipiter',  'zeheba-aureatoides',  'zurobata-fissifascia',  'zurobata-rorata']
scrape(class_names)

[ MainThread               ]  Processing abaciscus-figlina | taxon_id:870185
[ MainThread               ]  SUCCESS:     0 | FAILURE:     0 | EXISTS:     6
[ MainThread               ]  Processing abraxas-fasciaria | taxon_id:1078289
[ MainThread               ]  SUCCESS:     1 | FAILURE:     0 | EXISTS:     6
[ MainThread               ]  Processing abraxas-irrorata | taxon_id:1078320
[ MainThread               ]  SUCCESS:     1 | FAILURE:     0 | EXISTS:     6
[ MainThread               ]  Processing abraxas-latizonata | taxon_id:1078329
[ MainThread               ]  SUCCESS:     1 | FAILURE:     0 | EXISTS:     6
[ MainThread               ]  Processing abraxas-poliostrota | taxon_id:1078364
[ MainThread               ]  SUCCESS:     1 | FAILURE:     0 | EXISTS:     6
[ MainThread               ]  Processing absala-dorcada | taxon_id:1355179
[ MainThread               ]  SUCCESS:     2 | FAILURE:     0 | EXISTS:     6
[ MainThread               ]  Processing acolutha-flavipictaria | 

In [58]:
scrape_multithread([class_name for class_name in moth_classes if 2 < len(os.listdir(f"{moth_data_dir}/{class_name}")) <= 5], 5, skip_existing_dir=True)

[ MainThread               ]  Starting scraping...
[ ThreadPoolExecutor-4_5   ]  Processing agnibesa-pictaria | taxon_id:637668
[ ThreadPoolExecutor-4_36  ]  Processing cirrhochrista-fuscusa | taxon_id:714730
[ ThreadPoolExecutor-4_28  ]  Processing calymera-endophaea | taxon_id:124638
[ ThreadPoolExecutor-4_0   ]  Processing abraxaphantes-perampla | taxon_id:805844
[ ThreadPoolExecutor-4_35  ]  Processing chorodna-vulpinaria | taxon_id:1151011
[ ThreadPoolExecutor-4_22  ]  Processing bivincula-kalikotei | taxon_id:1042272
[ ThreadPoolExecutor-4_9   ]  Processing amesia-sanguiflua | taxon_id:1371198
[ ThreadPoolExecutor-4_33  ]  Processing charitoprepes-lubricosa | taxon_id:1512084
[ ThreadPoolExecutor-4_17  ]  Processing asthena-albosignata | taxon_id:1023948
[ ThreadPoolExecutor-4_19  ]  Processing bagada-malayica | taxon_id:362158
[ ThreadPoolExecutor-4_30  ]  Processing cechetra-minor | taxon_id:548363
[ ThreadPoolExecutor-4_3   ]  Processing acria-ceramitis | taxon_id:205604
[ Thr

KeyboardInterrupt: 

In [59]:
scrape_multithread([class_name for class_name in moth_classes if len(os.listdir(f"{moth_data_dir}/{class_name}")) <= 10], 5, skip_existing_dir=True)

[ MainThread               ]  Starting scraping...
[ ThreadPoolExecutor-5_36  ]  Processing athetis-transversa | taxon_id:1236357[ ThreadPoolExecutor-5_10  ]  Processing agathia-hemithearia | taxon_id:423606
[ ThreadPoolExecutor-5_29  ]  Processing arichanna-flavinigra | taxon_id:623596
[ ThreadPoolExecutor-5_33  ]  Processing artona-quadrimaculata | taxon_id:124699
[ ThreadPoolExecutor-5_25  ]  Processing aplochlora-dentisignata | taxon_id:1379490
[ ThreadPoolExecutor-5_8   ]  Processing aemene-maculifascia | taxon_id:1566903
[ ThreadPoolExecutor-5_32  ]  Processing artaxa-vitellina | taxon_id:1312740
[ ThreadPoolExecutor-5_3   ]  Processing achrosis-costimaculata | taxon_id:208077
[ ThreadPoolExecutor-5_9   ]  Processing agalope-hyalina | taxon_id:1371172
[ ThreadPoolExecutor-5_31  ]  Processing aroa-clara | taxon_id:805872
[ ThreadPoolExecutor-5_19  ]  Processing ammatho-umbrosa | taxon_id:1364478
[ ThreadPoolExecutor-5_17  ]  Processing amblychia-pardicelata | taxon_id:902377
[ Thr

In [6]:
classes = ["hippotion-rosetta", "hippotion-celerio", "theretra-alecto", "theretra-clotho", "eupterote-undata", 
           "apona-caschmirensis", "apona-shevaroyensis", "theretra-nessus"]
scrape_multithread(classes, 1)

[ MainThread               ]  Starting scraping...
[ ThreadPoolExecutor-0_6   ]  Processing apona-shevaroyensis | taxon_id:470642
[ MainThread               ]  Thread completed with result None
[ ThreadPoolExecutor-0_5   ]  Processing apona-caschmirensis | taxon_id:946274
[ ThreadPoolExecutor-0_3   ]  Processing theretra-clotho | taxon_id:362231
[ ThreadPoolExecutor-0_7   ]  Processing theretra-nessus | taxon_id:1125145
[ ThreadPoolExecutor-0_2   ]  Processing theretra-alecto | taxon_id:505254
[ MainThread               ]  Thread completed with result None
[ ThreadPoolExecutor-0_6   ]  SUCCESS:     0 | FAILURE:     0 | EXISTS:     2
[ ThreadPoolExecutor-0_7   ]  SUCCESS:     0 | FAILURE:     0 | EXISTS:     5
[ ThreadPoolExecutor-0_5   ]  SUCCESS:     0 | FAILURE:     0 | EXISTS:     8
[ ThreadPoolExecutor-0_3   ]  SUCCESS:     6 | FAILURE:     0 | EXISTS:     5
[ ThreadPoolExecutor-0_2   ]  SUCCESS:    41 | FAILURE:     0 | EXISTS:    33
[ MainThread               ]  Thread completed 

In [20]:
scrape(["hippotion-rosetta"])

[ MainThread               ]  Processing hippotion-rosetta | taxon_id:51999
[ MainThread               ]  SUCCESS:     0 | FAILURE:     0 | EXISTS:   216


In [18]:
classes = ["hippotion-rosetta", "hippotion-celerio", "theretra-alecto", "theretra-clotho", "eupterote-undata", 
           "apona-caschmirensis", "apona-shevaroyensis", "theretra-nessus"]
scrape_multithread(classes, 1, skip_existing_dir=True)

[ MainThread               ]  Starting scraping...
[ MainThread               ]  Thread completed with result None
[ ThreadPoolExecutor-1_2   ]  Processing eupterote-undata | taxon_id:563430
[ ThreadPoolExecutor-1_0   ]  Processing hippotion-celerio | taxon_id:199458
[ ThreadPoolExecutor-1_2   ]  SUCCESS:   139 | FAILURE:     0 | EXISTS:     0
[ ThreadPoolExecutor-1_0   ]  SUCCESS:   186 | FAILURE:     0 | EXISTS:     0
[ MainThread               ]  Thread completed with result None
[ MainThread               ]  Thread completed with result None
[ MainThread               ]  Thread completed with result None
[ MainThread               ]  Thread completed with result None
[ MainThread               ]  Thread completed with result None
[ MainThread               ]  Thread completed with result None
[ MainThread               ]  Thread completed with result None
[ MainThread               ]  Scraping completed


In [26]:
scrape_multithread([class_name for class_name in moth_classes if len(os.listdir(f"{moth_data_dir}/{class_name}")) <= 10], 1, skip_existing_dir=True)

[ MainThread               ]  Starting scraping...
[ MainThread               ]  Thread completed with result None
[ MainThread               ]  Thread completed with result None
[ MainThread               ]  Thread completed with result None
[ MainThread               ]  Thread completed with result None
[ ThreadPoolExecutor-4_2   ]  Processing abraxas-poliostrota | taxon_id:1078364
[ ThreadPoolExecutor-4_0   ]  Processing abraxas-irrorata | taxon_id:1078320
[ ThreadPoolExecutor-4_3   ]  Processing abrostola-anophioides | taxon_id:712553
[ ThreadPoolExecutor-4_18  ]  Processing alophogaster-rubribasis | taxon_id:872286
[ ThreadPoolExecutor-4_34  ]  Processing bradina-translinealis | taxon_id:1285646
[ ThreadPoolExecutor-4_24  ]  Processing andraca-trilochoides | taxon_id:1343009
[ ThreadPoolExecutor-4_4   ]  Processing agassiziella-fuscifusale | taxon_id:1418068
[ ThreadPoolExecutor-4_25  ]  Processing arichanna-schnitzleri | taxon_id:1379481
[ ThreadPoolExecutor-4_5   ]  Processing a

# Butterfly

In [52]:
scrape_multithread([class_name for class_name in butterfly_classes if len(os.listdir(f"{butterfly_data_dir}/{class_name}")) <= 5], 5)

[ MainThread               ]  Starting scraping...
[ ThreadPoolExecutor-3_28  ]  Processing tajuria-luculentus | taxon_id:1340089
[ ThreadPoolExecutor-3_5   ]  Processing catopsilia-scylla | taxon_id:977887
[ ThreadPoolExecutor-3_21  ]  Processing pieris-extensa | taxon_id:922386
[ ThreadPoolExecutor-3_30  ]  Processing ypthima-methora | taxon_id:1072512
[ ThreadPoolExecutor-3_17  ]  Processing lotongus-sarala | taxon_id:1581412
[ ThreadPoolExecutor-3_27  ]  Processing stimula-swinhoei | taxon_id:1502833
[ ThreadPoolExecutor-3_0   ]  Processing abisara-burnii | taxon_id:712538
[ ThreadPoolExecutor-3_23  ]  Processing potanthus-trachala | taxon_id:1052348
[ ThreadPoolExecutor-3_29  ]  Processing tongeia-kala | taxon_id:1112196
[ ThreadPoolExecutor-3_14  ]  Processing hyponephele-davendra | taxon_id:1503448
[ ThreadPoolExecutor-3_11  ]  Processing euploea-scherzeri | taxon_id:100769
[ ThreadPoolExecutor-3_24  ]  Processing pyrgus-cashmirensis | taxon_id:1480287
[ ThreadPoolExecutor-3_6  

In [60]:
scrape_multithread([class_name for class_name in butterfly_classes if len(os.listdir(f"{butterfly_data_dir}/{class_name}")) <= 10], 5, skip_existing_dir=True)

[ MainThread               ]  Starting scraping...
[ ThreadPoolExecutor-6_28  ]  Processing lethe-jalaurida | taxon_id:1557842
[ ThreadPoolExecutor-6_24  ]  Processing hipparchia-parisatis | taxon_id:1504800
[ ThreadPoolExecutor-6_30  ]  Processing limenitis-ligyes | taxon_id:996460
[ ThreadPoolExecutor-6_8   ]  Processing boloria-jerdoni | taxon_id:1542627
[ ThreadPoolExecutor-6_36  ]  Processing neptis-palnica | taxon_id:1540880
[ ThreadPoolExecutor-6_5   ]  Processing arhopala-comica | taxon_id:1417997
[ ThreadPoolExecutor-6_39  ]  Processing pithecops-fulgens | taxon_id:358769
[ ThreadPoolExecutor-6_46  ]  Processing stichophthalma-sparta | taxon_id:788025
[ ThreadPoolExecutor-6_16  ]  Processing cigaritis-evansii | taxon_id:1504248
[ ThreadPoolExecutor-6_34  ]  Processing nacaduba-pavana | taxon_id:498173
[ ThreadPoolExecutor-6_18  ]  Processing delias-lativitta | taxon_id:1510935
[ ThreadPoolExecutor-6_31  ]  Processing melanocyma-faunula | taxon_id:205358
[ ThreadPoolExecutor-6_

In [63]:
scrape_multithread(["euploea-sylvester", "euploea-mulciber", "euploea-godartii", "euploea-midamus"], 1)

[ MainThread               ]  Starting scraping...
[ ThreadPoolExecutor-7_3   ]  Processing euploea-midamus | taxon_id:358983
[ ThreadPoolExecutor-7_0   ]  Processing euploea-sylvester | taxon_id:781801
[ ThreadPoolExecutor-7_1   ]  Processing euploea-mulciber | taxon_id:358889
[ ThreadPoolExecutor-7_3   ]  SUCCESS:    13 | FAILURE:     0 | EXISTS:     0
[ ThreadPoolExecutor-7_0   ]  SUCCESS:    42 | FAILURE:     0 | EXISTS:   110
[ MainThread               ]  Thread completed with result None
[ ThreadPoolExecutor-7_1   ]  SUCCESS:   136 | FAILURE:     0 | EXISTS:     0
[ MainThread               ]  Thread completed with result None
[ MainThread               ]  Thread completed with result None
[ MainThread               ]  Thread completed with result None
[ MainThread               ]  Scraping completed


In [27]:
scrape_multithread([class_name for class_name in butterfly_classes if len(os.listdir(f"{butterfly_data_dir}/{class_name}")) <= 10], 1, skip_existing_dir=True)

[ MainThread               ]  Starting scraping...
[ ThreadPoolExecutor-5_0   ]  Processing abisara-attenuata | taxon_id:1050620
[ ThreadPoolExecutor-5_2   ]  Processing acupicta-delicatum | taxon_id:560565
[ ThreadPoolExecutor-5_22  ]  Processing arhopala-belphoebe | taxon_id:1584321
[ ThreadPoolExecutor-5_9   ]  Processing albulina-galathea | taxon_id:1347878
[ ThreadPoolExecutor-5_18  ]  Processing argynnis-westphali | taxon_id:1426242
[ ThreadPoolExecutor-5_4   ]  Processing aglais-rizana | taxon_id:357612
[ ThreadPoolExecutor-5_3   ]  Processing aeromachus-kali | taxon_id:337530
[ ThreadPoolExecutor-5_17  ]  Processing arhopala-ammonides | taxon_id:470744
[ ThreadPoolExecutor-5_41  ]  Processing callenya-melaena | taxon_id:783321
[ ThreadPoolExecutor-5_15  ]  Processing aporia-nabellica | taxon_id:1203656
[ ThreadPoolExecutor-5_28  ]  Processing arhopala-oenea | taxon_id:785025
[ ThreadPoolExecutor-5_5   ]  Processing agriades-jaloka | taxon_id:1073717
[ ThreadPoolExecutor-5_31  ]

# Count

In [3]:
early_regex = r"^.*-(early)$"
unidentified_regex = r"^.*-(spp|genera|genera-spp)$"
classes = { class_dir: len([ img for img in os.listdir(f"{dataset_dir}/{class_dir}") ]) for class_dir in os.listdir(f"{dataset_dir}") }
early_classes = { class_name: count for class_name, count in classes.items() if re.match(early_regex, class_name) }
unidentified_classes = { class_name: count for class_name, count in classes.items() if re.match(unidentified_regex, class_name) }
print(f"Total Class count : {len(classes):6} ( Unidentified: {len(unidentified_classes):6} / Early-stage: {len(early_classes):6} / Identified-adult: {len(classes) - len(unidentified_classes) - len(early_classes):6} )")
print(f"Total  Data count : {sum(classes.values()):6} ( Unidentified: {sum(unidentified_classes.values()):6} / Early-stage: {sum(early_classes.values()):6} / Identified-adult: {sum(classes.values()) - sum(unidentified_classes.values()) - sum(early_classes.values()):6} )")

Total Class count :   1197 ( Unidentified:      0 / Early-stage:     88 / Identified-adult:   1109 )
Total  Data count :  35734 ( Unidentified:      0 / Early-stage:   2261 / Identified-adult:  33473 )
