In [6]:
# !pip install python-Levenshtein

In [2]:
import shutil
import os
import time
import datetime
import re
from pathlib import Path
from PIL import Image
import Levenshtein

In [6]:
# copy flat test directory to hierarchical test directory

def flat_to_hierarchical_test_dir(src_dir, target_dir):
    for file in Path(src_dir).iterdir():
        if file.is_file():
            species_name = re.sub(r".(jpg|png|jpeg)$", "", file.name)
            species_name = re.sub(r"__|(-[0-9]+$)", "", species_name)
            # print(species_name)
            species_dir = f"{target_dir}/{species_name.lower()}"
            if not os.path.exists(species_dir):
                os.makedirs(species_dir)
            shutil.copy(file, species_dir)

for family in ["moth", "butterfly"]:
    target_dir = f"insect-dataset/{family}/val"
    if os.path.exists(target_dir):
            shutil.rmtree(target_dir)
    for src in ["random-test", "my-test"]:
        flat_to_hierarchical_test_dir(f"insect-dataset/{family}/{src}", target_dir)

In [30]:
def rename_and_merge_folder(src, dst):
    if os.path.exists(dst):
        for item in os.listdir(src):
            src_path = os.path.join(src, item)
            dst_path = os.path.join(dst, item)
            if os.path.isdir(src_path):
                rename_and_merge_folder(src_path, dst_path)
            else:
                shutil.move(src_path, dst_path)
        os.rmdir(src)
    else:
        shutil.move(src, dst)

In [45]:
# find classes with -spp2 or -genera-spp

for dataset_dir in [f"insect-dataset/lepidoptera/data"]:
    for file in Path(dataset_dir).iterdir():
        if file.is_dir() and (file.name.endswith("-spp2") or file.name.endswith("-genera-spp")):
            renamed = re.sub(r"-(spp2|genera-spp)$", "-spp", file.name)
            print(f"Renaming {file.name} to {renamed}")
            rename_and_merge_folder(file, f"{dataset_dir}/{renamed}")

In [42]:
# find classes with possible typo

def num_classes_match_genus(class1, files):
    genus = class1.split('-')[0]
    return len([ file.name for file in files if file.name.startswith(genus) ])

for dataset_dir in [f"insect-dataset/lepidoptera/data"]:
    genus_classes = [ file for file in Path(dataset_dir).iterdir() if file.is_dir() and file.name.endswith("-spp") ]
    species_classes = [ file for file in Path(dataset_dir).iterdir() if file.is_dir() and not file.name.endswith("-spp") ]
    for i1 in range(0, len(genus_classes)):
        for i2 in range(i1 + 1, len(genus_classes)):
            dist = Levenshtein.distance(genus_classes[i1].name, genus_classes[i2].name)
            if dist <= 1 and not (genus_classes[i1].name.endswith("dae-spp") and genus_classes[i2].name.endswith("nae-spp")):
                match1 = num_classes_match_genus(genus_classes[i1].name, species_classes)
                match2 = num_classes_match_genus(genus_classes[i2].name, species_classes)
                print(f"{genus_classes[i1].name} ({match1}) <------> {genus_classes[i2].name} ({match2})")

achaea-spp (4) <------> acraea-spp (3)
arna-spp (1) <------> aroa-spp (3)
barasa-spp (0) <------> parasa-spp (7)
burara-spp (7) <------> buzara-spp (1)
cyana-spp (21) <------> tyana-spp (5)
idaea-spp (10) <------> idea-spp (2)
idea-spp (2) <------> udea-spp (2)


In [62]:
for dataset_dir in [f"insect-dataset/lepidoptera/data"]:
    genus_classes = [ file for file in Path(dataset_dir).iterdir() if file.is_dir() and file.name.endswith("-spp") ]
    species_classes = [ file for file in Path(dataset_dir).iterdir() if file.is_dir() and not file.name.endswith("-spp") ]
    for i1 in range(0, len(species_classes)):
        for i2 in range(i1 + 1, len(species_classes)):
            dist = Levenshtein.distance(species_classes[i1].name, species_classes[i2].name)
            if dist <= 2:
                if species_classes[i1].name.split('-')[0] == species_classes[i2].name.split('-')[0]:
                    print(f"{species_classes[i1].name} <------> {species_classes[i2].name}")
                else:
                    match1 = num_classes_match_genus(species_classes[i1].name, species_classes)
                    match2 = num_classes_match_genus(species_classes[i2].name, species_classes)
                    print(f"{species_classes[i1].name} ({match1}) <------> {species_classes[i2].name} ({match2})")

arhopala-paramuta <------> arhopala-perimuta
athyma-kanwa <------> athyma-ranga
bassarona-recta <------> bassarona-teuta
bastilla-acuta <------> bastilla-arcuata
charaxes-dolon <------> charaxes-solon
episteme-adulatrix <------> episteme-maculatrix
eudocima-falonia <------> eudocima-phalonia
euproctis-madana <------> euproctis-magna
euthalia-duda <------> euthalia-durga
euthalia-nais <------> euthalia-nara
lethe-baladeva <------> lethe-ramadeva
lethe-dura <------> lethe-sura
miltochrista-undulata <------> miltochrista-undulosa
mimathyma-bhavana <------> mimathyma-chevana
neope-pulaha <------> neope-pulahina
neptis-ananta <------> neptis-nana
neptis-ananta <------> neptis-nata
neptis-namba <------> neptis-nana
neptis-namba <------> neptis-nata
neptis-nana <------> neptis-nata
papilio-agenor <------> papilio-agestor
papilio-chaon <------> papilio-machaon
parotis-marginata <------> parotis-marinata
pontia-callidice <------> pontia-daplidice
problepsis-deliaria <------> problepsis-delphiar

In [66]:
to_rename = {
    "gangarides-rosea": "gangarides-roseus",
    "lycaena-panava": "lycaena-pavana",
    "istrugia-disputaria": "isturgia-disputaria",
    "jamides-caeruleus": "jamides-caerulea",
    "hippoton-roseta": "hippotion-rosetta",
    "margina-argus": "mangina-argus",
    "sphenarches-anisodactyla": "sphenarches-anisodactylus",
    "sticopthalma-camadeva": "stichophthalma-camadeva",
    "tarucus-balkanicus": "tarucus-balkanica",
    "udara-dilectus": "udara-dilecta",
    "argina-argus": "margina-argus",
    "brevipecten-captatus": "brevipecten-captata",
    "amata-cysseus": "amata-cyssea",
    "pedesta-pandita": "pedesta-panda",
    "abisara-echeria": "abisara-echerius",
    "lymantrinae-spp": "lymantriinae-spp",
    "acraea-terpsichore": "acraea-terpsicore",
    "appias-lyncinda": "appias-lyncida",
    "callopistria-pulchrillinea": "callopistria-pulchrilinea",
    "catapoecilma-major": "catapaecilma-major",
    "chasmina-cadida": "chasmina-candida",
    "conogethes-puntiferalis": "conogethes-punctiferalis",
    "earias-vittella": "earias-vitella",
    "elymnias-peali": "elymnias-pealii",
    "eurema-andersonii": "eurema-andersoni",
    "gesonia-obeiditalis": "gesonia-obeditalis",
    "homodes-propitia": "hamodes-propitia",
    "hyalobathra-opheltisalis": "hyalobathra-opheltesalis",
    "hypomecis-reparata": "hypomecis-separata",
    "leucophlebia-lineate": "leucophlebia-lineata",
    "orvasca-subnotota": "orvasca-subnotata",
    "polydesma-boarmioides": "polydesma-boarmoides",
    "spalgis-epeus": "spalgis-epius",
    "ypthima-chenu": "ypthima-chenui",
    "ypthima-hübneri": "ypthima-huebneri"
}

for dataset_dir in [f"insect-dataset/lepidoptera/data"]:
    for src, dst in to_rename.items():
        if os.path.exists(f"{dataset_dir}/{src}"):
            print(f"Renaming {dataset_dir}/{src} to {dataset_dir}/{dst}")
            rename_and_merge_folder(f"{dataset_dir}/{src}", f"{dataset_dir}/{dst}")

In [63]:
mothsofindia = { class_dir: len([ img for img in os.listdir(f"insect-dataset/moth/data/{class_dir}") ]) for class_dir in os.listdir(f"insect-dataset/moth/data") }
print(f"Class count: {len(mothsofindia)}")
print(f"Data count : {sum(mothsofindia.values())}")

Class count: 3364
Data count : 44652


In [64]:
ifoundbutterflies = { class_dir: len([ img for img in os.listdir(f"insect-dataset/butterfly/data/{class_dir}") ]) for class_dir in os.listdir(f"insect-dataset/butterfly/data") }
print(f"Class count: {len(ifoundbutterflies)}")
print(f"Data count : {sum(ifoundbutterflies.values())}")

Class count: 1554
Data count : 66362


In [65]:
all_data = { class_dir: len([ img for img in os.listdir(f"insect-dataset/lepidoptera/data/{class_dir}") ]) for class_dir in os.listdir(f"insect-dataset/lepidoptera/data") }
print(f"Class count: {len(all_data)}")
print(f"Data count : {sum(all_data.values())}")

diff = [ x for x in (set(all_data.keys()) - set(mothsofindia.keys()) - set(ifoundbutterflies.keys())) ]
print(len(diff))
diff2 = [ x for x in diff if not x.endswith('-spp') ]
diff2.sort()
print(len(diff2))
diff2

Class count: 4742
Data count : 111499
622
278


['abraxas-sylvata',
 'acraea-violae',
 'adites-frigida',
 'aedia-acronyctoides',
 'aedia-leucomelas',
 'aemene-taprobanis',
 'aglais-urticae',
 'agrotis-segetum',
 'aiteta-diurna',
 'alsophila-nilgirensis',
 'amata-phegea',
 'amyna-selenampha',
 'ancylolomia-saundersiella',
 'ancylolomia-taprobanensis',
 'anomis-privata',
 'anomis-sabulifera',
 'antheraea-mylitta',
 'antheua-exanthemata',
 'antitrygodes-cuneilinea',
 'antitrygodes-divisaria',
 'anuga-lunulata',
 'apatura-ambica',
 'arhopala-pseudocentaurus',
 'asota-egens',
 'astygisa-albopunctata',
 'asura-conferta',
 'athetis-thoracica',
 'athyma-larymna',
 'athyma-nefte',
 'atrophaneura-pandiyana',
 'avatha-bubo',
 'banisia-myrsusalis',
 'barsine-cardinalis',
 'barsine-striata',
 'bastilla-torrida',
 'batracharta-irrorata',
 'blenina-donans',
 'bombyx-mori',
 'botyodes-flavibasalis',
 'botyodes-patulalis',
 'bradina-melanoperas',
 'caleta-caleta',
 'caligula-simla',
 'caltoris-cahira',
 'calyptra-minuticornis',
 'cania-bilinea',
 'c