In [6]:
# !pip install python-Levenshtein

In [2]:
import shutil
import os
import time
import datetime
import re
from pathlib import Path
from PIL import Image
import Levenshtein

In [6]:
# copy flat test directory to hierarchical test directory

def flat_to_hierarchical_test_dir(src_dir, target_dir):
    for file in Path(src_dir).iterdir():
        if file.is_file():
            species_name = re.sub(r".(jpg|png|jpeg)$", "", file.name)
            species_name = re.sub(r"__|(-[0-9]+$)", "", species_name)
            # print(species_name)
            species_dir = f"{target_dir}/{species_name.lower()}"
            if not os.path.exists(species_dir):
                os.makedirs(species_dir)
            shutil.copy(file, species_dir)

for family in ["moth", "butterfly"]:
    target_dir = f"insect-dataset/{family}/val"
    if os.path.exists(target_dir):
            shutil.rmtree(target_dir)
    for src in ["random-test", "my-test"]:
        flat_to_hierarchical_test_dir(f"insect-dataset/{family}/{src}", target_dir)

In [2]:
def rename_and_merge_folder(src, dst):
    if os.path.exists(dst):
        for item in os.listdir(src):
            src_path = os.path.join(src, item)
            dst_path = os.path.join(dst, item)
            if os.path.isdir(src_path):
                rename_and_merge_folder(src_path, dst_path)
            else:
                shutil.move(src_path, dst_path)
        os.rmdir(src)
    else:
        shutil.move(src, dst)

In [45]:
# find classes with -spp2 or -genera-spp

for dataset_dir in [f"insect-dataset/lepidoptera/data"]:
    for file in Path(dataset_dir).iterdir():
        if file.is_dir() and (file.name.endswith("-spp2") or file.name.endswith("-genera-spp")):
            renamed = re.sub(r"-(spp2|genera-spp)$", "-spp", file.name)
            print(f"Renaming {file.name} to {renamed}")
            rename_and_merge_folder(file, f"{dataset_dir}/{renamed}")

In [19]:
# find classes with possible typo

def num_classes_match_genus(class1, files):
    genus = class1.split('-')[0]
    return len([ file.name for file in files if file.name.startswith(genus) ])

threshold = 1
for dataset_dir in [f"insect-dataset/lepidoptera/data"]:
    genus_classes = [ file for file in Path(dataset_dir).iterdir() if file.is_dir() and file.name.endswith("-spp") ]
    species_classes = [ file for file in Path(dataset_dir).iterdir() if file.is_dir() and not file.name.endswith("-spp") ]
    for i1 in range(0, len(genus_classes)):
        for i2 in range(i1 + 1, len(genus_classes)):
            dist = Levenshtein.distance(genus_classes[i1].name, genus_classes[i2].name)
            if dist <= threshold and not (genus_classes[i1].name.endswith("dae-spp") and genus_classes[i2].name.endswith("nae-spp")):
                match1 = num_classes_match_genus(genus_classes[i1].name, species_classes)
                match2 = num_classes_match_genus(genus_classes[i2].name, species_classes)
                print(f"{genus_classes[i1].name} ({match1}) <------> {genus_classes[i2].name} ({match2})")
    print()
    for i1 in range(0, len(species_classes)):
        for i2 in range(i1 + 1, len(species_classes)):
            dist = Levenshtein.distance(species_classes[i1].name, species_classes[i2].name)
            if dist <= threshold:
                if species_classes[i1].name.split('-')[0] == species_classes[i2].name.split('-')[0]:
                    print(f"{species_classes[i1].name} <------> {species_classes[i2].name}")
                else:
                    match1 = num_classes_match_genus(species_classes[i1].name, species_classes)
                    match2 = num_classes_match_genus(species_classes[i2].name, species_classes)
                    print(f"{species_classes[i1].name} ({match1}) <------> {species_classes[i2].name} ({match2})")

achaea-spp (4) <------> acraea-spp (3)
arna-spp (1) <------> aroa-spp (3)
barasa-spp (0) <------> parasa-spp (7)
burara-spp (7) <------> buzara-spp (1)
cyana-spp (21) <------> tyana-spp (5)
idaea-spp (10) <------> idea-spp (2)
idea-spp (2) <------> udea-spp (2)

charaxes-dolon <------> charaxes-solon
lethe-dura <------> lethe-sura
neptis-nana <------> neptis-nata
parotis-marginata <------> parotis-marinata


In [18]:
to_rename = {
    # "lymantrinae-spp": "lymantriinae-spp",
    "acraea-terpsichore": "acraea-terpsicore",
    "appias-lyncinda": "appias-lyncida",
    "callopistria-pulchrillinea": "callopistria-pulchrilinea",
    "catapoecilma-major": "catapaecilma-major",
    "chasmina-cadida": "chasmina-candida",
    "conogethes-puntiferalis": "conogethes-punctiferalis",
    "earias-vittella": "earias-vitella",
    "elymnias-peali": "elymnias-pealii",
    "eurema-andersonii": "eurema-andersoni",
    "gesonia-obeiditalis": "gesonia-obeditalis",
    "homodes-propitia": "hamodes-propitia",
    "hyalobathra-opheltisalis": "hyalobathra-opheltesalis",
    "hypomecis-reparata": "hypomecis-separata",
    "leucophlebia-lineate": "leucophlebia-lineata",
    "orvasca-subnotota": "orvasca-subnotata",
    "polydesma-boarmioides": "polydesma-boarmoides",
    "spalgis-epeus": "spalgis-epius",
    "ypthima-chenu": "ypthima-chenui",
}

for dataset_dir in [f"insect-dataset/lepidoptera/data"]:
    for src, dst in to_rename.items():
        # print(f"Renaming {dataset_dir}/{src} to {dataset_dir}/{dst}")
        rename_and_merge_folder(f"{dataset_dir}/{src}", f"{dataset_dir}/{dst}")