In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import mynnlib
from mynnlib import *

In [5]:
# !pip install imagehash
# import imagehash
from itertools import chain

max_imx_cnt = 6
max_img_size = 300

def is_black(pixel):
    threshold = 10
    return pixel[0] <= threshold and pixel[1] <= threshold and pixel[2] <= threshold

def crop_header_footer(img):
    width, height = img.size
    start = int(height * 0.15)
    while not is_black(img.getpixel((int(width / 2), start))) and start > 0:
        start -= 1
    end = int(height * 0.85)
    while not is_black(img.getpixel((int(width / 2), end))) and end < height -1:
        end += 1
    return img.crop((0, start, width, end))

def center_crop(img):
    width, height = img.size
    square_size = min(width, height)
    left = (width - square_size) / 2
    top = (height - square_size) / 2
    right = (width + square_size) / 2
    bottom = (height + square_size) / 2
    return img.crop((left, top, right, bottom))

def resize(img):
    width, height = img.size
    img_size = min(width, max_img_size)
    return img.resize((img_size, img_size), Image.LANCZOS)

def create_dataset(src_dirs, dst, add_species_only=False):
    species_added_cnt = 0
    if not add_species_only and os.path.exists(dst):
        shutil.rmtree(dst)
    classes = list(set(list(chain(*[os.listdir(src) for src in src_dirs if os.path.exists(src)]))))
    for class_name in classes:
        if os.path.exists(f"{dst}/{class_name}"):
            continue
        species_added_cnt += 1
        img_cnt = 0
        # img_hashes = []
        for src in src_dirs:
            if add_species_only and not os.path.exists(f"{src}/{class_name}"):
                continue
            for file in os.listdir(f"{src}/{class_name}"):
                if not Path(f"{src}/{class_name}/{file}").is_file():
                    continue
                img = Image.open(f"{src}/{class_name}/{file}").convert("RGB")
                img = crop_header_footer(img)
                img = center_crop(img)
                img = resize(img)
                if img.size[0] < max_img_size:
                    continue
                    # img_hash = imagehash.average_hash(img)
                    # if img_hash in img_hashes:
                    #     continue
                    # img_hashes += [img_hash]
                if not os.path.exists(f"{dst}/{class_name}/"):
                    os.makedirs(f"{dst}/{class_name}/")
                img.save(f"{dst}/{class_name}/{img_cnt + 1}.jpg", format="JPEG", quality=50)
                img_cnt += 1
                if img_cnt >= max_imx_cnt:
                    break
            if img_cnt >= max_imx_cnt:
                break
    print(f"{species_added_cnt} species added")

In [6]:
import zipfile

def zip_folder(folder_path, zip_filename):
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, folder_path)
                zipf.write(file_path, arcname)
    shutil.rmtree(folder_path)

def unzip_file(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

In [7]:
if os.path.exists("models/images.lepidoptera.zip"):
    unzip_file("models/images.lepidoptera.zip", "models/images.lepidoptera")
create_dataset(["insect-dataset/src/ifoundbutterflies.org", 
                "insect-dataset/src/mothsofindia.org", 
                "insect-dataset/src/butterfly.inaturalist.org", 
                "insect-dataset/src/moth.inaturalist.org",
                "insect-dataset/lepidoptera/data"], 
               "models/images.lepidoptera",
               add_species_only=True)
if os.path.exists("models/images.lepidoptera"):
    zip_folder("models/images.lepidoptera", "models/images.lepidoptera.zip")

384 species added


In [8]:
if os.path.exists("models/images.odonata.zip"):
    unzip_file("models/images.odonata.zip", "models/images.odonata")
create_dataset(["insect-dataset/src/indianodonata.org", 
                "insect-dataset/src/odonata.inaturalist.org",
                "insect-dataset/odonata/data"], 
               "models/images.odonata",
               add_species_only=True)
if os.path.exists("models/images.odonata"):
    zip_folder("models/images.odonata", "models/images.odonata.zip")

1 species added


In [8]:
if os.path.exists("models/images.cicada.zip"):
    unzip_file("models/images.cicada.zip", "models/images.cicada")
create_dataset(["insect-dataset/src/indiancicadas.org", 
                "insect-dataset/src/cicada.inaturalist.org",
                "insect-dataset/cicada/data"], 
               "models/images.cicada",
               add_species_only=True)
if os.path.exists("models/images.cicada"):
    zip_folder("models/images.cicada", "models/images.cicada.zip")

0 species added


In [9]:
if os.path.exists("models/images.butterfly.zip"):
    unzip_file("models/images.butterfly.zip", "models/images.butterfly")
create_dataset(["insect-dataset/src/ifoundbutterflies.org", 
                "insect-dataset/src/butterfly.inaturalist.org",
                "insect-dataset/butterfly/data"], 
               "models/images.butterfly",
               add_species_only=True)
if os.path.exists("models/images.butterfly"):
    zip_folder("models/images.butterfly", "models/images.butterfly.zip")

203 species added


In [10]:
if os.path.exists("models/images.moth.zip"):
    unzip_file("models/images.moth.zip", "models/images.moth")
create_dataset(["insect-dataset/src/mothsofindia.org", 
                "insect-dataset/src/moth.inaturalist.org",
                "insect-dataset/moth/data"], 
               "models/images.moth",
               add_species_only=True)
if os.path.exists("models/images.moth"):
    zip_folder("models/images.moth", "models/images.moth.zip")

181 species added
