In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [1]:
import mynnlib
from mynnlib import *

In [2]:
# !pip install imagehash
# import imagehash
from itertools import chain

max_imx_cnt = 6
max_img_size = 300

def is_black(pixel):
    threshold = 10
    return pixel[0] <= threshold and pixel[1] <= threshold and pixel[2] <= threshold

def crop_header_footer(img):
    width, height = img.size
    start = int(height * 0.15)
    while not is_black(img.getpixel((int(width / 2), start))) and start > 0:
        start -= 1
    end = int(height * 0.85)
    while not is_black(img.getpixel((int(width / 2), end))) and end < height -1:
        end += 1
    return img.crop((0, start, width, end))

def center_crop(img):
    width, height = img.size
    square_size = min(width, height)
    left = (width - square_size) / 2
    top = (height - square_size) / 2
    right = (width + square_size) / 2
    bottom = (height + square_size) / 2
    return img.crop((left, top, right, bottom))

def resize(img):
    width, height = img.size
    img_size = min(width, max_img_size)
    return img.resize((img_size, img_size), Image.LANCZOS)

def create_dataset(src_dirs, dst):
    if os.path.exists(dst):
        shutil.rmtree(dst)
    classes = list(set(list(chain(*[os.listdir(src) for src in src_dirs]))))
    for class_name in classes:
        img_cnt = 0
        # img_hashes = []
        for src in src_dirs:
            if not os.path.exists(f"{src}/{class_name}"):
                continue
            for file in os.listdir(f"{src}/{class_name}"):
                if not Path(f"{src}/{class_name}/{file}").is_file():
                    continue
                img = Image.open(f"{src}/{class_name}/{file}").convert("RGB")
                img = crop_header_footer(img)
                img = center_crop(img)
                img = resize(img)
                if img.size[0] < max_img_size:
                    continue
                # img_hash = imagehash.average_hash(img)
                # if img_hash in img_hashes:
                #     continue
                # img_hashes += [img_hash]
                if not os.path.exists(f"{dst}/{class_name}/"):
                    os.makedirs(f"{dst}/{class_name}/")
                img.save(f"{dst}/{class_name}/{img_cnt + 1}.jpg", format="JPEG", quality=50)
                img_cnt += 1
                if img_cnt >= max_imx_cnt:
                    break
            if img_cnt >= max_imx_cnt:
                break

In [5]:
import zipfile

def zip_folder(folder_path, zip_filename):
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, folder_path)
                zipf.write(file_path, arcname)
    shutil.rmtree(folder_path)

In [7]:
create_dataset(["insect-dataset/src/ifoundbutterflies.org", 
                "insect-dataset/src/mothsofindia.org", 
                "insect-dataset/src/butterfly.inaturalist.org", 
                "insect-dataset/src/moth.inaturalist.org"], 
               f"models/images.lepidoptera")

In [9]:
zip_folder("models/images.lepidoptera", "models/images.lepidoptera.zip")

In [11]:
create_dataset(["insect-dataset/src/indianodonata.org", 
                "insect-dataset/src/odonata.inaturalist.org"], 
               f"models/images.odonata")

In [13]:
zip_folder("models/images.odonata", "models/images.odonata.zip")

In [12]:
create_dataset(["insect-dataset/src/indiancicadas.org", 
                "insect-dataset/src/cicada.inaturalist.org"], 
               f"models/images.cicada")

In [14]:
zip_folder("models/images.cicada", "models/images.cicada.zip")

In [3]:
create_dataset(["insect-dataset/src/ifoundbutterflies.org", 
                "insect-dataset/src/butterfly.inaturalist.org"], 
               f"models/images.butterfly")

In [6]:
zip_folder("models/images.butterfly", "models/images.butterfly.zip")

In [8]:
create_dataset(["insect-dataset/src/mothsofindia.org", 
                "insect-dataset/src/moth.inaturalist.org"], 
               f"models/images.moth")

In [10]:
zip_folder("models/images.moth", "models/images.moth.zip")