In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import mynnlib
from mynnlib import *

dataset_dir = "insect-dataset/lepidoptera"

early_regex = r"^.*-(early)$"
unidentified_regex = r"^.*-(spp|genera|genera-spp)$"
early_or_unidentified_regex = r"^.*-(early|spp|genera|genera-spp)$"

# Create datasets

In [12]:
if os.path.exists(f"{dataset_dir}/data"):
    shutil.rmtree(f"{dataset_dir}/data")
os.makedirs(f"{dataset_dir}/data")
for src_dir in [f"{dataset_dir}/../moth/data", f"{dataset_dir}/../butterfly/data"]:
    for class_dir in os.listdir(src_dir):
        shutil.copytree(f"{src_dir}/{class_dir}", f"{dataset_dir}/data/{class_dir}")

In [11]:
if os.path.exists(f"{dataset_dir}/val"):
    shutil.rmtree(f"{dataset_dir}/val")
os.makedirs(f"{dataset_dir}/val")
for src_dir in [f"{dataset_dir}/../moth/val", f"{dataset_dir}/../butterfly/val"]:
    for class_dir in os.listdir(src_dir):
        shutil.copytree(f"{src_dir}/{class_dir}", f"{dataset_dir}/val/{class_dir}")

# Count

In [13]:
classes = { class_dir: len([ img for img in os.listdir(f"{dataset_dir}/data/{class_dir}") ]) for class_dir in os.listdir(f"{dataset_dir}/data") }
early_classes = { class_name: count for class_name, count in classes.items() if re.match(early_regex, class_name) }
unidentified_classes = { class_name: count for class_name, count in classes.items() if re.match(unidentified_regex, class_name) }
print(f"Total Class count : {len(classes):6} ( Unidentified: {len(unidentified_classes):6} / Early-stage: {len(early_classes):6} / Identified-adult: {len(classes) - len(unidentified_classes) - len(early_classes):6} )")
print(f"Total  Data count : {sum(classes.values()):6} ( Unidentified: {sum(unidentified_classes.values()):6} / Early-stage: {sum(early_classes.values()):6} / Identified-adult: {sum(classes.values()) - sum(unidentified_classes.values()) - sum(early_classes.values()):6} )")

Total Class count :   5042 ( Unidentified:    446 / Early-stage:    857 / Identified-adult:   3739 )
Total  Data count : 312154 ( Unidentified:  13020 / Early-stage:  17549 / Identified-adult: 281585 )


In [14]:
img2_class = []
img5_class = []
for class_dir in os.listdir(f"{dataset_dir}/data"):
    if not re.match(early_or_unidentified_regex, class_dir):
        img_cnt = sum([1 for file in os.listdir(f"{dataset_dir}/data/{class_dir}")])
        img2_class += [class_dir] if img_cnt <= 2 else []
        img5_class += [class_dir] if img_cnt <= 5 else []
print(f"{len(img2_class):6} classes with <=2 images")
print(f"{len(img5_class):6} classes with <=5 images")

   153 classes with <=2 images
   434 classes with <=5 images


In [15]:
generas = set()
for class_name in classes:
    generas.add(class_name.split('-')[0])
print(f"Genera count: {len(generas)}")

Genera count: 1813


# Combine class details files

In [26]:
import json

input_data = {
    "models/class_details.butterfly.json": {'singular': "Butterfly", 'plural': "Butterflies"},
    "models/class_details.moth.json": {'singular': "Moth", 'plural': "Moths"},
}
combined_data = {}

for src, details in input_data.items():
    with open(src, "r", encoding="utf-8") as file:
        data = json.load(file)
    for key, value in data.items():
        if not re.match(r"(?i)^.*" + details['singular'] + "|" + details['plural'] + r"$", value['name']):
            value['name'] += " " + details['singular']
        combined_data[key] = value

with open("models/class_details.lepidoptera.json", "w", encoding="utf-8") as file:
    json.dump(combined_data, file, indent=4)

combined_data

{'abisara-attenuata': {'name': 'Attenuated Judy Butterfly'},
 'abisara-bifasciata': {'name': 'Double-banded Judy Butterfly'},
 'abisara-burnii': {'name': 'White-spotted Judy Butterfly'},
 'abisara-chela': {'name': 'Spot Judy Butterfly'},
 'abisara-echerius': {'name': 'Plum Judy Butterfly'},
 'abisara-fylla': {'name': 'Dark Judy Butterfly'},
 'abisara-neophron': {'name': 'Tailed Judy Butterfly'},
 'abrota-ganga': {'name': 'Sergeant-major Butterfly'},
 'acraea-issoria': {'name': 'Yellow Coster Butterfly'},
 'acraea-terpsicore': {'name': 'Tawny Coster Butterfly'},
 'actinor-radians': {'name': 'Veined Dart Butterfly'},
 'acupicta-delicatum': {'name': 'Dark Tinsel Butterfly'},
 'acytolepis-lilacea': {'name': 'Lilac Hedge Blue Butterfly'},
 'acytolepis-puspa': {'name': 'Common Hedge Blue Butterfly'},
 'aemona-amathusia': {'name': 'Yellow Dryad Butterfly'},
 'aeromachus-dubius': {'name': 'Dingy Scrub Hopper Butterfly'},
 'aeromachus-jhora': {'name': 'Grey Scrub Hopper Butterfly'},
 'aeromachu

# Train
### Model A (resnet-152)

In [16]:
training_params = [
    { "idx": 1, "robustness": 0.2, "break_at_val_acc_diff": 0.05},
    { "idx": 2, "robustness": 0.5, "break_at_val_acc_diff": 0.02},
    { "idx": 3, "robustness": 1.0, "break_at_val_acc_diff": 0.01},
    { "idx": 4, "robustness": 2.0, "break_at_val_acc_diff": -0.000001},
    { "idx": 5, "robustness": 2.0, "break_at_val_acc_diff": -0.000001}
]
for param in training_params:
    print(f"Phase {param["idx"]}:")
    if param["idx"] == 1:
        model_data = init_model_for_training(f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                             batch_size=32, arch="resnet152", image_size=224, robustness=param["robustness"],
                                             lr=1e-4, weight_decay=1e-4, silent=True)
    else:
        model_data = prepare_for_retraining(model_data, f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                            batch_size=32, image_size=224, robustness=param["robustness"], silent=True)
    train(model_data, 5, f"{dataset_dir}/checkpoint.lepidoptera.ta.ep{param["idx"]:02}###.pth", 
          break_at_val_acc_diff=param["break_at_val_acc_diff"])

Phase 1:
Epoch    1 /    5  | Train Loss: 2.1822 Acc: 0.5864  | Val Loss: 1.4265 Acc: 0.6265  | Elapsed time: 1:21:41.822510
Epoch    2 /    5  | Train Loss: 0.8145 Acc: 0.8010  | Val Loss: 1.2405 Acc: 0.7531  | Elapsed time: 2:40:29.224297
Epoch    3 /    5  | Train Loss: 0.6167 Acc: 0.8454  | Val Loss: 1.1244 Acc: 0.7346  | Elapsed time: 3:58:54.970532
Phase 2:
Epoch    1 /    5  | Train Loss: 1.5067 Acc: 0.6823  | Val Loss: 1.0881 Acc: 0.7562  | Elapsed time: 1:21:08.680264
Epoch    2 /    5  | Train Loss: 1.3111 Acc: 0.7191  | Val Loss: 1.0673 Acc: 0.7716  | Elapsed time: 2:41:32.846402
Phase 3:
Epoch    1 /    5  | Train Loss: 1.2929 Acc: 0.7225  | Val Loss: 1.1342 Acc: 0.7500  | Elapsed time: 1:20:42.444327
Epoch    2 /    5  | Train Loss: 1.2441 Acc: 0.7325  | Val Loss: 1.0588 Acc: 0.7747  | Elapsed time: 2:41:30.319693
Epoch    3 /    5  | Train Loss: 0.9636 Acc: 0.7913  | Val Loss: 0.8902 Acc: 0.8148  | Elapsed time: 4:02:22.746229
Epoch    4 /    5  | Train Loss: 0.8768 Acc: 

In [22]:
model_data = torch.load(f"{dataset_dir}/checkpoint.lepidoptera.ta.ep050004.pth", weights_only=False)

In [23]:
test_top_k(model_data, f"{dataset_dir}/../butterfly/random-test", 3, print_preds=False, print_top1_accuracy=True, print_no_match=False)
test_top_k(model_data, f"{dataset_dir}/../butterfly/random-test", 5, print_preds=False, print_top1_accuracy=False)
test_top_k(model_data, f"{dataset_dir}/../butterfly/random-test", 10, print_preds=False, print_top1_accuracy=False)

Top   1 accuracy: 119/153 -> 77.78%, genus matched: 138/153 -> 90.20%
Top   3 accuracy: 135/153 -> 88.24%, genus matched: 150/153 -> 98.04%
Top   5 accuracy: 144/153 -> 94.12%, genus matched: 152/153 -> 99.35%
Top  10 accuracy: 145/153 -> 94.77%, genus matched: 153/153 -> 100.00%


In [24]:
test_top_k(model_data, f"{dataset_dir}/../moth/random-test", 3, print_preds=False, print_top1_accuracy=True, print_no_match=False)
test_top_k(model_data, f"{dataset_dir}/../moth/random-test", 5, print_preds=False, print_top1_accuracy=False)
test_top_k(model_data, f"{dataset_dir}/../moth/random-test", 10, print_preds=False, print_top1_accuracy=False)

Top   1 accuracy: 130/152 -> 85.53%, genus matched: 146/152 -> 96.05%
Top   3 accuracy: 147/152 -> 96.71%, genus matched: 147/152 -> 96.71%
Top   5 accuracy: 147/152 -> 96.71%, genus matched: 147/152 -> 96.71%
Top  10 accuracy: 147/152 -> 96.71%, genus matched: 147/152 -> 96.71%


In [25]:
test_top_k(model_data, f"{dataset_dir}/../butterfly/my-test", 3, print_preds=True, print_top1_accuracy=True, print_no_match=False)

acraea-terpsicore             : [32macraea-terpsicore[0m(0.971)  lycaena-panava(0.016)  argynnis-hybrida(0.008)  
athyma-pravara                : neptis-clinia(0.648)  neptis-nata(0.158)  neptis-soma(0.044)  
colias-fieldii                : [32mcolias-fieldii[0m(0.998)  colias-eogene(0.001)  colias-erate(0.001)  
danaus-melanippus             : [32mdanaus-melanippus[0m(0.926)  danaus-genutia(0.070)  danaus-chrysippus(0.000)  
delias-descombesi             : [32mdelias-descombesi[0m(0.944)  delias-agostina(0.008)  delias-pasithoe(0.005)  
euploea-core                  : [32meuploea-core[0m(0.431)  euploea-sylvester(0.267)  euploea-algea(0.144)  
graphium-doson                : [32mgraphium-doson[0m(0.982)  graphium-teredon(0.008)  graphium-sarpedon(0.005)  
hypolimnas-bolina             : [32mhypolimnas-bolina[0m(0.999)  hypolimnas-misippus(0.001)  mimathyma-ambica(0.000)  
kallima-inachus               : [32mkallima-inachus[0m(0.997)  doleschallia-bisaltide(0.001)  kall

In [20]:
test_top_k(model_data, f"{dataset_dir}/../moth/my-test", 3, print_preds=True, print_top1_accuracy=True, print_no_match=False)

apona-spp                     : apona-caschmirensis(0.382)  polyptychus-trilineatus(0.232)  marumba-dyras(0.101)  
dysphania-percota             : [32mdysphania-percota[0m(0.997)  dysphania-percota-early(0.000)  teliphasa-albifusa(0.000)  
eupterote-undata              : [32meupterote-undata[0m(0.990)  eupterote-spp(0.005)  eupterote-mollifera(0.002)  
hippotion-rosetta-2           : hippotion-boerhaviae(0.536)  [32mhippotion-rosetta[0m(0.226)  cechetra-minor(0.154)  
hippotion-rosetta             : [32mhippotion-rosetta[0m(0.396)  hippotion-boerhaviae(0.364)  hippotion-spp(0.176)  
----------
Top   1 accuracy: 3/5 -> 60.00%, genus matched: 5/5 -> 100.00%
Top   3 accuracy: 4/5 -> 80.00%, genus matched: 5/5 -> 100.00%


### Model B (resnet-101)

In [None]:
training_params = [
    { "idx": 1, "robustness": 0.2, "break_at_val_acc_diff": 0.05},
    { "idx": 2, "robustness": 0.5, "break_at_val_acc_diff": 0.02},
    { "idx": 3, "robustness": 1.0, "break_at_val_acc_diff": 0.01},
    { "idx": 4, "robustness": 2.0, "break_at_val_acc_diff": -0.000001},
    { "idx": 5, "robustness": 2.0, "break_at_val_acc_diff": -0.000001}
]
for param in training_params:
    print(f"Phase {param["idx"]}:")
    if param["idx"] == 1:
        model_data = init_model_for_training(f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                             batch_size=32, arch="resnet101", image_size=224, robustness=param["robustness"],
                                             lr=1e-4, weight_decay=1e-4, silent=True)
    else:
        model_data = prepare_for_retraining(model_data, f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                            batch_size=32, image_size=224, robustness=param["robustness"], silent=True)
    train(model_data, 5, f"{dataset_dir}/checkpoint.lepidoptera.tb.ep{param["idx"]:02}###.pth", 
          break_at_val_acc_diff=param["break_at_val_acc_diff"])