In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import mynnlib
from mynnlib import *

dataset_dir = "insect-dataset/moth"

early_regex = r"^.*-(early)$"
unidentified_regex = r"^.*-(spp|genera|genera-spp)$"
early_or_unidentified_regex = r"^.*-(early|spp|genera|genera-spp)$"

# Dataset stats

In [80]:
classes = { class_dir: len([ img for img in os.listdir(f"{dataset_dir}/data/{class_dir}") ]) for class_dir in os.listdir(f"{dataset_dir}/data") }
early_classes = { class_name: count for class_name, count in classes.items() if re.match(early_regex, class_name) }
unidentified_classes = { class_name: count for class_name, count in classes.items() if re.match(unidentified_regex, class_name) }
print(f"Total Class count : {len(classes):6} ( Unidentified: {len(unidentified_classes):6} / Early-stage: {len(early_classes):6} / Identified-adult: {len(classes) - len(unidentified_classes) - len(early_classes):6} )")
print(f"Total  Data count : {sum(classes.values()):6} ( Unidentified: {sum(unidentified_classes.values()):6} / Early-stage: {sum(early_classes.values()):6} / Identified-adult: {sum(classes.values()) - sum(unidentified_classes.values()) - sum(early_classes.values()):6} )")

Total Class count :   3364 ( Unidentified:    411 / Early-stage:    304 / Identified-adult:   2649 )
Total  Data count :  44652 ( Unidentified:  11156 / Early-stage:   3569 / Identified-adult:  29927 )


In [81]:
img2_class = []
img5_class = []
for class_dir in os.listdir(f"{dataset_dir}/data"):
    if not re.match(early_or_unidentified_regex, class_dir):
        img_cnt = sum([1 for file in os.listdir(f"{dataset_dir}/data/{class_dir}")])
        img2_class += [class_dir] if img_cnt <= 2 else []
        img5_class += [class_dir] if img_cnt <= 5 else []
print(f"{len(img2_class):6} classes with <=2 images")
print(f"{len(img5_class):6} classes with <=5 images")

   548 classes with <=2 images
  1276 classes with <=5 images


-----------
# Train

### Model A (resnet-101 + only imago data)

In [5]:
training_params = [
    { "idx": "01", "robustness": 0.2, "break_at_val_acc_diff": 0.05},
    { "idx": "02", "robustness": 0.5, "break_at_val_acc_diff": 0.02},
    { "idx": "03", "robustness": 1.0, "break_at_val_acc_diff": 0.01},
    { "idx": "04", "robustness": 2.0, "break_at_val_acc_diff": 0.005}
]
for param in training_params:
    if param["idx"] == "01":
        model_data = init_model_for_training(f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                             batch_size=32, arch="resnet101", image_size=224, robustness=0.2,
                                             lr=1e-4, weight_decay=1e-4, silent=True)
    else:
        model_data = prepare_for_retraining(model_data, f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                            batch_size=32, image_size=224, robustness=param["robustness"], silent=True)
    train(model_data, 5, f"{dataset_dir}/checkpoint.moth.ta.ep{param["idx"]}###.pth", 
          break_at_val_acc_diff=param["break_at_val_acc_diff"])

Epoch    1 /    5  | Train Loss: 3.5788 Acc: 0.4362  | Val Loss: 1.6324 Acc: 0.5844  | Elapsed time: 0:13:59.920457
Epoch    2 /    5  | Train Loss: 0.7470 Acc: 0.8189  | Val Loss: 1.2188 Acc: 0.7143  | Elapsed time: 0:28:33.046725
Epoch    3 /    5  | Train Loss: 0.3673 Acc: 0.9033  | Val Loss: 1.3251 Acc: 0.7013  | Elapsed time: 0:43:07.716677
Epoch    1 /    5  | Train Loss: 1.3027 Acc: 0.7161  | Val Loss: 1.1361 Acc: 0.7143  | Elapsed time: 0:15:17.443942
Epoch    2 /    5  | Train Loss: 1.0163 Acc: 0.7729  | Val Loss: 1.0305 Acc: 0.7273  | Elapsed time: 0:30:44.161012
Epoch    1 /    5  | Train Loss: 0.9890 Acc: 0.7799  | Val Loss: 1.1097 Acc: 0.7403  | Elapsed time: 0:15:18.878553
Epoch    2 /    5  | Train Loss: 0.9098 Acc: 0.7968  | Val Loss: 1.0986 Acc: 0.7338  | Elapsed time: 0:30:49.874986
Epoch    1 /    5  | Train Loss: 0.8090 Acc: 0.8220  | Val Loss: 0.9465 Acc: 0.7597  | Elapsed time: 0:15:31.187134
Epoch    2 /    5  | Train Loss: 0.7119 Acc: 0.8429  | Val Loss: 0.9149 

In [6]:
train(model_data, 5, f"{dataset_dir}/checkpoint.moth.ta.ep05###.pth", break_at_val_acc_diff=0.005)

Epoch    1 /    5  | Train Loss: 0.6779 Acc: 0.8503  | Val Loss: 0.9450 Acc: 0.7597  | Elapsed time: 0:15:18.200399
Epoch    2 /    5  | Train Loss: 0.6501 Acc: 0.8551  | Val Loss: 0.9046 Acc: 0.7792  | Elapsed time: 0:30:41.719849
Epoch    3 /    5  | Train Loss: 0.6312 Acc: 0.8615  | Val Loss: 0.9144 Acc: 0.7662  | Elapsed time: 0:45:59.829237


In [8]:
model_data = torch.load(f"{dataset_dir}/checkpoint.moth.ta.ep050001.pth", weights_only=False)

In [9]:
test_top_k(model_data, f"{dataset_dir}/../moth/random-test", 3, print_preds=False, print_top1_accuracy=True, print_no_match=False)
test_top_k(model_data, f"{dataset_dir}/../moth/random-test", 5, print_preds=False, print_top1_accuracy=False)
test_top_k(model_data, f"{dataset_dir}/../moth/random-test", 10, print_preds=False, print_top1_accuracy=False)

Top   1 accuracy: 120/152 -> 78.95%, genus matched: 138/152 -> 90.79%
Top   3 accuracy: 142/152 -> 93.42%, genus matched: 146/152 -> 96.05%
Top   5 accuracy: 143/152 -> 94.08%, genus matched: 147/152 -> 96.71%
Top  10 accuracy: 146/152 -> 96.05%, genus matched: 148/152 -> 97.37%


In [10]:
test_top_k(model_data, f"{dataset_dir}/../moth/my-test", 3, print_preds=True, print_top1_accuracy=True, print_no_match=False)

apona-spp                     : polyptychus-dentatus(0.889)  apona-caschmirensis(0.017)  polyptychus-trilineatus(0.010)  
dysphania-percota             : [32mdysphania-percota[0m(0.956)  nyctemera-cenis(0.033)  nyctemera-adversata(0.002)  
eupterote-undata              : [32meupterote-undata[0m(0.619)  eupterote-spp(0.306)  speiredonia-obscura(0.017)  
hippotion-rosetta-2           : cechetra-minor(0.517)  hippotion-boerhaviae(0.429)  hippotion-spp(0.023)  
hippotion-rosetta             : theretra-clotho(0.566)  hippotion-boerhaviae(0.168)  theretra-alecto(0.092)  
----------
Top   1 accuracy: 2/5 -> 40.00%, genus matched: 2/5 -> 40.00%
Top   3 accuracy: 2/5 -> 40.00%, genus matched: 5/5 -> 100.00%


### Model B (resnet-152 + only imago data)

In [7]:
training_params = [
    { "idx": 1, "robustness": 0.2, "break_at_val_acc_diff": 0.05},
    { "idx": 2, "robustness": 0.5, "break_at_val_acc_diff": 0.02},
    { "idx": 3, "robustness": 1.0, "break_at_val_acc_diff": 0.01},
    { "idx": 4, "robustness": 2.0, "break_at_val_acc_diff": -0.000001}
]
for param in training_params:
    print(f"Phase {param["idx"]}:")
    if param["idx"] == 1:
        model_data = init_model_for_training(f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                             batch_size=32, arch="resnet152", image_size=224, robustness=param["robustness"],
                                             lr=1e-4, weight_decay=1e-4, silent=True)
    else:
        model_data = prepare_for_retraining(model_data, f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                            batch_size=32, image_size=224, robustness=param["robustness"], silent=True)
    train(model_data, 5, f"{dataset_dir}/checkpoint.moth.tb.ep{param["idx"]:02}###.pth", 
          break_at_val_acc_diff=param["break_at_val_acc_diff"])

Phase 1:
Epoch    1 /    5  | Train Loss: 3.4490 Acc: 0.4536  | Val Loss: 1.2743 Acc: 0.7143  | Elapsed time: 0:23:41.109599
Epoch    2 /    5  | Train Loss: 0.7013 Acc: 0.8308  | Val Loss: 1.0904 Acc: 0.7338  | Elapsed time: 0:46:33.587867
Phase 2:
Epoch    1 /    5  | Train Loss: 1.3652 Acc: 0.7025  | Val Loss: 1.1198 Acc: 0.7273  | Elapsed time: 0:29:04.944446
Epoch    2 /    5  | Train Loss: 1.0813 Acc: 0.7609  | Val Loss: 1.1452 Acc: 0.6948  | Elapsed time: 0:58:09.405522
Phase 3:
Epoch    1 /    5  | Train Loss: 1.0206 Acc: 0.7728  | Val Loss: 1.3211 Acc: 0.6753  | Elapsed time: 0:29:01.511832
Epoch    2 /    5  | Train Loss: 0.9465 Acc: 0.7882  | Val Loss: 1.0752 Acc: 0.7143  | Elapsed time: 0:58:04.772749
Epoch    3 /    5  | Train Loss: 0.8844 Acc: 0.8031  | Val Loss: 1.1184 Acc: 0.7468  | Elapsed time: 1:27:08.319044
Epoch    4 /    5  | Train Loss: 0.6824 Acc: 0.8479  | Val Loss: 1.0367 Acc: 0.7338  | Elapsed time: 1:56:10.911668
Phase 4:
Epoch    1 /    5  | Train Loss: 0.6

### Model C (resnet-101 + only imago data + varying data-aug on each epoch)

In [11]:
training_params = [
    { "idx": 1, "robustness": 0.2, "break_at_val_acc_diff": 0.05},
    { "idx": 2, "robustness": 0.5, "break_at_val_acc_diff": 0.02},
    { "idx": 3, "robustness": 1.0, "break_at_val_acc_diff": 0.01},
    { "idx": 4, "robustness": 2.0, "break_at_val_acc_diff": -0.000001}
]
last_val_acc = None
response = None
idx = 1
epoch = 1
while True:
    param = training_params[idx - 1]
    print(f"Phase {param["idx"]} |", end=' ')
    if idx == 1:
        model_data = init_model_for_training(f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                             batch_size=32, arch="resnet101", image_size=224, robustness=param["robustness"],
                                             lr=1e-4, weight_decay=1e-4, silent=True)
    else:
        model_data = prepare_for_retraining(model_data, f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                            batch_size=32, image_size=224, robustness=param["robustness"], silent=True)
    response = train(model_data, 1, f"{dataset_dir}/checkpoint.moth.tc.ep{epoch:04}.pth")
    if last_val_acc and response["val_acc"] - last_val_acc < param["break_at_val_acc_diff"]:
        idx += 1
        if idx == len(training_params) + 1:
            break
    last_val_acc = response["val_acc"]
    epoch += 1

Phase 1 | Epoch    1 /    1  | Train Loss: 3.6296 Acc: 0.4300  | Val Loss: 1.6104 Acc: 0.6169  | Elapsed time: 0:14:21.780614
Phase 1 | Epoch    1 /    1  | Train Loss: 3.7336 Acc: 0.4152  | Val Loss: 1.5916 Acc: 0.6169  | Elapsed time: 0:14:20.053484
Phase 2 | Epoch    1 /    1  | Train Loss: 1.8255 Acc: 0.6229  | Val Loss: 1.4139 Acc: 0.6623  | Elapsed time: 0:14:57.136417
Phase 2 | Epoch    1 /    1  | Train Loss: 1.4776 Acc: 0.6870  | Val Loss: 1.3455 Acc: 0.6948  | Elapsed time: 0:14:57.246749
Phase 2 | Epoch    1 /    1  | Train Loss: 1.3237 Acc: 0.7177  | Val Loss: 1.3458 Acc: 0.6753  | Elapsed time: 0:14:51.528282
Phase 3 | Epoch    1 /    1  | Train Loss: 1.2543 Acc: 0.7326  | Val Loss: 1.0736 Acc: 0.7468  | Elapsed time: 0:15:13.321988
Phase 3 | Epoch    1 /    1  | Train Loss: 1.1608 Acc: 0.7492  | Val Loss: 1.3039 Acc: 0.6948  | Elapsed time: 0:15:11.214581
Phase 4 | Epoch    1 /    1  | Train Loss: 1.1896 Acc: 0.7452  | Val Loss: 1.2562 Acc: 0.6883  | Elapsed time: 0:18:12

---------------
### Add more data to Dataset

In [17]:
def copy_data_from(sources, add_early=False):
    class_cnt = 0
    img_cnt = 0
    for more_data_dir in sources:
        for class_dir in os.listdir(f"{dataset_dir}/data"):
            if os.path.exists(f"{more_data_dir}/{class_dir}"):
                # print(f"Copying data for {class_dir}...")
                class_cnt += 1
                for file in os.listdir(f"{more_data_dir}/{class_dir}"):
                    shutil.copy2(f"{more_data_dir}/{class_dir}/{file}", f"{dataset_dir}/data/{class_dir}/{file}")
                    img_cnt += 1
            if add_early and os.path.exists(f"{more_data_dir}/{class_dir}-early"):
                # print(f"Copying data for {class_dir}-early...")
                class_cnt += 1
                os.makedirs(f"{dataset_dir}/data/{class_dir}-early/{file}")
                for file in os.listdir(f"{more_data_dir}/{class_dir}-early"):
                    shutil.copy2(f"{more_data_dir}/{class_dir}-early/{file}", f"{dataset_dir}/data/{class_dir}-early/{file}")
                    img_cnt += 1
    print(f"{img_cnt} images added into {class_cnt} classes")

In [18]:
# copy all from mothsofindia
if os.path.exists(f"{dataset_dir}/data"):
    shutil.rmtree(f"{dataset_dir}/data")
shutil.copytree("insect-dataset/src/mothsodindia.org", f"{dataset_dir}/data")

'insect-dataset/moth/data'

In [19]:
copy_data_from(["insect-dataset/src/indiabiodiversity.org"], add_early=True)

1958 images added into 587 classes


In [20]:
copy_data_from(["insect-dataset/src/wikipedia.org"], add_early=True)

1260 images added into 1064 classes


In [21]:
copy_data_from(["insect-dataset/src/insecta.pro"], add_early=True)

1201 images added into 395 classes


In [22]:
copy_data_from(["insect-dataset/src/inaturalist.org"], add_early=True)

142620 images added into 2834 classes


In [23]:
# # remove early classes
# for class_dir in os.listdir(f"{dataset_dir}/data"):
#     if class_dir.endswith("-early"):
#         shutil.rmtree(f"{dataset_dir}/data/{class_dir}")

### Updated Dataset stats

In [24]:
classes = { class_dir: len([ img for img in os.listdir(f"{dataset_dir}/data/{class_dir}") ]) for class_dir in os.listdir(f"{dataset_dir}/data") }
early_classes = { class_name: count for class_name, count in classes.items() if re.match(early_regex, class_name) }
unidentified_classes = { class_name: count for class_name, count in classes.items() if re.match(unidentified_regex, class_name) }
print(f"Total Class count : {len(classes):6} ( Unidentified: {len(unidentified_classes):6} / Early-stage: {len(early_classes):6} / Identified-adult: {len(classes) - len(unidentified_classes) - len(early_classes):6} )")
print(f"Total  Data count : {sum(classes.values()):6} ( Unidentified: {sum(unidentified_classes.values()):6} / Early-stage: {sum(early_classes.values()):6} / Identified-adult: {sum(classes.values()) - sum(unidentified_classes.values()) - sum(early_classes.values()):6} )")

Total Class count :   3600 ( Unidentified:    411 / Early-stage:    540 / Identified-adult:   2649 )
Total  Data count : 187511 ( Unidentified:  11156 / Early-stage:  13323 / Identified-adult: 163032 )


In [25]:
img2_class = []
img5_class = []
for class_dir in os.listdir(f"{dataset_dir}/data"):
    if not re.match(early_or_unidentified_regex, class_dir):
        img_cnt = sum([1 for file in os.listdir(f"{dataset_dir}/data/{class_dir}")])
        img2_class += [class_dir] if img_cnt <= 2 else []
        img5_class += [class_dir] if img_cnt <= 5 else []
print(f"{len(img2_class):6} classes with <=2 images")
print(f"{len(img5_class):6} classes with <=5 images")

   129 classes with <=2 images
   350 classes with <=5 images


In [91]:
generas = set()
for class_name in classes:
    generas.add(class_name.split('-')[0])
print(f"Genera count: {len(generas)}")

Genera count: 1462


### Model D (resnet-101 + only imago data + more data)

In [4]:
training_params = [
    { "idx": 1, "robustness": 0.2, "break_at_val_acc_diff": 0.05},
    { "idx": 2, "robustness": 0.5, "break_at_val_acc_diff": 0.02},
    { "idx": 3, "robustness": 1.0, "break_at_val_acc_diff": 0.01},
    { "idx": 4, "robustness": 2.0, "break_at_val_acc_diff": -0.000001}
]
for param in training_params:
    print(f"Phase {param["idx"]}:")
    if param["idx"] == 1:
        model_data = init_model_for_training(f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                             batch_size=32, arch="resnet101", image_size=224, robustness=param["robustness"],
                                             lr=1e-4, weight_decay=1e-4, silent=True)
    else:
        model_data = prepare_for_retraining(model_data, f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                            batch_size=32, image_size=224, robustness=param["robustness"], silent=True)
    train(model_data, 5, f"{dataset_dir}/checkpoint.moth.td.ep{param["idx"]:02}###.pth", 
          break_at_val_acc_diff=param["break_at_val_acc_diff"])

Phase 1:
Epoch    1 /    5  | Train Loss: 3.2987 Acc: 0.4659  | Val Loss: 1.1944 Acc: 0.6883  | Elapsed time: 0:16:40.142756
Epoch    2 /    5  | Train Loss: 0.7143 Acc: 0.8254  | Val Loss: 1.3508 Acc: 0.5909  | Elapsed time: 0:32:51.244716
Phase 2:
Epoch    1 /    5  | Train Loss: 1.3547 Acc: 0.7079  | Val Loss: 0.9934 Acc: 0.7468  | Elapsed time: 0:17:05.113645
Epoch    2 /    5  | Train Loss: 1.0779 Acc: 0.7609  | Val Loss: 0.9176 Acc: 0.7922  | Elapsed time: 0:34:15.341175
Epoch    3 /    5  | Train Loss: 0.9853 Acc: 0.7806  | Val Loss: 0.9466 Acc: 0.7792  | Elapsed time: 0:51:27.096290
Phase 3:
Epoch    1 /    5  | Train Loss: 0.9681 Acc: 0.7854  | Val Loss: 0.9841 Acc: 0.7273  | Elapsed time: 0:17:12.350080
Epoch    2 /    5  | Train Loss: 0.9086 Acc: 0.7976  | Val Loss: 1.0493 Acc: 0.7468  | Elapsed time: 0:34:25.654337
Epoch    3 /    5  | Train Loss: 0.7181 Acc: 0.8425  | Val Loss: 0.8768 Acc: 0.7987  | Elapsed time: 0:51:43.238765
Epoch    4 /    5  | Train Loss: 0.6420 Acc: 

In [11]:
model_data = torch.load(f"{dataset_dir}/checkpoint.moth.td.ep040001.pth", weights_only=False)

In [12]:
test_top_k(model_data, f"{dataset_dir}/../moth/random-test", 3, print_preds=False, print_top1_accuracy=True, print_no_match=False)
test_top_k(model_data, f"{dataset_dir}/../moth/random-test", 5, print_preds=False, print_top1_accuracy=False)
test_top_k(model_data, f"{dataset_dir}/../moth/random-test", 10, print_preds=False, print_top1_accuracy=False)

Top   1 accuracy: 119/152 -> 78.29%, genus matched: 138/152 -> 90.79%
Top   3 accuracy: 139/152 -> 91.45%, genus matched: 145/152 -> 95.39%
Top   5 accuracy: 143/152 -> 94.08%, genus matched: 147/152 -> 96.71%
Top  10 accuracy: 146/152 -> 96.05%, genus matched: 147/152 -> 96.71%


In [13]:
test_top_k(model_data, f"{dataset_dir}/../moth/my-test", 3, print_preds=True, print_top1_accuracy=True, print_no_match=False)

apona-spp                     : polyptychus-dentatus(0.392)  thyas-juno(0.060)  polyptychus-trilineatus(0.052)  
dysphania-percota             : [32mdysphania-percota[0m(0.871)  nyctemera-cenis(0.099)  nyctemera-carissima(0.013)  
eupterote-undata              : [32meupterote-undata[0m(0.996)  eupterote-spp(0.003)  hypopyra-vespertilio(0.000)  
hippotion-rosetta-2           : [32mhippotion-rosetta[0m(0.621)  hippotion-boerhaviae(0.226)  cechetra-minor(0.127)  
hippotion-rosetta             : hippotion-boerhaviae(0.282)  [32mhippotion-rosetta[0m(0.206)  cechetra-minor(0.196)  
----------
Top   1 accuracy: 3/5 -> 60.00%, genus matched: 4/5 -> 80.00%
Top   3 accuracy: 4/5 -> 80.00%, genus matched: 4/5 -> 80.00%


### Model E (resnet-101 + early & imago data ++more data)
Few imago classes have early stage data mixed now

In [39]:
training_params = [
    { "idx": 1, "robustness": 0.2, "break_at_val_acc_diff": 0.05},
    { "idx": 2, "robustness": 0.5, "break_at_val_acc_diff": 0.02},
    { "idx": 3, "robustness": 1.0, "break_at_val_acc_diff": 0.01},
    { "idx": 4, "robustness": 2.0, "break_at_val_acc_diff": -0.000001},
    { "idx": 5, "robustness": 2.0, "break_at_val_acc_diff": -0.000001}
]
for param in training_params:
    print(f"Phase {param["idx"]}:")
    if param["idx"] == 1:
        model_data = init_model_for_training(f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                             batch_size=32, arch="resnet101", image_size=224, robustness=param["robustness"],
                                             lr=1e-4, weight_decay=1e-4, silent=True)
    else:
        model_data = prepare_for_retraining(model_data, f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                            batch_size=32, image_size=224, robustness=param["robustness"], silent=True)
    train(model_data, 5, f"{dataset_dir}/checkpoint.moth.te.ep{param["idx"]:02}###.pth", 
          break_at_val_acc_diff=param["break_at_val_acc_diff"])

Phase 1:
Epoch    1 /    5  | Train Loss: 2.3807 Acc: 0.5738  | Val Loss: 0.8659 Acc: 0.7727  | Elapsed time: 0:40:55.912502
Epoch    2 /    5  | Train Loss: 0.7553 Acc: 0.8157  | Val Loss: 0.8122 Acc: 0.8442  | Elapsed time: 1:21:18.042499
Epoch    3 /    5  | Train Loss: 0.5209 Acc: 0.8662  | Val Loss: 0.7873 Acc: 0.8831  | Elapsed time: 1:59:31.558426
Phase 2:
Epoch    1 /    5  | Train Loss: 1.2878 Acc: 0.7286  | Val Loss: 0.7719 Acc: 0.8377  | Elapsed time: 0:40:21.693424
Epoch    2 /    5  | Train Loss: 1.0948 Acc: 0.7660  | Val Loss: 0.7964 Acc: 0.8182  | Elapsed time: 1:22:50.093711
Phase 3:
Epoch    1 /    5  | Train Loss: 1.0765 Acc: 0.7684  | Val Loss: 0.6974 Acc: 0.8506  | Elapsed time: 0:40:58.693259
Epoch    2 /    5  | Train Loss: 1.0323 Acc: 0.7781  | Val Loss: 0.7410 Acc: 0.8442  | Elapsed time: 1:21:44.937611
Phase 4:
Epoch    1 /    5  | Train Loss: 0.8803 Acc: 0.8103  | Val Loss: 0.6835 Acc: 0.8571  | Elapsed time: 0:40:39.551101
Epoch    2 /    5  | Train Loss: 0.7

In [23]:
model_data = torch.load(f"{dataset_dir}/checkpoint.moth.te.ep050000.pth", weights_only=False)

In [24]:
test_top_k(model_data, f"{dataset_dir}/../moth/my-test", 3, print_preds=True, print_top1_accuracy=True, print_no_match=False)

amyna-axis-2             : [32mamyna-axis[0m(0.875)  imma-mylias(0.053)  rivula-spp(0.033)  
amyna-axis               : [32mamyna-axis[0m(0.956)  amyna-punctum(0.028)  amyna-spp(0.011)  
apona-caschmirensis      : [32mapona-caschmirensis[0m(0.336)  marumba-cristata(0.286)  polyptychus-dentatus(0.107)  
dysphania-percota        : [32mdysphania-percota[0m(0.993)  dysphania-palmyra(0.001)  nyctemera-cenis(0.001)  
eupterote-undata-2       : [32meupterote-undata[0m(0.491)  hamodes-propitia(0.423)  eupterote-gardneri(0.050)  
eupterote-undata         : [32meupterote-undata[0m(0.998)  eupterote-spp(0.001)  antheraea-frithi(0.000)  
hippotion-rosetta-2      : [32mhippotion-rosetta[0m(0.422)  hippotion-boerhaviae(0.398)  cechetra-minor(0.130)  
hippotion-rosetta        : [32mhippotion-rosetta[0m(0.419)  hippotion-boerhaviae(0.214)  cechetra-minor(0.134)  
orgyia-postica-early     : [32morgyia-postica[0m(0.962)  palirisa-cervina-early(0.036)  calliteara-grotei(0.001)  
scirpop

In [57]:
test_top_k(model_data, f"{dataset_dir}/../moth/random-test", 3, print_preds=False, print_top1_accuracy=True, print_no_match=False)
test_top_k(model_data, f"{dataset_dir}/../moth/random-test", 5, print_preds=False, print_top1_accuracy=False)
test_top_k(model_data, f"{dataset_dir}/../moth/random-test", 10, print_preds=False, print_top1_accuracy=False)

Top   1 accuracy: 130/152 -> 85.53%, genus matched: 145/152 -> 95.39%
Top   3 accuracy: 147/152 -> 96.71%, genus matched: 148/152 -> 97.37%
Top   5 accuracy: 147/152 -> 96.71%, genus matched: 148/152 -> 97.37%
Top  10 accuracy: 148/152 -> 97.37%, genus matched: 148/152 -> 97.37%


### Model F (resnet-152 + early & imago data ++more data)
Few imago classes have early stage data mixed

In [46]:
training_params = [
    { "idx": 1, "robustness": 0.2, "break_at_val_acc_diff": 0.05},
    { "idx": 2, "robustness": 0.5, "break_at_val_acc_diff": 0.02},
    { "idx": 3, "robustness": 1.0, "break_at_val_acc_diff": 0.01},
    { "idx": 4, "robustness": 2.0, "break_at_val_acc_diff": -0.000001},
    { "idx": 5, "robustness": 2.0, "break_at_val_acc_diff": -0.000001}
]
for param in training_params:
    print(f"Phase {param["idx"]}:")
    if param["idx"] == 1:
        model_data = init_model_for_training(f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                             batch_size=32, arch="resnet152", image_size=224, robustness=param["robustness"],
                                             lr=1e-4, weight_decay=1e-4, silent=True)
    else:
        model_data = prepare_for_retraining(model_data, f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                            batch_size=32, image_size=224, robustness=param["robustness"], silent=True)
    train(model_data, 5, f"{dataset_dir}/checkpoint.moth.tf.ep{param["idx"]:02}###.pth", 
          break_at_val_acc_diff=param["break_at_val_acc_diff"])

Phase 1:
Epoch    1 /    5  | Train Loss: 2.2407 Acc: 0.5923  | Val Loss: 0.7868 Acc: 0.8182  | Elapsed time: 0:45:06.168741
Epoch    2 /    5  | Train Loss: 0.7329 Acc: 0.8204  | Val Loss: 0.8211 Acc: 0.7857  | Elapsed time: 1:29:59.231360
Phase 2:
Epoch    1 /    5  | Train Loss: 1.3194 Acc: 0.7225  | Val Loss: 0.8192 Acc: 0.8117  | Elapsed time: 0:46:24.276674
Epoch    2 /    5  | Train Loss: 1.1254 Acc: 0.7590  | Val Loss: 0.7811 Acc: 0.7987  | Elapsed time: 1:32:44.225132
Phase 3:
Epoch    1 /    5  | Train Loss: 1.0869 Acc: 0.7655  | Val Loss: 0.7336 Acc: 0.8377  | Elapsed time: 0:46:25.491660
Epoch    2 /    5  | Train Loss: 1.0394 Acc: 0.7773  | Val Loss: 0.7838 Acc: 0.7987  | Elapsed time: 1:32:46.558563
Phase 4:
Epoch    1 /    5  | Train Loss: 1.0764 Acc: 0.7684  | Val Loss: 0.7295 Acc: 0.8117  | Elapsed time: 0:46:08.965056
Epoch    2 /    5  | Train Loss: 0.8263 Acc: 0.8230  | Val Loss: 0.6484 Acc: 0.8571  | Elapsed time: 1:35:55.934073
Epoch    3 /    5  | Train Loss: 0.7

In [9]:
model_data = torch.load(f"{dataset_dir}/checkpoint.moth.tf.ep050000.pth", weights_only=False)

In [10]:
test_top_k(model_data, f"{dataset_dir}/test", 3, print_preds=True, print_top1_accuracy=True, print_no_match=False)
test_top_k(model_data, f"{dataset_dir}/test", 5, print_preds=False, print_top1_accuracy=False)
test_top_k(model_data, f"{dataset_dir}/test", 10, print_preds=False, print_top1_accuracy=False)

amyna-axis-2             : [32mamyna-axis[0m(0.869)  amyna-spp(0.069)  amyna-punctum(0.040)  
amyna-axis               : [32mamyna-axis[0m(0.727)  athetis-lineosa(0.138)  amyna-punctum(0.083)  
apona-caschmirensis      : [32mapona-caschmirensis[0m(0.393)  polyptychus-trilineatus(0.283)  polyptychus-dentatus(0.243)  
dysphania-percota        : [32mdysphania-percota[0m(0.999)  dysphania-percota-early(0.000)  speiredonia-mutabilis(0.000)  
eupterote-undata-2       : [32meupterote-undata[0m(0.501)  hamodes-propitia(0.200)  eupterote-gardneri(0.187)  
eupterote-undata         : [32meupterote-undata[0m(0.987)  eupterote-spp(0.010)  eupterote-mollifera(0.001)  
hippotion-rosetta-2      : hippotion-boerhaviae(0.451)  [32mhippotion-rosetta[0m(0.414)  cechetra-minor(0.047)  
hippotion-rosetta        : [32mhippotion-rosetta[0m(0.579)  hippotion-boerhaviae(0.335)  hippotion-spp(0.052)  
orgyia-postica-early     : [32morgyia-postica[0m(0.997)  palirisa-cervina-early(0.001)  nygmia

In [11]:
test_top_k(model_data, f"{dataset_dir}/test2", 3, print_preds=False, print_top1_accuracy=True, print_no_match=False)
test_top_k(model_data, f"{dataset_dir}/test2", 5, print_preds=False, print_top1_accuracy=False)
test_top_k(model_data, f"{dataset_dir}/test2", 10, print_preds=False, print_top1_accuracy=False)

Top   1 accuracy: 130/151 -> 86.09%, genus matched: 146/151 -> 96.69%
Top   3 accuracy: 147/151 -> 97.35%, genus matched: 147/151 -> 97.35%
Top   5 accuracy: 147/151 -> 97.35%, genus matched: 147/151 -> 97.35%
Top  10 accuracy: 148/151 -> 98.01%, genus matched: 148/151 -> 98.01%


In [12]:
pred = validate_prediction_in_dir_top_k(f"{dataset_dir}/val", model_data, 3)
print (f"Top 3 accuracy: {pred['success']}/{pred['total']} -> {100*pred['success']/pred['total']:.2f}%")

Top 3 accuracy: 150/154 -> 97.40%


------------
More data added. refer dataset-analysis2.ipynb
### Updated Dataset Stats

In [7]:
classes = { class_dir: len([ img for img in os.listdir(f"{dataset_dir}/data/{class_dir}") ]) for class_dir in os.listdir(f"{dataset_dir}/data") }
early_classes = { class_name: count for class_name, count in classes.items() if re.match(early_regex, class_name) }
unidentified_classes = { class_name: count for class_name, count in classes.items() if re.match(unidentified_regex, class_name) }
print(f"Total Class count : {len(classes):6} ( Unidentified: {len(unidentified_classes):6} / Early-stage: {len(early_classes):6} / Identified-adult: {len(classes) - len(unidentified_classes) - len(early_classes):6} )")
print(f"Total  Data count : {sum(classes.values()):6} ( Unidentified: {sum(unidentified_classes.values()):6} / Early-stage: {sum(early_classes.values()):6} / Identified-adult: {sum(classes.values()) - sum(unidentified_classes.values()) - sum(early_classes.values()):6} )")

Total Class count :   3308 ( Unidentified:      0 / Early-stage:    514 / Identified-adult:   2794 )
Total  Data count : 183194 ( Unidentified:      0 / Early-stage:  12730 / Identified-adult: 170464 )


In [8]:
img2_class = []
img5_class = []
for class_dir in os.listdir(f"{dataset_dir}/data"):
    if not re.match(early_or_unidentified_regex, class_dir):
        img_cnt = sum([1 for file in os.listdir(f"{dataset_dir}/data/{class_dir}")])
        img2_class += [class_dir] if img_cnt <= 2 else []
        img5_class += [class_dir] if img_cnt <= 5 else []
print(f"{len(img2_class):6} classes with <=2 images")
print(f"{len(img5_class):6} classes with <=5 images")

   140 classes with <=2 images
   375 classes with <=5 images


### Model G (resnet-152 + image transform pipeline fixed) ***

In [4]:
training_params = [
    { "idx": 1, "robustness": 0.2, "break_at_val_acc_diff": 0.05},
    { "idx": 2, "robustness": 0.5, "break_at_val_acc_diff": 0.02},
    { "idx": 3, "robustness": 1.0, "break_at_val_acc_diff": 0.01},
    { "idx": 4, "robustness": 2.0, "break_at_val_acc_diff": -0.000001},
    { "idx": 5, "robustness": 2.0, "break_at_val_acc_diff": -0.000001},
    { "idx": 6, "robustness": 2.0, "break_at_val_acc_diff": -0.000001}
]
start_time = time.time()
print("Started at:", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
for param in training_params:
    print(f"Phase {param["idx"]}:")
    if param["idx"] == 1:
        model_data = init_model_for_training(f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                             batch_size=32, arch="resnet152", image_size=224, robustness=param["robustness"],
                                             lr=1e-4, weight_decay=1e-4, silent=True)
    else:
        model_data = prepare_for_retraining(model_data, f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                            batch_size=32, image_size=224, robustness=param["robustness"], silent=True)
    train(model_data, 5, f"{dataset_dir}/checkpoint.moth.tg.ep{param["idx"]:02}###.pth", 
          break_at_val_acc_diff=param["break_at_val_acc_diff"])
    print(f"Total elapsed time: {datetime.timedelta(seconds=(time.time() - start_time))}")

Started at: 2025-03-26 09:02:53
Phase 1:
Epoch    0 /    4  | Train Loss: 2.1013 Acc: 0.6106  | Val Loss: 0.7047 Acc: 0.8312  | Elapsed time: 1:04:05.722466
Epoch    1 /    4  | Train Loss: 0.7105 Acc: 0.8198  | Val Loss: 0.7612 Acc: 0.8312  | Elapsed time: 1:54:01.203207
Total elapsed time: 1:54:08.495434
Phase 2:
Epoch    0 /    4  | Train Loss: 1.3300 Acc: 0.7220  | Val Loss: 0.6646 Acc: 0.8571  | Elapsed time: 0:55:59.747873
Epoch    1 /    4  | Train Loss: 1.1772 Acc: 0.7506  | Val Loss: 0.7088 Acc: 0.8377  | Elapsed time: 2:00:14.155372
Total elapsed time: 3:54:24.698940
Phase 3:
Epoch    0 /    4  | Train Loss: 1.1926 Acc: 0.7489  | Val Loss: 0.6448 Acc: 0.8506  | Elapsed time: 1:01:10.019170
Epoch    1 /    4  | Train Loss: 1.1433 Acc: 0.7594  | Val Loss: 0.5997 Acc: 0.8506  | Elapsed time: 1:57:44.231334
Total elapsed time: 5:52:10.947485
Phase 4:
Epoch    0 /    4  | Train Loss: 1.1739 Acc: 0.7526  | Val Loss: 0.6294 Acc: 0.8377  | Elapsed time: 0:58:19.141459
Epoch    1 /   

In [4]:
model_data = torch.load(f"{dataset_dir}/checkpoint.moth.tg.ep060001.pth", weights_only=False)

In [5]:
test_top_k(model_data, f"{dataset_dir}/test", 3, print_preds=True, print_top1_accuracy=True, print_no_match=False)
test_top_k(model_data, f"{dataset_dir}/test", 5, print_preds=False, print_top1_accuracy=False)
test_top_k(model_data, f"{dataset_dir}/test", 10, print_preds=False, print_top1_accuracy=False)

amyna-axis-2             : [32mamyna-axis[0m(0.849)  amyna-punctum(0.115)  athetis-lineosa(0.017)  
amyna-axis               : [32mamyna-axis[0m(0.783)  amyna-punctum(0.188)  athetis-lineosa(0.027)  
apona-caschmirensis-2    : dalima-truncataria(0.419)  [32mapona-caschmirensis[0m(0.159)  thyas-coronata(0.143)  
apona-caschmirensis      : [32mapona-caschmirensis[0m(0.653)  apona-shevaroyensis(0.090)  dalima-truncataria(0.044)  
dysphania-percota        : [32mdysphania-percota[0m(0.998)  dysphania-percota-early(0.001)  dysphania-palmyra(0.001)  
eupterote-undata-2       : hamodes-propitia(0.702)  [32meupterote-undata[0m(0.271)  eupterote-gardneri(0.007)  
eupterote-undata         : [32meupterote-undata[0m(0.997)  eupterote-patula(0.002)  cricula-trifenestrata(0.000)  
graphium-agamemnon       : erebus-ephesperis(0.392)  nausinoe-pueritia(0.062)  nyctemera-adversata(0.055)  
hippotion-rosetta-2      : [32mhippotion-rosetta[0m(0.406)  hippotion-boerhaviae(0.354)  cechetra-m

In [15]:
test_top_k(model_data, f"{dataset_dir}/test2", 3, print_preds=False, print_top1_accuracy=True, print_no_match=False)
test_top_k(model_data, f"{dataset_dir}/test2", 5, print_preds=False, print_top1_accuracy=False)
test_top_k(model_data, f"{dataset_dir}/test2", 10, print_preds=False, print_top1_accuracy=False)

Top   1 accuracy: 132/151 -> 87.42%, genus matched: 147/151 -> 97.35%
Top   3 accuracy: 149/151 -> 98.68%, genus matched: 149/151 -> 98.68%
Top   5 accuracy: 149/151 -> 98.68%, genus matched: 149/151 -> 98.68%
Top  10 accuracy: 149/151 -> 98.68%, genus matched: 149/151 -> 98.68%


In [16]:
pred = validate_prediction_in_dir_top_k(f"{dataset_dir}/val", model_data, 3)
print (f"Top 3 accuracy: {pred['success']}/{pred['total']} -> {100*pred['success']/pred['total']:.2f}%")

Top 3 accuracy: 152/154 -> 98.70%


### Model H (resnet-101 + image transform pipeline fixed)

In [17]:
training_params = [
    { "idx": 1, "robustness": 0.2, "break_at_val_acc_diff": 0.05},
    { "idx": 2, "robustness": 0.5, "break_at_val_acc_diff": 0.02},
    { "idx": 3, "robustness": 1.0, "break_at_val_acc_diff": 0.01},
    { "idx": 4, "robustness": 2.0, "break_at_val_acc_diff": -0.000001},
    { "idx": 5, "robustness": 2.0, "break_at_val_acc_diff": -0.000001},
    { "idx": 6, "robustness": 2.0, "break_at_val_acc_diff": -0.000001}
]
start_time = time.time()
print("Started at:", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
for param in training_params:
    print(f"Phase {param["idx"]}:")
    if param["idx"] == 1:
        model_data = init_model_for_training(f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                             batch_size=32, arch="resnet101", image_size=224, robustness=param["robustness"],
                                             lr=1e-4, weight_decay=1e-4, silent=True)
    else:
        model_data = prepare_for_retraining(model_data, f'{dataset_dir}/data', f'{dataset_dir}/val', 
                                            batch_size=32, image_size=224, robustness=param["robustness"], silent=True)
    train(model_data, 5, f"{dataset_dir}/checkpoint.moth.th.ep{param["idx"]:02}###.pth", 
          break_at_val_acc_diff=param["break_at_val_acc_diff"])
    print(f"Total elapsed time: {datetime.timedelta(seconds=(time.time() - start_time))}")

Started at: 2025-03-27 04:56:36
Phase 1:


Downloading: "https://download.pytorch.org/models/resnet101-cd907fc2.pth" to C:\Users\rakes/.cache\torch\hub\checkpoints\resnet101-cd907fc2.pth
100.0%


Epoch    0 /    4  | Train Loss: 2.1664 Acc: 0.6003  | Val Loss: 0.8757 Acc: 0.7987  | Elapsed time: 0:59:52.722753
Epoch    1 /    4  | Train Loss: 0.7369 Acc: 0.8139  | Val Loss: 0.7640 Acc: 0.8052  | Elapsed time: 1:45:44.064755
Total elapsed time: 1:46:13.595109
Phase 2:
Epoch    0 /    4  | Train Loss: 1.3533 Acc: 0.7170  | Val Loss: 0.7684 Acc: 0.8312  | Elapsed time: 0:53:03.180269
Epoch    1 /    4  | Train Loss: 1.2025 Acc: 0.7451  | Val Loss: 0.8157 Acc: 0.8312  | Elapsed time: 1:41:28.953513
Total elapsed time: 3:27:44.398430
Phase 3:
Epoch    0 /    4  | Train Loss: 1.2168 Acc: 0.7450  | Val Loss: 0.8137 Acc: 0.8182  | Elapsed time: 0:52:58.933432
Epoch    1 /    4  | Train Loss: 1.1582 Acc: 0.7549  | Val Loss: 0.8676 Acc: 0.8117  | Elapsed time: 1:43:49.781488
Total elapsed time: 5:11:36.163043
Phase 4:
Epoch    0 /    4  | Train Loss: 1.1993 Acc: 0.7482  | Val Loss: 0.7673 Acc: 0.8247  | Elapsed time: 0:53:00.436832
Epoch    1 /    4  | Train Loss: 0.9765 Acc: 0.7943  | V

In [18]:
model_data = torch.load(f"{dataset_dir}/checkpoint.moth.th.ep060001.pth", weights_only=False)

In [19]:
test_top_k(model_data, f"{dataset_dir}/test", 3, print_preds=False, print_top1_accuracy=True, print_no_match=True)
test_top_k(model_data, f"{dataset_dir}/test", 5, print_preds=False, print_top1_accuracy=False)
test_top_k(model_data, f"{dataset_dir}/test", 10, print_preds=False, print_top1_accuracy=False)

unidentified-moth-2: 
	 [31morgyia-postica-early[0m(0.448)  [31mcifuna-locuples-early[0m(0.226)  [31molene-mendosa[0m(0.155)  
unidentified-moth-3a: 
	 [31mnoctuides-melanophia[0m(0.119)  [31mcleora-injectaria[0m(0.087)  [31mzitha-tactilis[0m(0.050)  
unidentified-moth-3b: 
	 [31mpolydesma-boarmoides[0m(0.272)  [31mlocastra-muscosalis[0m(0.204)  [31maporodes-floralis[0m(0.155)  
unidentified-moth-4a: 
	 [31mhypomecis-transcissa[0m(0.301)  [31mcleora-injectaria[0m(0.219)  [31mbiston-betularia[0m(0.124)  
unidentified-moth-4b: 
	 [31mcleora-injectaria[0m(0.354)  [31mcusiala-boarmoides[0m(0.105)  [31mhypomecis-infixaria[0m(0.092)  
unidentified-moth-5: 
	 [31mlaelia-exclamationis[0m(0.432)  [31mmythimna-separata[0m(0.205)  [31mspodoptera-mauritia[0m(0.084)  
Top   1 accuracy: 8/11 -> 72.73%, genus matched: 10/11 -> 90.91%
Top   3 accuracy: 11/11 -> 100.00%, genus matched: 11/11 -> 100.00%
Top   5 accuracy: 11/11 -> 100.00%, genus matched: 11/11 -> 100.0

In [20]:
test_top_k(model_data, f"{dataset_dir}/test2", 3, print_preds=False, print_top1_accuracy=True, print_no_match=False)
test_top_k(model_data, f"{dataset_dir}/test2", 5, print_preds=False, print_top1_accuracy=False)
test_top_k(model_data, f"{dataset_dir}/test2", 10, print_preds=False, print_top1_accuracy=False)

Top   1 accuracy: 133/151 -> 88.08%, genus matched: 145/151 -> 96.03%
Top   3 accuracy: 145/151 -> 96.03%, genus matched: 146/151 -> 96.69%
Top   5 accuracy: 146/151 -> 96.69%, genus matched: 147/151 -> 97.35%
Top  10 accuracy: 148/151 -> 98.01%, genus matched: 149/151 -> 98.68%


In [21]:
pred = validate_prediction_in_dir_top_k(f"{dataset_dir}/val", model_data, 3)
print (f"Top 3 accuracy: {pred['success']}/{pred['total']} -> {100*pred['success']/pred['total']:.2f}%")

Top 3 accuracy: 148/154 -> 96.10%
