# Target
1. Try with cured data

# Prerequisites

In [2]:
import mynnlib
from mynnlib import *

dataset_dir = "insect-dataset/moth"

# Cure dataset manually
1. larva, pupa & egg moved to separated class
2. mossaic images or images containing multiple subject discarded or split
3. unclear images discarded
4. images with too small subject cropped
7. <strong>Pending:</strong> To check what to do *-spp classes (seem to contain mixed species under same genus)

In [28]:
for dataset in ["data", "cured-data", "discarded"]:
    for species_dir in Path(f"{dataset_dir}/{dataset}").iterdir():
        if species_dir.is_dir():
            new_name = species_dir.name.lower()
            new_name_2 = re.sub(r"(%20.*)|(-group$)", "", new_name)
            if new_name != new_name_2:
                print(f"Renaming {species_dir} to {new_name_2}")
                new_name = new_name_2
            Path(species_dir).rename(f"{dataset_dir}/{dataset}/{new_name}")

Renaming insect-dataset\moth\data\rinaca-grotei-species-group to rinaca-grotei-species
Renaming insect-dataset\moth\data\synegiodes-diffusifascia-obliquifascia-group to synegiodes-diffusifascia-obliquifascia
Renaming insect-dataset\moth\cured-data\rinaca-grotei-species-group to rinaca-grotei-species
Renaming insect-dataset\moth\cured-data\synegiodes-diffusifascia-obliquifascia-group to synegiodes-diffusifascia-obliquifascia


In [2]:
print(f"Original     image count: {image_count(f"{dataset_dir}/data"):6},    class count: {class_count(f"{dataset_dir}/data"):6}")
print(f"Cured        image count: {image_count(f"{dataset_dir}/cured-data"):6},    class count: {class_count(f"{dataset_dir}/cured-data"):6}")
print(f"Discarded    image count: {image_count(f"{dataset_dir}/discarded"):6}")

Original     image count:  44325,    class count:   3051
Cured        image count:  43400,    class count:   3418
Discarded    image count:    914


In [3]:
print(f"Moth class count  : {class_count(f"{dataset_dir}/cured-data", r"^(?!.+(?:-(larva|pupa|egg))$).+$"):6}")
print(f"Larva class count : {class_count(f"{dataset_dir}/cured-data", r"^.+-larva$"):6}")
print(f"Pupa class count  : {class_count(f"{dataset_dir}/cured-data", r"^.+-pupa$"):6}")
print(f"Egg class count   : {class_count(f"{dataset_dir}/cured-data", r"^.+-egg$"):6}")

Moth class count  :   3051
Larva class count :    197
Pupa class count  :    150
Egg class count   :     20


-------------------

# A) Train 25 epochs with a subset
to see if curing data is effective

In [2]:
split_data_for_train_and_val(f"{dataset_dir}/cured-data", 
                             f"{dataset_dir}/splits/test", f"{dataset_dir}/splits/val", f"{dataset_dir}/splits/train", 
                             test_data_weight=0.05, val_data_weight=0.1, min_file_cnt_for_val=4, 
                             class_name_filter_regex=r"^[a-c].*$")

Class count: 995
Training data count: 10859
Validation data count: 1210
Test data count: 568


In [3]:
model_data = init_model_for_training(f'{dataset_dir}/splits/train', f'{dataset_dir}/splits/val', batch_size=32, arch="resnet152", image_size=224)

train class count: 995
val class count: 459
feature count: 2048
device: cuda:0


In [6]:
train(model_data, 25, f"{dataset_dir}/checkpoint_latest.pth")
shutil.copy(f"{dataset_dir}/checkpoint_latest.pth", f"{dataset_dir}/checkpoint.resnet152.{datetime.datetime.now().strftime("%Y.%m.%d")}.subset.pth")

Epoch    1 /   25  | Train Loss: 5.2367 Acc: 0.1386  | Val Loss: 4.0382 Acc: 0.2281  | Elapsed time: 0:03:50.423420
Epoch    2 /   25  | Train Loss: 3.0725 Acc: 0.3604  | Val Loss: 2.7407 Acc: 0.3934  | Elapsed time: 0:06:37.804391
Epoch    3 /   25  | Train Loss: 1.8328 Acc: 0.5474  | Val Loss: 2.3702 Acc: 0.4653  | Elapsed time: 0:09:25.967207
Epoch    4 /   25  | Train Loss: 1.1282 Acc: 0.6995  | Val Loss: 1.8240 Acc: 0.5628  | Elapsed time: 0:12:14.168783
Epoch    5 /   25  | Train Loss: 0.7427 Acc: 0.7899  | Val Loss: 2.3665 Acc: 0.5430  | Elapsed time: 0:15:01.693821
Epoch    6 /   25  | Train Loss: 0.5075 Acc: 0.8507  | Val Loss: 1.7807 Acc: 0.6157  | Elapsed time: 0:17:49.623621
Epoch    7 /   25  | Train Loss: 0.3627 Acc: 0.8930  | Val Loss: 1.6757 Acc: 0.6603  | Elapsed time: 0:20:37.777056
Epoch    8 /   25  | Train Loss: 0.1400 Acc: 0.9632  | Val Loss: 1.3546 Acc: 0.7281  | Elapsed time: 0:23:28.722778
Epoch    9 /   25  | Train Loss: 0.0763 Acc: 0.9841  | Val Loss: 1.3383 

'insect-dataset/moth/checkpoint.resnet152.2025.01.29.subset.pth'

In [7]:
test(model_data, f"{dataset_dir}/splits/test", False)

Accuracy: 448 / 568 -> 78.87%
Elapsed time: 0:00:21.461196


In [8]:
test_top_k(model_data, f"{dataset_dir}/splits/my-test", 4)

acidon-nigrobasis        : bradina-spp(0.750)  achaea-janata(0.104)  aphendala-spp(0.029)  aroa-spp(0.026)  
Adoxophyes-privatana     : agathodes-ostentalis(0.888)  adoxophyes-spp(0.030)  archips-spp(0.027)  chalcoscelides-castaneipars(0.014)  
alcanola-speideli        : alcanola-tympanistis(1.000)  casminola-spp(0.000)  achaea-janata(0.000)  araeopteron-proleuca(0.000)  
artena-dotata-2          : artena-dotata(0.980)  calyptra-spp(0.006)  artena-submira(0.002)  botyodes-asialis(0.002)  
artena-dotata            : artena-dotata(1.000)  chilkasa-falcata(0.000)  calyptra-spp(0.000)  artena-submira(0.000)  
artena-submira-2         : achaea-janata(0.936)  artena-dotata(0.052)  crithote-spp(0.008)  artena-submira(0.001)  
artena-submira           : artena-submira(0.797)  carea-angulata(0.123)  artena-dotata(0.029)  aiteta-musculina(0.019)  
----------
Top 4 accuracy: 4 / 7 -> 0.571
Top 1 accuracy: 3 / 7 -> 0.429


In [9]:
test_top_k(model_data, f"{dataset_dir}/splits/my-test", 10, print_preds=False)

Top 10 accuracy: 5 / 7 -> 0.714
Top 1 accuracy: 3 / 7 -> 0.429


---------------------

# B.1) Train 25 epochs with 85% data

In [6]:
split_data_for_train_and_val(f"{dataset_dir}/cured-data", 
                             f"{dataset_dir}/splits/test", f"{dataset_dir}/splits/val", f"{dataset_dir}/splits/train", 
                             test_data_weight=0.05, val_data_weight=0.1, min_file_cnt_for_val=4)

Class count: 3418
Total data count: 43400
Training data count: 37121
Validation data count: 4156
Test data count: 2123


In [7]:
model_data = init_model_for_training(f'{dataset_dir}/splits/train', f'{dataset_dir}/splits/val', batch_size=32, arch="resnet152", image_size=224)

train class count: 3418
val class count: 1497
feature count: 2048
device: cuda:0


In [8]:
train(model_data, 25, f"{dataset_dir}/checkpoint_latest.pth")
shutil.copy(f"{dataset_dir}/checkpoint_latest.pth", f"{dataset_dir}/checkpoint.resnet152.{datetime.datetime.now().strftime("%Y.%m.%d")}.pth")

Epoch    1 /   25  | Train Loss: 6.7675 Acc: 0.0489  | Val Loss: 5.6538 Acc: 0.0972  | Elapsed time: 0:10:10.929575
Epoch    2 /   25  | Train Loss: 4.7947 Acc: 0.1798  | Val Loss: 4.0667 Acc: 0.2457  | Elapsed time: 0:20:32.012749
Epoch    3 /   25  | Train Loss: 3.1786 Acc: 0.3552  | Val Loss: 3.0163 Acc: 0.3953  | Elapsed time: 0:30:32.945827
Epoch    4 /   25  | Train Loss: 2.0249 Acc: 0.5286  | Val Loss: 2.4765 Acc: 0.4824  | Elapsed time: 0:40:38.863328
Epoch    5 /   25  | Train Loss: 1.3507 Acc: 0.6614  | Val Loss: 2.2484 Acc: 0.5371  | Elapsed time: 0:50:44.304387
Epoch    6 /   25  | Train Loss: 0.9185 Acc: 0.7590  | Val Loss: 1.9674 Acc: 0.5970  | Elapsed time: 1:00:40.249432
Epoch    7 /   25  | Train Loss: 0.6144 Acc: 0.8305  | Val Loss: 1.9842 Acc: 0.6071  | Elapsed time: 1:10:35.976399
Epoch    8 /   25  | Train Loss: 0.2617 Acc: 0.9320  | Val Loss: 1.5273 Acc: 0.7012  | Elapsed time: 1:20:36.299188
Epoch    9 /   25  | Train Loss: 0.1662 Acc: 0.9582  | Val Loss: 1.5249 

'insect-dataset/moth/checkpoint.resnet152.2025.01.30.pth'

In [9]:
test(model_data, f"{dataset_dir}/splits/test", False)

Accuracy: 1524 / 2123 -> 71.79%
Elapsed time: 0:01:07.207081


In [10]:
test_top_k(model_data, f"{dataset_dir}/my-test", 4)

acidon-nigrobasis        : metanastria-spp(0.599)  dichromia-sagitta(0.376)  westermannia-argentea(0.011)  euthrix-laeta(0.005)  
Adoxophyes-privatana     : eoophyla-spp(0.410)  parapoynx-bilinealis(0.136)  talanga-sexpunctalis(0.130)  limacodinae-genera-spp(0.043)  
alcanola-speideli        : alcanola-tympanistis(0.887)  nolinae-genera-spp(0.065)  alcanola-speideli(0.024)  nola-internella-analis-complex(0.007)  
artena-dotata-2          : artena-dotata(0.992)  simplicia-spp(0.006)  simplicia-schaldusalis(0.001)  simplicia-bimarginata(0.000)  
artena-dotata            : chilkasa-falcata(0.546)  artena-dotata(0.449)  simplicia-bimarginata(0.004)  bastilla-praetermissa(0.000)  
artena-submira-2         : achaea-janata(0.523)  buzara-onelia(0.307)  artena-submira(0.162)  artena-dotata(0.002)  
artena-submira           : artena-dotata(0.498)  artena-submira(0.162)  hulodes-caranea(0.147)  episparis-tortuosalis(0.096)  
clanis-phalaris-2        : clanis-phalaris(0.738)  clanis-undulosa(0.10

In [11]:
test_top_k(model_data, f"{dataset_dir}/my-test", 10, print_preds=False)

Top 10 accuracy: 17 / 25 -> 0.680
Top 1 accuracy: 9 / 25 -> 0.360


# B.2) Train 10 more epochs with 100% data 

In [22]:
model_data = torch.load(f"{dataset_dir}/checkpoint.resnet152.2025.01.30.subset.85%.pth", weights_only=False)

In [23]:
model_data = prepare_for_retraining(model_data, f'{dataset_dir}/cured-data', f'{dataset_dir}/splits/val')

train class count: 3418
val class count: 1497
0 new classes added: []
feature count: 2048
device: cuda:0


In [25]:
train(model_data, 10, f"{dataset_dir}/checkpoint_latest.pth")
shutil.copy(f"{dataset_dir}/checkpoint_latest.pth", f"{dataset_dir}/checkpoint.resnet152.{datetime.datetime.now().strftime("%Y.%m.%d")}.final.pth")

Epoch    1 /   10  | Train Loss: 0.2908 Acc: 0.9493  | Val Loss: 1.5148 Acc: 0.7310  | Elapsed time: 0:11:16.128621
Epoch    2 /   10  | Train Loss: 0.2859 Acc: 0.9500  | Val Loss: 1.4736 Acc: 0.7363  | Elapsed time: 0:22:29.273807
Epoch    3 /   10  | Train Loss: 0.2851 Acc: 0.9504  | Val Loss: 1.4532 Acc: 0.7418  | Elapsed time: 0:33:24.596322
Epoch    4 /   10  | Train Loss: 0.2815 Acc: 0.9505  | Val Loss: 1.4392 Acc: 0.7411  | Elapsed time: 0:44:20.138880
Epoch    5 /   10  | Train Loss: 0.2796 Acc: 0.9520  | Val Loss: 1.4510 Acc: 0.7416  | Elapsed time: 0:55:14.220325
Epoch    6 /   10  | Train Loss: 0.2776 Acc: 0.9509  | Val Loss: 1.4569 Acc: 0.7428  | Elapsed time: 1:06:08.409398
Epoch    7 /   10  | Train Loss: 0.2835 Acc: 0.9503  | Val Loss: 1.4415 Acc: 0.7404  | Elapsed time: 1:17:13.398654
Epoch    8 /   10  | Train Loss: 0.2787 Acc: 0.9515  | Val Loss: 1.4494 Acc: 0.7385  | Elapsed time: 1:28:23.434888
Epoch    9 /   10  | Train Loss: 0.2779 Acc: 0.9512  | Val Loss: 1.4318 

'insect-dataset/moth/checkpoint.resnet152.2025.01.30.final.pth'

In [26]:
test(model_data, f"{dataset_dir}/splits/test", False)

Accuracy: 1549 / 2123 -> 72.96%
Elapsed time: 0:01:04.509015


In [27]:
test_top_k(model_data, f"{dataset_dir}/my-test", 4)

acidon-nigrobasis        : metanastria-spp(0.652)  dichromia-sagitta(0.317)  euthrix-laeta(0.016)  westermannia-argentea(0.005)  
Adoxophyes-privatana     : parapoynx-bilinealis(0.214)  eoophyla-spp(0.183)  talanga-sexpunctalis(0.140)  westermannia-superba(0.054)  
alcanola-speideli        : alcanola-tympanistis(0.975)  nolinae-genera-spp(0.012)  alcanola-speideli(0.006)  casminola-spp(0.003)  
artena-dotata-2          : artena-dotata(0.998)  simplicia-spp(0.001)  simplicia-schaldusalis(0.001)  mocis-undata(0.000)  
artena-dotata            : artena-dotata(0.765)  chilkasa-falcata(0.231)  bastilla-praetermissa(0.002)  simplicia-bimarginata(0.001)  
artena-submira-2         : achaea-janata(0.489)  artena-submira(0.402)  buzara-onelia(0.092)  artena-dotata(0.010)  
artena-submira           : artena-dotata(0.394)  artena-submira(0.293)  episparis-tortuosalis(0.127)  hulodes-caranea(0.077)  
clanis-phalaris-2        : clanis-phalaris(0.834)  clanis-undulosa(0.082)  clanidopsis-exusta(0.037

In [28]:
test_top_k(model_data, f"{dataset_dir}/my-test", 10, print_preds=False)

Top 10 accuracy: 18 / 25 -> 0.720
Top 1 accuracy: 10 / 25 -> 0.400


------------------

# C) Train a new model for 25 epochs with 100% data
as 10 epochs with 100% was not much effective on an existing model

In [30]:
model_data = init_model_for_training(f'{dataset_dir}/cured-data', f'{dataset_dir}/splits/val', batch_size=32, arch="resnet152", image_size=224)

train class count: 3418
val class count: 1497
feature count: 2048
device: cuda:0


In [31]:
train(model_data, 25, f"{dataset_dir}/checkpoint_latest.pth")
shutil.copy(f"{dataset_dir}/checkpoint_latest.pth", f"{dataset_dir}/checkpoint.resnet152.{datetime.datetime.now().strftime("%Y.%m.%d")}.final2.pth")

Epoch    1 /   25  | Train Loss: 6.8505 Acc: 0.0372  | Val Loss: 5.4477 Acc: 0.1032  | Elapsed time: 0:10:59.910631
Epoch    2 /   25  | Train Loss: 5.0939 Acc: 0.1475  | Val Loss: 3.8136 Acc: 0.2538  | Elapsed time: 0:22:07.756667
Epoch    3 /   25  | Train Loss: 3.5369 Acc: 0.3106  | Val Loss: 2.3520 Acc: 0.4622  | Elapsed time: 0:33:06.047675
Epoch    4 /   25  | Train Loss: 2.3449 Acc: 0.4740  | Val Loss: 1.5299 Acc: 0.6299  | Elapsed time: 0:44:16.652515
Epoch    5 /   25  | Train Loss: 1.5730 Acc: 0.6142  | Val Loss: 0.9443 Acc: 0.7606  | Elapsed time: 0:55:22.436386
Epoch    6 /   25  | Train Loss: 1.0853 Acc: 0.7201  | Val Loss: 0.6718 Acc: 0.8219  | Elapsed time: 1:06:31.106933
Epoch    7 /   25  | Train Loss: 0.7697 Acc: 0.7933  | Val Loss: 0.6467 Acc: 0.8198  | Elapsed time: 1:17:35.935136
Epoch    8 /   25  | Train Loss: 0.3484 Acc: 0.9092  | Val Loss: 0.1566 Acc: 0.9622  | Elapsed time: 1:28:44.654650
Epoch    9 /   25  | Train Loss: 0.2454 Acc: 0.9377  | Val Loss: 0.1160 

'insect-dataset/moth/checkpoint.resnet152.2025.01.30.final2.pth'

In [32]:
test(model_data, f"{dataset_dir}/splits/test", False)

Accuracy: 2095 / 2123 -> 98.68%
Elapsed time: 0:01:00.806047


# Retest with new data

In [3]:
model_data = torch.load(f"{dataset_dir}/checkpoint.2025.01.30.resnet152.mothsofindia.cured.25x100%.pth", weights_only=False)

In [7]:
test_top_k(model_data, f"{dataset_dir}/my-test", 3)
test_top_k(model_data, f"{dataset_dir}/my-test", 5, print_preds=False, print_top1_accuracy=False)
test_top_k(model_data, f"{dataset_dir}/my-test", 10, print_preds=False, print_top1_accuracy=False)

acidon-nigrobasis             : chrysopera-combinans(0.251)  lebeda-nobilis(0.237)  miaromima-cornucopia(0.237)  
acosmeryx-anceus              : macroglossum-gyrans(0.803)  eudocima-phalonia(0.120)  cretonia-spp(0.020)  
Adoxophyes-privatana          : adoxophyes-spp(0.940)  westermannia-superba(0.032)  tortricidae-genera-spp(0.018)  
agnidra-vinacea               : agnidra-vinacea(0.907)  fascellina-inornata(0.040)  eupterote-spp(0.036)  
alcanola-speideli             : alcanola-tympanistis(0.924)  nola-spp(0.033)  nola-internella-analis-complex(0.015)  
alcanola-spp-2                : alcanola-tympanistis(0.924)  nola-spp(0.033)  nola-internella-analis-complex(0.015)  
alcanola-spp                  : ptisciana-seminivea(0.595)  alcanola-speideli(0.367)  spilosomina-genera-spp(0.008)  
alcanola-tympanistis-2        : ptisciana-seminivea(0.649)  calliteara-spp(0.165)  alcanola-speideli(0.135)  
alcanola-tympanistis          : alcanola-speideli(0.849)  nolinae-genera-spp(0.053)  neophe

### Overall seems worse than raw/uncured data