# Target
1. Try with cured data

# Prerequisites

In [1]:
import mynnlib
from mynnlib import *

dataset_dir = "insect-dataset/moth"

# Cure dataset manually
1. larva, pupa & egg moved to separated class
2. mossaic images or images containing multiple subject discarded or split
3. unclear images discarded
4. images with too small subject cropped
5. all directory/class names lowercased
6. directory/class names fixed (e.g. herpetogramma-rudis%20species-group renamed to herpetogramma-rudis)
7. <strong>Pending:</strong> To check what to do *-spp classes (seem to contain mixed species under same genus)

In [17]:
for dataset in ["data", "cured-data", "discarded"]:
    for species_dir in Path(f"{dataset_dir}/{dataset}").iterdir():
        if species_dir.is_dir():
            new_name = species_dir.name.lower()
            if "%20" in species_dir.name:
                new_name = re.sub(r"%20.*", "", new_name)
                print(f"Renaming {species_dir} to {new_name}")
            Path(species_dir).rename(f"{dataset_dir}/{dataset}/{new_name}")

Renaming insect-dataset\moth\data\herpetogramma-rudis%20species-group to herpetogramma-rudis


# Train and test with a cured data subset

In [2]:
split_data_for_train_and_val(f"{dataset_dir}/cured-data", 
                             f"{dataset_dir}/splits/test", f"{dataset_dir}/splits/val", f"{dataset_dir}/splits/train", 
                             test_data_weight=0.05, val_data_weight=0.1, min_file_cnt_for_val=4, 
                             class_name_filter_regex=r"^[a-c].*$")

Class count: 995
Training data count: 10859
Validation data count: 1210
Test data count: 568


In [3]:
model_data = init_model_for_training(f'{dataset_dir}/splits/train', f'{dataset_dir}/splits/val', batch_size=32, arch="resnet152", image_size=224)

train class count: 995
val class count: 459
feature count: 2048
device: cuda:0


In [6]:
train(model_data, 25, f"{dataset_dir}/checkpoint_latest.pth")
shutil.copy(f"{dataset_dir}/checkpoint_latest.pth", f"{dataset_dir}/checkpoint.resnet152.{datetime.datetime.now().strftime("%Y.%m.%d")}.subset.pth")

Epoch    1 /   25  | Train Loss: 5.2367 Acc: 0.1386  | Val Loss: 4.0382 Acc: 0.2281  | Elapsed time: 0:03:50.423420
Epoch    2 /   25  | Train Loss: 3.0725 Acc: 0.3604  | Val Loss: 2.7407 Acc: 0.3934  | Elapsed time: 0:06:37.804391
Epoch    3 /   25  | Train Loss: 1.8328 Acc: 0.5474  | Val Loss: 2.3702 Acc: 0.4653  | Elapsed time: 0:09:25.967207
Epoch    4 /   25  | Train Loss: 1.1282 Acc: 0.6995  | Val Loss: 1.8240 Acc: 0.5628  | Elapsed time: 0:12:14.168783
Epoch    5 /   25  | Train Loss: 0.7427 Acc: 0.7899  | Val Loss: 2.3665 Acc: 0.5430  | Elapsed time: 0:15:01.693821
Epoch    6 /   25  | Train Loss: 0.5075 Acc: 0.8507  | Val Loss: 1.7807 Acc: 0.6157  | Elapsed time: 0:17:49.623621
Epoch    7 /   25  | Train Loss: 0.3627 Acc: 0.8930  | Val Loss: 1.6757 Acc: 0.6603  | Elapsed time: 0:20:37.777056
Epoch    8 /   25  | Train Loss: 0.1400 Acc: 0.9632  | Val Loss: 1.3546 Acc: 0.7281  | Elapsed time: 0:23:28.722778
Epoch    9 /   25  | Train Loss: 0.0763 Acc: 0.9841  | Val Loss: 1.3383 

'insect-dataset/moth/checkpoint.resnet152.2025.01.29.subset.pth'

In [7]:
test(model_data, f"{dataset_dir}/splits/test", False)

Accuracy: 448 / 568 -> 78.87%
Elapsed time: 0:00:21.461196


In [8]:
test_top_k(model_data, f"{dataset_dir}/splits/my-test", 4)

acidon-nigrobasis        : bradina-spp(0.750)  achaea-janata(0.104)  aphendala-spp(0.029)  aroa-spp(0.026)  
Adoxophyes-privatana     : agathodes-ostentalis(0.888)  adoxophyes-spp(0.030)  archips-spp(0.027)  chalcoscelides-castaneipars(0.014)  
alcanola-speideli        : alcanola-tympanistis(1.000)  casminola-spp(0.000)  achaea-janata(0.000)  araeopteron-proleuca(0.000)  
artena-dotata-2          : artena-dotata(0.980)  calyptra-spp(0.006)  artena-submira(0.002)  botyodes-asialis(0.002)  
artena-dotata            : artena-dotata(1.000)  chilkasa-falcata(0.000)  calyptra-spp(0.000)  artena-submira(0.000)  
artena-submira-2         : achaea-janata(0.936)  artena-dotata(0.052)  crithote-spp(0.008)  artena-submira(0.001)  
artena-submira           : artena-submira(0.797)  carea-angulata(0.123)  artena-dotata(0.029)  aiteta-musculina(0.019)  
----------
Top 4 accuracy: 4 / 7 -> 0.571
Top 1 accuracy: 3 / 7 -> 0.429


In [9]:
test_top_k(model_data, f"{dataset_dir}/splits/my-test", 10, print_preds=False)

Top 10 accuracy: 5 / 7 -> 0.714
Top 1 accuracy: 3 / 7 -> 0.429


# Train and test with all cured data

In [12]:
split_data_for_train_and_val(f"{dataset_dir}/cured-data", 
                             f"{dataset_dir}/splits/test", f"{dataset_dir}/splits/val", f"{dataset_dir}/splits/train", 
                             test_data_weight=0.05, val_data_weight=0.1, min_file_cnt_for_val=4, 
                             class_name_filter_regex=r"^[a-k].*$")

Class count: 1810
Training data count: 19966
Validation data count: 2128
Test data count: 1135


In [None]:
model_data = init_model_for_training(f'{dataset_dir}/splits/train', f'{dataset_dir}/splits/val', batch_size=32, arch="resnet152", image_size=224)

In [None]:
train(model_data, 25, f"{dataset_dir}/checkpoint_latest.pth")
shutil.copy(f"{dataset_dir}/checkpoint_latest.pth", f"{dataset_dir}/checkpoint.resnet152.{datetime.datetime.now().strftime("%Y.%m.%d")}.pth")

In [None]:
test(model_data, f"{dataset_dir}/splits/test", False)

In [None]:
test_top_k(model_data, f"{dataset_dir}/my-test", 4)

In [None]:
test_top_k(model_data, f"{dataset_dir}/my-test", 10, print_preds=False)

# Train & test with all cured data on train phase only

In [None]:
model_data = init_model_for_training(f'{dataset_dir}/cured-data', f'{dataset_dir}/splits/val', batch_size=32, arch="resnet152", image_size=224)

In [None]:
train(model_data, 25, f"{dataset_dir}/checkpoint_latest.pth")
shutil.copy(f"{dataset_dir}/checkpoint_latest.pth", f"{dataset_dir}/checkpoint.resnet152.{datetime.datetime.now().strftime("%Y.%m.%d")}.pth")

In [None]:
test(model_data, f"{dataset_dir}/splits/test", False)

In [None]:
test_top_k(model_data, f"{dataset_dir}/my-test", 4)

In [None]:
test_top_k(model_data, f"{dataset_dir}/my-test", 10, print_preds=False)