In [1]:
from openml import tasks

from classes import EAGGA

from tqdm import tqdm

import os

In [2]:
oml_task_ids = [37, 43, 3903, 3904, 3913, 3918, 10093, 9946, 146819, 359955, 189922, 359962, 190392, 167120, 190137, 190410, 168350, 359975, 359972, 146820]
oml_tasks = tasks.get_tasks(oml_task_ids)
oml_tasks



[OpenML Classification Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
 Task ID..............: 37
 Task URL.............: https://www.openml.org/t/37
 Estimation Procedure.: crossvalidation
 Target Feature.......: class
 # of Classes.........: 2
 Cost Matrix..........: Available,
 OpenML Classification Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
 Task ID..............: 43
 Task URL.............: https://www.openml.org/t/43
 Estimation Procedure.: crossvalidation
 Target Feature.......: class
 # of Classes.........: 2
 Cost Matrix..........: Available,
 OpenML Classification Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
 Task ID..............: 3903
 Task URL.............: https://www.openml.org/t/3903
 Estimation Procedure.: crossvalidation
 Target Feature.......: c
 # of Classes.........: 2
 Cost Matrix..........: Available,
 OpenML Classification Task
 Ta

In [3]:
oml_datasets = [oml_task.get_dataset() for oml_task in oml_tasks]

class_column_in_X = list()
empty_y = list()
classes = list()
names = list()
for oml_dataset in tqdm(oml_datasets):
    class_column = oml_dataset.default_target_attribute
    X, y, _, _ = oml_dataset.get_data()

    class_column_in_X.append(class_column in X.columns)
    empty_y.append(y is None)

    names.append(oml_dataset.name)
    classes.append(X.loc[:, class_column].unique().tolist())
print(f'{all(class_column_in_X)}: {class_column_in_X}')  # X always already includes y
print(f'{all(empty_y)}: {empty_y}')  # y is always None

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [00:00<00:00, 23.37it/s]

True: [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
True: [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]





we see: X always includes the target and y is always None for our datasets

In [4]:
# define positive classes
positive_classes = ['tested_positive', '1', True, True, 'yes', True, '2', '2', '1', '2', '1', True, '1', '1', '2', '1', '2', 'Anomaly', '1', '2']
for name, cs, class_pos in zip(names, classes, positive_classes):
    print(f'{name}: {cs}, {class_pos}')

diabetes: ['tested_positive', 'tested_negative'], tested_positive
spambase: ['1', '0'], 1
pc3: [False, True], True
jm1: [False, True], True
kc2: ['no', 'yes'], yes
pc1: [False, True], True
banknote-authentication: ['1', '2'], 2
wdbc: ['2', '1'], 2
climate-model-simulation-crashes: ['0', '1'], 1
blood-transfusion-service-center: ['2', '1'], 2
gina: ['0', '1'], 1
kc1: [False, True], True
madeline: ['1', '0'], 1
numerai28.6: ['0', '1'], 1
ozone-level-8hr: ['1', '2'], 2
philippine: ['0', '1'], 1
phoneme: ['1', '2'], 2
Satellite: ['Anomaly', 'Normal'], Anomaly
sylvine: ['1', '0'], 1
wilt: ['2', '1'], 2


In [5]:
oml_task_diabetes = oml_tasks[0]
X, y, categorical_indicator, attribute_names = oml_task_diabetes.get_dataset().get_data()
Xy = X.copy()
Xy

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,tested_positive
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,tested_negative
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,tested_positive
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,tested_negative
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,tested_positive
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,tested_negative
764,2.0,122.0,70.0,27.0,0.0,36.8,0.340,27.0,tested_negative
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,tested_negative
766,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0,tested_positive


In [6]:
Xy.loc[:, 'class'].nunique() == 1

False

In [7]:
tmp, _, _, _ = tasks.get_task(3904).get_dataset().get_data()
tmp.isnull().values.any()

np.True_

In [8]:
hps = {
    'total_layers': (3, 10),
    'nodes_per_hidden_layer': (3, 20),
    'mu': 3,  # TODO: 100
    'lambda': 2,  # TODO: 10
    'holdout_train_size': 2/3,
    'cv_k': 5
}

batch_size = 16
patience = 2

secs_per_fold = 5
secs_total = 60

for (oml_dataset, class_positive) in zip(oml_datasets[:1], positive_classes[:1]):  # TODO: remove [:1]
    name = oml_dataset.name
    print(f'Dataset {name}')

    file_path = os.path.join('export', name)
    
    eagga = EAGGA(
        oml_dataset=oml_dataset,
        class_positive=class_positive,
        hps=hps,
        batch_size=batch_size,
        patience=patience,
        secs_per_fold=secs_per_fold,
        secs_total=secs_total,
        file_path=file_path
    )
    eagga.start_eagga()

Dataset diabetes
Starting init population
Finished init population
Start EAGGA at 2025-02-27T00:25:38.474836
Generation 1, evaluate 3 individuals
Running 5-fold CV for individual: 3 total layers, 5 nodes per hidden layer, gs: ([1, 3, 4, 5, 6, 7], [[[0, 2], 0]])
Stop early: 0.6164966945846875 < 0.616630290945371, epoch stop: 1
Fold 1/5 | trained for 1 epochs | metrics: {'loss': 0.6436928680964878, 'auc': np.float64(0.4838308457711443), 'nf': 0.25, 'ni': 0.03571428571428571, 'nnm': 0.25}
Stop early: 0.6068309843540192 < 0.6070238451162974, epoch stop: 3
Fold 2/5 | trained for 3 epochs | metrics: {'loss': 0.6518552218164716, 'auc': np.float64(0.4703565505804312), 'nf': 0.25, 'ni': 0.03571428571428571, 'nnm': 0.25}
Stop early: 0.6102500110864639 < 0.6104021767775217, epoch stop: 11
Fold 3/5 | trained for 11 epochs | metrics: {'loss': 0.6783761041504996, 'auc': np.float64(0.3956228956228956), 'nf': 0.25, 'ni': 0.03571428571428571, 'nnm': 0.25}
Stop early: 0.662921592593193 < 0.6654654244581

KeyboardInterrupt: 