In [1]:
from openml import tasks

from sklearn.model_selection import train_test_split

from classes import WeightClipper, EAGGA
from functions import run_eagga_cv

from tqdm import tqdm

In [2]:
oml_task_ids = [37, 43, 3903, 3904, 3913, 3918, 10093, 9946, 146819, 359955, 189922, 359962, 190392, 167120, 190137, 190410, 168350, 359975, 359972, 146820]
oml_tasks = tasks.get_tasks(oml_task_ids)
oml_tasks



[OpenML Classification Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
 Task ID..............: 37
 Task URL.............: https://www.openml.org/t/37
 Estimation Procedure.: crossvalidation
 Target Feature.......: class
 # of Classes.........: 2
 Cost Matrix..........: Available,
 OpenML Classification Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
 Task ID..............: 43
 Task URL.............: https://www.openml.org/t/43
 Estimation Procedure.: crossvalidation
 Target Feature.......: class
 # of Classes.........: 2
 Cost Matrix..........: Available,
 OpenML Classification Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
 Task ID..............: 3903
 Task URL.............: https://www.openml.org/t/3903
 Estimation Procedure.: crossvalidation
 Target Feature.......: c
 # of Classes.........: 2
 Cost Matrix..........: Available,
 OpenML Classification Task
 Ta

In [3]:
oml_datasets = [oml_task.get_dataset() for oml_task in oml_tasks]

class_column_in_X = list()
empty_y = list()
classes = list()
names = list()
for oml_dataset in tqdm(oml_datasets):
    class_column = oml_dataset.default_target_attribute
    X, y, _, _ = oml_dataset.get_data()

    class_column_in_X.append(class_column in X.columns)
    empty_y.append(y is None)

    names.append(oml_dataset.name)
    classes.append(X.loc[:, class_column].unique().tolist())
print(f'{all(class_column_in_X)}: {class_column_in_X}')  # X always already includes y
print(f'{all(empty_y)}: {empty_y}')  # y is always None

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [00:00<00:00, 23.76it/s]

True: [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
True: [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]





we see: X always includes the target and y is always None for our datasets

In [4]:
# define positive classes
positive_classes = ['tested_positive', '1', True, True, 'yes', True, '2', '2', '1', '2', '1', True, '1', '1', '2', '1', '2', 'Anomaly', '1', '2']
for name, cs, class_pos in zip(names, classes, positive_classes):
    print(f'{name}: {cs}, {class_pos}')

diabetes: ['tested_positive', 'tested_negative'], tested_positive
spambase: ['1', '0'], 1
pc3: [False, True], True
jm1: [False, True], True
kc2: ['no', 'yes'], yes
pc1: [False, True], True
banknote-authentication: ['1', '2'], 2
wdbc: ['2', '1'], 2
climate-model-simulation-crashes: ['0', '1'], 1
blood-transfusion-service-center: ['2', '1'], 2
gina: ['0', '1'], 1
kc1: [False, True], True
madeline: ['1', '0'], 1
numerai28.6: ['0', '1'], 1
ozone-level-8hr: ['1', '2'], 2
philippine: ['0', '1'], 1
phoneme: ['1', '2'], 2
Satellite: ['Anomaly', 'Normal'], Anomaly
sylvine: ['1', '0'], 1
wilt: ['2', '1'], 2


In [5]:
oml_task_diabetes = oml_tasks[0]
X, y, categorical_indicator, attribute_names = oml_task_diabetes.get_dataset().get_data()
Xy = X.copy()
Xy

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,tested_positive
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,tested_negative
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,tested_positive
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,tested_negative
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,tested_positive
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,tested_negative
764,2.0,122.0,70.0,27.0,0.0,36.8,0.340,27.0,tested_negative
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,tested_negative
766,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0,tested_positive


In [6]:
Xy.loc[:, 'class'].nunique() == 1

False

In [7]:
tmp, _, _, _ = tasks.get_task(3904).get_dataset().get_data()
tmp.isnull().values.any()

np.True_

In [10]:
hps = {
    'total_layers': (3, 10),
    'nodes_per_hidden_layer': (3, 20),
    'mu': 3,  # TODO: 100
    'lambda': 2,  # TODO: 10
    'holdout_train_size': 2/3,
    'cv_k': 5
}

batch_size = 16
patience = 2

secs_per_fold = 1
secs_total = 30

for (oml_dataset, class_positive) in zip(oml_datasets[0], positive_classes[0]):
    print(f'Dataset {oml_dataset.name}')
    
    eagga = EAGGA(
        oml_dataset=oml_dataset,
        class_positive=class_positive,
        hps=hps,
        batch_size=batch_size,
        patience=patience,
        secs_per_fold=secs_per_fold,
        secs_total=secs_total
    )
    eagga.start_eagga()

Dataset phoneme
Starting init population
Finished init population
Start EAGGA at 2025-02-26T23:04:49.877366
Generation 1, evaluate 3 individuals
Running 5-fold CV for individual: 3 total layers, 5 nodes per hidden layer, gs: ([0, 1, 3, 4], [[[2], 0]])
Fold 1/5 | trained for 3 epochs | metrics: (0.5639311483372813, 0.7374893427734737, 0.2, 0.0, 0.2)
Fold 2/5 | trained for 3 epochs | metrics: (0.5667206126710643, 0.6642602958075398, 0.2, 0.0, 0.2)
Fold 3/5 | trained for 3 epochs | metrics: (0.539094105694029, 0.7389640499445991, 0.2, 0.0, 0.2)
Fold 4/5 | trained for 3 epochs | metrics: (0.5620309558179644, 0.7031676272590991, 0.2, 0.0, 0.2)
Fold 5/5 | trained for 3 epochs | metrics: (0.5500125004185571, 0.709773834020801, 0.2, 0.0, 0.2)
Running 5-fold CV for individual: 5 total layers, 3 nodes per hidden layer, gs: ([0, 1, 2, 4], [[[3], 1]])
Fold 1/5 | trained for 3 epochs | metrics: (0.6220289637213168, 0.7326240871853801, 0.2, 0.0, 0.0)
Fold 2/5 | trained for 3 epochs | metrics: (0.705

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished init population
Start EAGGA at 2025-02-26T23:07:01.297067
Generation 1, evaluate 3 individuals
Running 5-fold CV for individual: 3 total layers, 3 nodes per hidden layer, gs: ([1, 2, 3, 4], [[[0], 0]])
Fold 1/5 | trained for 4 epochs | metrics: (0.6902062166027907, 0.5, 0.2, 0.0, 0.2)
Fold 2/5 | trained for 4 epochs | metrics: (0.4136612051870765, 0.5, 0.2, 0.0, 0.2)
Stop early: 0.21206259981475095 < 0.2165117214123408, optimal epoch 2
Fold 3/5 | trained for 2 epochs | metrics: (0.21780012638830556, 0.4985948477751756, 0.2, 0.0, 0.2)
Stop early: 0.22632517688202136 < 0.22775796100948797, optimal epoch 2
Fold 4/5 | trained for 2 epochs | metrics: (0.2337985767460451, 0.49119437939110067, 0.2, 0.0, 0.2)
Fold 5/5 | trained for 4 epochs | metrics: (0.3806948567309031, 0.5, 0.2, 0.0, 0.2)
Running 5-fold CV for individual: 3 total layers, 5 nodes per hidden layer, gs: ([0, 1, 2], [[[3, 4], 0]])
Stop early: 0.20441228851224436 < 0.21380325790607568, optimal epoch 2
Fold 1/5 | trained