In [1]:
from openml import tasks

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import mutual_info_classif

import numpy as np

from nds import ndomsort

from classes import Dataset, GroupStructure, WeightClipper, Prob
from functions import run_eagga_cv, generate_offspring

In [2]:
oml_task_diabetes = tasks.get_task(37)

In [3]:
X, y, categorical_indicator, attribute_names = oml_task_diabetes.get_dataset().get_data()

In [4]:
X

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,tested_positive
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,tested_negative
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,tested_positive
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,tested_negative
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,tested_positive
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,tested_negative
764,2.0,122.0,70.0,27.0,0.0,36.8,0.340,27.0,tested_negative
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,tested_negative
766,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0,tested_positive


In [5]:
Xy = X.copy()

In [6]:
categorical_indicator

[False, False, False, False, False, False, False, False, True]

In [7]:
feats_selected = GroupStructure.detector_features(Xy, categorical_indicator)
feats_selected = {0,1,2,3,4,5,6,7}
print(feats_selected)

{0, 1, 2, 3, 4, 5, 6, 7}


In [8]:
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly.feature_names_in_=Xy.iloc[:, [*feats_selected]].columns
interaction_terms = poly.fit_transform(
    X=Xy.iloc[:, [*feats_selected]],
    y=Xy.loc[:, 'class']
)
print(poly.get_feature_names_out())
print(interaction_terms)

['preg' 'plas' 'pres' 'skin' 'insu' 'mass' 'pedi' 'age' 'preg plas'
 'preg pres' 'preg skin' 'preg insu' 'preg mass' 'preg pedi' 'preg age'
 'plas pres' 'plas skin' 'plas insu' 'plas mass' 'plas pedi' 'plas age'
 'pres skin' 'pres insu' 'pres mass' 'pres pedi' 'pres age' 'skin insu'
 'skin mass' 'skin pedi' 'skin age' 'insu mass' 'insu pedi' 'insu age'
 'mass pedi' 'mass age' 'pedi age']
[[6.00000e+00 1.48000e+02 7.20000e+01 ... 2.10672e+01 1.68000e+03
  3.13500e+01]
 [1.00000e+00 8.50000e+01 6.60000e+01 ... 9.33660e+00 8.24600e+02
  1.08810e+01]
 [8.00000e+00 1.83000e+02 6.40000e+01 ... 1.56576e+01 7.45600e+02
  2.15040e+01]
 ...
 [5.00000e+00 1.21000e+02 7.20000e+01 ... 6.41900e+00 7.86000e+02
  7.35000e+00]
 [1.00000e+00 1.26000e+02 6.00000e+01 ... 1.05049e+01 1.41470e+03
  1.64030e+01]
 [1.00000e+00 9.30000e+01 7.00000e+01 ... 9.57600e+00 6.99200e+02
  7.24500e+00]]


In [9]:
population_interactions = GroupStructure.detector_interactions(Xy, feats_selected)
population_interactions

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[[3, 6], [0], [1], [2], [4], [5], [7]]

In [10]:
population = GroupStructure.detector_monotonicity(Xy, population_interactions)
population

[[[3, 6], 0], [[0], 0], [[1], 0], [[2], 1], [[4], 1], [[5], 0], [[7], 0]]

In [None]:
gs_1 = GroupStructure(
    {0, 1, 2, 3, 4, 5, 6, 7},
    {0, 1},
    [[2, 5], 1],
    [[4], 0],
    [[7, 3, 6], 1]
)

gs_2 = GroupStructure(
    {0, 1, 2, 3, 4, 5, 6, 7},
    {0},
    [[1, 2, 3, 4, 5], 1],
    [[6, 7], -1]
)

tmp = Dataset(
    X=Xy.loc[:, Xy.columns != 'class'],
    y=Xy.loc[:, 'class'],
    class_pos='tested_positive',
    group_structure=gs_1
)
len(tmp)
tmp[2]

gs_1.get_unconstrained_features()

population = [
    {'total_layers': 3, 'nodes_per_hidden_layer': 14, 'group_structure': gs_1, 'metrics': {'mean': (0.75, 0.15, 0.5, 0.4)}},
    {'total_layers': 5, 'nodes_per_hidden_layer': 5, 'group_structure': gs_2, 'metrics': {'mean': (0.8, 0.15, 0.5, 0.4)}},
    {'total_layers': 7, 'nodes_per_hidden_layer': 3, 'group_structure': gs_2, 'metrics': {'mean': (0.6, 0.27, 0.4, 0.6)}}
]
ranks_nds = ndomsort.non_domin_sort(
    [individual['metrics']['mean'] for individual in population],
    get_objectives=lambda elem: (1 - elem[0], *[elem[i] for i in range(1, len(elem))]),
    only_front_indices=True
)
print(ranks_nds)
hp_bounds = {
    'total_layers': (3, 10),
    'nodes_per_hidden_layer': (3, 20)
}
gs_1, gs_2 = generate_offspring(2, population, ranks_nds, hp_bounds)# GroupStructure.gga_crossover(gs_1, gs_2)
print(gs_1, gs_1['group_structure'])
print(gs_2, gs_2['group_structure'])

In [None]:
# outer
data_train_test, data_val = train_test_split(
    Xy,
    train_size=2/3,
    shuffle=True,
    stratify=Xy.loc[:, 'class']
)

# reset indices as StratifiedKFold assumes consecutive index
data_train_test = data_train_test.reset_index(drop=True)
data_val = data_val.reset_index(drop=True)

# inner
cv_inner = StratifiedKFold(
    n_splits=5,
    shuffle=False  # TODO: set to True
)

'''
- for each individual in the configuration, run k folds + average its performance
    -> in each fold, additionally split the training data, train on larger split, use smaller split to determine early stopping
    -> average early stopping epoch over all folds, report back with average performance
- find pareto front
- evaluate pareto front's performance on holdout test set, each model of the front is trained for the average of the epochs determined by early stopping in CV
'''
mu = 3  # TODO: set to 100
la = 2  # TODO: set to 10
monotonicity_clipper = WeightClipper(0, None)  # enforce monotonicity by clipping weights to [0, infty) after each epoch (in def train)
run_eagga_cv(mu, la, cv_inner, data_train_test, categorical_indicator, epochs=10, batch_size=8, weight_clipper=monotonicity_clipper)