In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import log_loss
from hybparsimony import HYBparsimony
from autogluon.tabular import TabularDataset, TabularPredictor
from hybparsimony import util
import openml

#### Step 0: Download Dataset

In [11]:
# Get COIL2000 dataset
dataset = openml.datasets.get_dataset('COIL2000')
label = dataset.default_target_attribute
X_orig, y_orig, _, _ = dataset.get_data(dataset_format="dataframe", target=label)
input_names = X_orig.columns
print(X_orig.shape)



(9822, 85)


In [12]:
# Use 50% for train/validation and 50% for testing
train_data, test_data, y_train, y_test = train_test_split(X_orig, 
                                                        y_orig, 
                                                        test_size=0.50, 
                                                        shuffle=True, 
                                                        random_state=0)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

#### Step 1: Use AutoGluon with all the features

In [13]:
time_autogluon = 150 # in seconds

# Train with features
train_data[label] = y_train.values
predictor = TabularPredictor(label=label, eval_metric='log_loss').fit(train_data, time_limit=time_autogluon)

No path specified. Models will be saved in: "AutogluonModels/ag-20230921_111451/"
Beginning AutoGluon training ... Time limit = 150s
AutoGluon will save models to "AutogluonModels/ag-20230921_111451/"
AutoGluon Version:  0.8.2
Python Version:     3.10.9
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #33~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Sep  7 10:33:52 UTC 2
Disk Space Avail:   1167.83 GB / 1574.05 GB (74.2%)
Train Data Rows:    4911
Train Data Columns: 85
Label Column: CARAVAN
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLP

In [14]:
# Shows performance with a new dataset
y_pred = predictor.predict_proba(test_data)
Log_loss_all = log_loss(y_true=y_test.values, y_pred=y_pred)
print(f'Log_loss with test using all the features={Log_loss_all}')
print('##################################################')

Log_loss with test using all the features=0.19725106895041525
##################################################


#### Step 2: Search the best features with HYB-PARSIMONY

In [15]:
def fitness_custom(cromosoma, **kwargs):
    global label

    X_train = kwargs["X"]
    y_train = kwargs["y"]
        
    # Extract features from the original DB plus response (last column)
    X_fs_selec = X_train.loc[: , cromosoma.columns]
    # Get 20% for validation
    x_train_custom, x_test_custom, y_train_custom, y_test_custom = train_test_split(X_fs_selec, 
                                                                                    y_train, 
                                                                                    test_size=0.20, 
                                                                                    shuffle=True, 
                                                                                    random_state=0)
    X_train_df = pd.DataFrame(np.hstack([x_train_custom, y_train_custom.reshape(-1,1).astype(int)]))
    X_train_df.columns = list(X_fs_selec.columns)+[label]
    X_test_df = pd.DataFrame(x_test_custom)
    predictor = TabularPredictor(label=label, eval_metric='log_loss', verbosity=0).fit(X_train_df, time_limit=time_autogluon)
    y_pred = predictor.predict_proba(X_test_df)
    fitness_val = -log_loss(y_true=y_test_custom, y_pred=y_pred)
    return np.array([fitness_val, np.sum(cromosoma.columns)]), predictor

In [16]:
HYBparsimony_model = HYBparsimony(fitness=fitness_custom,
                                features=input_names,
                                rerank_error=0.001,
                                gamma_crossover=0.50,
                                seed_ini=0,
                                npart=15,
                                maxiter=100,
                                early_stop=20,
                                verbose=1,
                                n_jobs=1)
HYBparsimony_model.fit(train_data[input_names], train_data[label].values)
best_model_probsfeats = HYBparsimony_model.best_model_conf[-len(input_names):]
selec_feats = np.array(input_names)[best_model_probsfeats>=0.50]
print(f'Selected feats with HYB-PARSIMONY num={len(selec_feats)}:{selec_feats}')
print('######################################################')

Running iteration 0
Best model -> Score = -0.211364 Complexity = 76.0 
Iter = 0 -> MeanVal = -0.21634  ValBest = -0.211364   ComplexBest = 76.0 Time(min) = 7.386587

Running iteration 1
Best model -> Score = -0.211364 Complexity = 76.0 
Iter = 1 -> MeanVal = -0.218023  ValBest = -0.212938   ComplexBest = 73.0 Time(min) = 7.295187

Running iteration 2
Best model -> Score = -0.209308 Complexity = 52.0 
Iter = 2 -> MeanVal = -0.218252  ValBest = -0.209308   ComplexBest = 52.0 Time(min) = 7.921135

Running iteration 3
Best model -> Score = -0.209308 Complexity = 52.0 
Iter = 3 -> MeanVal = -0.217136  ValBest = -0.212988   ComplexBest = 41.0 Time(min) = 7.63907

Running iteration 4
Best model -> Score = -0.209308 Complexity = 52.0 
Iter = 4 -> MeanVal = -0.215803  ValBest = -0.211512   ComplexBest = 49.0 Time(min) = 8.471128

Running iteration 5
Best model -> Score = -0.209308 Complexity = 52.0 
Iter = 5 -> MeanVal = -0.217571  ValBest = -0.213319   ComplexBest = 48.0 Time(min) = 7.746143



 #### Step 3: Use AutoGluon with the Selected Features

In [17]:
predictor = TabularPredictor(label=label, eval_metric='log_loss').fit(train_data[list(selec_feats)+[label]], 
                                                                      time_limit=time_autogluon)

No path specified. Models will be saved in: "AutogluonModels/ag-20230921_153153/"
Beginning AutoGluon training ... Time limit = 150s
AutoGluon will save models to "AutogluonModels/ag-20230921_153153/"
AutoGluon Version:  0.8.2
Python Version:     3.10.9
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #33~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Sep  7 10:33:52 UTC 2
Disk Space Avail:   1124.67 GB / 1574.05 GB (71.5%)
Train Data Rows:    4911
Train Data Columns: 44
Label Column: CARAVAN
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLP

In [18]:
y_pred = predictor.predict_proba(test_data[selec_feats])
Log_loss_selected = log_loss(y_true=y_test, y_pred=y_pred)

print(f'Log_loss with test using the selected features={Log_loss_selected}')
print(f'Difference between all features={len(input_names)} and the selected feats={len(selec_feats)} diff={len(input_names)-len(selec_feats)}')
print(f'Difference between log_loss with all features={Log_loss_all} and with the selected feats={Log_loss_selected} diff={Log_loss_all-Log_loss_selected}')
print('#########################################################################')  
    

Log_loss with test using the selected features=0.1975637808404383
Difference between all features=85 and the selected feats=44 diff=41
Difference between log_loss with all features=0.19725106895041525 and with the selected feats=0.1975637808404383 diff=-0.00031271189002304856
#########################################################################


However, sometimes, HYB-PARSIMONY is such an intensive search method that when working with SHDD the method may find parsimonious solutions that are too specific to that set of instances. Sometimes, the selected features may be the most appropriate for that sample but not be sufficient to create a model that will generalize correctly in the future. To reduce this over-fitting and to find a feature selection that can be used to create a robust model that generalizes correctly, we propose the following methodology (if time and resources are available):

1. Repeat $n$ runs, with different random seeds, the search for the best model with HYB-PARSIMONY and hold-out validation. In each repetition, extract the feature probability vector of the best individual (*best_model_probsfeats*).
2. Average the probabilities for each feature and select those that have a value greater than a given threshold, $thr_{fs}$.
3. Train Autogluon with the selected features.
4. Repeat points 2 and 3 with different $thr_{fs}$.
5. Select the model that obtains the best error validation $J$ or with another test dataset.

More info see: 

Divason, J., Pernia-Espinoza, A., Romero, A., Martinez-de-Pison, F.J. (2023). [Hybrid Intelligent Parsimony Search in Small High-Dimensional Datasets.](https://link.springer.com/content/pdf/10.1007/978-3-031-40725-3_33.pdf?pdf=inline%20link) In: Garcia Bringas, P., et al. Hybrid Artificial Intelligent Systems. HAIS 2023. Lecture Notes in Computer Science(), vol 14001. Springer, Cham. https://doi.org/10.1007/978-3-031-40725-3_33.



Bibtex ref:

@InProceedings{10.1007/978-3-031-40725-3_33,
author="Divas{\'o}n, Jose
and Pernia-Espinoza, Alpha
and Romero, Ana
and Martinez-de-Pison, Francisco Javier",
editor="Garc{\'i}a Bringas, Pablo
and P{\'e}rez Garc{\'i}a, Hilde
and Mart{\'i}nez de Pis{\'o}n, Francisco Javier
and Mart{\'i}nez {\'A}lvarez, Francisco
and Troncoso Lora, Alicia
and Herrero, {\'A}lvaro
and Calvo Rolle, Jos{\'e} Luis
and Quinti{\'a}n, H{\'e}ctor
and Corchado, Emilio",
title="Hybrid Intelligent Parsimony Search in Small High-Dimensional Datasets",
booktitle="Hybrid Artificial Intelligent Systems",
year="2023",
publisher="Springer Nature Switzerland",
address="Cham",
pages="384-396",
}