## Libraries

In [1]:
from copy import copy, deepcopy
from irace2 import irace, dummy_stats_test
import itertools
import numpy as np
from sampling_functions import norm_sample, truncated_poisson, truncated_skellam
import scipy.stats as stats
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split,StratifiedShuffleSplit,cross_val_score
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import scipy.stats as ss
from scipy.stats import norm, poisson, skellam
from tqdm import tqdm
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## DataSets

In [2]:
Xs = []
ys = []

df = pd.read_csv('spect_train.csv')
Xs.append(preprocessing.normalize(df.drop(columns=['OVERALL_DIAGNOSIS']).to_numpy()))
ys.append(df['OVERALL_DIAGNOSIS'].to_numpy())

df = pd.read_csv('spambase.csv')
Xs.append(preprocessing.normalize(df.drop(columns=['spam']).to_numpy()))
ys.append(df['spam'].to_numpy())

df = pd.read_csv('ionosphere_data.csv')
Xs.append(preprocessing.normalize(df.drop(columns=['column_ai']).to_numpy()))
ys.append(df['column_ai'].to_numpy())

In [3]:
for X in Xs:
    print(X.shape)

for y in ys:
    print(y.shape)

(80, 44)
(4601, 57)
(351, 34)
(80,)
(4601,)
(351,)


## Models setup

In [4]:
#all the numeric parameters being configured must be set beforehand
models = [LogisticRegression(C=1), 
    RandomForestClassifier(n_estimators=100,max_depth=5,ccp_alpha=0.0),
    SVC(C=1,coef0=0.0),
    XGBClassifier(n_estimators=100,max_depth=6,subsample=1)]


parameters_dict = {
    'LogisticRegression': {'C': lambda loc : norm_sample(loc=loc, scale=1, min= 1e-2),
                            'penalty':['l2'],
                            'solver':['lbfgs','newton-cg','sag']},
    'SVC':{'C':lambda loc : norm_sample(loc=loc, scale=1, min= 1e-2),
            'coef0': lambda loc : norm_sample(loc=loc, scale=1, min= 1e-2),
            'kernel':['linear','poly','rbf','sigmoid'],
            'decision_function_shape':['ovo','ovr']},
    'RandomForestClassifier': {'n_estimators': lambda loc: truncated_skellam(loc, mu1=10, mu2=10, min=1), 
                                'max_depth': lambda loc: truncated_skellam(loc, mu1=1, mu2=1, min=1),
                                'max_features':['sqrt', 'log2', None],
                                'ccp_alpha':lambda loc : norm_sample(loc=loc, scale=0.1, min= 1e-3)
                                },
    'XGBClassifier': {'sample_type': ['uniform','weighted'], 
                        'max_depth': lambda loc: truncated_skellam(loc, mu1=1, mu2=1, min=1),
                        'booster':['gbtree','dart'],
                        'subsample':lambda loc : norm_sample(loc=loc, scale=0.3, min= 1e-2,max=1)}
}

## Tests of hypothesis

In [5]:
stat_tests = [ ss.ttest_rel,
                ss.ttest_ind,
                ss.mannwhitneyu,
                ss.wilcoxon,
                dummy_stats_test] 

## Irace parameters

In [6]:
data_set_id = [0,1,2]
train_test_resampling = [10, 30, 100]
cv_splits = [10, 30, 100]
pop_size = [10, 25, 50]
n_gen = [10, 50, 250]

In [7]:
factors = list(itertools.product(data_set_id,stat_tests,cv_splits,pop_size,n_gen))

In [8]:
n = 10
res = []

for n_exp in tqdm(range(n)):
    for f in tqdm(factors):
        
        data_id = f[0]
        X = Xs[data_id]
        y = ys[data_id]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

        stat_test = f[1]
        split = f[2]
        p_size = f[3]
        stop = f[4]

        best_model,best_scores,population,pop_scores = irace(models, 
                X_train, 
                y_train, 
                lambda x: x > stop, 
                stat_test, 
                parameters_dict, 
                p_size, 'f1', cv=split)

        best_model.fit(X_train,y_train)
        y_pred = best_model.predict(X_test)

        row = [data_id,stat_test,split,p_size,stop,'cv',type(best_model).__name__,f1_score(y_test,y_pred)]
        res.append(row)

        best_model,best_scores,population,pop_scores = irace(models, 
                X_train, 
                y_train, 
                lambda x: x > f[4], 
                stat_test, 
                parameters_dict, 
                p_size, 'f1', r=split)

        best_model.fit(X_train,y_train)
        y_pred = best_model.predict(X_test)

        row = [data_id,stat_test,split,p_size,stop,'train_test',type(best_model).__name__,f1_score(y_test,y_pred)]
        res.append(row)

        with open('my_data.pkl', 'wb') as f:
            pickle.dump(res, f)

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
with open('my_data.pkl', 'rb') as f:
    # load the persisted Python object from the file
    my_data = pickle.load(f)

print(my_data)