In [20]:
def read_split(path:str,size:float):
    data = pd.read_csv(path)
    X = data.drop(columns=['y'])
    y = data.y
    return train_test_split(X,y, test_size=size, random_state=123)

In [21]:
import pandas as pd
import os
import copy
import glob
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
import pickle as pick

from joblib import parallel_backend
import matplotlib.pyplot as plt

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from  xgboost import XGBClassifier as xgb
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import TomekLinks 
from sklearn.metrics import balanced_accuracy_score 
import sklearn.metrics as met

import scipy.stats as stats

In [22]:
thetas = np.linspace(0,1,101)

In [23]:
paths = glob.glob('./../data/*csv')
paths

['./../data/data11.csv',
 './../data/data12.csv',
 './../data/data14.csv',
 './../data/data15.csv',
 './../data/data16.csv',
 './../data/data19.csv',
 './../data/data2.csv',
 './../data/data20.csv',
 './../data/data21.csv',
 './../data/data3.csv',
 './../data/data4.csv',
 './../data/data5.csv',
 './../data/data6.csv',
 './../data/data7.csv',
 './../data/data8.csv']

In [24]:
models = []
models.append(('Tree',DecisionTreeClassifier()))
models.append(('RF',RandomForestClassifier()))
models.append(('KNN',KNeighborsClassifier()))
models.append(('XGB',xgb()))

In [25]:
parameters = []
parameters.append(('Tree',{'min_samples_leaf':[20,30,40,50],
                           'min_impurity_decrease':[0,0.01,0.03,0.05],
                           'ccp_alpha':[0,0.01,0.03,0.05],
                           'random_state':[123]}))


parameters.append(('RF',{'n_estimators':[500],
                         'min_samples_leaf':[20,30,40],
                         'min_impurity_decrease':[0,0.03,0.05],
                         'ccp_alpha':[0,0.03,0.05],
                         'oob_score':[False, True],
                         'random_state':[123],
                         'n_jobs':[-1]}))
                         

parameters.append(('KNN',{'n_neighbors':[3,5,7,11],
                          'algorithm':['ball_tree','kd_tree'],
                          'p':[1,2],
                          'leaf_size':[20,30,40],
                          'n_jobs':[-1]}))

parameters.append(('XGB',{'learning_rate': [0.3, 0.5],
                          'min_child_weight': [1,3,5],
                          'subsample': [0.6,0.8,1],
                          'verbosity': [0],
                          'colsample_bytree': [0.8,1],
                          'n_estimators': [100,300],
                          'random_state':[123],
                          'n_jobs':[-1]}))
parameters = dict(parameters)

In [26]:
dict_models = {'Tree':{'t':[],'nt':[], 'size':[]},
              'RF':{'t':[],'nt':[],'size':[]},
              'KNN':{'t':[],'nt':[],'size':[]},
              'XGB':{'t':[],'nt':[],'size':[]}
              }

In [27]:
dict_models1 = copy.deepcopy(dict_models);
dict_models2 = copy.deepcopy(dict_models);
dict_models3 = copy.deepcopy(dict_models);
dict_models4 = copy.deepcopy(dict_models);

In [9]:
for path in paths:
    X_train, X_test, y_train, y_test = read_split(path,0.2)
    ros = SMOTE(random_state=123)
    x_ros, y_ros = ros.fit_resample(X_train,y_train)

    results = []
    names = []
    print('***************************************************************************')
    for name, model in models:
        print(f'{name}**************')
        model_grid = HalvingGridSearchCV(model,
                                  parameters[name],
                                  cv = 5,
                                  verbose=True,
                                  scoring='balanced_accuracy')
        model_grid.fit(x_ros, y_ros)
        print(model_grid.best_params_)
        # print(model_grid.best_score_)
        model.parameters = model_grid.best_params_
        #cv_results = cross_val_score(model,x_ros, y_ros, cv=5,scoring='balanced_accuracy')
        #results.append(cv_results)
        #names.append(name)
        #msg= '%s: %f (%f)' %(name, cv_results.mean(), cv_results.std())
        #print(msg)
        model.fit(x_ros,y_ros)
        b_a_score = balanced_accuracy_score(y_test, model.predict(X_test))
        print('ros: ',b_a_score)
        dict_models1[name]['t'].append(b_a_score)
        dict_models1[name]['size'].append(len(y_test))
        model.fit(X_train,y_train)
        b_a_score = balanced_accuracy_score(y_test, model.predict(X_test))
        print('noRos:',b_a_score)
        dict_models1[name]['nt'].append(b_a_score)
        print('**************')


***************************************************************************
Tree**************
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 23
max_resources_: 646
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 23
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 69
Fitting 5 folds for each of 22 candidates, totalling 110 fits
----------
iter: 2
n_candidates: 8
n_resources: 207
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 3
n_candidates: 3
n_resources: 621
Fitting 5 folds for each of 3 candidates, totalling 15 fits
{'ccp_alpha': 0, 'min_impurity_decrease': 0.05, 'min_samples_leaf': 20, 'random_state': 123}
ros:  0.9939759036144578
noRos: 0.9939759036144578
**************
RF**************
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 23
max_resources_: 646
aggressive_elimination: Fal





----------
iter: 1
n_candidates: 24
n_resources: 69
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 2
n_candidates: 8
n_resources: 207
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 3
n_candidates: 3
n_resources: 621
Fitting 5 folds for each of 3 candidates, totalling 15 fits
{'colsample_bytree': 0.8, 'learning_rate': 0.5, 'min_child_weight': 1, 'n_estimators': 300, 'n_jobs': -1, 'random_state': 123, 'subsample': 1, 'verbosity': 0}
ros:  0.9939759036144578
noRos: 0.9939759036144578
**************
***************************************************************************
Tree**************
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 291
max_resources_: 7864
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 291
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 873
Fitting 5 folds for each of

----------
iter: 1
n_candidates: 18
n_resources: 1002
Fitting 5 folds for each of 18 candidates, totalling 90 fits
----------
iter: 2
n_candidates: 6
n_resources: 3006
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 3
n_candidates: 2
n_resources: 9018
Fitting 5 folds for each of 2 candidates, totalling 10 fits
{'ccp_alpha': 0, 'min_impurity_decrease': 0.05, 'min_samples_leaf': 20, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': True, 'random_state': 123}
ros:  0.9894952474598493
noRos: 0.9908226810881677
**************
KNN**************
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 334
max_resources_: 9032
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 48
n_resources: 334
Fitting 5 folds for each of 48 candidates, totalling 240 fits
----------
iter: 1
n_candidates: 16
n_resources: 1002
Fitting 5 folds for each of 16 candidates, totalling 80 fits
----------
iter: 2
n_candidates: 6
n_resources: 300

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 6

In [11]:
path

'./../data/data17.csv'

In [30]:
dict_models1

{'Tree': {'t': [], 'nt': [], 'size': []},
 'RF': {'t': [], 'nt': [], 'size': []},
 'KNN': {'t': [], 'nt': [], 'size': []},
 'XGB': {'t': [], 'nt': [], 'size': []}}

In [18]:
pd.DataFrame.from_dict(dict_models1, orient='index')

Unnamed: 0,t,nt,size
Tree,"[0.7185906106736932, 0.9939759036144578, 0.940...","[0.7049004987678296, 0.9939759036144578, 0.914...","[9043, 89, 1095, 1799, 462, 1265, 366]"
RF,"[0.725436199726058, 0.9939759036144578, 0.9781...","[0.6862161929071239, 0.9939759036144578, 0.945...","[9043, 89, 1095, 1799, 462, 1265, 366]"
KNN,"[0.7115975900114699, 0.9759036144578312, 0.933...","[0.6132987339954665, 0.9939759036144578, 0.857...","[9043, 89, 1095, 1799, 462, 1265, 366]"
XGB,"[0.7518218376956267, 0.9939759036144578, 0.970...","[0.7219303009215986, 0.9939759036144578, 0.946...","[9043, 89, 1095, 1799, 462, 1265, 366]"


In [13]:
path

'./../data/data17.csv'

In [28]:
for path in paths:
    X_train, X_test, y_train, y_test = read_split(path,0.2)
    rus = TomekLinks()
    x_rus, y_rus = rus.fit_resample(X_train,y_train)

    results = []
    names = []
    print('***************************************************************************')
    for name, model in models:
        print(f'{name}**************')
        model_grid = HalvingGridSearchCV(model,
                                         parameters[name],
                                         cv = 5,
                                         verbose=True,
                                         scoring='balanced_accuracy')
        model_grid.fit(x_rus, y_rus)
        print(model_grid.best_params_)
        # print(model_grid.best_score_)
        model.parameters = model_grid.best_params_
#         cv_results = cross_val_score(model,x_rus, y_rus, cv=5,scoring='balanced_accuracy')
#         results.append(cv_results)
#         names.append(name)
#         msg= '%s: %f (%f)' %(name, cv_results.mean(), cv_results.std())
#         print(msg)
        model.fit(x_rus,y_rus)
        b_a_score = balanced_accuracy_score(y_test, model.predict(X_test))
        print(b_a_score)
        dict_models2[name]['t'].append(b_a_score)
        model.fit(X_train,y_train)
        b_a_score = balanced_accuracy_score(y_test, model.predict(X_test))
        print('noRus:',b_a_score)
        dict_models2[name]['nt'].append(b_a_score)
        print('******************')

***************************************************************************
Tree**************
n_iterations: 3
n_required_iterations: 4
n_possible_iterations: 3
min_resources_: 20
max_resources_: 353
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 20
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 60
Fitting 5 folds for each of 22 candidates, totalling 110 fits
----------
iter: 2
n_candidates: 8
n_resources: 180
Fitting 5 folds for each of 8 candidates, totalling 40 fits
{'ccp_alpha': 0.01, 'min_impurity_decrease': 0, 'min_samples_leaf': 20, 'random_state': 123}
0.9939759036144578
noRus: 0.9939759036144578
******************
RF**************
n_iterations: 3
n_required_iterations: 4
n_possible_iterations: 3
min_resources_: 20
max_resources_: 353
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 54
n_resources: 20
Fitting 5 folds for each of 54 candidates, totall



72 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/xgboost/core.py", line 532, in inner_f
    return f(**kwargs)
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/xgboost/sklearn.py", line 1357, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0], got [1]

 nan nan nan n

----------
iter: 1
n_candidates: 24
n_resources: 60
Fitting 5 folds for each of 24 candidates, totalling 120 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 0.8 0.8 0.8 0.7 0.7 0.7 0.8 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.8
 0.8 0.8 0.8 0.8 0.7 0.7]
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan 

----------
iter: 2
n_candidates: 8
n_resources: 180
Fitting 5 folds for each of 8 candidates, totalling 40 fits


        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.8        0.8        0.8        0.7        0.7        0.7
 0.8        0.7        0.7        0.7        0.7        0.7
 0.7        0.7        0.7        0.7        0.7        0.8
 0.8        0.8        0.8        0.8        0.7        0.7
 0.86286154 0.86286154 

{'colsample_bytree': 0.8, 'learning_rate': 0.3, 'min_child_weight': 1, 'n_estimators': 100, 'n_jobs': -1, 'random_state': 123, 'subsample': 0.8, 'verbosity': 0}
0.9939759036144578
noRus: 0.9939759036144578
******************
***************************************************************************
Tree**************
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 161
max_resources_: 4351
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 161
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 483
Fitting 5 folds for each of 22 candidates, totalling 110 fits
----------
iter: 2
n_candidates: 8
n_resources: 1449
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 3
n_candidates: 3
n_resources: 4347
Fitting 5 folds for each of 3 candidates, totalling 15 fits
{'ccp_alpha': 0.01, 'min_impurity_decrease': 0.01, 'min_samples_leaf': 40, 'r



----------
iter: 1
n_candidates: 24
n_resources: 483
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 2
n_candidates: 8
n_resources: 1449
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 3
n_candidates: 3
n_resources: 4347
Fitting 5 folds for each of 3 candidates, totalling 15 fits
{'colsample_bytree': 0.8, 'learning_rate': 0.3, 'min_child_weight': 1, 'n_estimators': 100, 'n_jobs': -1, 'random_state': 123, 'subsample': 0.6, 'verbosity': 0}
0.9544101078384033
noRus: 0.9461478620097645
******************
***************************************************************************
Tree**************
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 68
max_resources_: 1845
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 68
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 204
Fitting 5 folds for each o



----------
iter: 1
n_candidates: 24
n_resources: 204
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 2
n_candidates: 8
n_resources: 612
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 3
n_candidates: 3
n_resources: 1836
Fitting 5 folds for each of 3 candidates, totalling 15 fits
{'colsample_bytree': 0.8, 'learning_rate': 0.5, 'min_child_weight': 1, 'n_estimators': 100, 'n_jobs': -1, 'random_state': 123, 'subsample': 0.8, 'verbosity': 0}
0.9864864864864865
noRus: 0.9864864864864865
******************
***************************************************************************
Tree**************
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 187
max_resources_: 5055
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 187
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 561
Fitting 5 folds for each 



----------
iter: 1
n_candidates: 24
n_resources: 561
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 2
n_candidates: 8
n_resources: 1683
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 3
n_candidates: 3
n_resources: 5049
Fitting 5 folds for each of 3 candidates, totalling 15 fits
{'colsample_bytree': 1, 'learning_rate': 0.5, 'min_child_weight': 1, 'n_estimators': 300, 'n_jobs': -1, 'random_state': 123, 'subsample': 0.8, 'verbosity': 0}
0.9949688626679777
noRus: 0.9954113405440839
******************
***************************************************************************
Tree**************
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 54
max_resources_: 1463
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 54
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 162
Fitting 5 folds for each of 



----------
iter: 2
n_candidates: 8
n_resources: 486
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 3
n_candidates: 3
n_resources: 1458
Fitting 5 folds for each of 3 candidates, totalling 15 fits
{'ccp_alpha': 0, 'min_impurity_decrease': 0.05, 'min_samples_leaf': 20, 'random_state': 123}
1.0
noRus: 1.0
******************
RF**************
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 54
max_resources_: 1463
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 54
n_resources: 54
Fitting 5 folds for each of 54 candidates, totalling 270 fits
----------
iter: 1
n_candidates: 18
n_resources: 162
Fitting 5 folds for each of 18 candidates, totalling 90 fits
----------
iter: 2
n_candidates: 6
n_resources: 486
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 3
n_candidates: 2
n_resources: 1458
Fitting 5 folds for each of 2 candidates, totalling 10 fits
{'ccp_alpha': 0, 'min_impurity_d



----------
iter: 1
n_candidates: 16
n_resources: 60
Fitting 5 folds for each of 16 candidates, totalling 80 fits
----------
iter: 2
n_candidates: 6
n_resources: 180
Fitting 5 folds for each of 6 candidates, totalling 30 fits
{'algorithm': 'ball_tree', 'leaf_size': 20, 'n_jobs': -1, 'n_neighbors': 11, 'p': 2}
0.4759036144578313
noRus: 0.4819277108433735
******************
XGB**************
n_iterations: 3
n_required_iterations: 4
n_possible_iterations: 3
min_resources_: 20
max_resources_: 357
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 72
n_resources: 20
Fitting 5 folds for each of 72 candidates, totalling 360 fits
----------
iter: 1
n_candidates: 24
n_resources: 60
Fitting 5 folds for each of 24 candidates, totalling 120 fits




----------
iter: 2
n_candidates: 8
n_resources: 180
Fitting 5 folds for each of 8 candidates, totalling 40 fits
{'colsample_bytree': 0.8, 'learning_rate': 0.5, 'min_child_weight': 3, 'n_estimators': 300, 'n_jobs': -1, 'random_state': 123, 'subsample': 1, 'verbosity': 0}
0.6002190580503833
noRus: 0.5334063526834611
******************
***************************************************************************
Tree**************
n_iterations: 3
n_required_iterations: 4
n_possible_iterations: 3
min_resources_: 20
max_resources_: 224
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 20
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 60
Fitting 5 folds for each of 22 candidates, totalling 110 fits
----------
iter: 2
n_candidates: 8
n_resources: 180
Fitting 5 folds for each of 8 candidates, totalling 40 fits
{'ccp_alpha': 0.01, 'min_impurity_decrease': 0, 'min_samples_leaf': 20, 'random_state





72 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/xgboost/core.py", line 532, in inner_f
    return f(**kwargs)
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/xgboost/sklearn.py", line 1357, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0], got [1]

 nan nan nan n

----------
iter: 1
n_candidates: 24
n_resources: 60
Fitting 5 folds for each of 24 candidates, totalling 120 fits


  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan 0.6  0.65 0.6  0.6  0.6  0.6  0.65 0.6  0.6  0.6  0.6  0.6
 0.6  0.6  0.6  0.6  0.6  0.7  0.7  0.65 0.7  0.7  0.6  0.6 ]
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan    

----------
iter: 2
n_candidates: 8
n_resources: 180
Fitting 5 folds for each of 8 candidates, totalling 40 fits


        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.6        0.65       0.6        0.6        0.6        0.6
 0.65       0.6        0.6        0.6        0.6        0.6
 0.6        0.6        0.6        0.6        0.6        0.7
 0.7        0.65       0.7        0.7        0.6        0.6
 0.58333333 0.88030303 

{'colsample_bytree': 0.8, 'learning_rate': 0.5, 'min_child_weight': 1, 'n_estimators': 300, 'n_jobs': -1, 'random_state': 123, 'subsample': 0.8, 'verbosity': 0}
0.8653846153846154
noRus: 0.8653846153846154
******************
***************************************************************************
Tree**************
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 29
max_resources_: 790
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 29
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 87
Fitting 5 folds for each of 22 candidates, totalling 110 fits
----------
iter: 2
n_candidates: 8
n_resources: 261
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 3
n_candidates: 3
n_resources: 783
Fitting 5 folds for each of 3 candidates, totalling 15 fits
{'ccp_alpha': 0, 'min_impurity_decrease': 0.05, 'min_samples_leaf': 20, 'random_sta

72 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/xgboost/core.py", line 532, in inner_f
    return f(**kwargs)
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/xgboost/sklearn.py", line 1357, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0], got [1]

 nan nan nan n

----------
iter: 1
n_candidates: 24
n_resources: 87
Fitting 5 folds for each of 24 candidates, totalling 120 fits


     nan     nan     nan     nan     nan     nan     nan     nan     nan
     nan     nan     nan     nan     nan     nan     nan     nan     nan
     nan     nan     nan     nan     nan     nan     nan     nan     nan
     nan     nan     nan     nan     nan     nan     nan     nan     nan
     nan     nan     nan     nan     nan     nan     nan     nan     nan
     nan     nan     nan     nan     nan     nan     nan     nan     nan
     nan     nan     nan     nan     nan     nan     nan     nan     nan
 0.89375 0.89375 0.89375 0.7     0.6     0.6     0.89375 0.6     0.6
 0.6     0.6     0.79375 0.8     0.69375 0.79375 0.8     0.6     0.89375
 0.94375 0.89375 0.89375 0.94375 0.6     0.6    ]
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan       

----------
iter: 2
n_candidates: 8
n_resources: 261
Fitting 5 folds for each of 8 candidates, totalling 40 fits


        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.89375    0.89375    0.89375    0.7        0.6        0.6
 0.89375    0.6        0.6        0.6        0.6        0.79375
 0.8        0.69375    0.79375    0.8        0.6        0.89375
 0.94375    0.89375    0.89375    0.94375    0.6        0.6
 0.78333333 0.7

----------
iter: 3
n_candidates: 3
n_resources: 783
Fitting 5 folds for each of 3 candidates, totalling 15 fits


        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.89375    0.89375    0.89375    0.7        0.6        0.6
 0.89375    0.6        0.6        0.6        0.6        0.79375
 0.8        0.69375    0.79375    0.8        0.6        0.89375
 0.94375    0.89375    0.89375    0.94375    0.6        0.6
 0.78333333 0.7

{'colsample_bytree': 0.8, 'learning_rate': 0.3, 'min_child_weight': 1, 'n_estimators': 300, 'n_jobs': -1, 'random_state': 123, 'subsample': 0.6, 'verbosity': 0}
0.9972972972972973
noRus: 0.9972972972972973
******************
***************************************************************************
Tree**************
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 43
max_resources_: 1183
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 43
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 129
Fitting 5 folds for each of 22 candidates, totalling 110 fits
----------
iter: 2
n_candidates: 8
n_resources: 387
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 3
n_candidates: 3
n_resources: 1161
Fitting 5 folds for each of 3 candidates, totalling 15 fits
{'ccp_alpha': 0, 'min_impurity_decrease': 0.03, 'min_samples_leaf': 50, 'random_





----------
iter: 1
n_candidates: 24
n_resources: 60
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 2
n_candidates: 8
n_resources: 180
Fitting 5 folds for each of 8 candidates, totalling 40 fits
{'colsample_bytree': 0.8, 'learning_rate': 0.5, 'min_child_weight': 1, 'n_estimators': 300, 'n_jobs': -1, 'random_state': 123, 'subsample': 1, 'verbosity': 0}
0.8571428571428572
noRus: 0.8488095238095238
******************
***************************************************************************
Tree**************
n_iterations: 2
n_required_iterations: 4
n_possible_iterations: 2
min_resources_: 20
max_resources_: 161
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 20
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 60
Fitting 5 folds for each of 22 candidates, totalling 110 fits
{'ccp_alpha': 0, 'min_impurity_decrease': 0.03, 'min_samples_leaf': 40, 'random_sta



72 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/xgboost/core.py", line 532, in inner_f
    return f(**kwargs)
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/xgboost/sklearn.py", line 1357, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0], got [1]

 nan nan nan n

----------
iter: 1
n_candidates: 24
n_resources: 60
Fitting 5 folds for each of 24 candidates, totalling 120 fits


  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan 0.55 0.65 0.55 0.5  0.5  0.5  0.55 0.5  0.5  0.5  0.5  0.5
 0.5  0.5  0.5  0.5  0.5  0.75 0.65 0.55 0.75 0.75 0.5  0.5 ]
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan    

{'colsample_bytree': 0.8, 'learning_rate': 0.5, 'min_child_weight': 1, 'n_estimators': 300, 'n_jobs': -1, 'random_state': 123, 'subsample': 1, 'verbosity': 0}
0.7861111111111111
noRus: 0.7861111111111111
******************
***************************************************************************
Tree**************
n_iterations: 2
n_required_iterations: 4
n_possible_iterations: 2
min_resources_: 20
max_resources_: 160
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 20
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 60
Fitting 5 folds for each of 22 candidates, totalling 110 fits
{'ccp_alpha': 0, 'min_impurity_decrease': 0.03, 'min_samples_leaf': 40, 'random_state': 123}
0.9019607843137255
noRus: 0.9019607843137255
******************
RF**************
n_iterations: 2
n_required_iterations: 4
n_possible_iterations: 2
min_resources_: 20
max_resources_: 160
aggressive_elimination: False




{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_jobs': -1, 'n_neighbors': 3, 'p': 1}
0.9166666666666667
noRus: 0.9166666666666667
******************
XGB**************
n_iterations: 2
n_required_iterations: 4
n_possible_iterations: 2
min_resources_: 20
max_resources_: 160
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 72
n_resources: 20
Fitting 5 folds for each of 72 candidates, totalling 360 fits




72 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/xgboost/core.py", line 532, in inner_f
    return f(**kwargs)
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/xgboost/sklearn.py", line 1357, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0], got [1]

 nan nan nan n

----------
iter: 1
n_candidates: 24
n_resources: 60
Fitting 5 folds for each of 24 candidates, totalling 120 fits


        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.9        0.85       0.95       0.6        0.6        0.6
 0.89090909 0.6        0.6        0.6        0.6        0.6
 0.6        0.6        0.6        0.6        0.6        0.85
 0.85       0.89090909 0.85       0.85       0.6        0.6       ]
        nan   

{'colsample_bytree': 0.8, 'learning_rate': 0.3, 'min_child_weight': 1, 'n_estimators': 100, 'n_jobs': -1, 'random_state': 123, 'subsample': 0.8, 'verbosity': 0}
0.8872549019607843
noRus: 0.8872549019607843
******************
***************************************************************************
Tree**************
n_iterations: 2
n_required_iterations: 4
n_possible_iterations: 2
min_resources_: 20
max_resources_: 176
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 20
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 60
Fitting 5 folds for each of 22 candidates, totalling 110 fits
{'ccp_alpha': 0, 'min_impurity_decrease': 0.03, 'min_samples_leaf': 40, 'random_state': 123}
0.8439849624060151
noRus: 0.8439849624060151
******************
RF**************
n_iterations: 2
n_required_iterations: 4
n_possible_iterations: 2
min_resources_: 20
max_resources_: 176
aggressive_elimination: Fals

144 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
144 fits failed with the following error:
Traceback (most recent call last):
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/xgboost/core.py", line 532, in inner_f
    return f(**kwargs)
  File "/home/syseng/sdariza/.conda/envs/dataMining/lib/python3.9/site-packages/xgboost/sklearn.py", line 1357, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0], got [1]

 nan nan nan

----------
iter: 1
n_candidates: 24
n_resources: 60
Fitting 5 folds for each of 24 candidates, totalling 120 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9
 0.9 0.9 0.9 0.9 0.9 0.9]
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan 

{'colsample_bytree': 0.8, 'learning_rate': 0.3, 'min_child_weight': 1, 'n_estimators': 300, 'n_jobs': -1, 'random_state': 123, 'subsample': 0.6, 'verbosity': 0}
0.8439849624060151
noRus: 0.8439849624060151
******************
***************************************************************************
Tree**************
n_iterations: 2
n_required_iterations: 4
n_possible_iterations: 2
min_resources_: 20
max_resources_: 77
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 20
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 60
Fitting 5 folds for each of 22 candidates, totalling 110 fits
{'ccp_alpha': 0, 'min_impurity_decrease': 0.03, 'min_samples_leaf': 40, 'random_state': 123}
0.5625
noRus: 0.5
******************
RF**************
n_iterations: 2
n_required_iterations: 4
n_possible_iterations: 2
min_resources_: 20
max_resources_: 77
aggressive_elimination: False
factor: 3
----------
iter: 



----------
iter: 1
n_candidates: 24
n_resources: 60
Fitting 5 folds for each of 24 candidates, totalling 120 fits
{'colsample_bytree': 0.8, 'learning_rate': 0.3, 'min_child_weight': 1, 'n_estimators': 100, 'n_jobs': -1, 'random_state': 123, 'subsample': 1, 'verbosity': 0}
0.59375
noRus: 0.59375
******************
***************************************************************************
Tree**************
n_iterations: 2
n_required_iterations: 4
n_possible_iterations: 2
min_resources_: 20
max_resources_: 162
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 64
n_resources: 20
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 1
n_candidates: 22
n_resources: 60
Fitting 5 folds for each of 22 candidates, totalling 110 fits
{'ccp_alpha': 0, 'min_impurity_decrease': 0.03, 'min_samples_leaf': 40, 'random_state': 123}
0.5709459459459459
noRus: 0.47297297297297297
******************
RF**************
n_iterations: 2
n_required_iterations: 4
n



----------
iter: 1
n_candidates: 16
n_resources: 60
Fitting 5 folds for each of 16 candidates, totalling 80 fits
{'algorithm': 'ball_tree', 'leaf_size': 20, 'n_jobs': -1, 'n_neighbors': 7, 'p': 2}
0.4864864864864865
noRus: 0.4864864864864865
******************
XGB**************
n_iterations: 2
n_required_iterations: 4
n_possible_iterations: 2
min_resources_: 20
max_resources_: 162
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 72
n_resources: 20
Fitting 5 folds for each of 72 candidates, totalling 360 fits
----------
iter: 1
n_candidates: 24
n_resources: 60
Fitting 5 folds for each of 24 candidates, totalling 120 fits




{'colsample_bytree': 0.8, 'learning_rate': 0.3, 'min_child_weight': 1, 'n_estimators': 100, 'n_jobs': -1, 'random_state': 123, 'subsample': 1, 'verbosity': 0}
0.4864864864864865
noRus: 0.4864864864864865
******************


In [29]:
dict_models2

{'Tree': {'t': [0.9939759036144578,
   0.926873759321852,
   0.9797297297297297,
   0.9954113405440839,
   1.0,
   0.6522453450164294,
   0.5961538461538461,
   0.9480249480249481,
   0.8264604810996563,
   0.7773809523809523,
   0.8722222222222222,
   0.9019607843137255,
   0.8439849624060151,
   0.5625,
   0.5709459459459459],
  'nt': [0.9939759036144578,
   0.9304415472933096,
   0.9784410699359153,
   0.9958538184201902,
   1.0,
   0.6670317634173055,
   0.6057692307692308,
   0.9507276507276508,
   0.7448453608247423,
   0.6976190476190476,
   0.8722222222222222,
   0.9019607843137255,
   0.8439849624060151,
   0.5,
   0.47297297297297297],
  'size': []},
 'RF': {'t': [0.9939759036144578,
   0.9539004238424809,
   0.9932432432432432,
   0.9940839069157653,
   1.0,
   0.4939759036144578,
   0.7403846153846154,
   1.0,
   0.6649484536082474,
   0.7142857142857143,
   0.8861111111111111,
   0.9019607843137255,
   0.8439849624060151,
   0.5,
   0.5],
  'nt': [0.9939759036144578,
   0.

In [None]:
for path in paths:
    X_train, X_test, y_train, y_test = read_split(path,0.2)
    results = []
    names = []
    for name, model in models:
        print(f'{name}**************')
        model_grid = HalvingGridSearchCV(model,
                                         parameters[name],
                                         cv = 5,
                                         verbose=True,
                                         scoring='balanced_accuracy')
        model_grid.fit(X_train, y_train)
        probs = np.array(model_grid.predict_proba(X_train)[:,0])
        perfor=[]
        for theta in thetas:
            pred = 1*(probs>=theta)
            perfor.append(met.balanced_accuracy_score(y_train,pred))
        perfor=np.array(perfor,dtype=np.float32)
        list1 = np.array(perfor)
        k = 7
        pos = []
        for i in range(k):
            max_v = np.argmax(list1)
            x=list1[max_v]
            for j in range(1,4):
                list1[max_v-j]=-1*(list1[max_v-j]==x)+list1[max_v-j]*(list1[max_v-j]!=x)
                list1[max_v+j]=-1*(list1[max_v+j]==x)+list1[max_v+j]*(list1[max_v+j]!=x)
            pos.append(max_v)
            list1[max_v] = -1
        x=abs((thetas[pos]-0.5)*(thetas[pos]>=0.5))+thetas[pos]*(thetas[pos]<0.5)
        tOptim=pos[np.where(x==np.max(x))[0][0]]/100
        print(tOptim)
        print(model_grid.best_params_)
        model.parameters = model_grid.best_params_
#     cv_results = cross_val_score(model,X_train, y_train, cv=5,scoring='balanced_accuracy')
#     results.append(cv_results)
#     names.append(name)
#     msg= '%s: %f (%f)' %(name, cv_results.mean(), cv_results.std())
#     print(msg)
        model.fit(X_train,y_train)
        probs=np.array(model.predict_proba(X_test)[:,0])
        b_a_score = met.balanced_accuracy_score(y_test, model.predict(X_test))
        print('noTruncated:',b_a_score)
        dict_models3[name]['nt'].append(b_a_score)
        b_a_score = met.balanced_accuracy_score(y_test, 1*(probs>=tOptim))
        print('Truncated',b_a_score)
        dict_models3[name]['t'].append(b_a_score)
        print('******************')

In [None]:
dict_models3

In [None]:
# pd.DataFrame.from_dict(dict_models, orient='index')

In [None]:
for key,value in dict_models1.items():
    plt.plot(dict_models1[key]['t'],'o',label=key+'t')
    plt.plot(dict_models1[key]['nt'],'o',label=key+'nt')
    plt.legend()
    plt.show()

In [None]:
parameters = []
parameters.append(('Tree',{'class_weight':['balanced'],
                           'min_samples_leaf':[20,30,40,50],
                           'min_impurity_decrease':[0,0.01,0.03,0.05],
                           'ccp_alpha':[0,0.01,0.03,0.05],
                           'random_state':[123]}))

parameters.append(('RF',{'class_weight':['balanced'],
                         'n_estimators':[500],
                         'min_samples_leaf':[20,30,40],
                         'min_impurity_decrease':[0,0.03,0.05],
                         'ccp_alpha':[0,0.03,0.05],
                         'oob_score':[False, True],
                         'random_state':[123],
                         'n_jobs':[-1]}))

parameters.append(('XGB',{'sample_type':['weighted'],
                          'learning_rate': [0.3, 0.5, 0.7],
                          'min_child_weight': [1,3,5],
                          'subsample': [0.5,0.6,0.7,0.8],
                          'verbosity' : [0],
                          'colsample_bytree': [0.7,0.8,0.9,1],
                          'n_estimators': [100,300,500],
                          'random_state':[123],
                          'n_jobs':[-1]}))
parameters = dict(parameters)

In [None]:
models1 = []
models1.append(('Tree',DecisionTreeClassifier()))
models1.append(('RF',RandomForestClassifier()))
models1.append(('XGB',xgb()))

In [None]:
for path in paths:
    X_train, X_test, y_train, y_test = read_split(path,0.2)
    results = []
    names = []
    for name, model in models1:
        model_grid = HalvingGridSearchCV(model,
                                         parameters[name],
                                         cv = 5,
                                         verbose=True,
                                         scoring='balanced_accuracy')
        model_grid.fit(X_train, y_train)
        model.parameters = model_grid.best_params_
    #cv_results = cross_val_score(model,X_train, y_train, cv=5,scoring='balanced_accuracy')
    #results.append(cv_results)
    #names.append(name)
    #msg= '%s: %f (%f)' %(name, cv_results.mean(), cv_results.std())
    #print(msg)
        model.fit(X_train,y_train)
        b_a_score = balanced_accuracy_score(y_test, model.predict(X_test))
        print('weighted: ',b_a_score)
        dict_models4[name]['t'].append(b_a_score)
        if name == 'XGB':
            model.parameters.pop('sample_type','weighted')
        else:
            model.parameters.pop('class_weight','balanced')
        model.fit(X_train,y_train)
    #pick.dump(model,open(f'./weighted_{name}.sav','wb'))
        b_a_score = balanced_accuracy_score(y_test, model.predict(X_test))
        print('no_weighted: ',b_a_score)
        dict_models4[name]['nt'].append(b_a_score)
        print('******************')

In [None]:
dict_models4

In [None]:

x=[np.array(dict_models["Tree"]["t"], dtype=np.float32) * np.array(dict_models["Tree"]["size"],dtype=np.float32),np.array(dict_models["Tree"]["nt"], dtype=np.float32) * np.array(dict_models["Tree"]["size"],dtype=np.float32) ]
x=np.array(x)

In [None]:
stats.chi2_contingency(x)[1]

In [35]:
np.array(dict_models["Tree"]["t"], dtype=np.float32) * np.array(dict_models["Tree"]["size"],dtype=np.float32)

array([26.249998, 50.789997, 17.036001], dtype=float32)

In [None]:
if min(stats.shapiro(DatosMAPETrain).pvalue,stats.shapiro(DatosMAPETest).pvalue)<0.05:
    pval=min(stats.mannwhitneyu(DatosMAPETrain,DatosMAPETest).pvalue,stats.fligner(DatosMAPETrain,DatosMAPETest).pvalue)
else:
    pval=min(stats.ttest_ind(DatosMAPETrain,DatosMAPETest).pvalue,stats.levene(DatosMAPETrain,DatosMAPETest).pvalue) 
DatosMAPE=np.concatenate((DatosMAPETrain, DatosMAPETest), axis=None)
if pval>=0.05:
  print("El modelo es consistente y tiene un MAPE de: ",DatosMAPE.mean())
pval=min(pval,sm.stats.acorr_ljungbox(DatosMAPE)[1][0])
if pval>=0.05:
  print("El modelo es válido para predecir")