# MLBox Tutorial

## Created by: Sergei Issaev

In [32]:
from mlbox.preprocessing import *
from mlbox.optimisation import *
from mlbox.prediction import *

In [33]:
paths = ['train.csv', 'test.csv'] #to modify
target_name = "target" #to modify

In [34]:
data = Reader(sep=",", header = 0, to_hdf5 = True, to_path = 'save', verbose = True).train_test_split(Lpath = paths, target_name = target_name)  #reading


reading csv : train.csv ...
cleaning data ...
CPU time: 0.6281528472900391 seconds

reading csv : test.csv ...
cleaning data ...
CPU time: 0.7041749954223633 seconds

> Number of common features : 12

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 10
> Number of numerical features: 2
> Number of training samples : 18314
> Number of test samples : 15021

> Top sparse features (% missing values on train set):
company_type        27.5
company_size        26.0
gender              22.4
major_discipline    15.5
education_level      2.5
dtype: float64

> Task : classification
0.0    15889
1.0     2425
Name: target, dtype: int64

encoding target ...

dumping files into directory : save


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['city', 'company_size', 'company_type', 'education_level', 'enrolled_university', 'experience', 'gender', 'last_new_job', 'major_discipline', 'relevent_experience']]

  pytables.to_hdf(path_or_buf, key, self, **kwargs)


train dumped
test dumped
CPU time: 0.536135196685791 seconds


In [35]:
data['train'].isna().sum()

city                         0
city_development_index       0
company_size              4768
company_type              5028
education_level            457
enrolled_university        342
experience                  59
gender                    4098
last_new_job               367
major_discipline          2835
relevent_experience          0
training_hours               0
dtype: int64

In [36]:
data = Drift_thresholder().fit_transform(data)  #deleting non-stable variables


computing drifts ...
CPU time: 3.320826768875122 seconds

> Top 10 drifts

('major_discipline', 0.012730852551797511)
('company_type', 0.011518042146925866)
('city', 0.007947756553744023)
('education_level', 0.00790611607079672)
('experience', 0.007649773359261758)
('training_hours', 0.005617221463938948)
('gender', 0.005365756588933346)
('last_new_job', 0.005029686781754084)
('relevent_experience', 0.0027116096221917463)
('enrolled_university', 0.00268451371984324)

> Deleted variables : []
> Drift coefficients dumped into directory : save


In [37]:
Optimiser().evaluate(None, data)

  +str(self.to_path)+"/joblib'. Please clear it regularly.")


No parameters set. Default configuration is tested

##################################################### testing hyper-parameters... #####################################################

>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': '<NULL>'}

>>> CA ENCODER :{'strategy': 'label_encoding'}

>>> ESTIMATOR :{'strategy': 'LightGBM', 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 0.9, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'nthread': -1, 'seed': 0}


MEAN SCORE : neg_log_loss = -0.40767551778873967
VARIANCE : 0.002736224296793155 (fold 1 = -0.4049392934919465, fold 2 = -0.4104117420855328)
CPU time: 13.537402391433716 seconds



-0.40767551778873967

In [38]:
opt = Optimiser()

  +str(self.to_path)+"/joblib'. Please clear it regularly.")


In [9]:
space = {

        'ne__numerical_strategy' : {"space" : [0, 'mean', 'median', 'most_frequent']},
        'ne__categorical_strategy' : {"space" : ['<NULL>', 'most_frequent']},

        'ce__strategy' : {"space" : ["label_encoding", "dummification", "random_projection", "entity_embedding"]},

        'fs__strategy' : {"space" : ["variance", "rf_feature_importance", 'l1']},
        'fs__threshold': {"search" : "choice", "space" : [0.1, 0.2, 0.3, 0.4, 0.5]},

        'est__strategy' : {"space" : ["LightGBM", "RandomForest", 'ExtraTrees', 'Tree', 'Bagging', 'AdaBoost', 'Linear']},
        'est__max_depth' : {"search" : "choice", "space" : [1, 2, 4, 8,6, 10, 12, 15, 20, 25, 35, 50]},
        'est__n_estimators' : {"search" : "choice", "space" : [20, 40, 80, 160, 320, 640, 1280, 2560]},
        'est__bootstrap' : {"space" : [True, False]},
        'est__max_features' : {"search" : "choice", "space" : [0.1, 0.3, 0.5, 0.9, 1.0]},
        'est__max_samples' : {"search" : "choice", "space" : [0.5, 0.75, 0.9, 1.0]},
        'est__bootstrap_features' : {"space" : [True, False]},
        'est__random_state' : {"search" : "choice", "space" : [69]},
        'est__verbose' : {"space" : [True]},
        'est__num_leaves' : {"search" : "choice", "space" : [10, 20, 40, 80, 160]},
        'est__min_data_in_leaf' : {"search" : "choice", "space" : [1, 4, 10, 50, 100, 1000]},
        'est__boosting_type' : { "space" : ['gbdt', 'rf', 'goss', 'dart']},
        'est__criterion' : { "space" : ['gini', 'entropy']},
        'est__max_bin' : { "search" : "choice", "space" : [1, 5, 10, 20, 50]},
        'est__num_iterations' : {"search" : "choice", "space" : [20, 50, 100, 150, 200, 400, 800]},
        'est__learning_rate' : {"search" : "choice", "space" : [0.001, 0.010, 0.025, 0.050, 0.075, 0.100, 0.150, 0.250]},
        'est__min_weight_fraction_leaf' : {"search" : "choice", "space" : [0.0, 0.05,0.1, 0.3, 0.5]},
        'est__class_weight' : { "search" : "choice", "space" : [None, 'balanced', 'balanced_subsample']},
        'est__min_samples_split' : {"search" : "choice", "space" : [2, 3, 4, 8, 12]},
        'est__subsample' : {"search" : "choice", "space" : [0.6,0.9]}

        }

best = opt.optimise(space, data, max_evals = 25)

##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': 'most_frequent'}                                
                                                                                                                       

  + ". Parameter IGNORED. Check the list of "



>>> CA ENCODER :{'strategy': 'label_encoding'}
>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.3}                                          
>>> ESTIMATOR :{'strategy': 'RandomForest', 'bootstrap': True, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 20, 'max_features': 0.5, 'max_samples': 0.75, 'min_samples_split': 8, 'min_weight_fraction_leaf': 0.05, 'n_estimators': 2560, 'random_state': 69, 'verbose': True, 'ccp_alpha': 0.0, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'n_jobs': -1, 'oob_score': False, 'warm_start': False}
  0%|                                                                           | 0/25 [00:00<?, ?trial/s, best loss=?]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done 300 tasks      | elapsed:    2.9s

[Parallel(n_jobs=-1)]: Done 1500 tasks      | elapsed:   12.3s

[Parallel(n_jobs=-1)]: Done 2560 out of 2560 | elapsed:   20.4s finished

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s

[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s

[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.7s

[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    1.3s

[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:    2.0s

[Parallel(n_jobs=2)]: Done 1796 tasks      | elapsed:    2.8s

[Parallel(n_jobs=2)]: Done 2446 tasks      | elapsed:    3.7s

[Parallel(n_jobs=2)]: Done 2560 out of 2560 | elapsed:    3.9s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed: 

MEAN SCORE : neg_log_loss = -0.37912332450505876                                                                       
VARIANCE : 0.000433747368419668 (fold 1 = -0.3786895771366391, fold 2 = -0.3795570718734784)                           
CPU time: 75.20691347122192 seconds                                                                                    
  4%|█▉                                              | 1/25 [01:15<30:09, 75.39s/trial, best loss: 0.37912332450505876]

[Parallel(n_jobs=2)]: Done 2560 out of 2560 | elapsed:    4.9s finished



##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': 'most_frequent'}                                
                                                                                                                       

  + ". Parameter IGNORED. Check the list of "



>>> CA ENCODER :{'strategy': 'entity_embedding'}
>>> FEATURE SELECTOR :{'strategy': 'variance', 'threshold': 0.3}                                                       
>>> ESTIMATOR :{'strategy': 'RandomForest', 'bootstrap': True, 'class_weight': 'balanced_subsample', 'criterion': 'gini', 'max_depth': 12, 'max_features': 0.3, 'max_samples': 1.0, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.3, 'n_estimators': 1280, 'random_state': 69, 'verbose': True, 'ccp_alpha': 0.0, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'n_jobs': -1, 'oob_score': False, 'warm_start': False}
  4%|█▉                                              | 1/25 [01:16<30:09, 75.39s/trial, best loss: 0.37912332450505876]

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

ValueError: `max_samples` must be in range (0, 1) but got value 1.0


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

ValueError: `max_samples` must be in range (0, 1) but got value 1.0



MEAN SCORE : neg_log_loss = nan                                                                                        
VARIANCE : nan (fold 1 = nan, fold 2 = nan)                                                                            
CPU time: 80.29076886177063 seconds                                                                                    
  8%|███▊                                            | 2/25 [02:35<29:29, 76.94s/trial, best loss: 0.37912332450505876]




##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': 'most_frequent'}                                
>>> CA ENCODER :{'strategy': 'entity_embedding'}                                                                       
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.4}                                                             
>>> ESTIMATOR :{'strategy': 'RandomForest', 'bootstrap': False, 'class_weight': 'balanced_subsample', 'criterion': 'gini', 'max_depth': 15, 'max_features': 0.1, 'max_samples': 1.0, 'min_samples_split': 12, 'min_weight_fraction_leaf': 0.1, 'n_estimators': 320, 'random_state': 69, 'verbose': True, 'ccp_alpha': 0.0, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'n_jobs': -1, 'oob_score': False, 'warm_start': False}
  8%|███▊                    

  + ". Parameter IGNORED. Check the list of "

ValueError: `max_samples` must be in range (0, 1) but got value 1.0




MEAN SCORE : neg_log_loss = nan                                                                                        
VARIANCE : nan (fold 1 = nan, fold 2 = nan)                                                                            
CPU time: 9.661429405212402 seconds                                                                                    
 12%|█████▊                                          | 3/25 [02:45<20:49, 56.81s/trial, best loss: 0.37912332450505876]

ValueError: `max_samples` must be in range (0, 1) but got value 1.0




##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'most_frequent', 'categorical_strategy': '<NULL>'}                              
>>> CA ENCODER :{'strategy': 'entity_embedding'}                                                                       
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.5}                                                             
>>> ESTIMATOR :{'strategy': 'Bagging', 'bootstrap': False, 'bootstrap_features': False, 'max_features': 0.9, 'max_samples': 0.75, 'n_estimators': 20, 'random_state': 69, 'verbose': True, 'base_estimator': None, 'n_jobs': -1, 'oob_score': False, 'warm_start': False}
 12%|█████▊                                          | 3/25 [02:46<20:49, 56.81s/trial, best loss: 0.37912332450505876]

  + ". Parameter IGNORED. Check the list of "

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    3.1s remaining:    0.0s

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    3.1s finished

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this

MEAN SCORE : neg_log_loss = -1.0187783120123524                                                                        
VARIANCE : 0.04934719042175545 (fold 1 = -0.969431121590597, fold 2 = -1.0681255024341079)                             
CPU time: 26.575666666030884 seconds                                                                                   
 16%|███████▋                                        | 4/25 [03:12<16:43, 47.78s/trial, best loss: 0.37912332450505876]

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished



##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': 'most_frequent'}                                     
>>> CA ENCODER :{'strategy': 'entity_embedding'}                                                                       
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.1}                                                             
 16%|███████▋                                        | 4/25 [03:12<16:43, 47.78s/trial, best loss: 0.37912332450505876]

  + ". Parameter IGNORED. Check the list of "



>>> ESTIMATOR :{'strategy': 'Bagging', 'bootstrap': False, 'bootstrap_features': True, 'max_features': 0.3, 'max_samples': 0.9, 'n_estimators': 20, 'random_state': 69, 'verbose': True, 'base_estimator': None, 'n_jobs': -1, 'oob_score': False, 'warm_start': False}
 16%|███████▋                                        | 4/25 [03:12<16:43, 47.78s/trial, best loss: 0.37912332450505876]

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.5s finished

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.1s finished

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a w

MEAN SCORE : neg_log_loss = -0.43005523623217556                                                                       
VARIANCE : 0.006467643193516032 (fold 1 = -0.43652287942569157, fold 2 = -0.4235875930386595)                          
CPU time: 15.600842714309692 seconds                                                                                   
 20%|█████████▌                                      | 5/25 [03:28<12:43, 38.19s/trial, best loss: 0.37912332450505876]

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished



##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'most_frequent', 'categorical_strategy': 'most_frequent'}                       
>>> CA ENCODER :{'strategy': 'random_projection'}                                                                      
>>> FEATURE SELECTOR :{'strategy': 'variance', 'threshold': 0.4}                                                       
                                                                                                                       

  + ". Parameter IGNORED. Check the list of "



>>> ESTIMATOR :{'strategy': 'RandomForest', 'bootstrap': True, 'class_weight': 'balanced_subsample', 'criterion': 'gini', 'max_depth': 1, 'max_features': 0.5, 'max_samples': 1.0, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.05, 'n_estimators': 1280, 'random_state': 69, 'verbose': True, 'ccp_alpha': 0.0, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'n_jobs': -1, 'oob_score': False, 'warm_start': False}
 20%|█████████▌                                      | 5/25 [03:28<12:43, 38.19s/trial, best loss: 0.37912332450505876]

ValueError: `max_samples` must be in range (0, 1) but got value 1.0




MEAN SCORE : neg_log_loss = nan                                                                                        
VARIANCE : nan (fold 1 = nan, fold 2 = nan)                                                                            
CPU time: 1.020254373550415 seconds                                                                                    
                                                                                                                       

ValueError: `max_samples` must be in range (0, 1) but got value 1.0


  + ". Parameter IGNORED. Check the list of "



##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'median', 'categorical_strategy': '<NULL>'}                                     
>>> CA ENCODER :{'strategy': 'random_projection'}                                                                      
>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.4}                                          
>>> ESTIMATOR :{'strategy': 'Bagging', 'bootstrap': True, 'bootstrap_features': True, 'max_features': 0.5, 'max_samples': 0.75, 'n_estimators': 20, 'random_state': 69, 'verbose': True, 'base_estimator': None, 'n_jobs': -1, 'oob_score': False, 'warm_start': False}
 24%|███████████▌                                    | 6/25 [03:29<08:34, 27.09s/trial, best loss: 0.37912332450505876]

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.0s finished

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a w

                                                                                                                       


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s

[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.1s finished



MEAN SCORE : neg_log_loss = -0.47650565788700183
VARIANCE : 0.012618218132245035 (fold 1 = -0.48912387601924684, fold 2 = -0.46388743975475677)                         
CPU time: 11.01676893234253 seconds                                                                                    
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': '<NULL>'}                                       
>>> CA ENCODER :{'strategy': 'label_encoding'}                                                                         
>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.2}                                          
>>> ESTIMATOR :{'strategy': 'Linear', 'class_weight': 'balanced', 'random_state': 69, 'verbose': True, 'C': 1.0, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'mult

  + ". Parameter IGNORED. Check the list of "



 28%|█████████████▍                                  | 7/25 [03:40<06:41, 22.30s/trial, best loss: 0.37912332450505876]

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.



MEAN SCORE : neg_log_loss = -0.665628124352591                                                                         
VARIANCE : 0.002275979662162586 (fold 1 = -0.6633521446904285, fold 2 = -0.6679041040147536)                           
                                                                                                                       

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished



CPU time: 9.961446523666382 seconds
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'median', 'categorical_strategy': 'most_frequent'}                              
>>> CA ENCODER :{'strategy': 'dummification'}                                                                          
>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.1}                                          
>>> ESTIMATOR :{'strategy': 'AdaBoost', 'learning_rate': 0.001, 'n_estimators': 20, 'random_state': 69, 'algorithm': 'SAMME.R', 'base_estimator': None}
 32%|███████████████▎                                | 8/25 [03:50<05:16, 18.64s/trial, best loss: 0.37912332450505876]

  + ". Parameter IGNORED. Check the list of "

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS

MEAN SCORE : neg_log_loss = -inf                                                                                       
VARIANCE : nan (fold 1 = -inf, fold 2 = -inf)                                                                          
CPU time: 27.985506534576416 seconds                                                                                   
 32%|███████████████▎                                | 8/25 [04:18<05:16, 18.64s/trial, best loss: 0.37912332450505876]


  x = asanyarray(arr - arrmean)



##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': 'most_frequent'}                                     
>>> CA ENCODER :{'strategy': 'label_encoding'}                                                                         
>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.1}                                          
>>> ESTIMATOR :{'strategy': 'ExtraTrees', 'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 1, 'max_features': 0.9, 'max_samples': 0.5, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 40, 'random_state': 69, 'verbose': True, 'ccp_alpha': 0.0, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'n_jobs': -1, 'oob_score': False, 'warm_start': False}
 36%|█████████████████▎                      

  + ". Parameter IGNORED. Check the list of "

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.2s finished

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.

[Parallel(n_jobs=2)]: Done  40 out of  40 | elapsed:    0.0s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.2s finished

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.

[Parallel(n_jobs=2)]: Done  40 out of  40 | elapsed:    0.0s finished



MEAN SCORE : neg_log_loss = -0.6741686277886157                                                                        
VARIANCE : 0.001322475261654521 (fold 1 = -0.6754911030502703, fold 2 = -0.6728461525269612)                           
CPU time: 6.567481517791748 seconds                                                                                    
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'most_frequent', 'categorical_strategy': '<NULL>'}                              
>>> CA ENCODER :{'strategy': 'label_encoding'}                                                                         
 40%|██████████████████▊                            | 10/25 [04:25<04:15, 17.05s/trial, best loss: 0.37912332450505876]

  + ". Parameter IGNORED. Check the list of "



>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.5}                                                             
>>> ESTIMATOR :{'strategy': 'LightGBM', 'boosting_type': 'goss', 'class_weight': 'balanced_subsample', 'learning_rate': 0.075, 'max_depth': 20, 'n_estimators': 2560, 'num_leaves': 40, 'random_state': 69, 'subsample': 0.9, 'colsample_bytree': 0.8, 'importance_type': 'split', 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_jobs': -1, 'objective': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'nthread': -1, 'seed': 0}
 40%|██████████████████▊                            | 10/25 [04:25<04:15, 17.05s/trial, best loss: 0.37912332450505876]

ValueError: The only valid preset for class_weight is "balanced". Given "balanced_subsample".




MEAN SCORE : neg_log_loss = nan                                                                                        
VARIANCE : nan (fold 1 = nan, fold 2 = nan)                                                                            
CPU time: 4.596156358718872 seconds                                                                                    
 44%|████████████████████▋                          | 11/25 [04:30<03:06, 13.35s/trial, best loss: 0.37912332450505876]

ValueError: The only valid preset for class_weight is "balanced". Given "balanced_subsample".




##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': 'most_frequent'}                                
>>> CA ENCODER :{'strategy': 'dummification'}                                                                          
>>> FEATURE SELECTOR :{'strategy': 'variance', 'threshold': 0.5}                                                       
>>> ESTIMATOR :{'strategy': 'Linear', 'class_weight': 'balanced', 'random_state': 69, 'verbose': True, 'C': 1.0, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'ovr', 'n_jobs': -1, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 0.0001, 'warm_start': False}
 44%|████████████████████▋                          | 11/25 [04:30<03:06, 13.35s/trial, best loss: 0.37912332450505876]

  + ". Parameter IGNORED. Check the list of "

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    4.2s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    4.7s finished



                                                                                                                       

Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  )[list(self.__Lnum)+sorted(missing_var+sub_var)])

Use a Series with sparse values instead.

    >>> series = pd.Series(pd.SparseArray(...))

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  return klass(values, index=self.index, name=items, fastpath=True)

Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  default_fill_value=self._default_fill_value,




MEAN SCORE : neg_log_loss = -inf
VARIANCE : nan (fold 1 = -inf, fold 2 = -inf)                                                                          
CPU time: 14.576349973678589 seconds                                                                                   
##################################################### testing hyper-parameters... #####################################################
                                                                                                                       

  x = asanyarray(arr - arrmean)

  + ". Parameter IGNORED. Check the list of "



>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': '<NULL>'}
>>> CA ENCODER :{'strategy': 'dummification'}                                                                          
>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.1}                                          
>>> ESTIMATOR :{'strategy': 'Tree', 'class_weight': None, 'criterion': 'entropy', 'max_depth': 25, 'max_features': 0.9, 'min_samples_split': 3, 'min_weight_fraction_leaf': 0.5, 'random_state': 69, 'ccp_alpha': 0.0, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'presort': False, 'splitter': 'best'}
 48%|██████████████████████▌                        | 12/25 [04:45<02:58, 13.76s/trial, best loss: 0.37912332450505876]

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])



Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  )[list(self.__Lnum)+sorted(missing_var+sub_var)])

Use a Series with sparse values instead.

    >>> series = pd.Series(pd.SparseArray(...))

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  return klass(values, index=self.index, name=items, fastpath=True)

Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  default_fil

MEAN SCORE : neg_log_loss = -inf                                                                                       
VARIANCE : nan (fold 1 = -inf, fold 2 = -inf)                                                                          
CPU time: 18.679676055908203 seconds                                                                                   
 52%|████████████████████████▍                      | 13/25 [05:03<03:03, 15.27s/trial, best loss: 0.37912332450505876]


  x = asanyarray(arr - arrmean)

  + ". Parameter IGNORED. Check the list of "



##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': '<NULL>'}                                       
>>> CA ENCODER :{'strategy': 'random_projection'}                                                                      
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.3}                                                             
>>> ESTIMATOR :{'strategy': 'Tree', 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 12, 'max_features': 0.3, 'min_samples_split': 8, 'min_weight_fraction_leaf': 0.3, 'random_state': 69, 'ccp_alpha': 0.0, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'presort': False, 'splitter': 'best'}
 52%|████████████████████████▍                      | 13/25 [05:04<03:03, 15.27s/trial, best loss: 0.37912332450505876]





MEAN SCORE : neg_log_loss = -0.6889479642728042                                                                        
VARIANCE : 0.0011594297027251455 (fold 1 = -0.687788534570079, fold 2 = -0.6901073939755293)                           
CPU time: 6.1303322315216064 seconds                                                                                   
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}                                            
>>> CA ENCODER :{'strategy': 'dummification'}                                                                          
                                                                                                                       

  + ". Parameter IGNORED. Check the list of "



>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.5}
>>> ESTIMATOR :{'strategy': 'RandomForest', 'bootstrap': False, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 20, 'max_features': 0.3, 'max_samples': 0.75, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.1, 'n_estimators': 640, 'random_state': 69, 'verbose': True, 'ccp_alpha': 0.0, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'n_jobs': -1, 'oob_score': False, 'warm_start': False}
 56%|██████████████████████████▎                    | 14/25 [05:10<02:18, 12.57s/trial, best loss: 0.37912332450505876]

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    3.7s

[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:   14.6s finished

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s

[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.4s

[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.3s

[Parallel(n_jobs=2)]: Done 640 out of 640 | elapsed:    1.9s finished

If this happens often in your code, it can cause performance problems 
(results will

MEAN SCORE : neg_log_loss = -inf                                                                                       
                                                                                                                       


  x = asanyarray(arr - arrmean)



VARIANCE : nan (fold 1 = -inf, fold 2 = -inf)
CPU time: 58.453548431396484 seconds                                                                                   
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': 'most_frequent'}                                
>>> CA ENCODER :{'strategy': 'label_encoding'}                                                                         
                                                                                                                       

  + ". Parameter IGNORED. Check the list of "



>>> FEATURE SELECTOR :{'strategy': 'variance', 'threshold': 0.2}
>>> ESTIMATOR :{'strategy': 'LightGBM', 'boosting_type': 'gbdt', 'class_weight': None, 'learning_rate': 0.01, 'max_depth': 20, 'n_estimators': 320, 'num_leaves': 80, 'random_state': 69, 'subsample': 0.9, 'colsample_bytree': 0.8, 'importance_type': 'split', 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_jobs': -1, 'objective': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'nthread': -1, 'seed': 0}
MEAN SCORE : neg_log_loss = -0.38560264789739207                                                                       
VARIANCE : 0.0011288806127289497 (fold 1 = -0.3844737672846631, fold 2 = -0.386731528510121)                           
CPU time: 10.131822109222412 seconds                                                                                   
##################################################### testing hyper-parameters...

  + ". Parameter IGNORED. Check the list of "



MEAN SCORE : neg_log_loss = -0.38651634764125187                                                                       
VARIANCE : 0.0011060482176758757 (fold 1 = -0.38541029942357596, fold 2 = -0.3876223958589277)                         
CPU time: 3.7109344005584717 seconds                                                                                   
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'most_frequent', 'categorical_strategy': 'most_frequent'}                       
>>> CA ENCODER :{'strategy': 'dummification'}                                                                          
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.5}                                                             
>>> ESTIMATOR :{'strategy': 'Linear', 'class_weight': None, 'random_state': 69, 'verbose': True, 'C': 1.0, 'dual': False, 'fit_intercept': True,

  + ". Parameter IGNORED. Check the list of "

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   11.4s finished

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent worke

MEAN SCORE : neg_log_loss = -inf                                                                                       
VARIANCE : nan (fold 1 = -inf, fold 2 = -inf)                                                                          
CPU time: 46.81703567504883 seconds                                                                                    
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'median', 'categorical_strategy': '<NULL>'}                                     
>>> CA ENCODER :{'strategy': 'label_encoding'}                                                                         
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.1}                                                             
>>> ESTIMATOR :{'strategy': 'RandomForest', 'bootstrap': True, 'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_depth': 25, 'm


  x = asanyarray(arr - arrmean)

  + ". Parameter IGNORED. Check the list of "

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.4s finished

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.

[Parallel(n_jobs=2)]: Done  40 out of  40 | elapsed:    0.0s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.4s finished

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.

[Parallel(n_jobs=2)]: Done  40 out of  40 | elapsed:    0.0s finished


MEAN SCORE : neg_log_loss = -0.6494637667346028                                                                        
VARIANCE : 0.0021115471105425665 (fold 1 = -0.6515753138451453, fold 2 = -0.6473522196240602)                          
CPU time: 6.162477731704712 seconds                                                                                    
 76%|███████████████████████████████████▋           | 19/25 [07:16<01:58, 19.72s/trial, best loss: 0.37912332450505876]




##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'most_frequent', 'categorical_strategy': 'most_frequent'}                       
>>> CA ENCODER :{'strategy': 'random_projection'}                                                                      
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.2}                                                             
>>> ESTIMATOR :{'strategy': 'Linear', 'class_weight': None, 'random_state': 69, 'verbose': True, 'C': 1.0, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'ovr', 'n_jobs': -1, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 0.0001, 'warm_start': False}
 76%|███████████████████████████████████▋           | 19/25 [07:16<01:58, 19.72s/trial, best loss: 0.37912332450505876]

  + ". Parameter IGNORED. Check the list of "

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.2s finished

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent worke

MEAN SCORE : neg_log_loss = -0.38467076925177623                                                                       
VARIANCE : 0.001109041839868985 (fold 1 = -0.38356172741190725, fold 2 = -0.3857798110916452)                          
CPU time: 12.74704122543335 seconds                                                                                    
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': 'most_frequent'}                                     
>>> CA ENCODER :{'strategy': 'dummification'}                                                                          
>>> FEATURE SELECTOR :{'strategy': 'variance', 'threshold': 0.3}                                                       
>>> ESTIMATOR :{'strategy': 'LightGBM', 'boosting_type': 'goss', 'class_weight': 'balanced', 'learning_rate': 0.15, 'max_depth': 1, 'n_estimator

  + ". Parameter IGNORED. Check the list of "

Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  )[list(self.__Lnum)+sorted(missing_var+sub_var)])

Use a Series with sparse values instead.

    >>> series = pd.Series(pd.SparseArray(...))

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  return klass(values, index=self.index, name=items, fastpath=True)

Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  default_fill_value=self._default_fill_value,



MEAN SCORE : neg_log_loss = -0.668421612987514                                                                         
VARIANCE : 0.0020516132642978135 (fold 1 = -0.6704732262518118, fold 2 = -0.6663699997232162)                          
CPU time: 7.0787365436553955 seconds                                                                                   
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': 'most_frequent'}                                     
>>> CA ENCODER :{'strategy': 'dummification'}                                                                          
>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.5}                                          
>>> ESTIMATOR :{'strategy': 'Linear', 'class_weight': 'balanced', 'random_state': 69, 'verbose': True, 'C': 1.0, 'dual': False, 'fit_intercept':

  + ". Parameter IGNORED. Check the list of "

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    4.7s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.5s finished

Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  )[list(self.__Lnum)+sorted(missing_var+sub_var)])

Use a Series with sparse values instead.

    >>> series = pd.Series(pd.SparseArray(...))

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  return klass(values, index=self.index, name=items, fastpath=True)

Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  default_fill_value=self._default_f

MEAN SCORE : neg_log_loss = -inf                                                                                       
VARIANCE : nan (fold 1 = -inf, fold 2 = -inf)                                                                          
CPU time: 24.064779043197632 seconds                                                                                   
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'median', 'categorical_strategy': '<NULL>'}                                     
                                                                                                                       


  x = asanyarray(arr - arrmean)

  + ". Parameter IGNORED. Check the list of "



>>> CA ENCODER :{'strategy': 'dummification'}
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.5}                                                             
>>> ESTIMATOR :{'strategy': 'RandomForest', 'bootstrap': True, 'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_depth': 25, 'max_features': 0.9, 'max_samples': 1.0, 'min_samples_split': 4, 'min_weight_fraction_leaf': 0.3, 'n_estimators': 1280, 'random_state': 69, 'verbose': True, 'ccp_alpha': 0.0, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'n_jobs': -1, 'oob_score': False, 'warm_start': False}
 88%|█████████████████████████████████████████▎     | 22/25 [08:00<00:52, 17.41s/trial, best loss: 0.37912332450505876]

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

ValueError: `max_samples` must be in range (0, 1) but got value 1.0




MEAN SCORE : neg_log_loss = nan                                                                                        
VARIANCE : nan (fold 1 = nan, fold 2 = nan)                                                                            
                                                                                                                       

ValueError: `max_samples` must be in range (0, 1) but got value 1.0




CPU time: 27.17669367790222 seconds
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': '<NULL>'}                                       
 92%|███████████████████████████████████████████▏   | 23/25 [08:27<00:40, 20.40s/trial, best loss: 0.37912332450505876]

  + ". Parameter IGNORED. Check the list of "



>>> CA ENCODER :{'strategy': 'random_projection'}                                                                      
>>> FEATURE SELECTOR :{'strategy': 'variance', 'threshold': 0.2}                                                       
>>> ESTIMATOR :{'strategy': 'Linear', 'class_weight': 'balanced', 'random_state': 69, 'verbose': True, 'C': 1.0, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'ovr', 'n_jobs': -1, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 0.0001, 'warm_start': False}
 92%|███████████████████████████████████████████▏   | 23/25 [08:28<00:40, 20.40s/trial, best loss: 0.37912332450505876]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.2s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.2s finished



MEAN SCORE : neg_log_loss = -0.6699184984412908                                                                        
VARIANCE : 0.00215737542467187 (fold 1 = -0.6720758738659627, fold 2 = -0.6677611230166189)                            
CPU time: 4.318085193634033 seconds                                                                                    
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': 'most_frequent'}                                
>>> CA ENCODER :{'strategy': 'dummification'}                                                                          
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.4}                                                             
                                                                                                                       

  + ". Parameter IGNORED. Check the list of "



>>> ESTIMATOR :{'strategy': 'LightGBM', 'boosting_type': 'dart', 'class_weight': 'balanced', 'learning_rate': 0.25, 'max_depth': 10, 'n_estimators': 160, 'num_leaves': 80, 'random_state': 69, 'subsample': 0.6, 'colsample_bytree': 0.8, 'importance_type': 'split', 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_jobs': -1, 'objective': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'nthread': -1, 'seed': 0}
MEAN SCORE : neg_log_loss = -inf                                                                                       
VARIANCE : nan (fold 1 = -inf, fold 2 = -inf)                                                                          
CPU time: 9.456376314163208 seconds                                                                                    
100%|███████████████████████████████████████████████| 25/25 [08:41<00:00, 20.87s/trial, best loss: 0.37912332450505876]


~~~~~~~~~~~~~~~~~~


  x = asanyarray(arr - arrmean)



In [39]:
space = {

        'ne__numerical_strategy' : {"space" : [0, 'mean', 'median', 'most_frequent']},
        'ne__categorical_strategy' : {"space" : ['<NULL>', 'most_frequent']},

        'ce__strategy' : {"space" : ["label_encoding", "dummification", "random_projection", "entity_embedding"]},

        'fs__strategy' : {"space" : ["variance", "rf_feature_importance", 'l1']},
        'fs__threshold': {"search" : "choice", "space" : [0.1, 0.2, 0.3, 0.4, 0.5]},

        'est__strategy' : {"space" : ["LightGBM"]},

        'est__boosting_type' : { "space" : ['gbdt', 'goss', 'dart']},
        'est__class_weight' : { "search" : "choice", "space" : [None, 'balanced']},
        'est__importance_type' : { "space" : ['split', 'gain']},
        'est__min_split_gain' : { "search" : "choice", "space" : [0, 0.001, 0.01, 0.1, 0.2]},
        'est__max_depth' : { "search" : "choice", "space" : [1, 2, 3, 4, 6, 12, 24, 48, -1]},
        'est__n_estimators' : { "search" : "choice", "space" : [20, 50, 100, 200, 400, 800, 1600]},
        'est__num_leaves' : { "search" : "choice", "space" : [2, 25, 31, 36, 75, 100]},
        'est__min_child_weight' : { "search" : "choice", "space" : [0.0, 0.0001, 0.001, 0.1]},
        'est__random_state' : { "space" : [69]},
        'est__reg_alpha' : { "search" : "choice", "space" : [0.0, 0.0001, 0.001, 0.1, 0.25]},
        'est__reg_lambda' : { "search" : "choice", "space" : [0.0, 0.0001, 0.001, 0.1, 0.25]},
        'est__min_child_samples' : {"search" : "choice", "space" : [5, 10, 15, 20, 25, 40, 60]},
        'est__subsample' : {"search" : "choice", "space" : [0.5, 0.8, 0.85, 0.9, 0.95, 1.0]},
        'est__subsample_for_bin' : {"search" : "choice", "space" : [15000, 19000, 20000, 21000, 25000]},
        'est__learning_rate' : {"search" : "choice", "space" : [0.0001, 0.001, 0.010, 0.025, 0.050, 0.075, 0.100, 0.150, 0.250]},
        'est__colsample_bytree' : { "search" : "choice", "space" : [0.75, 0.8, 0.85, 0.95, 1.0]}

        }

best = opt.optimise(space, data, max_evals = 5)

##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'most_frequent', 'categorical_strategy': 'most_frequent'}                       
>>> CA ENCODER :{'strategy': 'dummification'}                                                                          
>>> FEATURE SELECTOR :{'strategy': 'variance', 'threshold': 0.1}                                                       
>>> ESTIMATOR :{'strategy': 'LightGBM', 'boosting_type': 'gbdt', 'class_weight': 'balanced', 'colsample_bytree': 0.75, 'importance_type': 'split', 'learning_rate': 0.025, 'max_depth': 3, 'min_child_samples': 60, 'min_child_weight': 0.0, 'min_split_gain': 0.2, 'n_estimators': 50, 'num_leaves': 36, 'random_state': 69, 'reg_alpha': 0.0, 'reg_lambda': 0.1, 'subsample': 1.0, 'subsample_for_bin': 20000, 'n_jobs': -1, 'objective': None, 'silent': True, 'subsample_freq': 0, 'nthread': -1, 'seed': 0}
  0%|  

Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  )[list(self.__Lnum)+sorted(missing_var+sub_var)])

Use a Series with sparse values instead.

    >>> series = pd.Series(pd.SparseArray(...))

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  return klass(values, index=self.index, name=items, fastpath=True)

Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  default_fill_value=self._default_fill_value,



MEAN SCORE : neg_log_loss = -0.6631320555275615                                                                        
VARIANCE : 0.0021947899936144988 (fold 1 = -0.665326845521176, fold 2 = -0.660937265533947)                            
CPU time: 16.461137533187866 seconds                                                                                   
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}                                            
>>> CA ENCODER :{'strategy': 'entity_embedding'}                                                                       
>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.4}                                          
>>> ESTIMATOR :{'strategy': 'LightGBM', 'boosting_type': 'dart', 'class_weight': None, 'colsample_bytree': 0.95, 'importance_type': 'gain', 'lea

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide 

MEAN SCORE : neg_log_loss = -0.3842501620912133                                                                        
VARIANCE : 0.0007145822771107424 (fold 1 = -0.3835355798141026, fold 2 = -0.38496474436832406)                         
CPU time: 241.23604106903076 seconds                                                                                   
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': '<NULL>'}                                       
>>> CA ENCODER :{'strategy': 'entity_embedding'}                                                                       
>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.2}                                          
>>> ESTIMATOR :{'strategy': 'LightGBM', 'boosting_type': 'gbdt', 'class_weight': 'balanced', 'colsample_bytree': 0.95, 'importance_type': 'split

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])



MEAN SCORE : neg_log_loss = -0.6547377929580593                                                                        
VARIANCE : 0.0035356906965919865 (fold 1 = -0.6582734836546513, fold 2 = -0.6512021022614674)                          
CPU time: 18.365607500076294 seconds                                                                                   
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'most_frequent', 'categorical_strategy': 'most_frequent'}                       
>>> CA ENCODER :{'strategy': 'random_projection'}                                                                      
>>> FEATURE SELECTOR :{'strategy': 'variance', 'threshold': 0.5}                                                       
>>> ESTIMATOR :{'strategy': 'LightGBM', 'boosting_type': 'dart', 'class_weight': 'balanced', 'colsample_bytree': 0.85, 'importance_type': 'split

In [40]:
Predictor().fit_predict(best, data)


fitting the pipeline ...


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


CPU time: 18.728700876235962 seconds


  + " is probably a leak ! "



predicting ...
CPU time: 0.6271548271179199 seconds

> Overview on predictions : 

        0.0       1.0  target_predicted
0  0.712310  0.287690                 0
1  0.877586  0.122414                 0
2  0.851214  0.148786                 0
3  0.880878  0.119122                 0
4  0.893192  0.106808                 0
5  0.893192  0.106808                 0
6  0.783511  0.216489                 0
7  0.879133  0.120867                 0
8  0.893192  0.106808                 0
9  0.876287  0.123713                 0

dumping predictions into directory : save ...


<mlbox.prediction.predictor.Predictor at 0x1d87673cc88>

In [41]:
df = pd.read_csv('save/target_predictions.csv')

In [42]:
sub = pd.read_csv('sample_submission_sxfcbdx.csv')

In [43]:
sub.head()

Unnamed: 0,enrollee_id,target
0,16548,0
1,12036,0
2,11061,0
3,5032,0
4,17599,0


In [44]:
indices = sub.index.values

In [45]:
for i in range(len(indices)):
    sub.loc[i, 'target'] = df.loc[i, '1.0']

In [46]:
sos = sub.to_csv('1.0.csv', index = False)