In [1]:
import numpy as np
import pandas as pd
from scipy.io import arff
import matplotlib.pyplot as plt
from model_dictionaries import model_dictionary
from preprocessing_functions import basic_processing, data_preprocessing, split_dataset
from preprocessing_dictionaries import category_strategy_dict, numerical_strategy_dict
from sklearn.utils.validation import check_is_fitted
from hpbandster_sklearn import HpBandSterSearchCV
import hpbandster.core.nameserver as hpns
import hpbandster.core.result as hpres

In [2]:
churn_data = arff.loadarff("../data/chrun.arff")
churn_df = pd.DataFrame(churn_data[0])

In [3]:
churn_df["class"] = churn_df["class"].astype(int)
churn_df["number_customer_service_calls"] = churn_df["number_customer_service_calls"].astype(int)

# Preprocess

In [4]:
X_complete = churn_df.drop(columns="class").copy()
y_complete = np.reshape(churn_df[["class"]].values, X_complete.shape[0])

In [5]:
separated_datasets = split_dataset(X_complete, y_complete, 0.1, 42)

In [6]:
separated_datasets["X_validation_clean"] = basic_processing(separated_datasets["X_validation"])

In [7]:
encoded_data, encoder_objs = data_preprocessing(separated_datasets["X_validation_clean"], ["OHE", "SUM"], 
                                                ("SSE", "MMS"), category_strategy_dict, numerical_strategy_dict)

# Experiments

In [8]:
rf = model_dictionary["RF"]["model"]
param_search = model_dictionary["RF"]["param_config"]

In [9]:
search = HpBandSterSearchCV(rf, 
                            param_search,
                            random_state=0, 
                            n_jobs=50, 
                            n_iter=1, 
                            verbose=0,
                            resource_name = "n_samples",
                            cv=4,
                            eta=4,
                            scoring='neg_log_loss',
                            refit=True, 
                            optimizer = 'hyperband').fit(encoded_data["SUM-SSE"], 
                                                         separated_datasets["y_validation"])

OSError: [Errno 98] Address already in use

In [13]:
pd.DataFrame(search.cv_results_)

Unnamed: 0,run,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_samples,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0,5079,0.651755,0.101773,0.073869,0.019340,True,gini,4,0.0880315,67,"{'bootstrap': True, 'criterion': 'gini', 'max_...",-0.748251,-0.738407,-0.745757,-0.740913,-0.743332,0.003879,71
1,1,0,5079,0.424412,0.084448,0.052440,0.006635,False,gini,8,0.227145,9,"{'bootstrap': False, 'criterion': 'gini', 'max...",-0.713896,-0.727734,-0.701889,-0.710404,-0.713481,0.009316,42
2,2,0,5079,1.144231,0.054633,0.101985,0.007696,True,gini,7,0.276704,75,"{'bootstrap': True, 'criterion': 'gini', 'max_...",-0.720029,-0.718849,-0.718606,-0.716451,-0.718484,0.001291,50
3,3,0,5079,2.415834,0.347342,0.101515,0.025632,False,gini,5,0.16746,81,"{'bootstrap': False, 'criterion': 'gini', 'max...",-0.725622,-0.728532,-0.729137,-0.731311,-0.728650,0.002031,62
4,4,0,5079,3.884070,0.306212,0.163471,0.036074,False,gini,10,0.617342,96,"{'bootstrap': False, 'criterion': 'gini', 'max...",-0.697520,-0.696001,-0.697932,-0.695238,-0.696673,0.001097,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,61,0,5079,1.688498,0.424388,0.053120,0.018902,False,entropy,9,0.56328,56,"{'bootstrap': False, 'criterion': 'entropy', '...",-0.697925,-0.699321,-0.701874,-0.699415,-0.699634,0.001422,16
81,61,1,5079,1.428867,0.368074,0.047593,0.026449,False,entropy,9,0.56328,56,"{'bootstrap': False, 'criterion': 'entropy', '...",-0.704310,-0.698298,-0.703652,-0.690721,-0.699245,0.005446,15
82,62,0,5079,0.836229,0.193159,0.087545,0.044065,True,entropy,6,0.2408,71,"{'bootstrap': True, 'criterion': 'entropy', 'm...",-0.723235,-0.733659,-0.729844,-0.722443,-0.727295,0.004665,58
83,63,0,5079,1.191454,0.317888,0.029180,0.016634,False,entropy,8,0.872603,56,"{'bootstrap': False, 'criterion': 'entropy', '...",-0.714722,-0.707961,-0.700335,-0.701647,-0.706166,0.005720,31


In [14]:
search.best_estimator_ 

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=10,
                       max_samples=0.20000363458756698, n_estimators=39)

In [15]:
search.best_score_

-0.6920483346046165