In [7]:
import pandas as pd
from scipy.io import arff
import numpy as np
from sklearn.model_selection import train_test_split

dataarff = arff.loadarff("phpMawTba.arff")
adult_census = pd.DataFrame(dataarff[0])
adult_census[list(adult_census.columns.values)] = adult_census[list(adult_census.columns.values)].applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
target_name = "class"

target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])
data.head()

  adult_census[list(adult_census.columns.values)] = adult_census[list(adult_census.columns.values)].applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)


Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25.0,Private,226802.0,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States
1,38.0,Private,89814.0,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States
2,28.0,Local-gov,336951.0,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States
3,44.0,Private,160323.0,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States
4,18.0,?,103497.0,Some-college,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

preprocessor = ColumnTransformer([("categorical_preprocessor", categorical_preprocessor, categorical_columns)], remainder="passthrough")

In [10]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4))
])

In [11]:
from scipy.stats import loguniform

class loguinform_int:
    """Integer valued version of the log-uniform distribution"""
    def __init__(self, a, b):
        self._distribution = loguniform(a, b)
    def rvs(self, *arg, **kwargs):
        """Random variable sample"""
        return self._distribution.rvs(*arg, **kwargs).astype(int)

In [14]:
%%time
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    "classifier__l2_regularization": loguniform(1e-6, 1e3),
    "classifier__learning_rate": loguniform(0.001, 10),
    "classifier__max_leaf_nodes": loguinform_int(2, 256),
    "classifier__min_samples_leaf": loguinform_int(1, 100),
    "classifier__max_bins": loguinform_int(2, 255),
}

model_random_search = RandomizedSearchCV(
    model,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    verbose=1,
)
model_random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
CPU times: total: 3min 28s
Wall time: 18.3 s


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [15]:
accuracy = model_random_search.score(X_test, y_test)
print(f"The test accuracy score of the best model is {accuracy:.2f}")

The test accuracy score of the best model is 0.88


In [16]:
from pprint import pprint
print("The best parameters are: ")
pprint(model_random_search.best_params_)

The best parameters are: 
{'classifier__l2_regularization': 0.14529067143132915,
 'classifier__learning_rate': 0.026121135425655567,
 'classifier__max_bins': 180,
 'classifier__max_leaf_nodes': 152,
 'classifier__min_samples_leaf': 54}


In [17]:
column_results = [f"param_{name}" for name in param_distributions.keys()]
column_results += ["mean_test_score", "std_test_score", "rank_test_score"]

cv_results = pd.DataFrame(model_random_search.cv_results_)
cv_results = cv_results[column_results].sort_values(
    "mean_test_score", ascending=False
)

def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name

cv_results = cv_results.rename(shorten_param, axis=1)
cv_results

Unnamed: 0,l2_regularization,learning_rate,max_leaf_nodes,min_samples_leaf,max_bins,mean_test_score,std_test_score,rank_test_score
8,0.145291,0.026121,152,54,180,0.864923,0.002262,1
3,6.476788,1.52419,230,58,143,0.840627,0.004316,2
0,0.002369,0.006641,33,23,28,0.805547,0.002592,3
6,369.341912,0.587916,5,2,2,0.802681,0.003887,4
4,0.00229,0.01542,2,1,95,0.78742,0.002026,5
1,0.000169,1.456967,12,3,4,0.78712,0.004076,6
2,0.129657,5.181247,22,4,16,0.740519,0.030861,7
7,0.054481,3.834079,94,13,19,0.704784,0.03087,8
9,2e-06,3.421609,7,28,50,0.681803,0.05526,9
5,0.254942,9.769594,14,24,15,0.511678,0.228197,10


In [18]:
cv_results = pd.read_csv(
    "randomized_search_results.csv", index_col=0
)

cv_results=cv_results[column_results].rename(shorten_param, axis=1).sort_values("mean_test_score", ascending=False)
cv_results.head()

Unnamed: 0,l2_regularization,learning_rate,max_leaf_nodes,min_samples_leaf,max_bins,mean_test_score,std_test_score,rank_test_score
208,0.011775,0.076653,24,2,155,0.871393,0.001588,1
343,0.000404,0.244503,15,15,229,0.871339,0.002741,2
21,4.994918,0.077047,53,7,192,0.870793,0.001993,3
328,2.036232,0.224702,28,49,236,0.869837,0.000808,4
327,4.733808,0.036786,61,5,241,0.869673,0.002417,5
