In [1]:
import pandas as pd
from scipy.io import arff
import numpy as np
from sklearn.model_selection import train_test_split

dataarff = arff.loadarff("phpMawTba.arff")
adult_census = pd.DataFrame(dataarff[0])
adult_census[list(adult_census.columns.values)] = adult_census[list(adult_census.columns.values)].applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
target_name = "class"

target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])
data.head()

  adult_census[list(adult_census.columns.values)] = adult_census[list(adult_census.columns.values)].applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)


Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25.0,Private,226802.0,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States
1,38.0,Private,89814.0,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States
2,28.0,Local-gov,336951.0,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States
3,44.0,Private,160323.0,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States
4,18.0,?,103497.0,Some-college,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)

categorical_preprocessor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)
preprocessor = ColumnTransformer(
    [
        ("cat_preprocessor", categorical_preprocessor, categorical_columns),
    ],
    remainder="passthrough"
)

In [4]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "classifier",
            HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4),
        ),
    ]
)
model

In [5]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target, cv=5)
cv_results = pd.DataFrame(cv_results)
cv_results

Unnamed: 0,fit_time,score_time,test_score
0,0.39481,0.03,0.863036
1,0.372053,0.033004,0.860784
2,0.379073,0.02951,0.86036
3,0.349551,0.033002,0.863124
4,0.34991,0.027511,0.867219


In [11]:
print("Generalization score without hyperparameters"
    f" tuning:\n{cv_results['test_score'].mean():.3f} +-"
    f" {cv_results['test_score'].std():.3f}"
)

Generalization score without hyperparameters tuning:
0.863 +- 0.003


In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "classifier__learning_rate": (0.05, 0.5),
    "classifier__max_leaf_nodes": (10, 30),
}

model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=2)
model_grid_search.fit(data, target)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [13]:
cv_results = pd.DataFrame(model_grid_search.cv_results_)
cv_results[
    [
        "param_classifier__learning_rate",
        "param_classifier__max_leaf_nodes",
        "mean_test_score",
        "std_test_score",
        "rank_test_score"
    ]
]

Unnamed: 0,param_classifier__learning_rate,param_classifier__max_leaf_nodes,mean_test_score,std_test_score,rank_test_score
0,0.05,10,0.864338,0.000369,4
1,0.05,30,0.870665,0.000348,1
2,0.5,10,0.867553,0.001126,2
3,0.5,30,0.867,0.000164,3


In [14]:
model_grid_search.best_params_

{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

model_grid_search.fit(X_train, y_train)
accuracy = model_grid_search.score(X_test, y_test)
print(f"Accuracy on test set: {accuracy:.3f}")

Accuracy on test set: 0.879


In [16]:
cv_results = cross_validate(
    model_grid_search, data, target, cv=5, n_jobs=2, return_estimator=True
)

In [17]:
cv_results = pd.DataFrame(cv_results)
cv_test_scores = cv_results["test_score"]
print(
    "Generalization score with hyperparameters tuning:\n"
    f"{cv_test_scores.mean():.3f} +- {cv_test_scores.std():.3f}"
)

Generalization score with hyperparameters tuning:
0.871 +- 0.003


In [19]:
for cv_fold, estimator_in_fold in enumerate(cv_results["estimator"]):
    print(
        f"Best hyperparameters for fold #{cv_fold+1}: \n"
        f"{estimator_in_fold.best_params_}"
    )

Best hyperparameters for fold #1: 
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}
Best hyperparameters for fold #2: 
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}
Best hyperparameters for fold #3: 
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}
Best hyperparameters for fold #4: 
{'classifier__learning_rate': 0.5, 'classifier__max_leaf_nodes': 10}
Best hyperparameters for fold #5: 
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}
