# 📝 Exercise M3.01

The goal is to write an exhaustive search to find the best parameters
combination maximizing the model generalization performance.

Here we use a small subset of the Adult Census dataset to make the code
faster to execute. Once your code works on the small subset, try to
change `train_size` to a larger value (e.g. 0.8 for 80% instead of
20%).

In [14]:
import pandas as pd

from sklearn.model_selection import train_test_split

adult_census = pd.read_csv("../datasets/adult-census.csv")

target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])

data_train, data_test, target_train, target_test = train_test_split(
    data, target, train_size=0.2, random_state=42)

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer(
    [('cat-preprocessor', categorical_preprocessor,
      selector(dtype_include=object))],
    remainder='passthrough', sparse_threshold=0)

# This line is currently required to import HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", HistGradientBoostingClassifier(random_state=42))
])


Use the previously defined model (called `model`) and using two nested `for`
loops, make a search of the best combinations of the `learning_rate` and
`max_leaf_nodes` parameters. In this regard, you will need to train and test
the model by setting the parameters. The evaluation of the model should be
performed using `cross_val_score`. We will use the following parameters
search:
- `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls
  the ability of a new tree to correct the error of the previous sequence of
  trees
- `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the
  depth of each tree.

In [3]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                     transformers=[('cat-preprocessor',
                                    OrdinalEncoder(handle_unknown='use_encoded_value',
                                                   unknown_value=-1),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x0000028781C9B670>)])),
  ('classifier', HistGradientBoostingClassifier(random_state=42))],
 'verbose': False,
 'preprocessor': ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                   transformers=[('cat-preprocessor',
                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                 unknown_value=-1),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000028781C9B670>)]),
 'classifier': HistGradientBoostingClassifier(r

In [6]:
from sklearn.model_selection import cross_val_score

learning_rate_range = (.01, .1, 1, 10)
max_leaf_nodes_range= (3, 10, 30)

In [15]:
def cv_model_hyperparameters():
    for learning_rate in learning_rate_range:
        for max_leaf_nodes in max_leaf_nodes_range:

            model.set_params(
                classifier__learning_rate = learning_rate,
                classifier__max_leaf_nodes = max_leaf_nodes
            )

            score = cross_val_score(
                model,
                data,
                target,
            )

            print(f"CV score of {score.mean():.3f} obtained for learning_rate={learning_rate} and max_leaf_nodes={max_leaf_nodes}")

In [17]:
cv_model_hyperparameters()

CV score of 0.799 obtained for learning_rate=0.01 and max_leaf_nodes=3
CV score of 0.820 obtained for learning_rate=0.01 and max_leaf_nodes=10
CV score of 0.848 obtained for learning_rate=0.01 and max_leaf_nodes=30
CV score of 0.856 obtained for learning_rate=0.1 and max_leaf_nodes=3
CV score of 0.870 obtained for learning_rate=0.1 and max_leaf_nodes=10
CV score of 0.874 obtained for learning_rate=0.1 and max_leaf_nodes=30
CV score of 0.870 obtained for learning_rate=1 and max_leaf_nodes=3
CV score of 0.867 obtained for learning_rate=1 and max_leaf_nodes=10
CV score of 0.860 obtained for learning_rate=1 and max_leaf_nodes=30
CV score of 0.281 obtained for learning_rate=10 and max_leaf_nodes=3
CV score of 0.761 obtained for learning_rate=10 and max_leaf_nodes=10
CV score of 0.616 obtained for learning_rate=10 and max_leaf_nodes=30


In [None]:
cv_model_hyperparameters

score of 0.874 obtained for learning_rate=0.1 and max_leaf_nodes=30