# 📝 Exercise M3.01

The goal is to write an exhaustive search to find the best parameters
combination maximizing the model generalization performance.

Here we use a small subset of the Adult Census dataset to make the code
faster to execute. Once your code works on the small subset, try to
change `train_size` to a larger value (e.g. 0.8 for 80% instead of
20%).

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

adult_census = pd.read_csv("../datasets/adult-census.csv")

target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])

data_train, data_test, target_train, target_test = train_test_split(
    data, target, train_size=0.2, random_state=42)

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

adult_census = pd.read_csv()

target_name = 'class'
y = adult_census[target_name]
X = adult_census.drop(columns = [target, 'education-num'])

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer(
    [('cat_preprocessor', categorical_preprocessor,
      selector(dtype_include=object))],
    remainder='passthrough', sparse_threshold=0)

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", HistGradientBoostingClassifier(random_state=42))
])

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

cat_preprocessor = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

preprocessor = ColumnTransformer(
    [('cat_preprocessor', cat_preprocessor, selector(dtype_include=object))],
    remainder='passthrough', sparse_threshold=0
)

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', HistGradientBoostingClassifier(random_state=42))
])


Use the previously defined model (called `model`) and using two nested `for`
loops, make a search of the best combinations of the `learning_rate` and
`max_leaf_nodes` parameters. In this regard, you will need to train and test
the model by setting the parameters. The evaluation of the model should be
performed using `cross_val_score` on the training set. We will use the
following parameters search:
- `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls
  the ability of a new tree to correct the error of the previous sequence of
  trees
- `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the
  depth of each tree.

In [6]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                     transformers=[('cat_preprocessor',
                                    OrdinalEncoder(handle_unknown='use_encoded_value',
                                                   unknown_value=-1),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x0000022E6F5B8A00>)])),
  ('classifier', HistGradientBoostingClassifier(random_state=42))],
 'verbose': False,
 'preprocessor': ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                   transformers=[('cat_preprocessor',
                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                 unknown_value=-1),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000022E6F5B8A00>)]),
 'classifier': HistGradientBoostingClassifier(r

In [15]:
from sklearn.model_selection import cross_val_score

learning_rate = [0.01, 0.1, 1, 10]
max_depth = [3, 10, 30]

score_dict = {'depth' : [], 'learning_rate' : [], 'mean' : []}

for depth in max_depth:
    for learning in learning_rate:
        model.set_params(classifier__learning_rate = learning, classifier__max_depth = depth)
        scores = cross_val_score(model, data, target)
        score_dict['depth'].append(depth)
        score_dict['learning_rate'].append(learning)
        score_dict['mean'].append(round(scores.mean(),3))
        print(f'depth: {depth}, learning_rate: {learning}, mean: {scores.mean():.3f}')
    print('\n')

depth: 3, learning_rate: 0.01, mean: 0.813
depth: 3, learning_rate: 0.1, mean: 0.866
depth: 3, learning_rate: 1, mean: 0.871
depth: 3, learning_rate: 10, mean: 0.346


depth: 10, learning_rate: 0.01, mean: 0.848
depth: 10, learning_rate: 0.1, mean: 0.874
depth: 10, learning_rate: 1, mean: 0.864
depth: 10, learning_rate: 10, mean: 0.388


depth: 30, learning_rate: 0.01, mean: 0.849
depth: 30, learning_rate: 0.1, mean: 0.874
depth: 30, learning_rate: 1, mean: 0.863
depth: 30, learning_rate: 10, mean: 0.402




In [18]:
pd.DataFrame(score_dict).sort_values('mean', ascending = False)

Unnamed: 0,depth,learning_rate,mean
5,10,0.1,0.874
9,30,0.1,0.874
2,3,1.0,0.871
1,3,0.1,0.866
6,10,1.0,0.864
10,30,1.0,0.863
8,30,0.01,0.849
4,10,0.01,0.848
0,3,0.01,0.813
11,30,10.0,0.402



Now use the test set to score the model using the best parameters
that we found using cross-validation in the training set.

In [22]:
from sklearn.model_selection import cross_validate

max_depth = [10, 30]

for depth in max_depth:
    model.set_params(classifier__learning_rate = 0.10, classifier__max_depth = depth)
    cv_result = cross_validate(model, data, target)
    scores = cv_result['test_score']
    
    print(f'depth: {depth}, learning_rate: 0.01, scores: {scores.mean():.4f} +- {scores.std():.4f}')


depth: 10, learning_rate: 0.01, scores: 0.8738 +- 0.0022
depth: 30, learning_rate: 0.01, scores: 0.8736 +- 0.0024
