# G - End-to-End ML Part 6: Hyperparametertuning mit Grid Search

Environment für dieses Notebook: `ads-ml-full` (*requirements-py3.11-ads-ml-full.txt*).


## Daten einlesen

In [1]:
from repml.datasets.trees_ber import prepare_trees_ber

_, labeled, _ = prepare_trees_ber()

In [2]:
# labeled = pd.read_parquet("../data/subset_baumkataster_berlin_2023.parquet")

## Trainings- und Testdaten

In [3]:
from sklearn.model_selection import train_test_split

y = "pflanzjahr"
X = labeled.columns.to_list()
X.remove(y)
train_data, test_data = train_test_split(
    labeled, test_size=0.2, random_state=42, stratify=labeled["gattung_deutsch"]
)

## Preprocessing Pipeline (wie 4E)

In [4]:
num_cols_6 = [
    "kronedurch",
    "stammumfg",
    "baumhoehe",
    "hoehe_zu_krone",
    "hoehe_zu_stamm",
]

cat_cols_6 = [
    "bezirk",
    "baumart",
    "art_dtsch",
    "art_bot",
    "gattung_deutsch",
    "gattung",
    "art_dtsch_infrequent",
    "art_bot_infrequent",
    "gattung_deutsch_infrequent",
    "gattung_infrequent",
    "namenr",
    "namenr_nonum",
    "lat_lon_tile",
]
cols_set_6 = [*num_cols_6, *cat_cols_6]

num_cols_6_ix = [cols_set_6.index(col) for col in num_cols_6]
cat_cols_6_ix = [cols_set_6.index(col) for col in cat_cols_6]

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

ct_6a = ColumnTransformer(
    transformers=[
        ("imp", IterativeImputer(random_state=42), num_cols_6_ix),
    ],
    remainder="passthrough",
)

In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

ct_6b = ColumnTransformer(
    transformers=[
        ("scale", StandardScaler(), num_cols_6_ix),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols_6_ix),
    ],
)

In [7]:
from sklearn.ensemble import GradientBoostingRegressor

est_6 = GradientBoostingRegressor(min_samples_split=25, min_samples_leaf=10, random_state=42)

In [8]:
from sklearn.pipeline import Pipeline

pipeline_6 = Pipeline(steps=[("prep_a", ct_6a), ("prep_b", ct_6b), ("model", est_6)])

## Grid

In [9]:
# # smaller parameter grid
# parameters = {
#     "model__learning_rate": [0.01, 0.05, 0.1],
#     "model__subsample": [0.8, 1.0],
# }

parameters = {
    "model__n_estimators": [100, 500],
    "model__learning_rate": [0.05, 0.1],
    "model__subsample": [0.7, 0.9],
    "model__max_depth": [5, 7],
}

In [10]:
from sklearn.model_selection import ParameterGrid

list(ParameterGrid(parameters))

[{'model__learning_rate': 0.05,
  'model__max_depth': 5,
  'model__n_estimators': 100,
  'model__subsample': 0.7},
 {'model__learning_rate': 0.05,
  'model__max_depth': 5,
  'model__n_estimators': 100,
  'model__subsample': 0.9},
 {'model__learning_rate': 0.05,
  'model__max_depth': 5,
  'model__n_estimators': 500,
  'model__subsample': 0.7},
 {'model__learning_rate': 0.05,
  'model__max_depth': 5,
  'model__n_estimators': 500,
  'model__subsample': 0.9},
 {'model__learning_rate': 0.05,
  'model__max_depth': 7,
  'model__n_estimators': 100,
  'model__subsample': 0.7},
 {'model__learning_rate': 0.05,
  'model__max_depth': 7,
  'model__n_estimators': 100,
  'model__subsample': 0.9},
 {'model__learning_rate': 0.05,
  'model__max_depth': 7,
  'model__n_estimators': 500,
  'model__subsample': 0.7},
 {'model__learning_rate': 0.05,
  'model__max_depth': 7,
  'model__n_estimators': 500,
  'model__subsample': 0.9},
 {'model__learning_rate': 0.1,
  'model__max_depth': 5,
  'model__n_estimators':

In [11]:
len(list(ParameterGrid(parameters)))

16

In [12]:
from sklearn.model_selection import GridSearchCV

cv_folds = 3
search = GridSearchCV(estimator=pipeline_6, param_grid=parameters, cv=cv_folds, n_jobs=4)

In [13]:
%%time
import os

import mlflow
from dotenv import load_dotenv

load_dotenv()

mlflow.set_tracking_uri(os.getenv("LOCAL_MLFLOW_TRACKING_URI"))
mlflow.set_experiment("4G-grid-search")


with mlflow.start_run():
    mlflow.sklearn.autolog()

    search.fit(X=train_data[cols_set_6], y=train_data[y])

    for idx, params in enumerate(search.cv_results_["params"]):
        with mlflow.start_run(nested=True):
            for key, value in params.items():
                mlflow.log_param(key, value)

            # Log all available metrics for each iteration
            for metric_name in search.cv_results_:
                if metric_name.startswith(("mean_", "std_", "split_", "rank_")):
                    mlflow.log_metric(metric_name, search.cv_results_[metric_name][idx])

2024/03/06 18:53:27 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/03/06 18:53:27 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

CPU times: user 6min 26s, sys: 14 s, total: 6min 40s
Wall time: 1h 10min 26s


In [14]:
search.best_params_

{'model__learning_rate': 0.1,
 'model__max_depth': 7,
 'model__n_estimators': 500,
 'model__subsample': 0.7}

In [15]:
search.best_score_

0.8445172069220656

In [16]:
import pandas as pd

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(pd.DataFrame(search.cv_results_).sort_values(by="rank_test_score"))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__learning_rate,param_model__max_depth,param_model__n_estimators,param_model__subsample,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
14,366.630171,0.993649,2.105348,0.023255,0.1,7,500,0.7,"{'model__learning_rate': 0.1, 'model__max_dept...",0.847525,0.843654,0.842373,0.844517,0.00219,1
15,343.459974,52.992389,1.809363,0.116013,0.1,7,500,0.9,"{'model__learning_rate': 0.1, 'model__max_dept...",0.847126,0.842571,0.842298,0.843998,0.002214,2
6,1026.446267,44.959886,2.136245,0.046495,0.05,7,500,0.7,"{'model__learning_rate': 0.05, 'model__max_dep...",0.835906,0.832157,0.830315,0.832793,0.002327,3
7,615.700763,275.321394,2.211728,0.045552,0.05,7,500,0.9,"{'model__learning_rate': 0.05, 'model__max_dep...",0.834648,0.831062,0.830065,0.831925,0.001968,4
10,249.512712,3.691105,1.555621,0.017137,0.1,5,500,0.7,"{'model__learning_rate': 0.1, 'model__max_dept...",0.834487,0.831026,0.829191,0.831568,0.002196,5
11,302.282419,1.016751,1.544975,0.013064,0.1,5,500,0.9,"{'model__learning_rate': 0.1, 'model__max_dept...",0.832784,0.830787,0.827475,0.830349,0.002189,6
2,492.564237,106.356047,4.109883,0.169584,0.05,5,500,0.7,"{'model__learning_rate': 0.05, 'model__max_dep...",0.81982,0.816868,0.815385,0.817358,0.001844,7
13,100.105783,0.422505,0.71864,0.022024,0.1,7,100,0.9,"{'model__learning_rate': 0.1, 'model__max_dept...",0.818183,0.815102,0.813994,0.815759,0.001772,8
3,722.602838,73.200855,1.700941,0.04358,0.05,5,500,0.9,"{'model__learning_rate': 0.05, 'model__max_dep...",0.817948,0.815311,0.813504,0.815588,0.001824,9
12,92.46479,2.227899,0.741102,0.049837,0.1,7,100,0.7,"{'model__learning_rate': 0.1, 'model__max_dept...",0.817783,0.815547,0.813078,0.815469,0.001922,10


## Performance

### R-squared

In [17]:
search.score(X=test_data[X], y=test_data[y])

0.8441130550617467

### Mean Absolute Error

In [18]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_pred=search.predict(test_data[X]), y_true=test_data[y])



7.316012833153506