# Auto-ajuste de hiperparámetros por búsqueda de cuadrícula (grid-search)

## Nuestro modelo predictivo

In [None]:
from sklearn import set_config

set_config(display="diagram")

In [None]:
import pandas as pd

adult_census = pd.read_csv("../../data/adult-census-numeric/full.csv")

In [None]:
# Extraemos la columna que contiene el objetivo.
# 
target_name = "class"
target = adult_census[target_name]
target

In [None]:
# Quitamos el objetivo de los datos y la columna "Education-Num" (duplicado de "Educación").

data = adult_census.drop(columns=[target_name, "education-num"])
data.head()

In [None]:
# La dividimos en un conjunto de entrenamiento y prueba.

from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42)

In [None]:
# Definiremos un piepline. Gestionará características numéricas y categóricas.

# seleccionar todas las columnas categóricas.

from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)

Aquí usaremos un modelo basado en árbol como clasificador (`HistgradientBoostingClassifier`).
Esto significa que:
- las variables numéricas no necesitan escala;
- las variables categóricas se pueden tratar con un entorno ordinal incluso si el orden de codificación no es significativo;
- para los modelos basados ​​en árboles, OrdinalEncoder evita tener representaciones de alta dimensión.

In [None]:
# Ahora construimos nuestro OrdinalEncoder pasando las categorías conocidas.

from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)

In [None]:
# usamos un ColumnTransformer para seleccionar las columnas categóricas y aplicar el OrdinalEncoder a ellas.

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('cat_preprocessor', categorical_preprocessor, categorical_columns)],
    remainder='passthrough', sparse_threshold=0)

In [None]:
# Finalmente, usamos un clasificador basado en árbol (histogram gradient-boosting) para predecir si una persona gana o no más de 50 K$ al año.

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4))])

model

## Ajustar con búsqueda de cuadrícula
- Usamos el **estimador GridSearchCV** para hacer la búsqueda.
    - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
- Dado que la búsqueda de la cuadrícula será costosa, solo exploraremos la combinación `learning_rate` y `max_leaf_nodes`.

In [None]:
%%time
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__learning_rate': (0.01, 0.1, 1, 10),
    'classifier__max_leaf_nodes': (3, 10, 31, 50)
}

model_grid_search = GridSearchCV(model, param_grid=param_grid, cv=2)

model_grid_search.fit(data_train,target_train)

In [None]:
# Finalmente, verificaremos la precisión de nuestro modelo utilizando el conjunto de pruebas.

accuracy = model_grid_search.score(data_test, target_test)

accuracy

In [None]:
model_grid_search.get_params()

Una vez que se la búsqueda de la cuadrícula está ajustada, se puede usar como cualquier otro predictor llamando a `predict` y `predict_probe`.

Internamente, utilizará el modelo con los mejores parámetros encontrados durante el ajuste.

In [None]:
# Obtener predicciones para las 5 primeras muestras utilizando el estimador con los mejores parámetros.
model_grid_search.predict(data_test.iloc[0:5])

In [None]:
# se puede conocer estos parámetros mirando el atributo best_params_.

model_grid_search.best_params_


In [None]:
# Además, podemos inspeccionar todos los resultados que se almacenan en el atributo cv_results_ de la búsqueda de cuadrícula.
# filtramos algunas columnas específicas de estos resultados.

# model_grid_search.cv_results_

cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False)
cv_results

In [None]:
# Centrémonos en las columnas más interesantes. Acortamos por legibilidad los nombres de los parámetros para eliminar el prefijo "param_classifier__":

# Obtener los nombres de los parámetros
column_results = [f"param_{name}" for name in param_grid.keys()]
column_results += [
    "mean_test_score", "std_test_score", "rank_test_score"]
cv_results = cv_results[column_results]

In [None]:
def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results = cv_results.rename(shorten_param, axis=1)
cv_results

Podemos visualizar la búsqueda de la cuadrícula como un mapa de calor.
Necesitamos transformar nuestros cv_results en un DataFrame, donde:
- Las filas corresponderán a los valores de tasa de aprendizaje;
- Las columnas corresponderán a max_leaf_nodes;
- El contenido del DataFrame será los puntajes medios de prueba.

In [None]:
pivoted_cv_results = cv_results.pivot_table(
    values="mean_test_score", index=["learning_rate"],
    columns=["max_leaf_nodes"])

pivoted_cv_results

In [None]:
# mapa de calor
import seaborn as sns

ax = sns.heatmap(pivoted_cv_results, annot=True, cmap="YlGnBu", vmin=0.7,
                 vmax=0.9)
ax.invert_yaxis()

## **Ejercicio**

Para el dataset "house-prices/full.csv" y usando [Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html): 
- Autoajusta parametros usando grid search.

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [53]:
# Carga de datos
haouses = pd.read_csv("../../data/house-prices/full.csv")
haouses.columns
# haouses.shape

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [54]:
target_name = "SalePrice"

y = haouses[target_name]
X = haouses.drop(columns=target_name)

In [55]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [None]:
# Mostrar información del dataset
print("Características del dataset:")
print(f"- Número de muestras: {X.shape[0]}")
print(f"- Número de características: {X.shape[1]}")
print(f"\nNombres de las características:\n{X.columns}")
print(f"\nDescripción de las características:\n{X.info()}...")

# Ver primeras filas
X.head()

In [56]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)

In [57]:
preprocessor = ColumnTransformer(
    [
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])


In [58]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    X, y, random_state=42)

## Lasso

In [59]:
lasso_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=1.0))
])

In [60]:
%%time
from sklearn.model_selection import GridSearchCV

param_grid = {
    'regressor__alpha': (0.01, 0.1, 1, 10,100),
    'regressor__max_iter': (10, 100, 1000)
}

# model_grid_search = GridSearchCV(lasso_pipeline, param_grid=param_grid, cv=2) // scoring por defecto: r2
model_grid_search = GridSearchCV(lasso_pipeline, param_grid=param_grid, cv=2, scoring='neg_mean_absolute_error')

model_grid_search.fit(data_train,target_train)

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


CPU times: user 9.89 s, sys: 52 ms, total: 9.94 s
Wall time: 10 s


  model = cd_fast.sparse_enet_coordinate_descent(


In [61]:
result = model_grid_search.score(data_test, target_test)
result

-17217.130477046943

In [62]:
model_grid_search.best_params_

{'regressor__alpha': 100, 'regressor__max_iter': 100}

In [63]:
model_grid_search.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_regressor__alpha', 'param_regressor__max_iter', 'params', 'split0_test_score', 'split1_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [64]:

cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__alpha,param_regressor__max_iter,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
13,0.09266,0.000949,0.021023,0.00026,100.0,100,"{'regressor__alpha': 100, 'regressor__max_iter...",-20006.116126,-16311.853575,-18158.984851,1847.131275,1
14,0.346186,0.00858,0.019878,8.3e-05,100.0,1000,"{'regressor__alpha': 100, 'regressor__max_iter...",-19989.608427,-16346.148827,-18167.878627,1821.7298,2
12,0.03728,0.001655,0.01986,0.000156,100.0,10,"{'regressor__alpha': 100, 'regressor__max_iter...",-20512.571229,-18555.551475,-19534.061352,978.509877,3
11,0.75562,0.107197,0.020644,0.000245,10.0,1000,"{'regressor__alpha': 10, 'regressor__max_iter'...",-22816.430098,-20335.467965,-21575.949032,1240.481066,4
10,0.12125,0.00986,0.020775,0.000431,10.0,100,"{'regressor__alpha': 10, 'regressor__max_iter'...",-22967.363769,-20215.055465,-21591.209617,1376.154152,5
9,0.064588,0.001113,0.030361,0.006791,10.0,10,"{'regressor__alpha': 10, 'regressor__max_iter'...",-29532.004764,-28409.412173,-28970.708468,561.296296,6
8,0.91919,0.078729,0.036766,0.000679,1.0,1000,"{'regressor__alpha': 1, 'regressor__max_iter':...",-30415.82766,-30001.306074,-30208.566867,207.260793,7
7,0.111061,0.00182,0.022144,0.002027,1.0,100,"{'regressor__alpha': 1, 'regressor__max_iter':...",-32449.858207,-32415.256333,-32432.55727,17.300937,8
6,0.036543,0.00078,0.020492,0.000403,1.0,10,"{'regressor__alpha': 1, 'regressor__max_iter':...",-36268.561873,-35979.694195,-36124.128034,144.433839,9
3,0.036656,3.9e-05,0.020147,0.000521,0.1,10,"{'regressor__alpha': 0.1, 'regressor__max_iter...",-37357.490329,-37052.423009,-37204.956669,152.53366,10
