# Auto-ajuste de hiperparámetros por búsqueda de cuadrícula (grid-search)

## Nuestro modelo predictivo

In [None]:
from sklearn import set_config

set_config(display="diagram")

In [None]:
import pandas as pd

adult_census = pd.read_csv("../../data/adult-census-numeric/full.csv")

In [None]:
# Extraemos la columna que contiene el objetivo.
# 
target_name = "class"
target = adult_census[target_name]
target

In [None]:
# Quitamos el objetivo de los datos y la columna "Education-Num" (duplicado de "Educación").

data = adult_census.drop(columns=[target_name, "education-num"])
data.head()

In [None]:
# La dividimos en un conjunto de entrenamiento y prueba.

from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42)

In [None]:
# Definiremos un piepline. Gestionará características numéricas y categóricas.

# seleccionar todas las columnas categóricas.

from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)

Aquí usaremos un modelo basado en árbol como clasificador (`HistgradientBoostingClassifier`).
Esto significa que:
- las variables numéricas no necesitan escala;
- las variables categóricas se pueden tratar con un entorno ordinal incluso si el orden de codificación no es significativo;
- para los modelos basados ​​en árboles, OrdinalEncoder evita tener representaciones de alta dimensión.

In [None]:
# Ahora construimos nuestro OrdinalEncoder pasando las categorías conocidas.

from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)

In [None]:
# usamos un ColumnTransformer para seleccionar las columnas categóricas y aplicar el OrdinalEncoder a ellas.

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('cat_preprocessor', categorical_preprocessor, categorical_columns)],
    remainder='passthrough', sparse_threshold=0)

In [None]:
# Finalmente, usamos un clasificador basado en árbol (histogram gradient-boosting) para predecir si una persona gana o no más de 50 K$ al año.

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4))])

model

## Ajustar con búsqueda de cuadrícula
- Usamos el **estimador GridSearchCV** para hacer la búsqueda.
    - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
- Dado que la búsqueda de la cuadrícula será costosa, solo exploraremos la combinación `learning_rate` y `max_leaf_nodes`.

In [None]:
%%time
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__learning_rate': (0.01, 0.1, 1, 10),
    'classifier__max_leaf_nodes': (3, 10, 31, 50)
}

model_grid_search = GridSearchCV(model, param_grid=param_grid, cv=2)

model_grid_search.fit(data_train,target_train)

In [None]:
# Finalmente, verificaremos la precisión de nuestro modelo utilizando el conjunto de pruebas.

accuracy = model_grid_search.score(data_test, target_test)

accuracy

In [None]:
model_grid_search.get_params()

Una vez que se la búsqueda de la cuadrícula está ajustada, se puede usar como cualquier otro predictor llamando a `predict` y `predict_probe`.

Internamente, utilizará el modelo con los mejores parámetros encontrados durante el ajuste.

In [None]:
# Obtener predicciones para las 5 primeras muestras utilizando el estimador con los mejores parámetros.
model_grid_search.predict(data_test.iloc[0:5])

In [73]:
# se puede conocer estos parámetros mirando el atributo best_params_.

model_grid_search.best_params_


{'regressor__alpha': 100, 'regressor__max_iter': 1000}

In [None]:
# Además, podemos inspeccionar todos los resultados que se almacenan en el atributo cv_results_ de la búsqueda de cuadrícula.
# filtramos algunas columnas específicas de estos resultados.

# model_grid_search.cv_results_

cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False)
cv_results

In [None]:
# Centrémonos en las columnas más interesantes. Acortamos por legibilidad los nombres de los parámetros para eliminar el prefijo "param_classifier__":

# Obtener los nombres de los parámetros
column_results = [f"param_{name}" for name in param_grid.keys()]
column_results += [
    "mean_test_score", "std_test_score", "rank_test_score"]
cv_results = cv_results[column_results]

In [None]:
def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results = cv_results.rename(shorten_param, axis=1)
cv_results

Podemos visualizar la búsqueda de la cuadrícula como un mapa de calor.
Necesitamos transformar nuestros cv_results en un DataFrame, donde:
- Las filas corresponderán a los valores de tasa de aprendizaje;
- Las columnas corresponderán a max_leaf_nodes;
- El contenido del DataFrame será los puntajes medios de prueba.

In [None]:
pivoted_cv_results = cv_results.pivot_table(
    values="mean_test_score", index=["learning_rate"],
    columns=["max_leaf_nodes"])

pivoted_cv_results

In [None]:
# mapa de calor
import seaborn as sns

ax = sns.heatmap(pivoted_cv_results, annot=True, cmap="YlGnBu", vmin=0.7,
                 vmax=0.9)
ax.invert_yaxis()

## **Ejercicio**

Para el dataset "house-prices/full.csv" y usando [Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html): 
- Autoajusta parametros usando grid search.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Carga de datos
haouses = pd.read_csv("../../data/house-prices/full.csv")
haouses.columns
# haouses.shape

In [None]:
target_name = "SalePrice"

y = haouses[target_name]
X = haouses.drop(columns=target_name)

In [None]:
y

In [None]:
# Mostrar información del dataset
print("Características del dataset:")
print(f"- Número de muestras: {X.shape[0]}")
print(f"- Número de características: {X.shape[1]}")
print(f"\nNombres de las características:\n{X.columns}")
print(f"\nDescripción de las características:\n{X.info()}...")

# Ver primeras filas
X.head()

In [None]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)

In [None]:
preprocessor = ColumnTransformer(
    [
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])


In [None]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    X, y, random_state=42)

## Lasso

In [None]:
lasso_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=1.0))
])

In [None]:
from sklearn.model_selection import cross_validate, ShuffleSplit

cvss = ShuffleSplit(n_splits=10, test_size=0.2)

In [None]:
%%time
from sklearn.model_selection import GridSearchCV

param_grid = {
    'regressor__alpha': (0.01, 0.1, 1, 10,100),
    'regressor__max_iter': (10, 100, 1000, 10000)
}

# model_grid_search = GridSearchCV(lasso_pipeline, param_grid=param_grid, cv=2) // scoring por defecto: r2
model_grid_search = GridSearchCV(lasso_pipeline, param_grid=param_grid, cv=cvss, scoring='neg_mean_absolute_error')

model_grid_search.fit(data_train,target_train)

In [None]:
result = model_grid_search.score(data_test, target_test)
result

In [70]:
model_grid_search.best_params_

{'regressor__alpha': 100, 'regressor__max_iter': 1000}

In [None]:
model_grid_search.cv_results_.keys()

In [72]:

cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__alpha,param_regressor__max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
14,0.546417,0.065882,0.01635,0.003244,100.0,1000,"{'regressor__alpha': 100, 'regressor__max_iter...",-17884.100095,-14376.714638,-18127.497208,-16013.255696,-14431.407716,-17053.133194,-21034.041481,-16726.936716,-18349.392258,-16938.245598,-17093.47246,1859.946749,1
13,0.16814,0.036818,0.021782,0.010555,100.0,100,"{'regressor__alpha': 100, 'regressor__max_iter...",-17892.204975,-14366.595867,-18132.47514,-16017.859719,-14432.159987,-17059.225893,-21081.959047,-16711.568306,-18343.839873,-16949.132637,-17098.702144,1871.681357,2
12,0.046367,0.001914,0.015448,0.000658,100.0,10,"{'regressor__alpha': 100, 'regressor__max_iter...",-20446.812223,-16605.109401,-20928.447914,-17776.555649,-16086.105909,-18125.124269,-23984.024668,-17492.507652,-20331.289028,-18355.490005,-19013.146672,2271.55697,3
10,0.186945,0.020627,0.019978,0.004698,10.0,100,"{'regressor__alpha': 10, 'regressor__max_iter'...",-19902.796851,-16489.298208,-19985.498251,-18034.789781,-17455.807248,-23122.688007,-24047.375922,-18465.872003,-20368.736754,-19255.024066,-19712.788709,2258.798075,4
11,1.108094,0.319726,0.016292,0.001903,10.0,1000,"{'regressor__alpha': 10, 'regressor__max_iter'...",-20124.663112,-16503.552763,-19944.648373,-18132.901367,-17531.894414,-23147.017552,-24152.26552,-18441.066303,-20713.946156,-19383.579524,-19807.553508,2278.508064,5
9,0.048813,0.001594,0.015694,0.000866,10.0,10,"{'regressor__alpha': 10, 'regressor__max_iter'...",-26189.560274,-21953.806702,-26303.295645,-24282.332076,-22130.672174,-27020.17923,-29089.026967,-23922.047316,-26741.304844,-24148.038148,-25178.026338,2157.436248,6
8,1.583903,0.065764,0.018572,0.003432,1.0,1000,"{'regressor__alpha': 1, 'regressor__max_iter':...",-27376.754452,-25542.179334,-25900.734866,-22876.387732,-24338.111879,-29882.306305,-31886.428928,-23320.717566,-27161.700056,-23376.766899,-26166.208802,2819.384317,7
7,0.182624,0.002287,0.018536,0.004034,1.0,100,"{'regressor__alpha': 1, 'regressor__max_iter':...",-27682.082571,-24879.529615,-26077.039201,-23247.372221,-24991.389055,-30075.962288,-31143.913694,-23674.040497,-26579.104359,-23498.141884,-26184.857539,2596.562795,8
5,1.574736,0.070549,0.017481,0.003101,0.1,1000,"{'regressor__alpha': 0.1, 'regressor__max_iter...",-30700.700357,-28820.169042,-27998.716469,-24653.378339,-26704.651362,-32427.571788,-34550.101683,-26446.071449,-29812.679682,-26118.95167,-28823.299184,2929.370562,9
4,0.195714,0.021088,0.018479,0.003351,0.1,100,"{'regressor__alpha': 0.1, 'regressor__max_iter...",-31774.797462,-27550.73998,-28314.892695,-25834.813707,-27405.244536,-32703.618841,-33620.257212,-26110.533272,-28937.560542,-26028.680182,-28828.113843,2734.605194,10


In [71]:
lasso_pipeline_c1 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=100, max_iter=1000))
])