In [381]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

data = {'col1': ['cat', 'dog', 'cat', 'bird', 'dog']}
df = pd.DataFrame(data)

In [382]:
encoder = OrdinalEncoder()
df['col1_encoded'] = encoder.fit_transform(df[['col1']])

In [383]:
df2 = pd.DataFrame(data)
df2['col1'] = encoder.fit_transform(df[['col1']])

# Improving my model with Imputation and Categorical Variable

In [426]:
# Importamos librerías necesarias
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
import pandas as pd


# Cargamos y leemos el archivo de datos
file = "/Volumes/Memory/kaggle/machine_learning/melb_data.csv"
data = pd.read_csv(file)

# Seleccionamos la variable objetivo y los predictores
y = data.Price
X = data.drop(['Price'], axis = 1)

# Selecionamos las columnas con baja cardinalidad
low_cardinality_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and X[cname].dtype == 'object']

# Selecionamos las columnas numericas
num_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

# Mantenemos solo las columnas con baja cardinalidad y las columnas numéricas
my_cols = low_cardinality_cols + num_cols
X = X[my_cols]

# Dividimos los datos en conjuntos de entrenamiento y validación
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0)

# Dividimos las columnas categóricas y numéricas en conjuntos de entrenamiento y validación
X_train_cat = X_train[low_cardinality_cols]
X_valid_cat = X_valid[low_cardinality_cols]
X_train_num = X_train[num_cols]
X_valid_num = X_valid[num_cols]

# Imputamos los valores faltantes de nuestras columnas numéricas
num_imputer = SimpleImputer(strategy = 'mean')
imputed_X_train_num = pd.DataFrame(num_imputer.fit_transform(X_train_num), columns = num_cols)
imputed_X_valid_num = pd.DataFrame(num_imputer.transform(X_valid_num), columns = num_cols)

# Imputamos y codificamos las columnas categóricas
cat_imputer = SimpleImputer(strategy = 'most_frequent')
imputed_X_train_cat = pd.DataFrame(cat_imputer.fit_transform(X_train_cat), columns = low_cardinality_cols)
imputed_X_valid_cat = pd.DataFrame(cat_imputer.transform(X_valid_cat), columns = low_cardinality_cols)

# Aplicamos el OrdinalEnconder a las columnas categóricas
ordinal_encoder = OrdinalEncoder()
encoded_X_train_cat = pd.DataFrame(ordinal_encoder.fit_transform(imputed_X_train_cat), columns = low_cardinality_cols)
encoded_X_valid_cat = pd.DataFrame(ordinal_encoder.transform(imputed_X_valid_cat), columns = low_cardinality_cols)

# Unimos las columnas numéricas y categóricas para formar los conjuntos finales de entrenamiento y validación
X_train_full = pd.concat([imputed_X_train_num, encoded_X_train_cat], axis = 1)
X_valid_full = pd.concat([imputed_X_valid_num, encoded_X_valid_cat], axis = 1)

# Definimos y entrenamos el modelo
model = RandomForestRegressor(n_estimators = 100, random_state = 0)
model.fit(X_train_full, y_train)

# Realizamos predicciones en el conjunto de validación
preds = model.predict(X_valid_full)

# Calculamos el error absoluto medio (MAE)
mae = mean_absolute_error(y_valid, preds)
mae = f'{mae:,.2f}'
print(mae)

162,460.28


In [429]:
# Importamos librerías necesarias
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# Cargamos y leemos el archivo de datos
file = "/Volumes/Memory/kaggle/machine_learning/melb_data.csv"
data = pd.read_csv(file)

# Seleccionamos la variable objetivo y los predictores
y = data.Price
X = data.drop(['Price'], axis=1)

# Seleccionamos las columnas categóricas con baja cardinalidad
low_cardinality_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and X[cname].dtype == 'object']

# Seleccionamos las columnas numéricas
num_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

# Mantener solo las columnas con baja cardinalidad y las columnas numéricas
my_cols = low_cardinality_cols + num_cols
X = X[my_cols]

# Dividimos los datos en conjuntos de entrenamiento y validación
X_train_full, X_valid_full, y_train_full, y_valid_full = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Dividimos las columnas categóricas y numéricas en conjuntos de entrenamiento y validación
X_train_cat = X_train_full[low_cardinality_cols]
X_valid_cat = X_valid_full[low_cardinality_cols]
X_train_num = X_train_full[num_cols]
X_valid_num = X_valid_full[num_cols]

# Imputamos los valores faltantes de nuestras columnas numéricas
num_imputer = SimpleImputer(strategy='mean')
imputed_X_train_num = pd.DataFrame(num_imputer.fit_transform(X_train_num), columns=num_cols)
imputed_X_valid_num = pd.DataFrame(num_imputer.transform(X_valid_num), columns=num_cols)

# Imputamos y codificamos las columnas categóricas
cat_imputer = SimpleImputer(strategy='most_frequent')
imputed_X_train_cat = pd.DataFrame(cat_imputer.fit_transform(X_train_cat), columns=low_cardinality_cols)
imputed_X_valid_cat = pd.DataFrame(cat_imputer.transform(X_valid_cat), columns=low_cardinality_cols)

# Aplicamos el OrdinalEncoder a las columnas categóricas
ordinal_encoder = OrdinalEncoder()
encoded_X_train_cat = pd.DataFrame(ordinal_encoder.fit_transform(imputed_X_train_cat), columns=low_cardinality_cols)
encoded_X_valid_cat = pd.DataFrame(ordinal_encoder.transform(imputed_X_valid_cat), columns=low_cardinality_cols)

# Unimos las columnas numéricas y categóricas para formar los conjuntos finales de entrenamiento y validación
imputed_X_train = pd.concat([imputed_X_train_num, encoded_X_train_cat], axis=1)
imputed_X_valid = pd.concat([imputed_X_valid_num, encoded_X_valid_cat], axis=1)

# Definimos el modelo
model = RandomForestRegressor(random_state=0)

# Ajuste de hiperparámetros usando GridSearchCV
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_features': ['sqrt', 'log2', 0.2, 0.5, 0.8],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(imputed_X_train, y_train_full)

# Mejor modelo encontrado por GridSearchCV
best_model = grid_search.best_estimator_

# Realizamos predicciones en el conjunto de validación con el mejor modelo
preds = best_model.predict(imputed_X_valid)

# Calculamos el error absoluto medio
mae = mean_absolute_error(y_valid_full, preds)
print(f"Mean Absolute Error: {mae:,.2f}")

Fitting 5 folds for each of 540 candidates, totalling 2700 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5,