In [1]:
import platform
platform.architecture()

('64bit', 'Mach-O')

In [2]:
import os, sys, glob
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import numpy as np
import json
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [3]:
# En un notebook, __file__ no está definido. Usamos os.getcwd() en su lugar.
# os.getcwd() devuelve el directorio de trabajo actual.
current_working_dir = os.getcwd()
print(f"Directorio de trabajo actual: {current_working_dir}")

if os.path.basename(current_working_dir) == 'notebooks':
    project_root = os.path.dirname(current_working_dir)

print(f"Añadiendo al path: {project_root}") 
sys.path.append(project_root)

Directorio de trabajo actual: /Users/ricardoalvarez/CodeProyectos/demos/streamlit_demos/ml_development/notebooks
Añadiendo al path: /Users/ricardoalvarez/CodeProyectos/demos/streamlit_demos/ml_development


In [4]:
import utils

In [5]:
df = pd.read_csv("../data/panama_real_estate_ficticio.csv")
df.head(3)

Unnamed: 0,location,building,bedrooms,bathrooms,parking_spaces,size_m2,has_pool,has_photos,price_usd
0,Isla Perico,360 Rooftop Casco,2,2,2,163,1,1,448903
1,Obarrio,Villa Magna,5,5,2,563,0,0,1745616
2,Calle 50,Trump,2,2,1,176,0,1,477119


In [6]:
num_features = ['size_m2', 'bedrooms', 'bathrooms', 'parking_spaces']
cat_features = ['has_photos', 'has_pool', 'location', 'building']
target = 'price_usd'


In [7]:
X = df[cat_features+num_features]
y = df[target]

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [8]:
# 1. Función personalizada para convertir columnas a minúsculas
def to_lowercase(dataframe):
    if not isinstance(dataframe, np.ndarray):
        return dataframe  # No hacer nada si no es un array

    # Crear un nuevo array para los resultados para no modificar el original
    result = np.empty_like(dataframe, dtype=object)
    for i, val in np.ndenumerate(dataframe):
        if isinstance(val, str):
            result[i] = val.lower()
        else:
            result[i] = val  # Mantener otros tipos (números, None, etc.)
    return result
    #return dataframe.apply(lambda x: x.str.lower() if x.dtype == "object" else x)

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    #('lowercase', FunctionTransformer(to_lowercase, validate=False, feature_names_out='one-to-one')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # <-- add sparse_output=False
])

In [9]:
# Definir pasos de preprocesamiento
# Codificación One-Hot para la columna 'zone'
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, cat_features),
        ('num', StandardScaler(), num_features)
    ],
    remainder='drop' # Mantiene las columnas no transformadas
)

In [10]:
rf = RandomForestRegressor(
    n_estimators=1000,
    max_features = 0.8,
    max_depth = 3,
    min_samples_leaf=2, 
    min_samples_split=5)

hgb = HistGradientBoostingRegressor(
    max_iter=1000,
    max_depth=3,
    learning_rate=0.1,
    min_samples_leaf=2,
    max_leaf_nodes=31,  # opcional, similar a controlar la complejidad
    random_state=42
)


# Crear un pipeline que primero preprocesa y luego entrena el modelo
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', hgb)
])

In [11]:
# --- K-Fold Cross-Validation para calcular el RMSE promedio ---
kf = KFold(n_splits=10, shuffle=True, random_state=42)
rmse_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model_pipeline.fit(X_train, y_train)
    preds = model_pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    rmse_scores.append(rmse)
    print(f"RMSE : {rmse}")
    print('---')
    average_rmse = np.mean(rmse_scores)
print(f"RMSE Promedio de 5-Fold CV: {average_rmse}")


RMSE : 108720.64428371312
---
RMSE : 110248.53735982461
---
RMSE : 94131.69694771695
---
RMSE : 118159.09462222255
---
RMSE : 147483.11922763847
---
RMSE : 129645.40353552594
---
RMSE : 125312.53576373843
---
RMSE : 171629.5319230309
---
RMSE : 105614.13078418837
---
RMSE : 131514.73854350424
---
RMSE Promedio de 5-Fold CV: 124245.94329911035


In [12]:
y_predict = model_pipeline.predict(X_test)

r2_score(y_test,y_predict), mean_squared_error(y_test,y_predict), mean_absolute_error(y_test,y_predict),

(0.9550589230998143, 17296126454.166283, 83302.86981380204)

In [13]:
model_pipeline.fit(X, y)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,0.1
,max_iter,1000
,max_leaf_nodes,31
,max_depth,3
,min_samples_leaf,2
,l2_regularization,0.0
,max_features,1.0
,max_bins,255


In [14]:
# --- Extracción de la configuración ---
# Obtenemos listas de valores únicos del DataFrame de entrenamiento
unique_locations = sorted(X['location'].unique().tolist())
unique_buildings = sorted(X['building'].unique().tolist())

In [15]:
# Creamos el diccionario de configuración
model_config = {
    'average_rmse': average_rmse,
    'locations': unique_locations,
    'buildings': unique_buildings
}

In [16]:
# Guardar el pipeline entrenado (opcional, para despliegue real)
joblib.dump(model_pipeline, 'real_estate_model_pipeline_v2.pkl')

['real_estate_model_pipeline_v2.pkl']

In [17]:
# 2. Guardar el archivo de configuración JSON
with open('model_config_v2.json', 'w') as f:
    json.dump(model_config, f, indent=4)
print("✅ Configuración del modelo guardada en 'model_config.json'")

✅ Configuración del modelo guardada en 'model_config.json'
