# House Prices — Starter Notebook

Objetivo: baseline de regresión (RMSE) y mejoras.

In [None]:
# 1) Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor


In [None]:
# 2) Carga dataset de ejemplo (ajusta a Ames en Kaggle si querés)
url = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv"
df = pd.read_csv(url)
df.head()

In [None]:
# 3) Setup rápido
y = df['median_house_value']
X = df.drop(columns=['median_house_value'])

num_cols = X.select_dtypes(include='number').columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median'))])
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                     ('oh', OneHotEncoder(handle_unknown='ignore'))])

pre = ColumnTransformer([('num', num_pipe, num_cols),
                         ('cat', cat_pipe, cat_cols)], remainder='drop')

model = Pipeline([('pre', pre),
                  ('rf', RandomForestRegressor(n_estimators=200, random_state=42))])


In [None]:
# 4) Validación simple
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared=False)
rmse

## Próximos pasos
- Cambiar a **Ames Housing** oficial de Kaggle
- Comparar con LinearRegression/XGB
- Tuning con GridSearchCV