# **Housing Prices**  

@sofiagerard  
febrero 2025



**Factorizing notebook in order to transform it into a pipeline/script and
keeping just the necessary, aka removing xgboost**

## Libraries

In [4]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os
import logging



2025-02-15 16:49:49,913 - INFO - NumExpr defaulting to 8 threads.


In [5]:
import logging

# Configurar logging para que muestre mensajes en el notebook
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Probarlo
logging.info("Este es un mensaje informativo en el notebook.")
logging.warning("¡Cuidado! Este es un mensaje de advertencia.")
logging.error("Esto es un error.")


2025-02-15 16:49:51,647 - INFO - Este es un mensaje informativo en el notebook.
2025-02-15 16:49:51,648 - ERROR - Esto es un error.


## Data

In [7]:
def load_data(train_path="../data/raw/train.csv", test_path="../data/raw/test.csv"):
    """
    Carga los datasets de entrenamiento y prueba desde archivos CSV.

    Args:
    - train_path (str): Ruta al archivo CSV de entrenamiento.
    - test_path (str): Ruta al archivo CSV de prueba.

    Returns:
    - tuple: (df_train, df_test) si tiene éxito, o None si falla.
    """
    try:
        # Verificar si los archivos existen
        if not os.path.exists(train_path):
            raise FileNotFoundError(f"❌ No se encontró el archivo: {train_path}")
        if not os.path.exists(test_path):
            raise FileNotFoundError(f"❌ No se encontró el archivo: {test_path}")

        # Cargar datos
        df_train = pd.read_csv(train_path)
        df_test = pd.read_csv(test_path)

        # Verificar si están vacíos
        if df_train.empty:
            raise ValueError("⚠️ El DataFrame de entrenamiento está vacío.")
        if df_test.empty:
            raise ValueError("⚠️ El DataFrame de prueba está vacío.")

        print("✅ Datos cargados correctamente.")
        return df_train, df_test

    except Exception as e:
        logging.error(f"Error al cargar los datos: {e}")
        print(f"❌ Error al cargar los datos: {e}")
        return None, None




---

## Preprocessing

In [8]:

def fill_categorical_na(df_train, df_test, categorical_cols):
    """Rellena nulos en columnas categóricas con 'None'."""
    try:
        for col in categorical_cols:
            df_train[col] = df_train[col].fillna("None")
            df_test[col] = df_test[col].fillna("None")
        logging.info(f"✅ Nulos categóricos rellenados para columnas: {categorical_cols}")
    except Exception as e:
        logging.error(f"❌ Error al rellenar nulos categóricos: {e}")
    return df_train, df_test


def fill_numerical_na(df_train, df_test, numerical_cols):
    """Rellena nulos en columnas numéricas con la mediana."""
    try:
        for col in numerical_cols:
            median_value = df_train[col].median()
            df_train[col] = df_train[col].fillna(median_value)
            df_test[col] = df_test[col].fillna(median_value)
        logging.info(f"✅ Nulos numéricos rellenados para columnas: {numerical_cols}")
    except Exception as e:
        logging.error(f"❌ Error al rellenar nulos numéricos: {e}")
    return df_train, df_test


def fill_mode_na(df_train, df_test, column):
    """Rellena nulos en una columna específica con el valor más frecuente (moda)."""
    try:
        mode_value = df_train[column].mode()[0]
        df_train[column] = df_train[column].fillna(mode_value)
        df_test[column] = df_test[column].fillna(mode_value)
        logging.info(f"✅ Nulos rellenados con la moda en columna: '{column}'")
    except Exception as e:
        logging.error(f"❌ Error al rellenar nulos con moda: {e}")
    return df_train, df_test


# 🛠️ **Uso de las funciones**

# Listas de columnas con nulos
categorical_na = [
    "Alley", "MasVnrType", "BsmtQual", "BsmtCond", "BsmtExposure", 
    "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", 
    "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"
]

numerical_na = ["LotFrontage", "MasVnrArea", "GarageYrBlt"]
mode_column = "Electrical"

# Cargar datos
df_train = pd.read_csv("../data/raw/train.csv")
df_test = pd.read_csv("../data/raw/test.csv")

# Aplicar funciones
df_train, df_test = fill_categorical_na(df_train, df_test, categorical_na)
df_train, df_test = fill_numerical_na(df_train, df_test, numerical_na)
df_train, df_test = fill_mode_na(df_train, df_test, mode_column)

# Revisar resultados
print(df_train.head())
print(df_test.head())



2025-02-15 16:50:05,489 - INFO - ✅ Nulos categóricos rellenados para columnas: ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
2025-02-15 16:50:05,491 - INFO - ✅ Nulos numéricos rellenados para columnas: ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
2025-02-15 16:50:05,493 - INFO - ✅ Nulos rellenados con la moda en columna: 'Electrical'


   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave  None      Reg   
1   2          20       RL         80.0     9600   Pave  None      Reg   
2   3          60       RL         68.0    11250   Pave  None      IR1   
3   4          70       RL         60.0     9550   Pave  None      IR1   
4   5          60       RL         84.0    14260   Pave  None      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0   None  None        None       0      2   
1         Lvl    AllPub  ...        0   None  None        None       0      5   
2         Lvl    AllPub  ...        0   None  None        None       0      9   
3         Lvl    AllPub  ...        0   None  None        None       0      2   
4         Lvl    AllPub  ...        0   None  None        None       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

---

## Feature Engineering

In [None]:
def create_engineered_features(df_train, df_test):
    """
    Crea nuevas características relevantes para el análisis de precios de viviendas.
    
    Características creadas:
    - HouseAge: Años desde la construcción o última remodelación.
    - TotalSF: Superficie total habitable (sobre nivel del suelo + sótano).
    - TotalBath: Suma de baños completos y medios baños (0.5 cada medio baño).
    """
    try:
        # HouseAge: Años desde construcción o remodelación
        df_train["HouseAge"] = df_train["YrSold"] - df_train[["YearBuilt", "YearRemodAdd"]].max(axis=1)
        df_test["HouseAge"] = df_test["YrSold"] - df_test[["YearBuilt", "YearRemodAdd"]].max(axis=1)

        # TotalSF: Superficie total habitable
        df_train["TotalSF"] = df_train["GrLivArea"] + df_train["TotalBsmtSF"]
        df_test["TotalSF"] = df_test["GrLivArea"] + df_test["TotalBsmtSF"]

        # TotalBath: Suma de baños completos y medios baños (medio baño = 0.5)
        df_train["TotalBath"] = (
            df_train["FullBath"] + (df_train["HalfBath"] * 0.5) +
            df_train["BsmtFullBath"] + (df_train["BsmtHalfBath"] * 0.5)
        )
        df_test["TotalBath"] = (
            df_test["FullBath"] + (df_test["HalfBath"] * 0.5) +
            df_test["BsmtFullBath"] + (df_test["BsmtHalfBath"] * 0.5)
        )

        logging.info("✅ Ingeniería de características completada exitosamente.")
    except Exception as e:
        logging.error(f"❌ Error durante la ingeniería de características: {e}")

    return df_train, df_test


# 🛠️ **Uso de la función**

# Aplicar ingeniería de características
df_train, df_test = create_engineered_features(df_train, df_test)

# Revisar resultados
print(df_train[["HouseAge", "TotalSF", "TotalBath"]].head())
print(df_test[["HouseAge", "TotalSF", "TotalBath"]].head())


2025-02-15 16:53:11,830 - INFO - ✅ Ingeniería de características completada exitosamente.


   HouseAge  TotalSF  TotalBath
0         5     2566        3.5
1        31     2524        2.5
2         6     2706        3.5
3        36     2473        2.0
4         8     3343        3.5
   HouseAge  TotalSF  TotalBath
0        49   1778.0        1.0
1        52   2658.0        1.5
2        12   2557.0        2.5
3        12   2530.0        2.5
4        18   2560.0        2.0


---

## Models

In [11]:

def select_features(df_train, df_test):
    """Selecciona y devuelve las variables relevantes."""
    numerical_features = [
        "OverallQual", "GrLivArea", "TotalBsmtSF", "GarageCars", "GarageArea",
        "1stFlrSF", "FullBath", "TotRmsAbvGrd", "YearBuilt", "YearRemodAdd",
        "HouseAge", "TotalSF", "TotalBath"
    ]

    categorical_features = [
        col for col in df_train.columns if col.startswith(("MSZoning_", "Neighborhood_", 
                                                           "BldgType_", "HouseStyle_", "SaleCondition_"))
    ]

    selected_features = numerical_features + categorical_features

    X_train = df_train[selected_features]
    y_train = df_train["SalePrice"]
    X_test = df_test[selected_features]

    print(f"Número de variables finales: {len(selected_features)}")
    print("Primeras filas de X_train:")
    print(X_train.head())

    return X_train, y_train, X_test, numerical_features, categorical_features


def prepare_pipeline(numerical_features, categorical_features):
    """Prepara y devuelve el pipeline de preprocesamiento."""
    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    cat_pipeline = Pipeline([
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", num_pipeline, numerical_features),
        ("cat", cat_pipeline, categorical_features)
    ])

    return preprocessor


def split_data(X_train, y_train, test_size=0.2, random_state=42):
    """Divide los datos en entrenamiento y validación."""
    X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=test_size, random_state=random_state)
    print(f"X_train_sub: {X_train_sub.shape}, X_val: {X_val.shape}")
    return X_train_sub, X_val, y_train_sub, y_val


# Ejecución
# df_train y df_test deben estar previamente cargados
X_train, y_train, X_test, numerical_features, categorical_features = select_features(df_train, df_test)
preprocessor = prepare_pipeline(numerical_features, categorical_features)
X_train_sub, X_val, y_train_sub, y_val = split_data(X_train, y_train)


Número de variables finales: 13
Primeras filas de X_train:
   OverallQual  GrLivArea  TotalBsmtSF  GarageCars  GarageArea  1stFlrSF  \
0            7       1710          856           2         548       856   
1            6       1262         1262           2         460      1262   
2            7       1786          920           2         608       920   
3            7       1717          756           3         642       961   
4            8       2198         1145           3         836      1145   

   FullBath  TotRmsAbvGrd  YearBuilt  YearRemodAdd  HouseAge  TotalSF  \
0         2             8       2003          2003         5     2566   
1         2             6       1976          1976        31     2524   
2         2             6       2001          2002         6     2706   
3         1             7       1915          1970        36     2473   
4         2             9       2000          2000         8     3343   

   TotalBath  
0        3.5  
1        2.5  


## Linear Regression

In [14]:
def train_linear_regression_pipeline(preprocessor, X_train, y_train):
    """Entrena un pipeline con regresión lineal."""
    try:
        lr_pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("model", LinearRegression())
        ])
        lr_pipeline.fit(X_train, y_train)
        logging.info("✅ Modelo entrenado correctamente")
        return lr_pipeline
    except Exception as e:
        logging.error(f"❌ Error al entrenar el modelo: {e}")
        return None

def evaluate_model(model, X_val, y_val):
    """Evalúa el modelo con métricas estándar."""
    try:
        y_pred = model.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        r2 = r2_score(y_val, y_pred)
        logging.info("📊 Evaluación del modelo:")
        logging.info(f"MAE: {mae:.2f}")
        logging.info(f"RMSE: {rmse:.2f}")
        logging.info(f"R²: {r2:.4f}")
        return mae, rmse, r2
    except Exception as e:
        logging.error(f"❌ Error al evaluar el modelo: {e}")
        return None, None, None

def make_predictions(model, X_test):
    """Genera predicciones utilizando un modelo entrenado."""
    try:
        predictions = model.predict(X_test)
        logging.info("✅ Predicciones realizadas correctamente.")
        return predictions
    except Exception as e:
        logging.error(f"❌ Error al hacer predicciones: {e}")
        return None


# 📌 Ejecutar el flujo completo en el notebook
model = train_linear_regression_pipeline(preprocessor, X_train_sub, y_train_sub)

if model:
    evaluate_model(model, X_val, y_val)
    predictions = make_predictions(model, X_test)
    if predictions is not None:
        print("🔍 Primeras predicciones:", predictions[:5])


2025-02-15 17:10:28,797 - INFO - ✅ Modelo entrenado correctamente
2025-02-15 17:10:28,801 - INFO - 📊 Evaluación del modelo:
2025-02-15 17:10:28,801 - INFO - MAE: 24623.25
2025-02-15 17:10:28,801 - INFO - RMSE: 39445.56
2025-02-15 17:10:28,801 - INFO - R²: 0.7971
2025-02-15 17:10:28,806 - INFO - ✅ Predicciones realizadas correctamente.


🔍 Primeras predicciones: [101326.67622267 153570.18494751 164859.23143874 185151.55439356
 212049.05004195]
