In [18]:
import os
import yaml
import logging
from contextlib import contextmanager, nullcontext

import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import mlflow
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from xgboost import XGBRegressor

In [19]:
print("Setting up logger")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

mlflow_tracking_uri = 'http://localhost:5555'

Setting up logger


In [20]:
print("Fetching the data")
data_path = '../../data/processed/featured_house_data.csv'

data = pl.read_csv(data_path)
X = data.drop('price')
y = data.select(pl.col('price'))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Fetching the data


In [21]:
X.shape, X.columns

((84, 14),
 ['sqft',
  'bedrooms',
  'bathrooms',
  'location_Downtown',
  'location_Mountain',
  'location_Rural',
  'location_Suburb',
  'location_Urban',
  'location_Waterfront',
  'year_built',
  'condition',
  'house_age',
  'price_per_sqft',
  'bed_bath_ratio'])

In [22]:
X

sqft,bedrooms,bathrooms,location_Downtown,location_Mountain,location_Rural,location_Suburb,location_Urban,location_Waterfront,year_built,condition,house_age,price_per_sqft,bed_bath_ratio
i64,i64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64
1527,2,1.5,0,0,0,1,0,0,1956,1,70,324.165029,1.333333
2526,3,2.5,1,0,0,0,0,0,1998,3,28,297.70388,1.2
1622,2,1.5,0,0,1,0,0,0,1975,2,51,196.670777,1.333333
3102,4,3.0,0,0,0,0,0,1,2005,3,21,390.070922,1.333333
1835,2,2.0,0,0,0,0,1,0,1982,1,44,251.771117,1.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…
2080,3,2.0,0,0,0,0,1,0,1991,1,35,254.807692,1.5
1640,2,1.5,0,0,0,1,0,0,1963,2,63,226.829268,1.333333
2220,3,2.0,1,0,0,0,0,0,1985,1,41,266.666667,1.5
1730,2,1.5,0,0,1,0,0,0,1965,2,61,189.595376,1.333333


In [23]:
if mlflow_tracking_uri:
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    mlflow.set_experiment("House Price Prediction Experiment")

In [24]:
# Define models and hyperparameter grids
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror')
}

model_grids = {
    'LinearRegression': {},
    'RandomForest': {
        'n_estimators': [100, 150],
        'max_depth': [None, 10, 20]
    },
    'GradientBoosting': {
        'n_estimators': [100, 250],
        'learning_rate': [0.1, 0.05],
        'max_depth': [3, 10]
    },
    'XGBoost': {
        'n_estimators': [100, 150],
        'learning_rate': [0.1, 0.05],
        'max_depth': [3, 10]
    }
}


In [25]:
print("Doing hyperparameter tuning")

def evaluate_model_with_grid_search(name, model, grid, X_train, X_test, y_train, y_test):
    if grid:
        rf = GridSearchCV(model, grid, cv=3, scoring='r2', n_jobs=-1)
        rf.fit(X_train, y_train)
        best_model = rf.best_estimator_
        best_params = rf.best_params_

    else:
        model.fit(X_train, y_train)
        best_model = model
        best_params = model.get_params()

    y_pred = best_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)


    return {
        'mae': mae,
        'mse': mse,
        'rmse': rmse,
        'r2': r2,
        'model': best_model,
        'params': best_params
    }

print("MLFlow tracking URI:", mlflow_tracking_uri)

Doing hyperparameter tuning
MLFlow tracking URI: http://localhost:5555


In [26]:
results = {}

with mlflow.start_run(run_name ="model_comparison") if mlflow_tracking_uri else nullcontext():
    for name, model in models.items():
        logger.info(f"Training model {name}")
        with mlflow.start_run(run_name = name, nested = True) if mlflow_tracking_uri else nullcontext():
            evaluation = evaluate_model_with_grid_search(name, model, model_grids[name], X_train, X_test, y_train, y_test)
            results[name] = evaluation

            if mlflow_tracking_uri:
                mlflow.log_params(evaluation['params'])
                mlflow.log_metrics({
                    'mae': evaluation['mae'],
                    'mse': evaluation['mse'],
                    'rmse': evaluation['rmse'],
                    'r2': evaluation['r2']
                })
            mlflow.sklearn.log_model(evaluation['model'], artifact_path=name.lower().replace(" ", "_"))

        print(f"{name} R2: {evaluation['r2']:.4f}, RMSE: {evaluation['rmse']:.2f}")

INFO:root:Training model LinearRegression
INFO:root:Training model RandomForest


LinearRegression R2: 0.9550, RMSE: 58235.90


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
INFO:root:Training model GradientBoosting
  y = column_or_1d(y, warn=True)
  y = column_o

RandomForest R2: 0.9656, RMSE: 50934.03


INFO:root:Training model XGBoost


GradientBoosting R2: 0.9353, RMSE: 69838.48




XGBoost R2: 0.9897, RMSE: 27898.65
