# House Prices Advanced Regression Techniques - XGBoost

- Using XGBoost;
- Filling all missing values;
- Work with all values;
- **Target**: *SalePrice*

[Kaggle](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview)

In [22]:
# Imports
import os 
from datetime import datetime
import pickle

import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, \
    mean_absolute_percentage_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
# Configs

# Set Seaborn Style
sns.set_style("darkgrid")

# Define a function to measure results
def results_regression(y_test_, y_pred_, _print=False):
    mse = mean_squared_error(y_test_, y_pred_)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test_, y_pred_)
    mape = mean_absolute_percentage_error(y_test_, y_pred_)
    r2 = r2_score(y_test_, y_pred_)

    if _print:
        print(f"MSE: {mse}")
        print(f"RMSE: {rmse}")
        print(f"MAE: {mae}")
        print(f"MAPE: {mape}")
        print(f"R2_SCORE {r2}")
    else:
        return mse, rmse, mae, mape, r2


In [24]:
data = pd.read_csv("./../data/house-prices-all-columns-filled.csv")
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0.315789,0.75,0.207668,0.039258,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.166667,0.999005,1.0,0.8,0.276159
1,0.105263,0.75,0.255591,0.0446,1.0,1.0,1.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.416667,0.998507,1.0,0.8,0.240397
2,0.315789,0.75,0.217252,0.052266,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.75,0.999005,1.0,0.8,0.296026
3,0.368421,0.75,0.191693,0.044368,1.0,0.0,1.0,0.0,0.0,0.0,...,0.492754,0.0,0.0,0.0,0.0,0.166667,0.99801,1.0,0.0,0.18543
4,0.315789,0.75,0.268371,0.06625,1.0,0.0,1.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.999005,1.0,0.8,0.331126


In [25]:
X = data.drop(columns=["SalePrice"], axis=1)
y = data["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


In [26]:
params = {
    "seed": [42],
    "n_estimators": range(100, 500, 100),
    "learning_rate": [0.3, 0.5],
    "max_depth": [4],
    "min_split_loss": [0],
    "lambda": [1],
    "reg_alpha": [8],
    "objective": [
        "reg:squarederror",
        "reg:squaredlogerror",
        "reg:squarederror_lad"
    ],
    "eval_metric": ["rmse", "rmsle", "mae", "mape", "mphe", "logloss", "error"],
}


model = GridSearchCV(XGBRegressor(), params, scoring='r2', verbose=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

display(model.best_estimator_, model.best_params_, model.best_score_)

results_regression(y_test, y_pred, True)

Fitting 5 folds for each of 168 candidates, totalling 840 fits
[CV 1/5; 1/168] START eval_metric=rmse, lambda=1, learning_rate=0.3, max_depth=4, min_split_loss=0, n_estimators=100, objective=reg:squarederror, reg_alpha=8, seed=42
[CV 1/5; 1/168] END eval_metric=rmse, lambda=1, learning_rate=0.3, max_depth=4, min_split_loss=0, n_estimators=100, objective=reg:squarederror, reg_alpha=8, seed=42;, score=0.747 total time=   7.3s
[CV 2/5; 1/168] START eval_metric=rmse, lambda=1, learning_rate=0.3, max_depth=4, min_split_loss=0, n_estimators=100, objective=reg:squarederror, reg_alpha=8, seed=42
[CV 2/5; 1/168] END eval_metric=rmse, lambda=1, learning_rate=0.3, max_depth=4, min_split_loss=0, n_estimators=100, objective=reg:squarederror, reg_alpha=8, seed=42;, score=0.667 total time=   2.7s
[CV 3/5; 1/168] START eval_metric=rmse, lambda=1, learning_rate=0.3, max_depth=4, min_split_loss=0, n_estimators=100, objective=reg:squarederror, reg_alpha=8, seed=42
[CV 3/5; 1/168] END eval_metric=rmse, la

280 fits failed out of a total of 840.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/pablo_veinberg/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pablo_veinberg/.local/lib/python3.10/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/home/pablo_veinberg/.local/lib/python3.10/site-packages/xgboost/sklearn.py", line 1025, in fit
    self._Booster = train(
  File "/home/pablo_veinberg/.local/lib/python3.10/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/h

{'eval_metric': 'rmse',
 'lambda': 1,
 'learning_rate': 0.3,
 'max_depth': 4,
 'min_split_loss': 0,
 'n_estimators': 100,
 'objective': 'reg:squarederror',
 'reg_alpha': 8,
 'seed': 42}

0.7466150781810267

MSE: 0.0026778128222004417
RMSE: 0.05174758759788172
MAE: 0.033495113431158664
MAPE: 0.16883551434677857
R2_SCORE 0.7812554106797587
