# House Prices Advanced Regression Techniques - Random Forest Regressor

- Using Random Forest Regressor;
- Filling all missing values;
- Work with all values;
- **Target**: *SalePrice*

[Kaggle](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview)

In [1]:
# Imports
import os 

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MaxAbsScaler, LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

In [2]:
# Function to measure results
def results_regression(y_test_ ,y_pred_):
    mse = mean_squared_error(y_test_ ,y_pred_)
    print(f"MSE: {mse}")
    
    rmse = np.sqrt(mse)
    print(f"RMSE: {rmse}")
    
    mae = mean_absolute_error(y_test_ ,y_pred_)
    print(f"MAE: {mae}")
    
    mape = mean_absolute_percentage_error(y_test_ ,y_pred_)
    print(f"MAPE: {mape}")
    
    r2 = r2_score(y_test_ ,y_pred_)
    print(f"R2_SCORE {r2}")

In [3]:
# Loading train dataset
data = pd.read_csv("./../data/house-prices-all-columns-filled.csv")
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0.315789,0.75,0.207668,0.039258,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.166667,0.999005,1.0,0.8,0.276159
1,0.105263,0.75,0.255591,0.0446,1.0,1.0,1.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.416667,0.998507,1.0,0.8,0.240397
2,0.315789,0.75,0.217252,0.052266,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.75,0.999005,1.0,0.8,0.296026
3,0.368421,0.75,0.191693,0.044368,1.0,0.0,1.0,0.0,0.0,0.0,...,0.492754,0.0,0.0,0.0,0.0,0.166667,0.99801,1.0,0.0,0.18543
4,0.315789,0.75,0.268371,0.06625,1.0,0.0,1.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.999005,1.0,0.8,0.331126


In [4]:
# Validate missing values
data.isna().sum().sum()

0

## Analyze

In [5]:
# Matrix Correlation
mc = data.corr()

In [6]:
# Select features with corr > threshold
threshold = .25
best_features = mc[(mc['SalePrice'] >= threshold) | (mc['SalePrice'] <= threshold * -1)]['SalePrice'].index[:-1]
best_features


Index(['LotFrontage', 'LotArea', 'LotShape', 'OverallQual', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'Foundation', 'BsmtQual',
       'BsmtExposure', 'BsmtFinSF1', 'TotalBsmtSF', 'HeatingQC', 'CentralAir',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath', 'HalfBath',
       'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF'],
      dtype='object')

## Test model

In [7]:
X = data[best_features]
y = data['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

## Run model

In [10]:
# Test all params
params_to_test = {
    "n_estimators": range(1, 502, 100),
    "criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"],
    "max_features": ["sqrt", "log2", None],
}

model = GridSearchCV(RandomForestRegressor(), params_to_test, scoring='r2', verbose=10)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

display("Best params:", model.best_params_)
display("Best score:", model.best_score_)

results_regression(y_test, y_pred)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5; 1/72] START criterion=squared_error, max_features=sqrt, n_estimators=1.
[CV 1/5; 1/72] END criterion=squared_error, max_features=sqrt, n_estimators=1;, score=0.750 total time=   0.0s
[CV 2/5; 1/72] START criterion=squared_error, max_features=sqrt, n_estimators=1.
[CV 2/5; 1/72] END criterion=squared_error, max_features=sqrt, n_estimators=1;, score=0.494 total time=   0.0s
[CV 3/5; 1/72] START criterion=squared_error, max_features=sqrt, n_estimators=1.
[CV 3/5; 1/72] END criterion=squared_error, max_features=sqrt, n_estimators=1;, score=0.640 total time=   0.0s
[CV 4/5; 1/72] START criterion=squared_error, max_features=sqrt, n_estimators=1.
[CV 4/5; 1/72] END criterion=squared_error, max_features=sqrt, n_estimators=1;, score=0.370 total time=   0.0s
[CV 5/5; 1/72] START criterion=squared_error, max_features=sqrt, n_estimators=1.
[CV 5/5; 1/72] END criterion=squared_error, max_features=sqrt, n_estimators=1;, score=0.5

'Best params:'

{'criterion': 'friedman_mse', 'max_features': 'sqrt', 'n_estimators': 101}

'Best score:'

0.8277264198422097

MSE: 0.0012751250140608087
RMSE: 0.0357088926468017
MAE: 0.02219098973673109
MAPE: 0.10230296223530118
R2_SCORE 0.8958378661793486
