# Building a regression model for predicting house sale prices

In [None]:
import pickle
import pathlib

import numpy as np
import pandas as pd

In [None]:
DATA_DIR = pathlib.Path.cwd() / 'data'
print(DATA_DIR)

In [None]:
clean_data_path = DATA_DIR / 'processed' / 'ames_model_data.pkl'

In [None]:
with open(clean_data_path, 'rb') as file:
    model_data = pickle.load(file)

## Train-test splitting

In [None]:
X = model_data.drop(columns=['SalePrice']).copy()
y = model_data['SalePrice'].copy()

In [None]:
X, y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
RANDOM_SEED = 42  # Any number here, really.

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_SEED,
)


In [None]:
X.shape, Xtrain.shape, Xtest.shape

In [None]:
y.shape, ytrain.shape, ytest.shape

## First experiment: scaling and oob

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

float64_columns = Xtrain.select_dtypes('float64').columns

col = ColumnTransformer(
    [
        ('scale', StandardScaler(), float64_columns),
    ],
    remainder='passthrough',
)

col.fit(Xtrain)

Xtrain_scaled = col.transform(Xtrain)
Xtest_scaled = col.transform(Xtest)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

param_grid = {
    'n_estimators': [800],
    'max_depth': [15],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=RANDOM_SEED, oob_score=True),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2,
)

grid_search.fit(Xtrain_scaled, ytrain)


In [None]:
print(f'Best parameters: {grid_search.best_params_}')

In [None]:
#get cross validation rmse of random forest
rmse = np.sqrt(mean_squared_error(ytrain, grid_search.predict(Xtrain_scaled)))
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

In [None]:
val = grid_search.best_estimator_.predict(Xtest_scaled)
rmse = np.sqrt(mean_squared_error(ytest, val))
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

## Second experiment: skewness

In [None]:
from scipy.stats import skew

skewness = Xtrain.select_dtypes(np.number).apply(skew)

skewness

In [None]:
skewness = skewness[abs(skewness) > 3]

skew_features = Xtrain[skewness.index]

skew_features.columns

In [None]:
def log_tf(feature):
    return np.log1p(feature)

Xtrain_skew = Xtrain.copy()
Xtest_skew = Xtest.copy()

Xtrain_skew[skew_features.columns] = Xtrain_skew[skew_features.columns].apply(log_tf)
Xtest_skew[skew_features.columns] = Xtest_skew[skew_features.columns].apply(log_tf)

In [None]:
col.fit(Xtrain_skew)

Xtrain_scaled = col.transform(Xtrain_skew)
Xtest_scaled = col.transform(Xtest_skew)

In [None]:
grid_search.fit(Xtrain_scaled, ytrain)

In [None]:
# it isnt possible to get rmse score of grid search because RF use r^2 as score metric
rmse = np.sqrt(mean_squared_error(ytrain, grid_search.predict(Xtrain_scaled)))
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

In [None]:
val = grid_search.best_estimator_.predict(Xtest_scaled)
rmse = np.sqrt(mean_squared_error(ytest, val))
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

## Ainda pior que o baseline