# Building a regression model for predicting house sale prices

In [1]:
import pickle
import pathlib

import numpy as np
import pandas as pd

In [2]:
DATA_DIR = pathlib.Path.cwd() / 'data'
print(DATA_DIR)

c:\Insper\6SEM\ml\HousePriceRegressor\data


In [3]:
clean_data_path = DATA_DIR / 'processed' / 'ames_model_data.pkl'

In [4]:
with open(clean_data_path, 'rb') as file:
    model_data = pickle.load(file)

In [5]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2877 entries, 0 to 2929
Columns: 165 entries, Lot.Frontage to Exterior_Other
dtypes: bool(119), float64(34), int64(12)
memory usage: 1.4 MB


### Justificativa para manter grande parte da feature engineering anterior e opções para explorar

lorem ipsus

## Train-test splitting

In [6]:
X = model_data.drop(columns=['SalePrice']).copy()
y = model_data['SalePrice'].copy()

In [7]:
X, y

(      Lot.Frontage  Lot.Area  Lot.Shape  Land.Slope  Overall.Qual  \
 0            141.0   31770.0          1           0             5   
 1             80.0   11622.0          0           0             4   
 2             81.0   14267.0          1           0             5   
 3             93.0   11160.0          0           0             6   
 4             74.0   13830.0          1           0             4   
 ...            ...       ...        ...         ...           ...   
 2925          37.0    7937.0          1           0             5   
 2926          68.0    8885.0          1           1             4   
 2927          62.0   10441.0          0           0             4   
 2928          77.0   10010.0          0           1             4   
 2929          74.0    9627.0          0           1             6   
 
       Overall.Cond  Mas.Vnr.Area  Exter.Qual  Exter.Cond  BsmtFin.SF.1  ...  \
 0                4         112.0           2           2         639.0  ...  

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
RANDOM_SEED = 42  # Any number here, really.

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_SEED,
)


In [None]:
X.shape, Xtrain.shape, Xtest.shape

In [None]:
y.shape, ytrain.shape, ytest.shape

## Primeiro teste: modelo linear com scaling e cross-validation

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


float64_columns = Xtrain.select_dtypes('float64').columns

col = ColumnTransformer(
    [
        ('scale', StandardScaler(), float64_columns),
    ],
    remainder='passthrough',
)

col.fit(Xtrain)

Xtrain_scaled = col.transform(Xtrain)
Xtest_scaled = col.transform(Xtest)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

lr = LinearRegression()

grid = {
    'fit_intercept': [True, False],
}

grid_search = GridSearchCV(
    lr,
    grid,
    cv=5,
    scoring='neg_mean_squared_error',
    return_train_score=True,
)

In [None]:
grid_search.fit(Xtrain_scaled, ytrain)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')

rmse = np.sqrt(-grid_search.best_score_)
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

## Segundo experimento: Modelo Lasso e scaling

In [None]:
from sklearn.linear_model import Lasso

grid = {
    'alpha': np.logspace(-8, -3, 200),
}

lasso = Lasso()

grid_search = GridSearchCV(
    lasso,
    grid,
    cv=5,
    scoring='neg_mean_squared_error',
    return_train_score=True,
)

In [None]:
grid_search.fit(Xtrain_scaled, ytrain)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')

In [None]:
rmse = np.sqrt(-grid_search.best_score_)
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

Percebe-se que Lasso tem menor erro que o linear

## Terceiro experimento: Lasso + fazer transformação de log para features com skewness (assimetria)

Transformação de log é conhecida por fazer uma feature que tem distribuição assimétrica ter uma distribuição mais próxima da normal

In [None]:
#test skewness for all columns and get index of those with skewness > 0.5
from scipy.stats import skew

skewness = Xtrain.select_dtypes(np.number).apply(skew)

skewness

In [None]:
skewness = skewness[abs(skewness) > 3]

skew_features = Xtrain[skewness.index]

skew_features.columns

In [None]:
def log_tf(feature):
    return np.log1p(feature)

Xtrain_skew = Xtrain.copy()
Xtest_skew = Xtest.copy()

Xtrain_skew[skew_features.columns] = Xtrain_skew[skew_features.columns].apply(log_tf)
Xtest_skew[skew_features.columns] = Xtest_skew[skew_features.columns].apply(log_tf)


In [None]:
col.fit(Xtrain_skew)

Xtrain_scaled = col.transform(Xtrain_skew)
Xtest_scaled = col.transform(Xtest_skew)

In [None]:
grid_search.fit(Xtrain_scaled, ytrain)

In [None]:
rmse = np.sqrt(-grid_search.best_score_)
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

In [None]:
val = grid_search.best_estimator_.predict(Xtest_scaled)
rmse = np.sqrt(mean_squared_error(ytest, val))
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

Percebe-se uma melhora com esse feature engineering

## Quarto experimento : Elastic net

Elastic net é um modelo que combina a regularização ridge e Lasso

In [None]:
from sklearn.linear_model import ElasticNet

grid = {
    'alpha': np.logspace(-8, -3, 10),
    'l1_ratio': np.linspace(0.01, 1, 50),
}

elastic = ElasticNet()

grid_search = GridSearchCV(
    elastic,
    grid,
    cv=5,
    scoring='neg_mean_squared_error',
    return_train_score=True,
)

In [None]:
grid_search.fit(Xtrain_scaled, ytrain)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')

In [None]:
rmse = np.sqrt(-grid_search.best_score_)
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

## Considerando todas as opções, Elastic net foi a que teve menor erro

#### Teste final

In [None]:
val = grid_search.best_estimator_.predict(Xtest_scaled)
rmse = np.sqrt(mean_squared_error(ytest, val))
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')