# Building a regression model for predicting house sale prices

In [1]:
import pickle
import pathlib

import numpy as np
import pandas as pd

In [2]:
DATA_DIR = pathlib.Path.cwd() / 'data'
print(DATA_DIR)

c:\Insper\6SEM\ml\HousePriceRegressor\data


In [3]:
clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'

In [4]:
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2877 entries, 0 to 2929
Data columns (total 70 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   MS.SubClass      2877 non-null   category
 1   MS.Zoning        2877 non-null   category
 2   Lot.Frontage     2877 non-null   float64 
 3   Lot.Area         2877 non-null   float64 
 4   Lot.Shape        2877 non-null   category
 5   Land.Contour     2877 non-null   category
 6   Lot.Config       2877 non-null   category
 7   Land.Slope       2877 non-null   category
 8   Neighborhood     2877 non-null   category
 9   Bldg.Type        2877 non-null   category
 10  House.Style      2877 non-null   category
 11  Overall.Qual     2877 non-null   category
 12  Overall.Cond     2877 non-null   category
 13  Roof.Style       2877 non-null   category
 14  Mas.Vnr.Type     2877 non-null   category
 15  Mas.Vnr.Area     2877 non-null   float64 
 16  Exter.Qual       2877 non-null   category
 17  

In [6]:
model_data = data.copy()

## Encoding categorical variables

Lets identify all categorical variables - both nominal (that is, categoricals without category order) and ordinal.

In [7]:
categorical_columns = []
ordinal_columns = []
for col in model_data.select_dtypes('category').columns:
    if model_data[col].cat.ordered:
        ordinal_columns.append(col)
    else:
        categorical_columns.append(col)

In [8]:
ordinal_columns

['Lot.Shape',
 'Land.Slope',
 'Overall.Qual',
 'Overall.Cond',
 'Exter.Qual',
 'Exter.Cond',
 'Heating.QC',
 'Electrical',
 'Kitchen.Qual',
 'Functional',
 'Paved.Drive',
 'Fence']

In [9]:
categorical_columns

['MS.SubClass',
 'MS.Zoning',
 'Land.Contour',
 'Lot.Config',
 'Neighborhood',
 'Bldg.Type',
 'House.Style',
 'Roof.Style',
 'Mas.Vnr.Type',
 'Foundation',
 'Bsmt.Qual',
 'Bsmt.Cond',
 'Bsmt.Exposure',
 'BsmtFin.Type.1',
 'BsmtFin.Type.2',
 'Central.Air',
 'Garage.Type',
 'Garage.Finish',
 'Sale.Type',
 'Sale.Condition',
 'Condition',
 'Exterior']

### Encoding ordinal variables 

In [10]:
for col in ordinal_columns:
    codes, _ = pd.factorize(data[col], sort=True)
    model_data[col] = codes

In [11]:
model_data[ordinal_columns].info()

<class 'pandas.core.frame.DataFrame'>
Index: 2877 entries, 0 to 2929
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Lot.Shape     2877 non-null   int64
 1   Land.Slope    2877 non-null   int64
 2   Overall.Qual  2877 non-null   int64
 3   Overall.Cond  2877 non-null   int64
 4   Exter.Qual    2877 non-null   int64
 5   Exter.Cond    2877 non-null   int64
 6   Heating.QC    2877 non-null   int64
 7   Electrical    2877 non-null   int64
 8   Kitchen.Qual  2877 non-null   int64
 9   Functional    2877 non-null   int64
 10  Paved.Drive   2877 non-null   int64
 11  Fence         2877 non-null   int64
dtypes: int64(12)
memory usage: 292.2 KB


In [12]:
data['Lot.Shape'].value_counts()

Lot.Shape
Reg    1825
IR1     960
IR2      76
IR3      16
Name: count, dtype: int64

In [13]:
model_data['Lot.Shape'].value_counts()

Lot.Shape
0    1825
1     960
2      76
3      16
Name: count, dtype: int64

In [14]:
model_data['Exterior'].value_counts()

Exterior
VinylSd    1024
HdBoard     439
MetalSd     432
Wd Sdng     401
Plywood     218
CemntBd     126
BrkFace      86
WdShing      55
Stucco       42
AsbShng      41
Other        13
Name: count, dtype: int64

In [15]:
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

Unnamed: 0,0,1,2,3,4
AsbShng,False,False,False,False,False
BrkFace,True,False,False,True,False
CemntBd,False,False,False,False,False
HdBoard,False,False,False,False,False
MetalSd,False,False,False,False,False
Plywood,False,False,False,False,False
Stucco,False,False,False,False,False
VinylSd,False,True,False,False,True
Wd Sdng,False,False,True,False,False
WdShing,False,False,False,False,False


In [16]:
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data, drop_first=True)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

Unnamed: 0,0,1,2,3,4
BrkFace,True,False,False,True,False
CemntBd,False,False,False,False,False
HdBoard,False,False,False,False,False
MetalSd,False,False,False,False,False
Plywood,False,False,False,False,False
Stucco,False,False,False,False,False
VinylSd,False,True,False,False,True
Wd Sdng,False,False,True,False,False
WdShing,False,False,False,False,False
Other,False,False,False,False,False


In [17]:
model_data = pd.get_dummies(model_data, drop_first=True)

In [18]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2877 entries, 0 to 2929
Columns: 165 entries, Lot.Frontage to Exterior_Other
dtypes: bool(119), float64(34), int64(12)
memory usage: 1.4 MB


In [19]:
for cat in categorical_columns:
    dummies = []
    for col in model_data.columns:
        if col.startswith(cat + "_"):
            dummies.append(f'"{col}"')
    dummies_str = ', '.join(dummies)
    print(f'From column "{cat}" we made {dummies_str}\n')

From column "MS.SubClass" we made "MS.SubClass_30", "MS.SubClass_50", "MS.SubClass_60", "MS.SubClass_70", "MS.SubClass_80", "MS.SubClass_85", "MS.SubClass_90", "MS.SubClass_120", "MS.SubClass_160", "MS.SubClass_190", "MS.SubClass_Other"

From column "MS.Zoning" we made "MS.Zoning_RH", "MS.Zoning_RL", "MS.Zoning_RM"

From column "Land.Contour" we made "Land.Contour_HLS", "Land.Contour_Low", "Land.Contour_Lvl"

From column "Lot.Config" we made "Lot.Config_CulDSac", "Lot.Config_FR2", "Lot.Config_FR3", "Lot.Config_Inside"

From column "Neighborhood" we made "Neighborhood_BrDale", "Neighborhood_BrkSide", "Neighborhood_ClearCr", "Neighborhood_CollgCr", "Neighborhood_Crawfor", "Neighborhood_Edwards", "Neighborhood_Gilbert", "Neighborhood_IDOTRR", "Neighborhood_MeadowV", "Neighborhood_Mitchel", "Neighborhood_NAmes", "Neighborhood_NPkVill", "Neighborhood_NWAmes", "Neighborhood_NoRidge", "Neighborhood_NridgHt", "Neighborhood_OldTown", "Neighborhood_SWISU", "Neighborhood_Sawyer", "Neighborhood_Sa

## Train-test splitting

In [20]:
X = model_data.drop(columns=['SalePrice']).copy()
y = model_data['SalePrice'].copy()

In [21]:
X, y

(      Lot.Frontage  Lot.Area  Lot.Shape  Land.Slope  Overall.Qual  \
 0            141.0   31770.0          1           0             5   
 1             80.0   11622.0          0           0             4   
 2             81.0   14267.0          1           0             5   
 3             93.0   11160.0          0           0             6   
 4             74.0   13830.0          1           0             4   
 ...            ...       ...        ...         ...           ...   
 2925          37.0    7937.0          1           0             5   
 2926          68.0    8885.0          1           1             4   
 2927          62.0   10441.0          0           0             4   
 2928          77.0   10010.0          0           1             4   
 2929          74.0    9627.0          0           1             6   
 
       Overall.Cond  Mas.Vnr.Area  Exter.Qual  Exter.Cond  BsmtFin.SF.1  ...  \
 0                4         112.0           2           2         639.0  ...  

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
RANDOM_SEED = 42  # Any number here, really.

In [24]:
Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_SEED,
)


In [25]:
X.shape, Xtrain.shape, Xtest.shape

((2877, 164), (2157, 164), (720, 164))

In [26]:
y.shape, ytrain.shape, ytest.shape

((2877,), (2157,), (720,))

## First Experiment: linear with scaling, cross_validation

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


float64_columns = Xtrain.select_dtypes('float64').columns

col = ColumnTransformer(
    [
        ('scale', StandardScaler(), float64_columns),
    ],
    remainder='passthrough',
)

col.fit(Xtrain)

Xtrain_scaled = col.transform(Xtrain)
Xtest_scaled = col.transform(Xtest)


In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

lr = LinearRegression()

grid = {
    'fit_intercept': [True, False],
}

grid_search = GridSearchCV(
    lr,
    grid,
    cv=5,
    scoring='neg_mean_squared_error',
    return_train_score=True,
)

In [29]:
grid_search.fit(Xtrain_scaled, ytrain)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')

rmse = np.sqrt(-grid_search.best_score_)
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

Best parameters: {'fit_intercept': False}
Best score: -6736423346737.394
Average error is inf%


  error_percent = 100 * (10**rmse - 1)


In [30]:
val = grid_search.best_estimator_.predict(Xtest_scaled)
rmse = np.sqrt(mean_squared_error(ytest, val))
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is inf%


  error_percent = 100 * (10**rmse - 1)


## Second experiment: Lasso and scaling

In [31]:
from sklearn.linear_model import Lasso

grid = {
    'alpha': np.logspace(-8, -3, 200),
}

lasso = Lasso()

grid_search = GridSearchCV(
    lasso,
    grid,
    cv=5,
    scoring='neg_mean_squared_error',
    return_train_score=True,
)

In [32]:
grid_search.fit(Xtrain_scaled, ytrain)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best parameters: {'alpha': 8.804883581643464e-05}
Best score: -0.003296308851327888


In [33]:
rmse = np.sqrt(-grid_search.best_score_)
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 14.13%


In [34]:
val = grid_search.best_estimator_.predict(Xtest_scaled)
rmse = np.sqrt(mean_squared_error(ytest, val))
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 15.30%


## Third experiment: skewness and log

In [45]:
#test skewness for all columns and get index of those with skewness > 0.5
from scipy.stats import skew

skewness = Xtrain.select_dtypes(np.number).apply(skew)

skewness

Lot.Frontage        1.374507
Lot.Area           13.935570
Lot.Shape           1.260202
Land.Slope          5.096601
Overall.Qual        0.238328
Overall.Cond        0.631834
Mas.Vnr.Area        2.584621
Exter.Qual         -0.813090
Exter.Cond         -1.319756
BsmtFin.SF.1        1.013526
BsmtFin.SF.2        4.222772
Bsmt.Unf.SF         0.955611
Total.Bsmt.SF       0.729935
Heating.QC          0.546224
Electrical          4.168022
X1st.Flr.SF         1.263409
X2nd.Flr.SF         0.842831
Low.Qual.Fin.SF    11.186456
Gr.Liv.Area         1.096900
Bsmt.Full.Bath      0.614981
Bsmt.Half.Bath      3.940715
Full.Bath           0.159638
Half.Bath           0.686145
Bedroom.AbvGr       0.408400
Kitchen.AbvGr       4.018729
Kitchen.Qual       -0.456881
TotRms.AbvGrd       0.770448
Functional          5.199897
Fireplaces          0.730059
Garage.Cars        -0.179943
Garage.Area         0.264373
Paved.Drive         3.172562
Wood.Deck.SF        1.974402
Open.Porch.SF       2.676164
Enclosed.Porch

In [46]:
skewness = skewness[abs(skewness) > 3]

skew_features = Xtrain[skewness.index]

skew_features.columns

Index(['Lot.Area', 'Land.Slope', 'BsmtFin.SF.2', 'Electrical',
       'Low.Qual.Fin.SF', 'Bsmt.Half.Bath', 'Kitchen.AbvGr', 'Functional',
       'Paved.Drive', 'Enclosed.Porch', 'X3Ssn.Porch', 'Screen.Porch',
       'Pool.Area', 'Misc.Val'],
      dtype='object')

In [47]:
def log_tf(feature):
    return np.log1p(feature)

Xtrain_skew = Xtrain.copy()
Xtest_skew = Xtest.copy()

Xtrain_skew[skew_features.columns] = Xtrain_skew[skew_features.columns].apply(log_tf)
Xtest_skew[skew_features.columns] = Xtest_skew[skew_features.columns].apply(log_tf)


In [48]:
col.fit(Xtrain_skew)

Xtrain_scaled = col.transform(Xtrain_skew)
Xtest_scaled = col.transform(Xtest_skew)

In [49]:
grid_search.fit(Xtrain_scaled, ytrain)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [40]:
rmse = np.sqrt(-grid_search.best_score_)
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 12.88%


In [41]:
val = grid_search.best_estimator_.predict(Xtest_scaled)
rmse = np.sqrt(mean_squared_error(ytest, val))
error_percent = 100 * (10**rmse - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 14.91%


## Ainda pior que o baseline