# House Prices - Advanced Regression Techniques
## Predict sales prices and practice feature engineering, RFs, and gradient boosting

https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview
    

In [175]:
# Imports
import torch
import math, time
import numpy as np
import pandas as pd
from torch import nn
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from torch.utils.data import Dataset, DataLoader
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

In [145]:
# Load data
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")

In [146]:
test.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal
1458,2919,60,RL,74.0,9627,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,11,2006,WD,Normal


In [147]:
train.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
1459,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,147500


In [148]:
# We have 1 column more
train.columns.size, test.columns.size

(81, 80)

In [149]:
# So test doesn't have SalePrice
diff = set(train.columns) - set(test.columns)
diff

{'SalePrice'}

In [150]:
# So I assume the test set is for submitting the results
# I will have to use the train as my main dataset, with SalePrice as target value
dataset = train
X = dataset[test.columns]
y = dataset['SalePrice']
X.shape,y.shape

((1460, 80), (1460,))

In [151]:
# Are there missing values?
missing_values = X.columns[X.isnull().sum() > 0].tolist()
X[missing_values].isnull().sum()

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [168]:
y.isnull().sum()

0

In [152]:
# Lets see straight ahead what can we achieve when considering all the features.
# We'll ignore for now the features with missing values.

### Pre-process data

# Separate data Xm will be X without features with missing values
Xm = X.drop(columns=missing_values)
print(Xm.columns)

# Select the numerical columns
numerical_columns = Xm.select_dtypes(include=['int', 'float']).columns
X_num = Xm.loc[:, numerical_columns]

# Select the categorical columns
categorical_columns = Xm.select_dtypes(include=['object']).columns
X_cat = Xm.loc[:, categorical_columns]

print('Numerical columns:', numerical_columns.size)
print('Categorical columns:', categorical_columns.size)
print('Total:', categorical_columns.size + numerical_columns.size, " Valid: ", Xm.shape[1] == categorical_columns.size + numerical_columns.size)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageCars', 'GarageArea', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition'],
      dtype='object')
Numerical columns: 34
Categorical columns: 27
Total: 61  Vali

In [153]:
numerical_columns, categorical_columns

(Index(['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
        'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
        'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
        'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
        'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
        'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
        'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'],
       dtype='object'),
 Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
        'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
        'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
        'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating',
        'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive',
        'SaleType', 'SaleCondition'],
       dtype='object'))

- Scaling numerical data is pretty straightforward, we have to standarize data so the data looks like a standart distribution which makes data comparable and so, machine learning estimators can behave as intended.
- Encoding categorical features is a bit more complicated, because we have to decide if we are going to encode it with an ordinal encoder or not. So for this experiment, I'm gonna go with just get dummies wich is for non ordinal encoding. Which means its going to create new columns with only 0 and 1, instead of turning the labels into a column with 0, 1, 2, 3... because if I do that, the model might learn that 3 > 1 which is not the case for example with colors.

In [156]:
# Preprocess the numerical features
scaler = StandardScaler()
print(X_num[:10], X_cat[:10])
X_num = scaler.fit_transform(X_num)

# Encode the categorical features
X_cat = pd.get_dummies(X_cat)

   Id  MSSubClass  LotArea  OverallQual  OverallCond  YearBuilt  YearRemodAdd  \
0   1          60     8450            7            5       2003          2003   
1   2          20     9600            6            8       1976          1976   
2   3          60    11250            7            5       2001          2002   
3   4          70     9550            7            5       1915          1970   
4   5          60    14260            8            5       2000          2000   
5   6          50    14115            5            5       1993          1995   
6   7          20    10084            8            5       2004          2005   
7   8          60    10382            7            6       1973          1973   
8   9          50     6120            7            5       1931          1950   
9  10         190     7420            5            6       1939          1950   

   BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  ...  GarageArea  WoodDeckSF  \
0         706           0        150  .

In [160]:
print(X_num.shape, X_cat.shape)
print(X_num[:10], X_cat[:10])

(1460, 34) (1460, 182)
[[-1.73086488  0.07337496 -0.20714171  0.65147924 -0.51719981  1.05099379
   0.87866809  0.57542484 -0.28865283 -0.94459061 -0.45930254 -0.79343379
   1.16185159 -0.12024172  0.37033344  1.10781015 -0.24106104  0.78974052
   1.22758538  0.16377912 -0.21145358  0.91220977 -0.95122649  0.31172464
   0.35100032 -0.75217584  0.21650316 -0.3593249  -0.11633929 -0.27020835
  -0.06869175 -0.08768781 -1.5991111   0.13877749]
 [-1.7284922  -0.87256276 -0.09188637 -0.07183611  2.17962776  0.15673371
  -0.42957697  1.17199212 -0.28865283 -0.64122799  0.46646492  0.25714043
  -0.79516323 -0.12024172 -0.48251191 -0.81996437  3.94880935  0.78974052
  -0.76162067  0.16377912 -0.21145358 -0.31868327  0.60049493  0.31172464
  -0.06073101  1.62619479 -0.70448325 -0.3593249  -0.11633929 -0.27020835
  -0.06869175 -0.08768781 -0.48911005 -0.61443862]
 [-1.72611953  0.07337496  0.07347998  0.65147924 -0.51719981  0.9847523
   0.83021457  0.09290718 -0.28865283 -0.30164298 -0.31336875 

In [167]:
X_merged = pd.concat([pd.DataFrame(X_num, columns=numerical_columns), pd.DataFrame(X_cat)], axis=1)
print(X_merged.shape, X_merged)

(1460, 216)             Id  MSSubClass   LotArea  OverallQual  OverallCond  YearBuilt  \
0    -1.730865    0.073375 -0.207142     0.651479    -0.517200   1.050994   
1    -1.728492   -0.872563 -0.091886    -0.071836     2.179628   0.156734   
2    -1.726120    0.073375  0.073480     0.651479    -0.517200   0.984752   
3    -1.723747    0.309859 -0.096897     0.651479    -0.517200  -1.863632   
4    -1.721374    0.073375  0.375148     1.374795    -0.517200   0.951632   
...        ...         ...       ...          ...          ...        ...   
1455  1.721374    0.073375 -0.260560    -0.071836    -0.517200   0.918511   
1456  1.723747   -0.872563  0.266407    -0.071836     0.381743   0.222975   
1457  1.726120    0.309859 -0.147810     0.651479     3.078570  -1.002492   
1458  1.728492   -0.872563 -0.080160    -0.795151     0.381743  -0.704406   
1459  1.730865   -0.872563 -0.058112    -0.795151     0.381743  -0.207594   

      YearRemodAdd  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  ...  Sal

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X_merged, y, test_size=0.2, random_state=0)

In [170]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1168, 216) (292, 216) (1168,) (292,)


In [173]:
# Choose a regression algorithm
regressor = LinearRegression()

# Train the regression model
regressor.fit(X_train, y_train)

In [235]:
# Evaluate the performance of the model
y_pred = regressor.predict(X_test)
results = pd.DataFrame(
    np.concatenate(
        (
            y_test.to_numpy().reshape(len(y_test), 1),
            y_pred.reshape(len(y_pred), 1)

        ), axis=1),
    columns=["Test Data", "Predicted"]
    )
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 4.323736233256736e+27


In [244]:
results

Unnamed: 0,Test Data,Predicted
0,200624.0,275109.0
1,133000.0,145210.0
2,110000.0,117224.0
3,192000.0,216340.0
4,88000.0,98508.0
...,...,...
287,324000.0,284284.0
288,555000.0,460834.0
289,136000.0,264300.0
290,82500.0,127678.0


- So, 4.32 MSE with this simple approach where we just took off all the data with missing values and only did minimum pre-processing for fitting a regression model.
- The MSE means that the squared difference between the predicted values and the actual values is 4.32.