# Real estate price prediction via supervised learning

In this notebook we use Linear Regression to predict real estate prices. This implementation is kept fairly simple where we use only the numerical features and ignore categorical variables.

In [86]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer, StandardScaler, PolynomialFeatures
from sklearn.model_selection import cross_val_score

In [87]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [88]:
y = train.SalePrice
X = train.loc[:,'MSSubClass':'SaleCondition'] #Remove id column
print X.shape
X.head()

(1460, 79)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [89]:
# pick only numerical features and ignore all categorical variables 
# this is not ideal! - more exhaustive approach in another notebook
X = X.select_dtypes(exclude=['object'])
print X.shape
X.head()

(1460, 36)


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,548,0,61,0,0,0,0,0,2,2008
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,460,298,0,0,0,0,0,0,5,2007
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,608,0,42,0,0,0,0,0,9,2008
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,642,0,35,272,0,0,0,0,2,2006
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,836,192,84,0,0,0,0,0,12,2008


In [90]:
# we repeat the same steps with our test data
test = test.loc[:,'MSSubClass':'SaleCondition'] #Remove id column
test = test.select_dtypes(exclude=['object'])
print test.shape
test.head()

(1459, 36)


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,730.0,140,0,0,0,120,0,0,6,2010
1,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,312.0,393,36,0,0,0,0,12500,6,2010
2,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,482.0,212,34,0,0,0,0,0,3,2010
3,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,...,470.0,360,36,0,0,0,0,0,6,2010
4,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,...,506.0,0,82,0,0,144,0,0,1,2010


In [91]:
# impute missing values in training set
my_imputer = Imputer()
X_arr = my_imputer.fit_transform(X) #returns an ndarray
X = pd.DataFrame(X_arr, columns=X.columns)

# impute missing values in test set
test_arr = my_imputer.transform(test) #returns an ndarray
test = pd.DataFrame(test_arr, columns=test.columns)

In [92]:
# Check how much in mae
def get_mae(X, y, model):
    # multiply by -1 to make positive MAE score instead of neg value returned as sklearn convention
    return -1 * cross_val_score(model, X, y, scoring = 'neg_mean_absolute_error').mean()

In [93]:
#Choose regressor models

#naive linear regressor
lm = LinearRegression(copy_X=False, fit_intercept=True, normalize=False)

#linear regressor with regularization
lm_reg = Ridge(copy_X=False, fit_intercept=True, alpha=500)

#random forest regressor
rf = RandomForestRegressor(75)

In [94]:
# Check CV scores with these two models
mae = get_mae(X,y, lm)
print('CV mean absolute error (linear regression:', mae)
mae = get_mae(X,y, lm_reg)
print('CV mean absolute error (linear regression with regularization):', mae)
mae = get_mae(X,y, rf)
print('CV mean absolute error (random forest):', mae)

('CV mean absolute error (linear regression:', 22496.21328126865)
('CV mean absolute error (linear regression with regularization):', 22039.824974498679)
('CV mean absolute error (random forest):', 18290.400020937614)


In [95]:
# scale features to zero mean and unit variance values in training set
scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
X_arr = scaler.fit_transform(X)
X = pd.DataFrame(X_arr, columns=test.columns)

# transform the test data
test_arr = scaler.transform(test) #returns an ndarray
test = pd.DataFrame(test_arr, columns=test.columns)

In [96]:
# Check CV scores with these two models
mae = get_mae(X,y, lm)
print('CV mean absolute error (linear regression:', mae)
mae = get_mae(X,y, lm_reg)
print('CV mean absolute error (linear regression with regularization):', mae)
mae = get_mae(X,y, rf)
print('CV mean absolute error (random forest):', mae)

('CV mean absolute error (linear regression:', 22503.281714547458)
('CV mean absolute error (linear regression with regularization):', 21825.033828436543)
('CV mean absolute error (random forest):', 18497.847815442674)


In [97]:
# include polynomial features
poly = PolynomialFeatures(degree=2)
X_arr = poly.fit_transform(X)
X = pd.DataFrame(X_arr, columns=poly.get_feature_names(X.columns))

# transform the test data
test_arr = poly.transform(test) #returns an ndarray
test = pd.DataFrame(test_arr, columns=poly.get_feature_names(X.columns))

In [98]:
# Check CV scores with these two models
mae = get_mae(X,y, lm)
print('CV mean absolute error (linear regression:', mae)
mae = get_mae(X,y, lm_reg)
print('CV mean absolute error (linear regression with regularization):', mae)
mae = get_mae(X,y, rf)
print('CV mean absolute error (random forest):', mae)

('CV mean absolute error (linear regression:', 897994599588776.62)
('CV mean absolute error (linear regression with regularization):', 19528.042440925285)
('CV mean absolute error (random forest):', 18528.819112921326)


## Conclusions:

* Random Forest without scaling and without including has the best CV score (18290)

* Linear Regression with regulation does better than its naive implementation

* Feature scaling and polynomial features improves the Linear Regression model