In [299]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

Predicting House Sale Price using Decision Tree Regressor

This is part of kaggle fun competition

The data provided by https://www.kaggle.com

link : https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview

In [300]:
#load our data into pandas data frame
df_house_train = pd.read_csv('train.csv')

#load another data that we will use later for predicting purpose
df_house_predict = pd.read_csv('test.csv')

#examine our data
df_house_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [301]:
#check columns so we can choose our features
df_house_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [302]:
#now we choose our features to train our model
features = ['LotArea', 'YearBuilt', 'OverallCond', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
#explanation of our features:
#LotArea = land size in square feet, usually larger the landsize, higher the price
#YearBuilt = the year the house was built, well usually older the house lower the price
#OverallCond = overall condition of the house after inspection, rate goes from 1 to 10, 10 being the highest
#1stFlrSF= 1stfloor size in square feet 
#2ndFlrSF= 1stfloor size in square feet 
#FullBath = number of bathroom
#BedroomAbvGr = number of bedroom
#TotRmsAbvGrd = total room in the house not including the bathroom

#choose our training predictive output, or our dependant variable
y= df_house_train['SalePrice']

In [303]:
#now we examine our features in our data
df_house_train[features].head()

Unnamed: 0,LotArea,YearBuilt,OverallCond,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
0,8450,2003,5,856,854,2,3,8
1,9600,1976,8,1262,0,2,3,6
2,11250,2001,5,920,866,2,3,6
3,9550,1915,5,961,756,1,3,7
4,14260,2000,5,1145,1053,2,4,9


In [304]:
#next we are going to do a little bit of data cleaning

#now we check the data types of our features, are they all numbers?
df_house_train[features].dtypes
#good, they are all numbers

LotArea         int64
YearBuilt       int64
OverallCond     int64
1stFlrSF        int64
2ndFlrSF        int64
FullBath        int64
BedroomAbvGr    int64
TotRmsAbvGrd    int64
dtype: object

In [305]:
#now we check for any nan values
#it must be done because NAN values will mess our ML models
df_house_train[features].isnull().values.any()
#comes out False, it means we dont have any NAN values 
#maybe since this is data from kaggle
#real life data might have nan values
#lets proceed then

False

In [306]:
#check our data statistic to see if it's sensible
df_house_train[features].describe()

Unnamed: 0,LotArea,YearBuilt,OverallCond,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,1971.267808,5.575342,1162.626712,346.992466,1.565068,2.866438,6.517808
std,9981.264932,30.202904,1.112799,386.587738,436.528436,0.550916,0.815778,1.625393
min,1300.0,1872.0,1.0,334.0,0.0,0.0,0.0,2.0
25%,7553.5,1954.0,5.0,882.0,0.0,1.0,2.0,5.0
50%,9478.5,1973.0,5.0,1087.0,0.0,2.0,3.0,6.0
75%,11601.5,2000.0,6.0,1391.25,728.0,2.0,3.0,7.0
max,215245.0,2010.0,9.0,4692.0,2065.0,3.0,8.0,14.0


In [307]:
#Now we're ready to begin
# first step, split our train data into train and test data
#this is important to see our model prediction accuracy
# random_state will ensure we get the same split everytime we execute the code
train_features, test_features, train_labels, test_labels = train_test_split(df_house_train[features], y, random_state=1)

#next,  normalize/standardize our features first, so mean of our features is 0 and standard deviation is 1
#normalize/standardize our data is important before applying it into any machine learning models
#i use standardscaler to standardize the data
#scale = StandardScaler()
#standardize train_features
#train_features = scale.fit_transform(train_features)
#standardize test_features
#test_features = scale.transform(test_features)

#the result shows that we have lower score if we normalize the data, so we are not goin to do it

In [308]:
#now we make our DecisionTreeRegressor model
#setting random state for model reproducibility
house_predict = DecisionTreeRegressor(random_state=1)

#train our model
house_predict.fit(train_features, train_labels)

#now we see our model score on test features
house_predict.score(test_features, test_labels)
#hmm 76%

0.7628402884641607

In [309]:
#check the mean value error
#this is to see the mean difference between predicted and actual data
val_predictions = house_predict.predict(test_features)
print(mean_absolute_error(test_labels, val_predictions))

26889.909589041097


In [310]:
#now we back on our competition Data frame that we've already load 

#this is the data we are going to use to predict which passenger will likely to survive

#we are already trained our model, so this will be the predictive outcome

#this is part of kaggle fun competition

df_house_predict.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [311]:
#same data structure, so we are going to use the same features
df_house_predict[features].head()

Unnamed: 0,LotArea,YearBuilt,OverallCond,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
0,11622,1961,6,896,0,1,2,5
1,14267,1958,6,1329,0,1,3,6
2,13830,1997,5,928,701,2,3,6
3,9978,1998,6,926,678,2,3,7
4,5005,1992,5,1280,0,2,2,5


In [312]:
#next we are going to do a little bit of data cleaning again

#now we check the data types of our features, are they all numbers?
df_house_predict[features].dtypes
#good, they are all numbers

LotArea         int64
YearBuilt       int64
OverallCond     int64
1stFlrSF        int64
2ndFlrSF        int64
FullBath        int64
BedroomAbvGr    int64
TotRmsAbvGrd    int64
dtype: object

In [313]:
#now we check for any nan values again
#it must be done because NAN values will mess with our ML models
df_house_predict[features].isnull().values.any()
#comes out False, it means we dont have any NAN values 


False

In [314]:
#check our data statistic again to see if it's sensible
df_house_predict[features].describe()

Unnamed: 0,LotArea,YearBuilt,OverallCond,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
count,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,9819.161069,1971.357779,5.553804,1156.534613,325.967786,1.570939,2.85401,6.385195
std,4955.517327,30.390071,1.11374,398.16582,420.610226,0.55519,0.829788,1.508895
min,1470.0,1879.0,1.0,407.0,0.0,0.0,0.0,3.0
25%,7391.0,1953.0,5.0,873.5,0.0,1.0,2.0,5.0
50%,9399.0,1973.0,5.0,1079.0,0.0,2.0,3.0,6.0
75%,11517.5,2001.0,6.0,1382.5,676.0,2.0,3.0,7.0
max,56600.0,2010.0,9.0,5095.0,1862.0,4.0,6.0,15.0


In [315]:
#now we predict our competition data
house_predict_competition = DecisionTreeRegressor(random_state=1)

house_price_prediction = house_predict.predict(df_house_predict[features])
house_price_prediction

array([149000., 176000., 181000., ..., 137500., 145000., 225000.])

In [316]:
#to see it better, i am going to pair our predictive outcome with its respective house ids
new_dataframe_house = pd.DataFrame({'Id' : df_house_predict['Id'], 'SalePrice': house_price_prediction})
new_dataframe_house.head()

Unnamed: 0,Id,SalePrice
0,1461,149000.0
1,1462,176000.0
2,1463,181000.0
3,1464,181000.0
4,1465,180000.0


In [317]:
#ignore this part, i need to make csv of the data  for kaggle
new_dataframe_house.to_csv('house_price_submission.csv', index=False)