In [1]:
#in this second part of the Predict House Prices with DecisionTreeRegressor experiment, we shall split the training data
#into input and validation data, this is so that we can test the accuracy of our model against the output of the training data
#since the test data does not come with an output with which to test the accuracy of our model

In [2]:
import pandas as pd

In [3]:
#we load the train dataset
house_data_train = pd.read_csv("Data/train.csv")

In [5]:
from sklearn.model_selection import train_test_split
#the above is imported from sklearn library and it helps us split data into a test and train

In [6]:
#we then initialise the X and y, recall the X is the features, or the input, and the y is out output
house_data_train_features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = house_data_train[house_data_train_features]
X

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
0,8450,2003,856,854,2,3,8
1,9600,1976,1262,0,2,3,6
2,11250,2001,920,866,2,3,6
3,9550,1915,961,756,1,3,7
4,14260,2000,1145,1053,2,4,9
...,...,...,...,...,...,...,...
1455,7917,1999,953,694,2,3,7
1456,13175,1978,2073,0,2,3,7
1457,9042,1941,1188,1152,2,4,9
1458,9717,1950,1078,0,1,2,5


In [7]:
#our y
y = house_data_train.SalePrice
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [15]:
#time to split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
#our original test dataset is now further split into a test and train dataset
#we now have features for both training and testing, and outputs for both training and testing

In [17]:
#next, we build the model using the train features and output
from sklearn.tree import DecisionTreeRegressor
house_model = DecisionTreeRegressor(random_state=1)
house_model.fit(X_train,y_train)

DecisionTreeRegressor(random_state=1)

In [18]:
#lets make a prediction using test data first
price_predictions = house_model.predict(X_train.head())
price_predictions

array([107500., 160000., 145000., 192140., 181000.])

In [25]:
#lets see the actual values
y_train.head()

1292    107500
1018    160000
1213    145000
1430    192140
810     181000
Name: SalePrice, dtype: int64

In [26]:
#not supriseing, our predictions are true for the training dataset

In [27]:
#now lets try with our test dataset
price_predictions_on_train = house_model.predict(X_test.head())
price_predictions_on_train

array([335000., 205000., 139000., 205000.,  89500.])

In [28]:
#lets see the actual values
y_test.head()

529    200624
491    133000
459    110000
279    192000
655     88000
Name: SalePrice, dtype: int64

In [32]:
#as you can see, our values are a bit off, but the good thing here is that we can actually test by how much on average
#our values are off
from sklearn.metrics import mean_absolute_error

In [34]:
#let us check the accuracy of our predictions against the actual values
mean_absolute_error(y_test, house_model.predict(X_test))

32966.449315068494

In [36]:
#there you go, we have a mean absolute error MAE of approx. 33000, this means that on average, 
#our prices are off by about $33000

In [37]:
#just to be curious, lets check the prediction accuracy of our training data
mean_absolute_error(y_train, house_model.predict(X_train))

72.78904109589041

In [38]:
#ok, this is lower, it means our model fits well on training data, which is no suprise,
#since it was trained on training data. LOL