In [71]:
import pandas as pd
melb_data = pd.read_csv('datasheets/melb_data.csv')
melb_data.describe()
melb_data.columns # Prints all columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [72]:
melb_data.dropna(axis=0) # Drops N/A (missing) values
y = melb_data.Price # Prediction target.. convention to use y 

# Features for model
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
x = melb_data[melbourne_features]
x.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,13580.0,13580.0,13580.0,13580.0,13580.0
mean,2.937997,1.534242,558.416127,-37.809203,144.995216
std,0.955748,0.691712,3990.669241,0.07926,0.103916
min,1.0,0.0,0.0,-38.18255,144.43181
25%,2.0,1.0,177.0,-37.856822,144.9296
50%,3.0,1.0,440.0,-37.802355,145.0001
75%,3.0,2.0,651.0,-37.7564,145.058305
max,10.0,8.0,433014.0,-37.40853,145.52635


In [73]:
x.head(8)

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
0,2,1.0,202.0,-37.7996,144.9984
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
3,3,2.0,94.0,-37.7969,144.9969
4,4,1.0,120.0,-37.8072,144.9941
5,2,1.0,181.0,-37.8041,144.9953
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [74]:

# Import for model
from sklearn.tree import DecisionTreeRegressor
# Define model with random state for same result
melbourne_model = DecisionTreeRegressor(random_state = 1)
# Fit model for prediciton
melbourne_model.fit(x, y)

# Prediction for first 5 houses
print("Making prediciton for these 5 house ")
print(x.head())

print("The price predicitons are: ")
print(melbourne_model.predict(x.head()))

Making prediciton for these 5 house 
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
0      2       1.0     202.0   -37.7996    144.9984
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
3      3       2.0      94.0   -37.7969    144.9969
4      4       1.0     120.0   -37.8072    144.9941
The price predicitons are: 
[1480000. 1035000. 1465000.  850000. 1600000.]


In [75]:
from sklearn.metrics import mean_absolute_error

print("Predicted prices vs. Actual Prices: ")
print(melbourne_model.predict(x.head()))
print(y.head())

predicted_home_prices = melbourne_model.predict(x)
mean_absolute_error(y, predicted_home_prices)
# print("MAE value ")
# print(mean_absolute_error)

Predicted prices vs. Actual Prices: 
[1480000. 1035000. 1465000.  850000. 1600000.]
0    1480000.0
1    1035000.0
2    1465000.0
3     850000.0
4    1600000.0
Name: Price, dtype: float64


1125.1804614629357

In [76]:
# TODO: Look up more
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(x, y, random_state = 0)
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

248621.68748159058
