### Data exploration

In [2]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [3]:
!ls

cross-validation.ipynb  ml_intro_0.ipynb        ml_missing_values.ipynb
melb_data.csv           ml_intro_1.ipynb        xgboost.ipynb
ml_cat_variables.ipynb  ml_intro_2.ipynb


In [4]:
melbourne_file_path = 'melb_data.csv'

In [5]:
melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data.shape

(13580, 21)

In [6]:
melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.938,1075684.079,10.138,3105.302,2.915,1.534,1.61,558.416,151.968,1964.684,-37.809,144.995,7454.417
std,0.956,639310.724,5.869,90.677,0.966,0.692,0.963,3990.669,541.015,37.274,0.079,0.104,4378.582
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.183,144.432,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.857,144.93,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802,145.0,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.756,145.058,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.409,145.526,21650.0


In [7]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [8]:
melbourne_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.8,144.998,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.808,144.993,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.809,144.994,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.797,144.997,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.807,144.994,Northern Metropolitan,4019.0


### Data Prep

In [9]:
#dropping nulls
melbourne_data = melbourne_data.dropna(axis=0)

In [10]:
#Selecting Y variable
y = melbourne_data.Price
y.describe()

count      6196.000
mean    1068828.202
std      675156.428
min      131000.000
25%      620000.000
50%      880000.000
75%     1325000.000
max     9000000.000
Name: Price, dtype: float64

In [11]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931,1.576,471.007,-37.808,144.99
std,0.971,0.711,897.45,0.076,0.099
min,1.0,1.0,0.0,-38.165,144.542
25%,2.0,1.0,152.0,-37.855,144.926
50%,3.0,1.0,373.0,-37.802,144.996
75%,4.0,2.0,628.0,-37.758,145.053
max,8.0,8.0,37000.0,-37.457,145.526


### Data modeling

In [12]:
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(X, y)

DecisionTreeRegressor(random_state=1)

In [13]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("")
print("The predictions are")
print(melbourne_model.predict(X.head()))

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2     1.000   156.000    -37.808     144.993
2      3     2.000   134.000    -37.809     144.994
4      4     1.000   120.000    -37.807     144.994
6      3     2.000   245.000    -37.802     144.999
7      2     1.000   256.000    -37.806     144.995

The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]


## Simple Evaluation

In [15]:
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

1115.7467183128902