In [1]:
import pandas as pd

### I. Play with dataset and feature selection

In [2]:
melbourne_file_path = "data/melb_data.csv"
melb_data = pd.read_csv(melbourne_file_path)

melb_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [4]:
melb_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [5]:
melb_data = melb_data.dropna(axis=0)


In [6]:
melb_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1068828.0,9.751097,3101.947708,2.902034,1.57634,1.573596,471.00694,141.568645,1964.081988,-37.807904,144.990201,7435.489509
std,0.971079,675156.4,5.612065,86.421604,0.970055,0.711362,0.929947,897.449881,90.834824,38.105673,0.07585,0.099165,4337.698917
min,1.0,131000.0,0.0,3000.0,0.0,1.0,0.0,0.0,0.0,1196.0,-38.16492,144.54237,389.0
25%,2.0,620000.0,5.9,3044.0,2.0,1.0,1.0,152.0,91.0,1940.0,-37.855438,144.926198,4383.75
50%,3.0,880000.0,9.0,3081.0,3.0,1.0,1.0,373.0,124.0,1970.0,-37.80225,144.9958,6567.0
75%,4.0,1325000.0,12.4,3147.0,3.0,2.0,2.0,628.0,170.0,2000.0,-37.7582,145.0527,10175.0
max,8.0,9000000.0,47.4,3977.0,9.0,8.0,10.0,37000.0,3112.0,2018.0,-37.45709,145.52635,21650.0


In [7]:
y = melb_data.Price

In [8]:
X = melb_data[ ['Rooms','Bathroom', 'Landsize', 'Lattitude', 'Longtitude'] ]

In [9]:
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


### II. Model Building

In [10]:
# Decision Tree example

from sklearn.tree import DecisionTreeRegressor

melb_model = DecisionTreeRegressor(random_state = 1)

melb_model.fit(X, y)


In [11]:
print("Making predictions for the following 5 houses: ")

print(X.head())
print()

print("The predictions are: ")

print(melb_model.predict( X.head() ) )

Making predictions for the following 5 houses: 
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954

The predictions are: 
[1035000. 1465000. 1600000. 1876000. 1636000.]


### III. Model Validation

In [12]:
from sklearn.metrics import mean_absolute_error as MAE


In [13]:
X2 = melb_data[ ['Rooms','Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude'] ]

melb_model2 = DecisionTreeRegressor()
melb_model2.fit(X2, y)

pred_home_prices = melb_model2.predict(X2)


In [14]:
MAE(y, pred_home_prices)

434.71594577146544

In [15]:
from sklearn.model_selection import train_test_split as TTS


In [17]:
train_X2, test_X2, train_y, test_y = TTS(X2, y, random_state=0)

melb_model3 = DecisionTreeRegressor()

melb_model3.fit(train_X2, train_y)

test_X2_preds = melb_model3.predict( test_X2 )

print( MAE( test_y, test_X2_preds) )

263679.1562298257


### IV. Undefitting and Overfitting Data

In [18]:
def get_mae(max_leaf_value, train_X_data, test_X_data, train_y_data, test_y_data):
    '''
    Loads four sets of data for a Decision Tree Regressor model with a maximum number of leaves allowed.
    '''
    MODEL = DecisionTreeRegressor(max_leaf_nodes = max_leaf_value,
                                  random_state=0)
    MODEL.fit(train_X_data, train_y_data)
    val_preds = MODEL.predict(test_X_data)
    mae = MAE(test_y_data, val_preds)
    return(mae)


In [20]:
for MLN in [5, 50, 100, 250, 500, 1000, 2500, 5000]:
    my_mae = get_mae(MLN, train_X2, test_X2, train_y, test_y)
    
    print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(MLN, my_mae) )

    del my_mae
    

Max leaf nodes: 5 		 Mean Absolute Error: 347380
Max leaf nodes: 50 		 Mean Absolute Error: 258171
Max leaf nodes: 100 		 Mean Absolute Error: 248734
Max leaf nodes: 250 		 Mean Absolute Error: 247206
Max leaf nodes: 500 		 Mean Absolute Error: 243495
Max leaf nodes: 1000 		 Mean Absolute Error: 247378
Max leaf nodes: 2500 		 Mean Absolute Error: 255481
Max leaf nodes: 5000 		 Mean Absolute Error: 255575


### V. The Random Forest

In [21]:
from sklearn.ensemble import RandomForestRegressor


In [22]:
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X2, train_y)
melb_preds_fm = forest_model.predict(test_X2)

print(MAE(test_y, melb_preds_fm))

191669.7536453626
