In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor 

In [3]:
melbourne_file_path = 'melb_data.csv'
melbourne_data = pd.read_csv('melb_data.csv')
print(melbourne_data.shape)
melbourne_data.columns

(13580, 21)


Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [4]:
melbourne_data = melbourne_data.dropna(axis=0)
print(melbourne_data.shape)

(6196, 21)


In [8]:
y = melbourne_data.Price
print(y.shape)
print(y[:5])

(6196,)
1    1035000.0
2    1465000.0
4    1600000.0
6    1876000.0
7    1636000.0
Name: Price, dtype: float64


In [6]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

X = melbourne_data[melbourne_features]
X.shape

(6196, 5)

In [10]:
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [11]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [12]:
melbourne_model = DecisionTreeRegressor(random_state = 1) # specify a number for random_state to ensure same results each run
melbourne_model.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [13]:
print(f'predictions for the following 5 houses: {X.head()}')
print(f'the predicitons are {melbourne_model.predict(X.head())}')

predictions for the following 5 houses:    Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
the predicitons are [1035000. 1465000. 1600000. 1876000. 1636000.]


In [14]:
print(melbourne_model.predict(X.head()))
print(y[:5])

[1035000. 1465000. 1600000. 1876000. 1636000.]
1    1035000.0
2    1465000.0
4    1600000.0
6    1876000.0
7    1636000.0
Name: Price, dtype: float64


In [15]:
predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

1115.7467183128902

now let's do that with the data split:

In [16]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
# define model
melbourne_model = DecisionTreeRegressor()
# fit it
melbourne_model.fit(train_X, train_y)

# get predicted prices
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

274364.91542930924


the next step is to avoid overfitiing / underfitting
for the decision tree we might limit the number of the tree's leaf nodes:

In [17]:
# make the function

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state = 0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return (mae)

In [18]:
# compare mae with different values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(f'max leaf nodes: {max_leaf_nodes}, mean absolute error: {my_mae}')

max leaf nodes: 5, mean absolute error: 385696.54278937966
max leaf nodes: 50, mean absolute error: 279794.61143891385
max leaf nodes: 500, mean absolute error: 261718.1134423186
max leaf nodes: 5000, mean absolute error: 271996.1207230471


it is time for the random forest

In [12]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

198635.2808478588


