# Random Forest - Melbourne Housing

In [40]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
# Path of the file to read
iowa_file_path = 'melb_data.csv'

home_data = pd.read_csv(iowa_file_path)
home_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [12]:
home_data

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,26/08/2017,16.7,3150.0,...,2.0,2.0,652.0,,1981.0,,-37.90562,145.16761,South-Eastern Metropolitan,7392.0
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,26/08/2017,6.8,3016.0,...,2.0,2.0,333.0,133.0,1995.0,,-37.85927,144.87904,Western Metropolitan,6380.0
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,26/08/2017,6.8,3016.0,...,2.0,4.0,436.0,,1997.0,,-37.85274,144.88738,Western Metropolitan,6380.0
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,3016.0,...,1.0,5.0,866.0,157.0,1920.0,,-37.85908,144.89299,Western Metropolitan,6380.0


In [6]:
home_data_dummy = pd.get_dummies(home_data)
home_data_dummy

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,...,CouncilArea_Yarra,CouncilArea_Yarra Ranges,Regionname_Eastern Metropolitan,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
0,2,1480000.0,2.5,3067.0,2.0,1.0,1.0,202.0,,,...,1,0,0,0,1,0,0,0,0,0
1,2,1035000.0,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,...,1,0,0,0,1,0,0,0,0,0
2,3,1465000.0,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,...,1,0,0,0,1,0,0,0,0,0
3,3,850000.0,2.5,3067.0,3.0,2.0,1.0,94.0,,,...,1,0,0,0,1,0,0,0,0,0
4,4,1600000.0,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,...,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,4,1245000.0,16.7,3150.0,4.0,2.0,2.0,652.0,,1981.0,...,0,0,0,0,0,0,1,0,0,0
13576,3,1031000.0,6.8,3016.0,3.0,2.0,2.0,333.0,133.0,1995.0,...,0,0,0,0,0,0,0,0,1,0
13577,3,1170000.0,6.8,3016.0,3.0,2.0,4.0,436.0,,1997.0,...,0,0,0,0,0,0,0,0,1,0
13578,4,2500000.0,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,...,0,0,0,0,0,0,0,0,1,0


In [54]:
def define_test_train_validation(df, features):
    # Target Objective
    y = df.Price
    # Features - fill in nans with mean, code string data
    missing_val_count_by_column = (df.isnull().sum())
    missing_values = missing_val_count_by_column[missing_val_count_by_column > 0].apply(str) + f'/{df.shape[0]} missing'

    print(f'There are {df.shape[0]} rows and {df.shape[1]} columns.\n')
    print('Missing Values:\n')
    print(missing_values)
    X = pd.get_dummies(df[features]).apply(lambda x: x.fillna(x.mean()),axis=0)
    # Split Data
    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
    return train_X, val_X, train_y, val_y


def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)

    print(f"Validation MAE for best value of {max_leaf_nodes} max leaf nodes: {mae:,.0f}")
    return mae


def find_best_tree_size(candidate_max_leaf_nodes):
    # Write loop to find the ideal tree size from candidate_max_leaf_nodes
    scores = {leaf_size:get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
    # Store the best value of max_leaf_nodes
    best_tree_size = min(scores, key=scores.get)
    return best_tree_size

def final_model(best_tree_size):
    final_model = RandomForestRegressor(max_leaf_nodes=best_tree_size, random_state=1)
    final_model.fit(train_X, train_y)
    final_model_predictions = final_model.predict(val_X)
    final_model_mae = mean_absolute_error(final_model_predictions, val_y)
    print(f"Validation MAE for Random Forest Model: {final_model_mae}")
    return final_model_predictions


features = ['Rooms', 'YearBuilt', 'Bathroom', 'Car', 'Landsize', 
            'BuildingArea', 'Lattitude', 'Longtitude', 'Postcode',
            'Method', 'SellerG'
            ]
train_X, val_X, train_y, val_y = define_test_train_validation(home_data, features)

candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500, 1000]
#best_tree_size = find_best_tree_size(candidate_max_leaf_nodes)

#final_model_predictions = final_model(best_tree_size)

There are 13580 rows and 21 columns.

Missing Values:

Car               62/13580 missing
BuildingArea    6450/13580 missing
YearBuilt       5375/13580 missing
CouncilArea     1369/13580 missing
dtype: object


In [30]:
final_model_predictions

array([1862460.,  842780., 2220310., ..., 1723195., 1273740.,  500610.])

In [36]:
val_y

321      1640000.0
4003      675000.0
13348    2800000.0
2697      615000.0
12600    2700000.0
           ...    
11703    1515000.0
7756      775000.0
4076     1630000.0
12923    1240000.0
6892      378000.0
Name: Price, Length: 3395, dtype: float64