In [1]:
building_machine_learning_model = [
    "1. Set up the development environment",
    "2. Import the dataset",
    "3. Scrub the dataset",
    "4. Split the data into training and test data",
    "5. Select an algorithm and configure its hyperparameters",
    "6. Evaluate the results"
]

building_machine_learning_model

['1. Set up the development environment',
 '2. Import the dataset',
 '3. Scrub the dataset',
 '4. Split the data into training and test data',
 '5. Select an algorithm and configure its hyperparameters',
 '6. Evaluate the results']

In [2]:
# Import requisite modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn # Import sci-kit learn

In [3]:
# We want to first unzip the melbourne housing zip file
# Import the zipfile module
from zipfile import ZipFile

with ZipFile('Melbourne_housing_FULL.csv.zip') as zObject:
    
    zObject.extractall('melbourne_housing_data')
    


In [4]:
# Import the melbourne dataset
df = pd.read_csv('melbourne_housing_data/melbourne_housing_full.csv')
df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.80140,144.99580,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.81140,145.01160,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,13 Burns St,4,h,1480000.0,PI,Jas,24/02/2018,6.3,3013.0,...,1.0,3.0,593.0,,,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0
34853,Yarraville,29A Murray St,2,h,888000.0,SP,Sweeney,24/02/2018,6.3,3013.0,...,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0
34854,Yarraville,147A Severn St,2,t,705000.0,S,Jas,24/02/2018,6.3,3013.0,...,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0
34855,Yarraville,12/37 Stephen St,3,h,1140000.0,SP,hockingstuart,24/02/2018,6.3,3013.0,...,,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.0


In [5]:
# Remove some columns
del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']

In [6]:
df.dropna(axis=0, how='any', subset=None, inplace=True)
df

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
4,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council
6,Abbotsford,4,h,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra City Council
11,Abbotsford,3,h,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,Yarra City Council
14,Abbotsford,2,h,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,Yarra City Council
...,...,...,...,...,...,...,...,...,...,...,...,...
34847,Wollert,3,h,500000.0,25.5,3.0,2.0,2.0,383.0,118.0,2016.0,Whittlesea City Council
34849,Wollert,3,h,570000.0,25.5,3.0,2.0,2.0,404.0,158.0,2012.0,Whittlesea City Council
34853,Yarraville,2,h,888000.0,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council
34854,Yarraville,2,t,705000.0,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council


In [7]:
# Find length of original dataframe
df_original = pd.read_csv('melbourne_housing_data/melbourne_housing_full.csv')
len(df_original)

34857

In [8]:
# Find length of our present df after removing NaN values
len(df)

8895

In [9]:
df.columns.values.tolist()

['Suburb',
 'Rooms',
 'Type',
 'Price',
 'Distance',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'CouncilArea']

In [10]:
features_df = pd.get_dummies(df, columns=['Suburb', 'CouncilArea', 'Type'])
features_df

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Suburb_Abbotsford,...,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Type_h,Type_t,Type_u
2,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,1,...,0,0,0,0,0,1,0,1,0,0
4,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,1,...,0,0,0,0,0,1,0,1,0,0
6,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,1,...,0,0,0,0,0,1,0,1,0,0
11,3,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,1,...,0,0,0,0,0,1,0,1,0,0
14,2,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,1,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34847,3,500000.0,25.5,3.0,2.0,2.0,383.0,118.0,2016.0,0,...,0,0,0,1,0,0,0,1,0,0
34849,3,570000.0,25.5,3.0,2.0,2.0,404.0,158.0,2012.0,0,...,0,0,0,1,0,0,0,1,0,0
34853,2,888000.0,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,0,...,0,0,0,0,0,0,0,1,0,0
34854,2,705000.0,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,0,...,0,0,0,0,0,0,0,0,1,0


In [11]:
# Remove the dependent variable of Price (the target)
del features_df['Price']

In [12]:
# Get the independent variables from features_df (the features)
X = features_df
X

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Suburb_Abbotsford,Suburb_Aberfeldie,...,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Type_h,Type_t,Type_u
2,2,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,1,0,...,0,0,0,0,0,1,0,1,0,0
4,3,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,1,0,...,0,0,0,0,0,1,0,1,0,0
6,4,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,1,0,...,0,0,0,0,0,1,0,1,0,0
11,3,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,1,0,...,0,0,0,0,0,1,0,1,0,0
14,2,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,1,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34847,3,25.5,3.0,2.0,2.0,383.0,118.0,2016.0,0,0,...,0,0,0,1,0,0,0,1,0,0
34849,3,25.5,3.0,2.0,2.0,404.0,158.0,2012.0,0,0,...,0,0,0,1,0,0,0,1,0,0
34853,2,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,0,0,...,0,0,0,0,0,0,0,1,0,0
34854,2,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [13]:
# Get the dependent variable Price (the target)
y = df['Price']
y

2        1035000.0
4        1465000.0
6        1600000.0
11       1876000.0
14       1636000.0
           ...    
34847     500000.0
34849     570000.0
34853     888000.0
34854     705000.0
34856    1020000.0
Name: Price, Length: 8895, dtype: float64

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

In [15]:
# Import Gradient Boosting Regression model
from sklearn.ensemble import GradientBoostingRegressor

In [16]:
np.random.seed(42)
model = GradientBoostingRegressor(n_estimators=150,
                                 learning_rate=0.1,
                                 max_depth=30,
                                min_samples_split=4,
                                 min_samples_leaf=6,
                                 max_features=0.6,
                                 loss='huber'
                                 )

model.fit(X_train, y_train) # Fit the model to out training data

model.score(X_test, y_test)

In [17]:
# model.fit(X_train, y_train) # Fit the model to out training data

In [18]:
# Calculate the mean square error of our model
from sklearn.metrics import mean_absolute_error as mae

mse = mae(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error:%.2f"%mse)

Training Set Mean Absolute Error:29314.72


In [19]:
# Calculate the mean squared error on the testing data

mse_test = mae(y_test, model.predict(X_test))
print("Testing set mean absolute error\n",
     mse_test)

Testing set mean absolute error
 164159.23338633685


In [20]:
# model.score(X_test, y_test)

0.8200792852990676

In [22]:
## Using random forest regressor
# Create the features
X2 = features_df
y2 = df['Price']  # Create the target



In [23]:
# Call the module of Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)

# Split the dataset
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,
                                                        y2,
                                                        test_size=0.2)

# Fit the model
model = RandomForestRegressor()
model.fit(X_train2, y_train2)

# Get the score
model.score(X_test2, y_test2)



0.8094978629223825

In [24]:
from sklearn.metrics import mean_absolute_error as mae

mse_random_forest = mae(y_train2, model.predict(X_train2))
print("Training Set Mean Absolute Error for Random Forest:\n", mse_random_forest)

Training Set Mean Absolute Error for Random Forest:
 63797.850688990606


In [25]:
mse_random_forest2 = mae(y_test2, model.predict(X_test2))
print("Testing Set Mean Absolute Error for Random Forest:\n", mse_random_forest2)

Testing Set Mean Absolute Error for Random Forest:
 166550.81081104957


## Model optimization

In [26]:
# Get the independent variables from features_df (the features)
X = features_df
X

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Suburb_Abbotsford,Suburb_Aberfeldie,...,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Type_h,Type_t,Type_u
2,2,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,1,0,...,0,0,0,0,0,1,0,1,0,0
4,3,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,1,0,...,0,0,0,0,0,1,0,1,0,0
6,4,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,1,0,...,0,0,0,0,0,1,0,1,0,0
11,3,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,1,0,...,0,0,0,0,0,1,0,1,0,0
14,2,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,1,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34847,3,25.5,3.0,2.0,2.0,383.0,118.0,2016.0,0,0,...,0,0,0,1,0,0,0,1,0,0
34849,3,25.5,3.0,2.0,2.0,404.0,158.0,2012.0,0,0,...,0,0,0,1,0,0,0,1,0,0
34853,2,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,0,0,...,0,0,0,0,0,0,0,1,0,0
34854,2,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [27]:
# Get the dependent variable Price (the target)
y = df['Price']
y

2        1035000.0
4        1465000.0
6        1600000.0
11       1876000.0
14       1636000.0
           ...    
34847     500000.0
34849     570000.0
34853     888000.0
34854     705000.0
34856    1020000.0
Name: Price, Length: 8895, dtype: float64

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

In [36]:
# Import Gradient Boosting Regression model
from sklearn.ensemble import GradientBoostingRegressor
np.random.seed(42)

model = GradientBoostingRegressor(n_estimators=250,
                                 learning_rate=0.1,
                                 max_depth=5,
                                min_samples_split=4,
                                 min_samples_leaf=6,
                                 max_features=0.6,
                                 loss='huber'
                                 )

model.fit(X_train, y_train) # Fit the model to out training data

model.score(X_test, y_test)

0.831956903732383

In [37]:
# Calculate the mean square error of our model
from sklearn.metrics import mean_absolute_error as mae

mse = mae(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error:%.2f"%mse)

Training Set Mean Absolute Error:124068.34


In [38]:
# Calculate the mean squared error on the testing data

mse_test = mae(y_test, model.predict(X_test))
print("Testing set mean absolute error\n",
     mse_test)

Testing set mean absolute error
 158497.09334912864
