In [1]:
#!/usr/bin/env python
import pandas as pd

In [2]:
melbourne_file_path = "melb_data.csv"
melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data.columns #read the index of all column variables or features of the dataset

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [4]:
# dropna drops missing values (think of na as "not available") at first column or feature
melbourne_data = melbourne_data.dropna(axis=0) 
print(melbourne_data)

             Suburb          Address  Rooms Type      Price Method  \
1        Abbotsford  25 Bloomburg St      2    h  1035000.0      S   
2        Abbotsford     5 Charles St      3    h  1465000.0     SP   
4        Abbotsford      55a Park St      4    h  1600000.0     VB   
6        Abbotsford     124 Yarra St      3    h  1876000.0      S   
7        Abbotsford    98 Charles St      2    h  1636000.0      S   
...             ...              ...    ...  ...        ...    ...   
12205    Whittlesea    30 Sherwin St      3    h   601000.0      S   
12206  Williamstown      75 Cecil St      3    h  1050000.0     VB   
12207  Williamstown    2/29 Dover Rd      1    u   385000.0     SP   
12209       Windsor  201/152 Peel St      2    u   560000.0     PI   
12212    Yarraville  54 Pentland Pde      6    h  2450000.0     VB   

             SellerG        Date  Distance  Postcode  ...  Bathroom  Car  \
1             Biggin   4/02/2016       2.5    3067.0  ...       1.0  0.0   
2      

In [6]:
#save house prices in the melbourne dataset yo select target prediction using the dot notation to pull out a variable
y = melbourne_data.Price
print(y)

1        1035000.0
2        1465000.0
4        1600000.0
6        1876000.0
7        1636000.0
           ...    
12205     601000.0
12206    1050000.0
12207     385000.0
12209     560000.0
12212    2450000.0
Name: Price, Length: 6196, dtype: float64


In [11]:
#choosing specific features that contains certain specified columns or features
melbourne_feature = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

In [12]:
#labeling the melbourne_feature to be called 'X' (replace the orginal data "melbourne_data" that contains all the features)
X = melbourne_data[melbourne_feature]
print(X) #checked if the columns or featues have been reduced to the select features of the dataset



       Rooms  Bathroom  Landsize  Lattitude  Longtitude
1          2       1.0     156.0  -37.80790   144.99340
2          3       2.0     134.0  -37.80930   144.99440
4          4       1.0     120.0  -37.80720   144.99410
6          3       2.0     245.0  -37.80240   144.99930
7          2       1.0     256.0  -37.80600   144.99540
...      ...       ...       ...        ...         ...
12205      3       2.0     972.0  -37.51232   145.13282
12206      3       1.0     179.0  -37.86558   144.90474
12207      1       1.0       0.0  -37.85588   144.89936
12209      2       1.0       0.0  -37.85581   144.99025
12212      6       3.0    1087.0  -37.81038   144.89389

[6196 rows x 5 columns]


In [13]:
#shows the statistical analysis of each feature within the data specified for analysis
X.describe()


Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [14]:
X.head() #print the top rows or data variables 


Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [15]:
#building the model algorithm (decision tree) with the most popular library 'sklearn'
#defining the model with scikit-learn and fitting DT with features and target variable
from sklearn.tree import DecisionTreeRegressor


In [16]:
#define the model and specify the random_state to ensure the same results each run
#You use any number, and model quality won't depend meaningfully on exactly what value you choose.
melbourne_model = DecisionTreeRegressor(random_state = 1)

In [17]:
#fitting the DT with the updated data input and target variable y of the house prices
melbourne_model.fit(X,y)

In [18]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melbourne_model.predict(X.head()))

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]
