In [2]:
import pandas as pd

melb_housing = pd.read_csv("melb_data.csv")
melb_housing.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [3]:
melb_housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [8]:
min_room, max_room = melb_housing.Rooms.min(), melb_housing.Rooms.max()
min_price, avg_price, max_price = melb_housing.Price.min(), melb_housing.Price.mean(), melb_housing.Price.max()
min_land, avg_land, max_land = melb_housing.Landsize.min(), melb_housing.Landsize.mean(), melb_housing.Landsize.max()
oldest = 2024 - melb_housing.YearBuilt.min()
youngest = 2024 - melb_housing.YearBuilt.max()

print("Min No. of Rooms:", min_room)
print("Max No. of Rooms:", max_room)
print("Min Price:", min_price)
print("Average Price:", avg_price)
print("Max Price:", max_price)
print("Min Landsize:", min_land)
print("Average Landsize:", avg_land)
print("Max Landsize:", max_land)
print("The oldest house is", oldest, "years old")
print("The youngest house is", youngest, "years old")

Min No. of Rooms: 1
Max No. of Rooms: 10
Min Price: 85000.0
Average Price: 1075684.079455081
Max Price: 9000000.0
Min Landsize: 0.0
Average Landsize: 558.4161266568483
Max Landsize: 433014.0
The oldest house is 828.0 years old
The youngest house is 6.0 years old


In [9]:
y = melb_housing.Price
y

0        1480000.0
1        1035000.0
2        1465000.0
3         850000.0
4        1600000.0
           ...    
13575    1245000.0
13576    1031000.0
13577    1170000.0
13578    2500000.0
13579    1285000.0
Name: Price, Length: 13580, dtype: float64

In [11]:
# feature selection
melb_features = ["Rooms", "Bathroom", "Landsize", "Lattitude", "Longtitude"]
X = melb_housing[melb_features]
X

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
0,2,1.0,202.0,-37.79960,144.99840
1,2,1.0,156.0,-37.80790,144.99340
2,3,2.0,134.0,-37.80930,144.99440
3,3,2.0,94.0,-37.79690,144.99690
4,4,1.0,120.0,-37.80720,144.99410
...,...,...,...,...,...
13575,4,2.0,652.0,-37.90562,145.16761
13576,3,2.0,333.0,-37.85927,144.87904
13577,3,2.0,436.0,-37.85274,144.88738
13578,4,1.0,866.0,-37.85908,144.89299


In [12]:
from sklearn.tree import DecisionTreeRegressor

melb_model = DecisionTreeRegressor(random_state=1)
melb_model.fit(X, y)

In [18]:
from sklearn.metrics import mean_absolute_error

print("Making predictions based on the following feature selected dataset:")
print(X)

predictions = melb_model.predict(X)
print("Predicted values of the house price:")
print(predictions)

error = mean_absolute_error(y, predictions)
print("You are off by $", round(error, 2))

Making predictions based on the following feature selected dataset:
       Rooms  Bathroom  Landsize  Lattitude  Longtitude
0          2       1.0     202.0  -37.79960   144.99840
1          2       1.0     156.0  -37.80790   144.99340
2          3       2.0     134.0  -37.80930   144.99440
3          3       2.0      94.0  -37.79690   144.99690
4          4       1.0     120.0  -37.80720   144.99410
...      ...       ...       ...        ...         ...
13575      4       2.0     652.0  -37.90562   145.16761
13576      3       2.0     333.0  -37.85927   144.87904
13577      3       2.0     436.0  -37.85274   144.88738
13578      4       1.0     866.0  -37.85908   144.89299
13579      4       1.0     362.0  -37.81188   144.88449

[13580 rows x 5 columns]
Predicted values of the house price:
[1480000. 1035000. 1465000. ... 1170000. 2500000. 1285000.]
You are off by $ 1125.18


In [19]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

melb_model = DecisionTreeRegressor(random_state=1)
melb_model.fit(train_X, train_y)
predictions = melb_model.predict(val_X)

mean_absolute_error(val_y, predictions)

241632.16966126655