
<div align="center">
  <h1>Melbourne Housing Prediction</h1>
</div>


### Importing the Dependencies

In [1]:
#  importing all the required libraries
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
#Loading dataset 
df = pd.read_csv('./datasets/melb_data.csv')

pd.set_option('display.max_columns', 100)

In [3]:
#checking head count
df.head


<bound method NDFrame.head of               Suburb           Address  Rooms Type      Price Method  \
0         Abbotsford      85 Turner St      2    h  1480000.0      S   
1         Abbotsford   25 Bloomburg St      2    h  1035000.0      S   
2         Abbotsford      5 Charles St      3    h  1465000.0     SP   
3         Abbotsford  40 Federation La      3    h   850000.0     PI   
4         Abbotsford       55a Park St      4    h  1600000.0     VB   
...              ...               ...    ...  ...        ...    ...   
13575  Wheelers Hill      12 Strada Cr      4    h  1245000.0      S   
13576   Williamstown     77 Merrett Dr      3    h  1031000.0     SP   
13577   Williamstown       83 Power St      3    h  1170000.0      S   
13578   Williamstown      96 Verdon St      4    h  2500000.0     PI   
13579     Yarraville        6 Agnes St      4    h  1285000.0     SP   

        SellerG        Date  Distance  Postcode  Bedroom2  Bathroom  Car  \
0        Biggin   3/12/2016  

In [4]:
#checking shape & is there any null values 
df.shape #(13580, 21)
df.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [5]:
#describe table
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rooms,13580.0,2.937997,0.955748,1.0,2.0,3.0,3.0,10.0
Price,13580.0,1075684.0,639310.724296,85000.0,650000.0,903000.0,1330000.0,9000000.0
Distance,13580.0,10.13778,5.868725,0.0,6.1,9.2,13.0,48.1
Postcode,13580.0,3105.302,90.676964,3000.0,3044.0,3084.0,3148.0,3977.0
Bedroom2,13580.0,2.914728,0.965921,0.0,2.0,3.0,3.0,20.0
Bathroom,13580.0,1.534242,0.691712,0.0,1.0,1.0,2.0,8.0
Car,13518.0,1.610075,0.962634,0.0,1.0,2.0,2.0,10.0
Landsize,13580.0,558.4161,3990.669241,0.0,177.0,440.0,651.0,433014.0
BuildingArea,7130.0,151.9676,541.014538,0.0,93.0,126.0,174.0,44515.0
YearBuilt,8205.0,1964.684,37.273762,1196.0,1940.0,1970.0,1999.0,2018.0


In [6]:
#desribe with info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

* spam - 0
 
* ham - 1

In [7]:
# number of unique values in each column
df.nunique()

Suburb             314
Address          13378
Rooms                9
Type                 3
Price             2204
Method               5
SellerG            268
Date                58
Distance           202
Postcode           198
Bedroom2            12
Bathroom             9
Car                 11
Landsize          1448
BuildingArea       602
YearBuilt          144
CouncilArea         33
Lattitude         6503
Longtitude        7063
Regionname           8
Propertycount      311
dtype: int64

In [8]:
#dropping coloumns
df = df.drop(columns=['BuildingArea', 'YearBuilt', 'CouncilArea'])

In [9]:
df.isnull().sum()

Suburb            0
Address           0
Rooms             0
Type              0
Price             0
Method            0
SellerG           0
Date              0
Distance          0
Postcode          0
Bedroom2          0
Bathroom          0
Car              62
Landsize          0
Lattitude         0
Longtitude        0
Regionname        0
Propertycount     0
dtype: int64

In [10]:
#Dropping the car column
df = df.drop(columns=['Car'])

In [11]:
low_cardinality_col = [col for col in df.columns if df[col].dtype ==
object and df[col].nunique() < 10]
num_col = [col for col in df.columns if df[col].dtype in [int, float]]

In [12]:
X = df[low_cardinality_col + num_col].drop(columns='Price')
X.tail()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
13575,h,S,South-Eastern Metropolitan,4,16.7,3150.0,4.0,2.0,652.0,-37.90562,145.16761,7392.0
13576,h,SP,Western Metropolitan,3,6.8,3016.0,3.0,2.0,333.0,-37.85927,144.87904,6380.0
13577,h,S,Western Metropolitan,3,6.8,3016.0,3.0,2.0,436.0,-37.85274,144.88738,6380.0
13578,h,PI,Western Metropolitan,4,6.8,3016.0,4.0,1.0,866.0,-37.85908,144.89299,6380.0
13579,h,SP,Western Metropolitan,4,6.3,3013.0,4.0,1.0,362.0,-37.81188,144.88449,6543.0


In [13]:
X[low_cardinality_col].nunique()

Type          3
Method        5
Regionname    8
dtype: int64

In [14]:
#LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['Type'] = le.fit_transform(df['Type'])
X['Method'] = le.fit_transform(df['Method'])
X['Regionname'] = le.fit_transform(df['Regionname'])

In [16]:
# sklearn.model_selection
from sklearn.model_selection import train_test_split
# ensure target vector `y` is defined before splitting
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)
X_train.shape, X_test.shape #((10864, 12), (2716, 12))
y_train.shape, y_test.shape #((10864,), (2716,))

((10864,), (2716,))

In [19]:
#y = df['Price']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)



In [20]:
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr_prediction = dtr.predict(X_test)

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, dtr_prediction)
mae

221954.68335787923

In [21]:
#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

def rf_mae(X_train, X_test, y_train, y_test):
	rfr = RandomForestRegressor()
	rfr.fit(X_train, y_train)
	prediction = rfr.predict(X_test)
	rfr_mae = mean_absolute_error(y_test, prediction)
	return rfr_mae


In [22]:
rf_mae(X_train, X_test, y_train, y_test)

169853.96979469105