## Importing the libraries

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, classification_report

## Importing the data

In [2]:
# Importing the train dataset
data = pd.read_csv("Dataset/Melbourne_housing_dataset_full.csv")
data.shape

(34857, 21)

In [3]:
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


## Splitting data into test and train

In [4]:
# If value in Price is empty, we will put it in the test dataframe else we will put it in the train dataframe
train_data = data[data['Price'].notnull()]
train_data.shape

(27247, 21)

In [5]:
test_data = data[data['Price'].isna()]
test_data.shape

(7610, 21)

In [6]:
# Saving train_data to file
path = r'C:\Users\shrea\Desktop\Jupyter Notebooks\IITB Internship\IITB-ML-Assignment\Output\\'
train_data.to_csv(path+'train_data.csv', index=False)
print('Saved file to disk.')

Saved file to disk.


In [7]:
# Saving test_data to file
path = r'C:\Users\shrea\Desktop\Jupyter Notebooks\IITB Internship\IITB-ML-Assignment\Output\\'
test_data.to_csv(path+'test_data.csv', index=False)
print('Saved file to disk.')

Saved file to disk.


## Inspecting the data

In [8]:
train_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0
5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra City Council,-37.7969,144.9969,Northern Metropolitan,4019.0
6,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra City Council,-37.8072,144.9941,Northern Metropolitan,4019.0


In [9]:
train_data.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [10]:
test_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
7,Abbotsford,16 Maugie St,4,h,,SN,Nelson,6/08/2016,2.5,3067.0,...,2.0,2.0,400.0,220.0,2006.0,Yarra City Council,-37.7965,144.9965,Northern Metropolitan,4019.0
8,Abbotsford,53 Turner St,2,h,,S,Biggin,6/08/2016,2.5,3067.0,...,1.0,2.0,201.0,,1900.0,Yarra City Council,-37.7995,144.9974,Northern Metropolitan,4019.0
9,Abbotsford,99 Turner St,2,h,,S,Collins,6/08/2016,2.5,3067.0,...,2.0,1.0,202.0,,1900.0,Yarra City Council,-37.7996,144.9989,Northern Metropolitan,4019.0


In [11]:
train_data.isna().sum()

Suburb               0
Address              0
Rooms                0
Type                 0
Price                0
Method               0
SellerG              0
Date                 0
Distance             1
Postcode             1
Bedroom2          6441
Bathroom          6447
Car               6824
Landsize          9265
BuildingArea     16591
YearBuilt        15163
CouncilArea          3
Lattitude         6254
Longtitude        6254
Regionname           3
Propertycount        3
dtype: int64

In [12]:
test_data.isna().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price            7610
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2         1776
Bathroom         1779
Car              1904
Landsize         2545
BuildingArea     4524
YearBuilt        4143
CouncilArea         0
Lattitude        1722
Longtitude       1722
Regionname          0
Propertycount       0
dtype: int64

In [13]:
train_data.nunique()

Suburb             345
Address          26751
Rooms               12
Type                 3
Price             2871
Method               5
SellerG            349
Date                78
Distance           213
Postcode           209
Bedroom2            14
Bathroom            10
Car                 13
Landsize          1557
BuildingArea       662
YearBuilt          151
CouncilArea         33
Lattitude        11366
Longtitude       12275
Regionname           8
Propertycount      336
dtype: int64

In [14]:
test_data.nunique()

Suburb            312
Address          7532
Rooms              11
Type                3
Price               0
Method              9
SellerG           258
Date               78
Distance          197
Postcode          193
Bedroom2           11
Bathroom            9
Car                14
Landsize         1140
BuildingArea      448
YearBuilt         137
CouncilArea        33
Lattitude        4329
Longtitude       4463
Regionname          8
Propertycount     309
dtype: int64

## Data Preprocessing

In [15]:
# Dropping columns which will be of no use for our model
col_list = ['Address', 'BuildingArea', 'YearBuilt']

In [16]:
drop_data = data[col_list]
# Saving dropped data to file
path = r'C:\Users\shrea\Desktop\Jupyter Notebooks\IITB Internship\IITB-ML-Assignment\Output\\'
drop_data.to_csv(path+'drop.csv', index=False)
print('Saved file to disk.')

Saved file to disk.


In [17]:
train_data = train_data.drop(col_list, axis=1)
test_data = test_data.drop(col_list, axis=1)
test_data.drop('Price', axis=1, inplace=True)

In [18]:
# Function to extraxt year and month from datetime data
def conv_date(date_list):
    for i in date_list:
        train_data[i] = pd.to_datetime(train_data[i])
        test_data[i] = pd.to_datetime(test_data[i])

In [19]:
date_list = ['Date']
conv_date(date_list)

In [20]:
# Function to obtain year and month from datetime data
def get_year(colname, dfcolname):
    train_data[colname] = train_data[dfcolname].apply(lambda x:x.year)
    test_data[colname] = test_data[dfcolname].apply(lambda x:x.year)
def get_month(colname, dfcolname):
    train_data[colname] = train_data[dfcolname].apply(lambda x:x.month)
    test_data[colname] = test_data[dfcolname].apply(lambda x:x.month)

In [21]:
get_year('CurrentYear', 'Date')
get_month('CurrentMonth', 'Date')

In [22]:
# Dropping date column
train_data = train_data.drop('Date', axis=1)
test_data = test_data.drop('Date', axis=1)

In [23]:
# Function to impute missing values by mean
def impute_mean(imp_mean_list):
    for i in imp_mean_list:
        train_data[i] = train_data[i].fillna(train_data[i].mean())
        test_data[i] = test_data[i].fillna(train_data[i].mean())

In [24]:
imp_mean_list = ['Distance', 'Lattitude', 'Longtitude']
impute_mean(imp_mean_list)

In [25]:
# Function to impute missing values by median
def impute_median(imp_median_list):
    for i in imp_median_list:
        train_data[i] = train_data[i].fillna(train_data[i].median())
        test_data[i] = test_data[i].fillna(train_data[i].median())

In [26]:
imp_median_list = ['Bedroom2', 'Bathroom', 'Car', 'Propertycount', 'Landsize']
impute_median(imp_median_list)

In [27]:
# Function to impute with None
def impute_none(imp_none_list):
    for i in imp_none_list:
        train_data[i] = train_data[i].fillna('None')
        test_data[i] = test_data[i].fillna('None')

In [28]:
imp_none_list = ['CouncilArea', 'Postcode', 'Regionname']
impute_none(imp_none_list)

In [29]:
train_data.isna().sum()

Suburb           0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
CouncilArea      0
Lattitude        0
Longtitude       0
Regionname       0
Propertycount    0
CurrentYear      0
CurrentMonth     0
dtype: int64

In [30]:
test_data.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
CouncilArea      0
Lattitude        0
Longtitude       0
Regionname       0
Propertycount    0
CurrentYear      0
CurrentMonth     0
dtype: int64

In [31]:
clean_data = pd.concat([train_data, test_data])
# Saving clean data to file
path = r'C:\Users\shrea\Desktop\Jupyter Notebooks\IITB Internship\IITB-ML-Assignment\Output\\'
clean_data.to_csv(path+'mmClean.csv', index=False)
print('Saved file to disk.')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Saved file to disk.


In [32]:
extracted_data = clean_data[['CouncilArea', 'Postcode', 'Regionname', 'Bedroom2', 'Bathroom', 'Car', 'Propertycount', 
                             'Landsize', 'Distance', 'Lattitude', 'Longtitude']]
# Saving extracted data to file
path = r'C:\Users\shrea\Desktop\Jupyter Notebooks\IITB Internship\IITB-ML-Assignment\Output\\'
extracted_data.to_csv(path+'extracted.csv', index=False)
print('Saved file to disk.')

Unnamed: 0,CouncilArea,Postcode,Regionname,Bedroom2,Bathroom,Car,Propertycount,Landsize,Distance,Lattitude,Longtitude
1,Yarra City Council,3067,Northern Metropolitan,2.0,1.0,1.0,4019.0,202.0,2.5,-37.799600,144.998400
2,Yarra City Council,3067,Northern Metropolitan,2.0,1.0,0.0,4019.0,156.0,2.5,-37.807900,144.993400
4,Yarra City Council,3067,Northern Metropolitan,3.0,2.0,0.0,4019.0,134.0,2.5,-37.809300,144.994400
5,Yarra City Council,3067,Northern Metropolitan,3.0,2.0,1.0,4019.0,94.0,2.5,-37.796900,144.996900
6,Yarra City Council,3067,Northern Metropolitan,3.0,1.0,2.0,4019.0,120.0,2.5,-37.807200,144.994100
10,Yarra City Council,3067,Northern Metropolitan,2.0,1.0,0.0,4019.0,181.0,2.5,-37.804100,144.995300
11,Yarra City Council,3067,Northern Metropolitan,4.0,2.0,0.0,4019.0,245.0,2.5,-37.802400,144.999300
14,Yarra City Council,3067,Northern Metropolitan,2.0,1.0,2.0,4019.0,256.0,2.5,-37.806000,144.995400
15,Yarra City Council,3067,Northern Metropolitan,3.0,1.0,2.0,4019.0,512.0,2.5,-37.806963,144.996711
16,Yarra City Council,3067,Northern Metropolitan,3.0,1.0,2.0,4019.0,512.0,2.5,-37.806963,144.996711
