In [699]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer, RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [700]:
#Loading data and getting some information about it

In [701]:
df = pd.read_csv('melb_data.csv')
print(df) 

              Suburb           Address  Rooms Type      Price Method  \
0         Abbotsford      85 Turner St      2    h  1480000.0      S   
1         Abbotsford   25 Bloomburg St      2    h  1035000.0      S   
2         Abbotsford      5 Charles St      3    h  1465000.0     SP   
3         Abbotsford  40 Federation La      3    h   850000.0     PI   
4         Abbotsford       55a Park St      4    h  1600000.0     VB   
...              ...               ...    ...  ...        ...    ...   
13575  Wheelers Hill      12 Strada Cr      4    h  1245000.0      S   
13576   Williamstown     77 Merrett Dr      3    h  1031000.0     SP   
13577   Williamstown       83 Power St      3    h  1170000.0      S   
13578   Williamstown      96 Verdon St      4    h  2500000.0     PI   
13579     Yarraville        6 Agnes St      4    h  1285000.0     SP   

        SellerG        Date  Distance  Postcode  ...  Bathroom  Car  Landsize  \
0        Biggin   3/12/2016       2.5    3067.0  ...  

In [702]:
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [703]:
df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [704]:
#df.Price, df.BuildingArea, df.Propertycount

In [705]:
df.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [706]:
df.shape

(13580, 21)

In [707]:
df.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [708]:
#Task-1: Filling the missing values in the dataset

In [709]:
missingValueImputerMean = SimpleImputer()
missingValueImputerFrequency = SimpleImputer(strategy='most_frequent')

In [710]:
#Filling missing values of "Car"
X = df.loc[:, ['Car']]
y = df.Price
df.Car = missingValueImputerMean.fit_transform(X, y)
df.Car = df.Car.astype(int)

#Filling missing values of "BuildingArea"
X = df.loc[:, ['BuildingArea']]
y = df.Price
df.BuildingArea = missingValueImputerMean.fit_transform(X, y)
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1,202.0,151.96765,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1,94.0,151.96765,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [711]:
#Filling missing values of "YearBuilt"
X = df.loc[:, ['YearBuilt']]
y = df.Price
df.YearBuilt = missingValueImputerFrequency.fit_transform(X, y)

#Filling missing values of "BuildingArea"
X = df.loc[:, ['CouncilArea']]
y = df.Price
df.CouncilArea = missingValueImputerFrequency.fit_transform(X, y)
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1,202.0,151.96765,1970.0,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1,94.0,151.96765,1970.0,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [712]:
df.isnull().sum()

Suburb           0
Address          0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Date             0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
YearBuilt        0
CouncilArea      0
Lattitude        0
Longtitude       0
Regionname       0
Propertycount    0
dtype: int64

In [713]:
#Task - 2:Replacing the categorical/nominal attributes

In [714]:
print(len(df.Suburb.unique()))
print(len(df.Address.unique()))
print(len(df.YearBuilt.unique()))
print(len(df.Date.unique()))
print(len(df.SellerG.unique()))
print(len(df.Postcode.unique()))
print(len(df.CouncilArea.unique()))
print(len(df.Regionname.unique()))
print(len(df.Type.unique()))
print(len(df.Method.unique()))

314
13378
144
58
268
198
33
8
3
5


In [715]:
#df = df.drop(['Suburb', 'Address', 'YearBuilt', 'Date', 'SellerG'], axis = 1)
df = df.drop(['Address', 'Date', 'SellerG'], axis = 1)

In [716]:
#Replacing "Type"
X = df.loc[:, ['Type']]
y = df.Price
OHEncoder = ce.OneHotEncoder(cols = ['Type'])
newCols = OHEncoder.fit_transform(X, y)
df = df.drop(['Type'], axis = 1)
df = df.join(newCols)

#Replacing "Method"
X = df.loc[:, ['Method']]
y = df.Price
OHEncoder = ce.OneHotEncoder(cols = ['Method'])
newCols = OHEncoder.fit_transform(X, y)
df = df.drop(['Method'], axis = 1)
df = df.join(newCols)

#Replacing "Regionname"
X = df.loc[:, ['Regionname']]
y = df.Price
OHEncoder = ce.OneHotEncoder(cols = ['Regionname'])
newCols = OHEncoder.fit_transform(X, y)
df = df.drop(['Regionname'], axis = 1)
df = df.join(newCols)

#Replacing "CouncilArea"
X = df.loc[:, ['CouncilArea']]
y = df.Price
OHEncoder = ce.OneHotEncoder(cols = ['CouncilArea'])
newCols = OHEncoder.fit_transform(X, y)
df = df.drop(['CouncilArea'], axis = 1)
df = df.join(newCols)

#Replacing "Suburb"
X = df.loc[:, ['Suburb']]
y = df.Price
OHEncoder = ce.OneHotEncoder(cols = ['Suburb'])
newCols = OHEncoder.fit_transform(X, y)
df = df.drop(['Suburb'], axis = 1)
df = df.join(newCols)

#Replacing "Postcode"
X = df.loc[:, ['Postcode']]
y = df.Price
OHEncoder = ce.OneHotEncoder(cols = ['Postcode'])
newCols = OHEncoder.fit_transform(X, y)
df = df.drop(['Postcode'], axis = 1)
df = df.join(newCols)

#Replacing "SellerG"
# X = df.loc[:, ['SellerG']]
# y = df.Price
# OHEncoder = ce.OneHotEncoder(cols = ['SellerG'])
# newCols = OHEncoder.fit_transform(X, y)
# df = df.drop(['SellerG'], axis = 1)
# df = df.join(newCols)

#Replacing "YearBuilt"
X = df.loc[:, ['YearBuilt']]
y = df.Price
OHEncoder = ce.OneHotEncoder(cols = ['YearBuilt'])
newCols = OHEncoder.fit_transform(X, y)
df = df.drop(['YearBuilt'], axis = 1)
df = df.join(newCols)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [717]:
df.columns, df.columns.shape

(Index(['Rooms', 'Price', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize',
        'BuildingArea', 'Lattitude', 'Longtitude',
        ...
        'YearBuilt_135', 'YearBuilt_136', 'YearBuilt_137', 'YearBuilt_138',
        'YearBuilt_139', 'YearBuilt_140', 'YearBuilt_141', 'YearBuilt_142',
        'YearBuilt_143', 'YearBuilt_144'],
       dtype='object', length=716),
 (716,))

In [718]:
#Task - 3

In [719]:
#Turning it to a 5-class classification problem

In [720]:
df = df.sort_values(by = "Price")
df

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Lattitude,Longtitude,...,YearBuilt_135,YearBuilt_136,YearBuilt_137,YearBuilt_138,YearBuilt_139,YearBuilt_140,YearBuilt_141,YearBuilt_142,YearBuilt_143,YearBuilt_144
2652,1,85000.0,6.4,1.0,1.0,0,0.0,151.96765,-37.79110,144.89000,...,0,0,0,0,0,0,0,0,0,0
1805,4,131000.0,8.9,4.0,1.0,2,499.0,155.00000,-37.88640,145.02420,...,0,0,0,0,0,0,0,0,0,0
7303,1,145000.0,13.9,2.0,1.0,1,36.0,151.96765,-37.78330,144.82660,...,0,0,0,0,0,0,0,0,0,0
1927,4,145000.0,7.8,3.0,1.0,1,536.0,164.00000,-37.75550,144.96580,...,0,0,0,0,0,0,0,0,0,0
7940,1,160000.0,4.6,1.0,1.0,0,322.0,151.96765,-37.81980,145.03730,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12557,5,6400000.0,3.0,5.0,2.0,1,553.0,308.00000,-37.84908,144.95753,...,0,0,0,0,0,0,0,0,0,0
3616,6,6500000.0,5.6,6.0,6.0,3,1334.0,365.00000,-37.80290,145.02670,...,0,0,0,0,0,0,0,0,0,0
9575,4,7650000.0,5.3,4.0,2.0,4,1690.0,284.00000,-37.82652,145.03052,...,0,0,0,0,0,0,0,0,0,0
7692,5,8000000.0,9.0,5.0,5.0,4,2079.0,464.30000,-37.81790,145.06940,...,0,0,0,0,0,0,0,0,0,0


In [721]:
df = df.reset_index(drop=True)

In [722]:
minVal = df.Price.min()
maxVal = df.Price.max()
rangeVal = maxVal - minVal
increment = rangeVal / 5
minVal, maxVal, rangeVal, increment

(85000.0, 9000000.0, 8915000.0, 1783000.0)

In [723]:
for i in range(df.shape[0]):
    if df.Price[i] >= minVal and df.Price[i] <= (minVal + increment):
        df.Price[i] = 'bottom value'
    elif df.Price[i] > (minVal + increment) and df.Price[i] <= (minVal + 2 * increment):
        df.Price[i] = 'low value'
    elif df.Price[i] > (minVal + 2 * increment) and df.Price[i] <= (minVal + 3 * increment):
        df.Price[i] = 'medium value'
    elif df.Price[i] > (minVal + 3 * increment) and df.Price[i] <= (minVal + 4 * increment):
        df.Price[i] = 'high value'
    else:
        df.Price[i] = 'top value'
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Price[i] = 'bottom value'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Price[i] = 'low value'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Price[i] = 'medium value'
A value is trying to be set on a co

In [724]:
#K-Nearest Neighbor

In [732]:
predictionAccuraciesKnn = []
bestPredAccuracy = 0.0
bestK = 0
testScore = 0.0
acc = [0.]*6
for i in range(1, 11):
    print(f'Iteration {i}/10:')
    print('----------------')
    for k in range(5, 11):
        #Splitting into train, test and validation set
        train, test = train_test_split(df, test_size=0.25, random_state=k*i, shuffle=True)
        test, val = train_test_split(test, test_size=0.4, random_state=k*i, shuffle=True)

        y_train = train['Price'].values
        y_test = test['Price'].values
        y_val = val['Price'].values

        X_train = train.drop(columns=['Price'])
        X_test = test.drop(columns=['Price'])
        X_val = val.drop(columns=['Price'])

        #Normalizing data:
        scaler = RobustScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        X_val = scaler.transform(X_val)


        knnClassifier = KNeighborsClassifier(n_neighbors = k)
        knnClassifier.fit(X_train, y_train)
        y_pred = knnClassifier.predict(X_val) 
        

        predAccuracy = accuracy_score(y_val,y_pred)
        acc[k - 5] += predAccuracy
        print(f'k = {k}, Accuracy: {round((predAccuracy * 100), 7)}%')
        if predAccuracy > bestPredAccuracy:
            bestK = k
            bestPredAccuracy = predAccuracy
            y_pred_test = knnClassifier.predict(X_test)
            testScore = accuracy_score(y_test,y_pred_test)

    print('')
    print('Best results found for k =', bestK)
    print(f'Test set accuracy for k = {bestK} is {round((testScore * 100), 57)} %')
    print('')
    predictionAccuraciesKnn.append(testScore)
    
print(f'Mean of test set accuracies : {np.mean(predictionAccuraciesKnn)}')
print(f'Standard deviation of test set accuracies : {np.std(predictionAccuraciesKnn)}')

print('')
for k in range(5, 11):
    print(f'Average accuracy for k = {k} is {round(((acc[k - 5] / 10) * 100), 7)} %')

Iteration 1/10:
----------------
k = 5, Accuracy: 91.7525773%
k = 6, Accuracy: 93.5198822%
k = 7, Accuracy: 93.5935199%
k = 8, Accuracy: 92.7835052%
k = 9, Accuracy: 92.2680412%
k = 10, Accuracy: 93.0044183%

Best results found for k = 7
Test set accuracy for k = 7 is 92.78350515463917 %

Iteration 2/10:
----------------
k = 5, Accuracy: 92.7835052%
k = 6, Accuracy: 92.7098675%
k = 7, Accuracy: 93.3726068%
k = 8, Accuracy: 93.6671576%
k = 9, Accuracy: 92.3416789%
k = 10, Accuracy: 92.7098675%

Best results found for k = 8
Test set accuracy for k = 8 is 93.17623956799214 %

Iteration 3/10:
----------------
k = 5, Accuracy: 93.0044183%
k = 6, Accuracy: 92.562592%
k = 7, Accuracy: 92.8571429%
k = 8, Accuracy: 93.6671576%
k = 9, Accuracy: 93.3726068%
k = 10, Accuracy: 93.4462445%

Best results found for k = 8
Test set accuracy for k = 8 is 93.17623956799214 %

Iteration 4/10:
----------------
k = 5, Accuracy: 92.3416789%
k = 6, Accuracy: 93.5198822%
k = 7, Accuracy: 92.6362297%
k = 8, Accu

In [None]:
#Random Forest

In [731]:
a = [0.]*6
print(a)
a[5] += 1
print(a)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
