In [142]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from utils import *
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, LambdaCallback

%matplotlib inline

print('Libraries Imported')


Libraries Imported


In [143]:
df = pd.read_csv("houseprices.csv")
df.head()

Unnamed: 0,ADDRESS,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK
0,1 Acorn Place,South Lake,565000,4,2,2.0,600,160,2003.0,18300,Cockburn Central Station,1800,09-2018\r,6164,-32.1159,115.84245,LAKELAND SENIOR HIGH SCHOOL,0.828339,
1,1 Addis Way,Wandi,365000,3,2,2.0,351,139,2013.0,26900,Kwinana Station,4900,02-2019\r,6167,-32.19347,115.859553,ATWELL COLLEGE,5.524324,129.0
2,1 Ainsley Court,Camillo,287000,3,1,1.0,719,86,1979.0,22600,Challis Station,1900,06-2015\r,6111,-32.120578,115.993579,KELMSCOTT SENIOR HIGH SCHOOL,1.649178,113.0
3,1 Albert Street,Bellevue,255000,2,1,2.0,651,59,1953.0,17900,Midland Station,3600,07-2018\r,6056,-31.900547,116.038009,SWAN VIEW SENIOR HIGH SCHOOL,1.571401,
4,1 Aman Place,Lockridge,325000,4,1,2.0,466,131,1998.0,11200,Bassendean Station,2000,11-2016\r,6054,-31.88579,115.94778,KIARA COLLEGE,1.514922,


In [144]:
df.isnull().sum()

ADDRESS                 0
SUBURB                  0
PRICE                   0
BEDROOMS                0
BATHROOMS               0
GARAGE               2478
LAND_AREA               0
FLOOR_AREA              0
BUILD_YEAR           3155
CBD_DIST                0
NEAREST_STN             0
NEAREST_STN_DIST        0
DATE_SOLD               0
POSTCODE                0
LATITUDE                0
LONGITUDE               0
NEAREST_SCH             0
NEAREST_SCH_DIST        0
NEAREST_SCH_RANK    10952
dtype: int64

In [145]:
df.describe()

Unnamed: 0,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN_DIST,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH_DIST,NEAREST_SCH_RANK
count,33656.0,33656.0,33656.0,31178.0,33656.0,33656.0,30501.0,33656.0,33656.0,33656.0,33656.0,33656.0,33656.0,22704.0
mean,637072.0,3.65911,1.823063,2.199917,2740.644016,183.501545,1989.706436,19777.374465,4523.371494,6089.420074,-31.960664,115.879265,1.815268,72.672569
std,355825.6,0.752038,0.587427,1.365225,16693.513215,72.102982,20.96433,11364.415413,4495.064024,62.167921,0.17778,0.118137,1.746,40.639795
min,51000.0,1.0,1.0,1.0,61.0,1.0,1868.0,681.0,46.0,6003.0,-32.472979,115.58273,0.070912,1.0
25%,410000.0,3.0,1.0,2.0,503.0,130.0,1978.0,11200.0,1800.0,6050.0,-32.068437,115.789763,0.880568,39.0
50%,535500.0,4.0,2.0,2.0,682.0,172.0,1995.0,17500.0,3200.0,6069.0,-31.933231,115.854198,1.34552,68.0
75%,760000.0,4.0,2.0,2.0,838.0,222.25,2005.0,26600.0,5300.0,6150.0,-31.843818,115.970722,2.097225,105.0
max,2440000.0,10.0,16.0,99.0,999999.0,870.0,2017.0,59800.0,35500.0,6558.0,-31.45745,116.343201,23.254372,139.0


In [146]:
df.shape

(33656, 19)

In [147]:
df.dtypes

ADDRESS              object
SUBURB               object
PRICE                 int64
BEDROOMS              int64
BATHROOMS             int64
GARAGE              float64
LAND_AREA             int64
FLOOR_AREA            int64
BUILD_YEAR          float64
CBD_DIST              int64
NEAREST_STN          object
NEAREST_STN_DIST      int64
DATE_SOLD            object
POSTCODE              int64
LATITUDE            float64
LONGITUDE           float64
NEAREST_SCH          object
NEAREST_SCH_DIST    float64
NEAREST_SCH_RANK    float64
dtype: object

Data Preprocessing 

In [148]:
df.columns

Index(['ADDRESS', 'SUBURB', 'PRICE', 'BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN',
       'NEAREST_STN_DIST', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE',
       'NEAREST_SCH', 'NEAREST_SCH_DIST', 'NEAREST_SCH_RANK'],
      dtype='object')

In [153]:
def preprocess(df):
    df = df.copy()
    
    #Drop columnn with high cardinality
    df = df.drop('ADDRESS', axis = 1)
    
    df = df.drop('NEAREST_STN', axis = 1)
    
    #Drop columns which have more than 25% missing in the dataset
    val = (round(df.isnull().sum()/len(df),2)>0.25)
    col = list(val[val==True].index)
    df = df.drop(col, axis =1)
    
    df['BUILD_YEAR'] = df['BUILD_YEAR'].fillna(df['BUILD_YEAR'].median())
    
    df['DATE_SOLD'] = pd.to_datetime(df['DATE_SOLD'])
    df['YEAR'] = df['DATE_SOLD'].apply(lambda x:x.year)
    df['MONTH'] =  df['DATE_SOLD'].apply(lambda x:x.month)
    

In [154]:
preprocess(df)

In [156]:
df.dtypes

SUBURB               object
PRICE                 int64
BEDROOMS              int64
BATHROOMS             int64
GARAGE              float64
LAND_AREA             int64
FLOOR_AREA            int64
BUILD_YEAR          float64
CBD_DIST              int64
NEAREST_STN          object
NEAREST_STN_DIST      int64
DATE_SOLD            object
POSTCODE              int64
LATITUDE            float64
LONGITUDE           float64
NEAREST_SCH          object
NEAREST_SCH_DIST    float64
NEAREST_SCH_RANK    float64
dtype: object

In [155]:
df = df.iloc[:,1:]
df_norm = (df-df.mean())/df.std()
df_norm

TypeError: unsupported operand type(s) for -: 'str' and 'float'

In [None]:
df_norm