PREDICTING THE PRICE OF THE HOUSE USING NEURAL NETWORK

PS: Please like or upvote my kernel if you liked my work or learnt something new from it. Thank you.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Importing the required libraries

In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Upload the dataset and name it as 'df'

In [None]:
df = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')
# importing the dataset

In [None]:
df # a look at the dataset

Describing and understanding the dataset. 

In [None]:
df.isnull().sum()
# checking for any missing data

In [None]:
df.describe().transpose()
# understanding the data

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(df['price'])
# checking the price column distribution
# most of the house price fall between 0 - 1.5 million dollars

In [None]:
sns.countplot(df['bedrooms'])
# most of the houses have between 2 - 5 bedrooms on average

In [None]:
df.corr()
# correlations of all the columns with respect to each other

In [None]:
df.corr()['price'].sort_values()
# correlation of the price column alone with respect to other columns and sorted in ascending order

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x='price',y='sqft_living',data=df)
# a very linear relationship observed as they are highly correlated as seen from the above table

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x='price',y='bathrooms',data=df)

In [None]:
sns.boxplot(x='bedrooms',y='price',data=df)
# bedrooms and price correlation

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x = 'price', y = 'long', data = df)
# there is a lot of price variations based on the longitude of the location of the house

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x = 'price', y = 'lat', data = df)
# expensive housing areas in some particular latitudes
# at a certain combination of lat and long there seems to be an expensive neighbourhood

In [None]:
plt.figure(figsize = (15,12))
sns.scatterplot(x = 'long', y = 'lat', data = df, hue = 'price')
# the shape of the distribution matches the King county in Seattle
# darker points are the expensive neighborhoods

In [None]:
df.sort_values('price', ascending = False).head(20)
# only about 20 houses are in the range of 3 - 7 million dollars
# these can be considered as outliers in the dataset

In [None]:
len(df)
# 21613 houses in the dataset
# 1% of 21613 = 216 houses

Dealing with the outliers

In [None]:
bottom_99_percent = df.sort_values('price', ascending = False).iloc[216:]
# this drops all the really expensive houses which were the outliers

In [None]:
bottom_99_percent

In [None]:
plt.figure(figsize = (15,12))
sns.scatterplot(x = 'long', y = 'lat', data = bottom_99_percent, hue = 'price', palette = 'RdYlGn')
# a lot clearer color distribution of the expensive houses

In [None]:
sns.boxplot(x='waterfront',y='price',data=df)
# waterfront houses are more expensive

Dropping columns that are not necessary

In [None]:
df
# can drop ID

In [None]:
df.drop('id', axis = 1, inplace = True)

In [None]:
df['date'] = pd.to_datetime(df['date'])
# convert the date column items into a date-time object. The formatting also changes.
# now its easier to extract info like the month and year automatically

In [None]:
df['date']
# feature engineering or feature extraction can be done on this object now

In [None]:
df['year'] = df['date'].apply(lambda date: date.year)

In [None]:
df['year']

In [None]:
df['month'] = df['date'].apply(lambda date: date.month)

In [None]:
df['month']

In [None]:
df
# the year and month columns are now added to this. Exploratory data analysis can be done on to see if they are useful.

In [None]:
df.groupby('month').mean()['price'].plot()
# to check if any significant relationship between month sold and price of the house
# about $60k price difference during the sprint and summer months.

In [None]:
df.groupby('year').mean()['price'].plot()
# sales increasing in price as the time goes by

In [None]:
df = df.drop('date', axis = 1)

In [None]:
df

In [None]:
df['zipcode'].value_counts()
# zipcodes cannot be left as numerical values. They have to be treated as a categorical variable.
# 70 categories of zipcodes have to be created to make dummy variables here
# 

In [None]:
df = df.drop('zipcode', axis = 1)

In [None]:
df


In [None]:
X = df.drop('price', axis =1).values
y = df['price']

In [None]:
X

In [None]:
y

Splitting the dataset into Training set and Test set (test set = 30% of the dataset)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

Sacling only our training values. Using MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train= scaler.fit_transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

In [None]:
X_train.shape

In [None]:
X_test.shape

Importing libraries needed to create a neural network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam

Creating the layers of the neural network along with the activation function and optimizer.

In [None]:
model = Sequential()

model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(1)) # output layer neuron

model.compile(optimizer='adam',loss='mse')

Fitting the model to the training set and also providing the test set values to validate the model performance

In [None]:
model.fit(x=X_train,y=y_train.values,
          validation_data=(X_test,y_test.values),
          batch_size=128,epochs=400)

Plotting the cost/loss function for the training and test set.

In [None]:
losses = pd.DataFrame(model.history.history)
# Loss: loss on training set
# val_loss: loss on test set

In [None]:
losses.plot()
# both lines are close so no overfitting of the model
# decrease in both the training and validation loss

Importing libraries to evaluate the performance of the model.

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score

In [None]:
y_pred = model.predict(X_test)

In [None]:
mean_absolute_error(y_test,y_pred)

In [None]:
np.sqrt(mean_squared_error(y_test,y_pred))

In [None]:
explained_variance_score(y_test,y_pred)

In [None]:
df['price'].mean()

In [None]:
df['price'].median()

The scatterplot indicates the predicted values and the red line indicates the actual values. Our predicted model is linear and falls very close to the actual value. The model's performance was reduced due to the presence of the outliers (houses worth more than $4 million). This model explains 80 percent of the variance.

In [None]:
# Our predictions
plt.scatter(y_test,y_pred)

# Perfect predictions
plt.plot(y_test,y_test,'r')