In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/kc-housesales-data/kc_house_data.csv')

In [None]:
df.head()

In [None]:
df.info()

#### Yipee!! No Missing Data :D
#### Let's see how our target label looks like

In [None]:
df['price'].describe()

## EDA
Let's analyse our datset a bit to get a wider insight into it.

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(df['price'])

This shows that the houses mostly range between  0-2 lacs.

Now let's find correlation of other features with our target feauture 'price'

In [None]:
df.corr()['price'].sort_values(ascending=False).drop('price').plot(kind='bar')

#### This shows that price of the house in King County is highly correlated with the sqft_living and least with zip_code

In [None]:
sns.scatterplot(y='price',x='sqft_living',data=df)

Here we can see a linear relation between the two, which is affected by some outliers with higher price or sqft_living.

In [None]:
sns.countplot('bedrooms',data=df)

This shows that most houses have 3 bedrooms.

In [None]:
plt.figure(figsize=(10,12))
sns.boxplot(x='bedrooms',y='price',data=df)

Here we can observe some outliers affecting the price for houses despite of having same no. of bedrooms thus, other factor also affect our house price. Let's explore that. 

In [None]:
plt.figure(figsize=(10,9))
sns.boxplot(x=df['grade'],y='price',data=df)

Let's explore latitude and longitude

In [None]:
sns.scatterplot(y='lat',x='price',data=df)

In [None]:
sns.scatterplot(y='long',x='price',data=df)

let's try to visulise as a map of KC !!

In [None]:
plt.figure(figsize=(12,10))
sns.scatterplot(y='lat',x='long',data=df,hue='price',edgecolor=None,alpha=0.4,palette='RdYlGn')

To get better scale let's alter our dataset for higher ranging houses. Let's remove 1% of lower ranging houses from our dataset or say create a new df consisting 99% of dataset removing those low ranging houses

In [None]:
0.05*len(df)

In [None]:
new_99_df=df.sort_values('price',ascending=False).iloc[1080:]

In [None]:
len(new_99_df)

In [None]:
plt.figure(figsize=(12,10))
sns.scatterplot(y='lat',x='long',data=new_99_df,hue='price',edgecolor=None,alpha=0.9,palette='RdYlGn')

Above shows analysis of houses shows that houses range higher on waterfronts ,i.e lying on the waterside we have houses with price > 450k (approx.)

Let's analyse waterfront and price

In [None]:
sns.boxplot(x='waterfront',y='price',data=new_99_df)

In [None]:
sns.countplot(df['view'])

## Feature Engineering

In [None]:
df.info()

There's just one objct dtype feature i.e date. Let's convert it into datetime object.

In [None]:
df['date'].head()

In [None]:
df['date']=pd.to_datetime(df['date'])

In [None]:
df['date'].head()

Date column converted !

In [None]:
df.columns

Let's create new columns of month and year sold in our dataset.

In [None]:
df['month_sold']=df['date'].apply(lambda date: date.month)
df['year_sold']=df['date'].apply(lambda date: date.year)

In [None]:
df['month_sold'].head()

In [None]:
df['year_sold']

In [None]:
sns.countplot(df['month_sold'])

Though sales is not much affected by month but the month of May incurred most house sales.

In [None]:
sns.countplot(df['year_sold'])

Let's analyse zipcode feature too, is it of any benefit for us in dataset to predict price of house?

In [None]:
df['zipcode'].value_counts()

[ jom lmlnU

There are 70 different zipcodes :( which we can't categorize into 70 diff columns. Therefore, for the time being we'll drop it out, with other columns not in use.

In [None]:
df=df.drop(['date','id','zipcode'],axis=1)

In [None]:
df.columns

## Model Creation

In [None]:
X=df.drop('price',axis=1).values
y=df['price'].values

In [None]:
type(X)

Let's split our dataset into training and test set.

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=100)

Let's build our Neural Network

In [None]:
x_train.shape

Let's preprocess our data because it's always a good practice ;)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model=Sequential()

model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))

model.add(Dense(1))

model.compile(optimizer='adam',loss='mse')

model.fit(x=x_train,y=y_train,validation_data=(x_test,y_test),batch_size=128,epochs=400)

In [None]:
loss=pd.DataFrame(model.history.history)

In [None]:
loss.plot()

Woah !! Our model didn't overfit..so this works well for our model ;)

Let's evaluate how model performs.. on test set

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,explained_variance_score

predictions=model.predict(x_test)

print("MAE:",mean_absolute_error(y_test,predictions))
print("MSE:",mean_squared_error(y_test,predictions))
print("RMSE:",np.sqrt(mean_squared_error(y_test,predictions)))

In [None]:
explained_variance_score(y_test,predictions)

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(y_test,predictions)
plt.plot(y_test,y_test,'r')

Red line shows ideal model 

## Conclusion
Model with 5 layers consisting of 19 neurons each in 4 layers and 1 in output, gives a accuracy of 80% to predict the price of a house in King County.