In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df=pd.read_csv("../input/housesalesprediction/kc_house_data.csv")

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
# no missing data

In [None]:
df.info()

In [None]:
df.describe().transpose()

# Exploratory Data Analysis

In [None]:
plt.figure(figsize=(14,8))
sns.distplot(df['price'],bins=40)

In [None]:
# it shows most of the houses range below 20,00,000..so its better to exclude the outliers while creating the model

In [None]:
sns.countplot(df['bedrooms'])

In [None]:
# most of the houses have 2-5 bedrooms

In [None]:
df.corr()['price'].sort_values()

In [None]:
plt.figure(figsize=(14,8))
sns.scatterplot(x='price',y='sqft_living',data=df)

In [None]:
plt.figure(figsize=(14,6))
sns.scatterplot(x='price',y='lat',data=df)

In [None]:
plt.figure(figsize=(14,6))
sns.scatterplot(x='price',y='long',data=df)

In [None]:
# at a certain combination of latitude and longitude we have an expensive housing area

In [None]:
plt.figure(figsize=(14,8))
sns.scatterplot(x='long',y='lat',data=df,hue='price')

In [None]:
# it rersembles the map of king county

In [None]:
df.sort_values('price',ascending=False)

In [None]:
# it is better to drop the outliers

In [None]:
len(df)*0.01

In [None]:
# so 20 is less than 1% thus we can drop them and it wont make much of a difference

In [None]:
bottom_99_perc=df.sort_values('price',ascending=False).iloc[216:]

In [None]:
plt.figure(figsize=(14,8))
sns.scatterplot(x='long',y='lat',edgecolor=None,alpha=0.2,palette='viridis',data=bottom_99_perc,hue='price')

In [None]:
# properties near water are more expensive

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(x='waterfront',y='price',data=bottom_99_perc)

# feature engineering

In [None]:
df.head()

In [None]:
df=df.drop('id',axis=1)

In [None]:
df['date']=pd.to_datetime(df['date'])

In [None]:
df['date']

In [None]:
df['year']=df['date'].apply(lambda date:date.year)

In [None]:
df['month']=df['date'].apply(lambda date:date.month)

In [None]:
plt.figure(figsize=(14,10))
sns.boxplot(x='month',y='price',data=df)

In [None]:
# not a significant difference as far as month is considered

In [None]:
df.groupby('year').mean()['price'].plot()

In [None]:
# it makes sense considering the inflation

In [None]:
df=df.drop('date',axis=1)

In [None]:
df.columns

In [None]:
df=df.drop('zipcode',axis=1)

# model training

In [None]:
X=df.drop('price',axis=1).values
y=df['price'].values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=101)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler=MinMaxScaler()

In [None]:
X_train=scaler.fit_transform(X_train)

In [None]:
# we dont fit to our test set as we dont want prior info of test set and prevent data leakage
X_test=scaler.transform(X_test)

# creating the model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
X_train.shape

In [None]:
model=Sequential()

model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))

model.add(Dense(1))


model.compile(optimizer='adam',loss='mse')

In [None]:
model.fit(x=X_train,y=y_train,
          validation_data=(X_test,y_test),
         batch_size=128,
         epochs=400)

In [None]:
losses=pd.DataFrame(model.history.history)

In [None]:
losses.columns

In [None]:
plt.figure(figsize=(12,6))
losses.plot()

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,explained_variance_score

In [None]:
predictions=model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test,predictions))

In [None]:
mean_absolute_error(y_test,predictions)

In [None]:
explained_variance_score(y_test,predictions)

In [None]:
# its not that good and not that bad either

# testing on new value

In [None]:
single_house=df.drop('price',axis=1).iloc[0]

In [None]:
single_house=scaler.transform(single_house.values.reshape([-1,19]))

In [None]:
model.predict(single_house)