In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(15,18))
im=plt.imread("../input/king-county-map/king county.jpg")
plt.imshow(im) 
#This is the actual map of the county

In [None]:
df=pd.read_csv("../input/housesalesprediction/kc_house_data.csv")
df.head()

# 1. Exploratory Data Analysis:

In [None]:
df.isnull().sum()
# Here we see that there is no missing data in this data set

In [None]:
df.describe().transpose()
#Here we get overall statistical description of our data set

In [None]:
sns.set_style("darkgrid")
plt.figure(figsize=(10,8))
sns.distplot(df["price"])

In the distribution plot above, we see that the house prices are mainy distributed between 0 and 1 million dolar, and there some extreme outliers that we can just skip in order to prevent their influence over our deep learning model

In [None]:
fig, ax = plt.subplots(2)  
sns.countplot(df["bedrooms"],  ax=ax[0])
sns.countplot(df["floors"],  ax=ax[1])



In [None]:
df.corr()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),linewidths=0.5,annot=True)

Our target is house prices, we can also special correlation of prices with the other features

In [None]:
df.corr()["price"].sort_values(ascending=False)
#Here we can clearly see that there is positive high correlation between house prices and sqft_living(Square footage of the apartments interior living space)

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x="sqft_living", y="price", data=df,color="red")
#Here we visualize the relation between house prices and the square of the living area

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x="long", y="price", data=df, color="red")

From this plot above, we understanda that the longitude between -122.0 and -122.4 has the most expensive prices, and the lontitude -121.4 has the lowest house prices

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x="lat",y="price", data=df, color="green")

From this plot above, we understand that the latitude between 47.5 and 47.7 has the most expensive house prices, and the latitudes between 47.2 and 47.4 has the lowest house prices

In [None]:
df.plot(x="long",y="lat",c="price", kind="scatter",alpha=0.5,figsize=(20,15), cmap=plt.get_cmap("jet"), colorbar=True, s=df["grade"])
#here we visualize the longitude and latitude and get the actual ara of the county and their relation with the price
#We can easily see that the prices between 47.7 and 47.5 latitude has the highes prices

In [None]:
plt.figure(figsize=(15,20))
im=plt.imread("../input/king-county-map/king county.jpg")
plt.imshow(im) 
#This is the actual map of the county and it correlates with our langitude and latitude plot above

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x="long",y="lat",data=df, hue="price",palette="rocket",alpha=0.8)

In order to get better distribution, we can drop some outliers

In [None]:
df.describe()["price"]
#Here we the overall statistical information about house pricess and the outliers begins from 3.2 millon dolars

In [None]:
df[df["price"]>3000000]
#Here we can see that there are only 40 houses that have higher than 3 million dolar house price

#Therefore, I will just create a new data frame without these outliers and create geographical maps again

In [None]:
df_without_outliers=df.sort_values("price",ascending=False).iloc[46:]
df_without_outliers.sort_values("price",ascending=False).head()
#Now we have created a new data frame that has house prices lower than 3 million dolars

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x="long",y="lat",data=df_without_outliers, hue="price",palette="rocket",alpha=0.8)

Now we can see better the price distribution according to the latitude and longitude

In [None]:
df_without_outliers.plot(x="long",y="lat",c="price", kind="scatter",alpha=0.5,figsize=(20,15), cmap=plt.get_cmap("jet"), colorbar=True)


In [None]:
plt.figure(figsize=(14,6))
sns.boxplot(x="waterfront", y="price", data=df)

The boxplot above shows that the houses near waterfront have higher house prices

# 2. Feature Engineering:

In [None]:
df.head()

we can just drop the id column because it has nor a special mening for predicting house prices

In [None]:
df.drop("id",axis=1, inplace=True)
df.head(3) # Now we dropped the id column from the data frame

In [None]:
df["date"]=pd.to_datetime(df["date"])
df.head(3) #here we have changed the structure of the date column in order use it better and make some feature engineering

In [None]:
#Here we will create two new columns by feature enginnering in order to analyze data according time properties
df["year"]=df["date"].apply(lambda date: date.year)
df["month"]=df["date"].apply(lambda date: date. month)

In [None]:
df[["year","month"]].head()
#Here we have added our new feature that were hidden in the date column

In [None]:
df.groupby("month").mean()["price"]
#here we cna see the average price per month

In [None]:
fig, ax = plt.subplots(1,2) 
df.groupby("month").mean()["price"].plot(ax=ax[0], figsize=(15,6),c="red")
df.groupby("year").mean()["price"].plot(ax=ax[1], figsize=(15,6),c="red")
#In the plot below, we see that the prices tends to become higher from march to july
#The housing prices rise up from 2014 to 2015

In [None]:
df.drop("date", axis=1, inplace=True)
#There is not to do with the date column and we get all the useful data via feature engineering

In [None]:
df.head()

In [None]:
df["zipcode"].value_counts()
# we need to drop zipcode column because ml algorithm will treat this as continues value and then cause wrong predcitions
# We can not make them dummy variables because there 70 ifferent zip codes

In [None]:
df.drop("zipcode", axis=1, inplace=True)
df.head()

Below we will make some feature engineering for yr_renovated column because majority of the houses are not renovated

In [None]:
def renovation(feature):
    if feature > 0:
        feature=1
    return feature
#here we create a function that will assign 1 for those that are renovated and 0 those that are not renovated
        

In [None]:
df["yr_renovated"]=df["yr_renovated"].apply(renovation)

In [None]:
df["yr_renovated"]. value_counts(). head(70)
#Now we have just two class as 0 for non-renovated ones and 1 for renovated ones

Now our data is ready for machine learning algorithm

# 3. Splitting Data and Training the Algorithm:

In [None]:
#Here we assign features  to the X and price to the y 
X=df.drop("price",axis=1).values
y=df["price"].values

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3)
#Here we split our data as train and test set

The next step is to chech whether our data shapes in train and test set comply with each other

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


In [None]:
model=Sequential() #here we get an insance of our model
model.add(Dense(19, activation="relu")) # here we add a dense layer with 19 neurons because we have 19 features
model.add(Dense(19, activation="relu")) # here we add a dense layer with 19 neurons because we have 19 features
model.add(Dense(19, activation="relu")) # here we add a dense layer with 19 neurons because we have 19 features
model.add(Dense(19, activation="relu")) # here we add a dense layer with 19 neurons because we have 19 features
model.add(Dense(1)) # here we add a dthe fina layer with 1 neurons because we have one output, that is the house price


In [None]:
model.compile(optimizer="adam", loss="mse")
#Here assign adam optimizer as our optimizer and mean squared error as our loss function for our deep learning model

In [None]:
model.fit(x= X_train, y= y_train, batch_size=128, epochs=300, validation_data=(X_test, y_test))
#Here we fit our model into the training X and y set with batch_size 128 and 300 epaochs, and we use also test dataset as validation

# 4. Predicting and Evaluation of the Model's Performance

In [None]:
pd.DataFrame(model.history.history)
#Here we can see losses in both our loss function and validation loss in the test data 

In [None]:

pd.DataFrame(model.history.history).plot(figsize=(15,10))
#There happens a decrease in both our training and validation loss to a certain point and become stable after 60.th epoch
#Moreover there is no overfitting because both lines goes with a perfect harmony

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score

In [None]:
predictions=model.predict(X_test) #here the trained algorithm makes predictions

In [None]:
print("The absolute mean error :",mean_absolute_error(y_test, predictions))
print("The squared mean error :",mean_squared_error(y_test, predictions))
print("The squared mean error :",np.sqrt(mean_squared_error(y_test, predictions)))


In [None]:
print("The mean of the real data: ",df["price"].mean())
print("The absolute mean error :",mean_absolute_error(y_test, predictions))

> There is 149 000 dolar error and it means %20 procent error that our model makes

In [None]:
print("The Variance Score :", explained_variance_score(y_test, predictions))
#The variance shows how many percent that our model can explain,so our model can explain %58 procent accurately

In [None]:
sns.distplot((y_test-predictions),color="red")

In [None]:
plt.figure(figsize=(18,8))
plt.scatter(y_test, predictions)
plt.scatter(y_test,y_test,color="red")
#In this we can see that our model predict lower and normal house prices very good, but the oulier not good
#The outlier affects negatively the performance of our model

Because  the predictions of our model is not good enough, we will standardize our features and retrain the model

# 5. Retraining Our Model

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler=StandardScaler()

In [None]:
scaler.fit(X_train)

In [None]:
scaler.transform(X_train)

In [None]:
scaler.transform(X_test)

Now we rescaled our features and it is ready for ML algorithm

In [None]:
model2=Sequential() #here we get an insance of our model
model2.add(Dense(19, activation="relu")) # here we add a dense layer with 19 neurons because we have 19 features
model2.add(Dense(19, activation="relu")) # here we add a dense layer with 19 neurons because we have 19 features
model2.add(Dense(19, activation="relu")) # here we add a dense layer with 19 neurons because we have 19 features
model2.add(Dense(19, activation="relu")) # here we add a dense layer with 19 neurons because we have 19 features
model2.add(Dense(1)) # here we add a dthe fina layer with 1 neurons because we have one output, that is the house price

In [None]:
model2.compile(optimizer="adam", loss="mse")
#Here assign adam optimizer as our optimizer and mean squared error as our loss function for our deep learning model

In [None]:
model2.fit(x= X_train, y= y_train, batch_size=64, epochs=300, validation_data=(X_test, y_test))

In [None]:
pd.DataFrame(model2.history.history)

In [None]:
pd.DataFrame(model2.history.history).plot(figsize=(15,8))

In [None]:
predictions2=model2.predict(X_test)

In [None]:
print("The absolute mean error :",mean_absolute_error(y_test, predictions2))
print("The squared mean error :",mean_squared_error(y_test, predictions2))
print("The squared mean error :",np.sqrt(mean_squared_error(y_test, predictions2)))


In [None]:
print("The mean of the real data: ",df["price"].mean())
print("The absolute mean error :",mean_absolute_error(y_test, predictions2))

Now our model decreased mean error from 149 000 dolar to 135 000.

In [None]:
print("The Variance Score :", explained_variance_score(y_test, predictions2))
#The variance shows how many percent that our model can explain,so our model can explain %58 procent accurately

The variance also increased from %58 to %65

In [None]:

plt.figure(figsize=(10,15))
plt.scatter(y_test,predictions2)
plt.scatter(y_test,predictions, color="green")
plt.scatter(y_test,y_test, color="red")

Now we will use linear regression:

In [None]:
X=df.drop("price", axis=1)
y=df["price"]

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X,y)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model3=LinearRegression()

In [None]:
model3.fit(X_train,y_train)

In [None]:
predictions3=model3.predict(X_test)

In [None]:
print("Predictions of Linear Regression:",mean_absolute_error(y_test,predictions3))

In [None]:

plt.figure(figsize=(10,15))
plt.scatter(y_test,predictions3, color="green")
plt.scatter(y_test,y_test, color="red")


Linear Regression performs better than our deep learning model

In [None]:
model4=Sequential() #here we get an insance of our model
model4.add(Dense(19, activation="relu")) # here we add a dense layer with 19 neurons because we have 19 features
model4.add(Dense(19, activation="relu")) # here we add a dense layer with 19 neurons because we have 19 features
model4.add(Dense(19, activation="relu")) # here we add a dense layer with 19 neurons because we have 19 features
model4.add(Dense(19, activation="relu")) # here we add a dense layer with 19 neurons because we have 19 features
model4.add(Dense(1)) # here we add a dthe fina layer with 1 neurons because we have one output, that is the house price

In [None]:
model4.compile(optimizer="rmsprop",loss="mse")

In [None]:
model4.fit(x= X_train, y= y_train, batch_size=64, epochs=300, validation_data=(X_test, y_test))

In [None]:
predictions4=model4.predict(X_test)

In [None]:
print(mean_absolute_error(y_test,predictions4))

In [None]:
plt.figure(figsize=(10,15))
plt.scatter(y_test,predictions4, color="green")
plt.scatter(y_test,y_test, color="red")


We changed the optimizer from adams to rmsprop, but predictions are worse than before.

The next step is to drop the outliers and train the model again