I have applied deep neural networks,decision trees and linear regression model to predict house prices.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
raw_data = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')
df = raw_data.copy()

In [None]:
df.info()

* So,our data has no missing values as number of values in each column are equal.

In [None]:
df.drop('id',axis=1,inplace=True)

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(df['price'],bins=30)

There are some outliers.Let's remove them for better analysis.

In [None]:
new_df = df[df['price']<2e6].copy()

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(new_df['price'],bins=30)

In [None]:
(len(df)-len(new_df))*100/len(df)

So,we removed less than 1% of our dataset.

In [None]:
new_df.corr()['price'].sort_values()

Let's visually explore highly correlated factors.

In [None]:
%matplotlib inline

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='bathrooms',data=new_df)

In [None]:
plt.figure(figsize=(8,24))
plt.subplot(3,1,1)
plt.scatter(y='sqft_living15',x='price',data=new_df,s=1)
plt.ylabel('sqft_living15')
plt.xlabel('price')
plt.subplot(3,1,2)
plt.scatter(y='sqft_living',x='price',data=new_df,s=1)
plt.ylabel('sqft_living')
plt.xlabel('price')
plt.subplot(3,1,3)
plt.scatter(y='sqft_above',x='price',data=new_df,s=1)
plt.ylabel('sqft_above')
plt.xlabel('price')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='grade',data=new_df)

Feature Engineering Section:

In [None]:
new_df.head()

Let's explore date column

In [None]:
new_df['date'].apply(lambda x:x[:-7])

In [None]:
new_df['date'] = pd.to_datetime(new_df['date'])

In [None]:
new_df['month'] = new_df['date'].apply(lambda date:date.month)
new_df['year'] = new_df['date'].apply(lambda date:date.year)

In [None]:
new_df.corr()['price'].sort_values()

Ah! Attributes month and year do not seem important.But I will still keep them to analyse their effect on my model.
Though, I will drop attribute zipcode.

In [None]:
new_df.drop('zipcode',axis=1,inplace=True)

In [None]:
new_df.groupby('month').mean()['price'].plot()

In [None]:
new_df.groupby('year').mean()['price'].plot()

In [None]:
new_df.drop('date',axis=1,inplace=True)

In [None]:
new_df['sqft_basement'].value_counts()

I will categorize this column as having a basement(0) or not(1).

In [None]:
def convert_to_dummy(value):
    if value == 0:
        return 0
    else:
        return 1
new_df['basement']=new_df['sqft_basement'].apply(convert_to_dummy)
new_df['basement'].value_counts()

In [None]:
new_df.drop('sqft_basement',axis=1,inplace=True)

In [None]:
new_df['yr_renovated'].value_counts()

In [None]:
def convert_to_dummy(value):
    if value == 0:
        return 0
    else:
        return 1
new_df['renovated']=new_df['yr_renovated'].apply(convert_to_dummy)
new_df['renovated'].value_counts()

In [None]:
new_df.drop('yr_renovated',axis=1,inplace=True)

In [None]:
new_df.columns

# Using Deep Neural Network

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = new_df.drop('price',axis=1)
y = new_df['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler=MinMaxScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import tensorflow as tf

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(19,activation='relu'),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
model.compile(optimizer='adam',loss='mse')

In [None]:
num_epochs=100
model.fit(X_train,y_train.values,epochs=num_epochs,batch_size=128,verbose=2)

In [None]:
loss_df = pd.DataFrame(model.history.history)

In [None]:
loss_df.plot()

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,explained_variance_score

In [None]:
predictions = model.predict(X_test)

In [None]:
# Our predictions
plt.scatter(y_test,predictions)

# Perfect predictions
plt.plot(y_test,y_test,'r')

In [None]:
Deep_Net =['Deep Net',mean_absolute_error(y_test,predictions),np.sqrt(mean_squared_error(y_test,predictions)),explained_variance_score(y_test,predictions)]

In [None]:
error_metrics=pd.DataFrame({'model':[],'mean absolute error':[],'root mean squared error':[],'Explained variance score':[]})

In [None]:
error_metrics.loc[0]= Deep_Net

In [None]:
error_metrics

# Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

In [None]:
pred = dtree.predict(X_test)

In [None]:
# Our predictions
plt.scatter(y_test,pred)

# Perfect predictions
plt.plot(y_test,y_test,'r')

In [None]:
Decision_Tree =['Decision Tree',mean_absolute_error(y_test,pred),np.sqrt(mean_squared_error(y_test,pred)),explained_variance_score(y_test,pred)]

In [None]:
error_metrics.loc[1] = Decision_Tree

In [None]:
error_metrics

# Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm=LinearRegression()

In [None]:
lm.fit(X_train,y_train)

In [None]:
pred_lm = lm.predict(X_test)

In [None]:
# Our predictions
plt.scatter(y_test,pred_lm)

# Perfect predictions
plt.plot(y_test,y_test,'r')

In [None]:
Linear_model =['Linear Regression',mean_absolute_error(y_test,pred_lm),np.sqrt(mean_squared_error(y_test,pred_lm)),explained_variance_score(y_test,pred_lm)]

error_metrics.loc[2]= Linear_model

error_metrics