In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import sklearn




In [None]:
df=pd.read_csv('../input/housesalesprediction/kc_house_data.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('id', inplace=True)

In [None]:
#df.describe()

In [None]:
#Variable 'price', 'sqft_living' values are really large and will affect the absolute numbers of the regression model. 
#I will normalise the data using log

df['price'] = np.log(df['price'])
df['sqft_living'] = np.log(df['sqft_living'])





In [None]:
df.describe()


In [None]:
#check for missing or null values in the data set. Looks like everything is in place.

In [None]:
null= df.isnull()

In [None]:
sns.heatmap(null,cbar=False,cmap='viridis',yticklabels=False)


In [None]:
#there is no missing values

In [None]:
#I created a coorelation matrix to check which variables are strongerly correlated with the target variable 'price'

In [None]:
plt.subplots(figsize=(10,8))
sns.heatmap(df.corr(method='pearson'),annot=True,linecolor="black",cmap='coolwarm',fmt="1.1f", linewidths=0.25, vmax=1.0, square=True)
plt.title("Data Correlation",fontsize=50)
plt.show()


In [None]:
#Data Visulations 

In [None]:
#Let's have a look at the different variables and their relation with the target variable 'price'

In [None]:
filter = ['sqft_living','sqft_above','sqft_living15', 'sqft_lot15']
sns.pairplot(data=df, x_vars=['sqft_living','sqft_above','sqft_living15', 'sqft_lot15'],y_vars='price',kind='scatter')
plt.show()

In [None]:
#sqft_living has a stronger correlation with the price as compare to other variables. Lets focus on sqft_living

In [None]:
plt.figure(figsize = (8, 5))
sns.jointplot(x='sqft_living', y='price',data=df, 
              alpha = 0.5,)
plt.xlabel('Sqft Living')
plt.ylabel('Sale Price')
plt.show()

In [None]:
#Let's check the 'zipcode' variable and important zip code is on sqft_living and price relation

In [None]:
df["zipcode"].nunique()

In [None]:
df['zipcode'].value_counts()

In [None]:
#zip code 98103 has most of houses sold 

In [None]:
df.groupby('zipcode')['price'].mean().reset_index().sort_values('price',ascending=False)

In [None]:
#Most expensive zipcode is 98039

In [None]:
#let's plot sqft_living  for both the zipcodes

In [None]:
zip98103 = df['zipcode'] == 98103 
zip98039 = df['zipcode'] == 98039

In [None]:
plt.figure(figsize = (8, 5))
sns.jointplot(x='sqft_living', y='price',data=df[zip98103], 
              alpha = 0.5,)
plt.xlabel('Sqft Living')
plt.ylabel('Sale Price')
plt.show()

In [None]:
#Plot of sqft_living vs pirce for zipcode 98103 is almost similar with the original plot for all zipcode. 

In [None]:
plt.figure(figsize = (8, 5))
sns.jointplot(x='sqft_living', y='price',data=df[zip98039], 
              alpha = 0.5,)
plt.xlabel('Sqft Living')
plt.ylabel('Sale Price')
plt.show()

In [None]:
#Zipcode 98039 has an interesting plot. Sqft_living vs price for this zipcode has a strong positive correlation. 
# we will only include zipcode 98039 in our prediction model to get a better result

In [None]:
f, axes = plt.subplots(1, 2, figsize=(25,5))
sns.countplot(x='bedrooms' , data=df, ax=axes[1])
sns.boxplot(x='bedrooms', y='price', data=df, ax=axes[0])

In [None]:
f, axes = plt.subplots(1, 2, figsize=(25,5))
sns.countplot(x='bathrooms' , data=df, ax=axes[1])
sns.boxplot(x='bathrooms', y='price', data=df, ax=axes[0])

In [None]:
f, axes = plt.subplots(1, 2, figsize=(25,5))
sns.countplot(x='grade' , data=df, ax=axes[1])
sns.boxplot(x='grade', y='price', data=df, ax=axes[0])

In [None]:
f, axes = plt.subplots(1, 1, figsize=(5,5))
sns.boxplot(x='waterfront',y='price' , data=df)
plt.show()
sns.boxplot(x='view', y='price', data=df)

In [None]:
#Let's check the 'zipcode' variable and how it is correlated with the 'price'

In [None]:
plt.figure(figsize = (8, 5))
sns.jointplot(y='long', x='price',data=df, 
              alpha = 0.5,)
plt.xlabel('Sqft Living')
plt.ylabel('Sale Price')
plt.show()

In [None]:
# HOUSE PREDICTIONS

In [None]:
#Now we will use some model to Predict house prices.

In [None]:
#First we try to run Linear Regression model to predict the prices

features1 = ['sqft_living','grade', 'bathrooms','sqft_above','sqft_living15','lat','sqft_lot15']
features2= ['sqft_living','grade', 'bathrooms','sqft_above','sqft_living15','lat','view','bedrooms','condition']
features3 =['sqft_living','grade', 'bathrooms','sqft_above','sqft_living15','lat','view','bedrooms','condition','yr_built','sqft_lot15','floors','waterfront','zipcode']

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression








In [None]:
x= df[features1]
y=df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

In [None]:
lm=LinearRegression()

In [None]:
lm.fit(X_train, y_train)

In [None]:
Score1=lm.score(X_test, y_test)

In [None]:
round(Score1,2)

In [None]:
# The predicition score for feature 1 is 71% which is weak so we will try feature 2.

In [None]:
x= df[features2]
y=df['price']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

In [None]:
lm=LinearRegression()

In [None]:
lm.fit(X_train, y_train)

In [None]:
Score2=lm.score(X_test, y_test)

In [None]:
round(Score2,2)

In [None]:
# The predicition score for feature 2 is 74% . Let see what will be the score with feature3?

In [None]:
x= df[features3]
y=df['price']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

In [None]:
lm=LinearRegression()

In [None]:
lm.fit(X_train, y_train)

In [None]:
Score3=lm.score(X_test, y_test)

In [None]:
round(Score3,2)

In [None]:
# The predicition score for feature 3 is 77% which is stronger than previous two features so we will use this for predicition.

In [None]:
print(lm.intercept_)

In [None]:
lm.coef_

In [None]:
pd.DataFrame(lm.coef_,x.columns, columns=['coef'])

In [None]:
Prediction1=lm.predict(X_test)

In [None]:
Prediction1

In [None]:
y_test

In [None]:
from sklearn import metrics


In [None]:
round(metrics.mean_absolute_error(y_test, Prediction1),2)

In [None]:
round(metrics.mean_squared_error(y_test, Prediction1),2)

In [None]:
round(np.sqrt(metrics.mean_squared_error(y_test, Prediction1)),2)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, Prediction1)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Price')
ax.set_ylabel('Predicted')
plt.show()
plt.rcParams['figure.figsize'] = (25,5)

In [None]:
sns.distplot((y_test-Prediction1))

In [None]:
compare = pd.DataFrame({'Prediction': Prediction1, 'Test Data' : y_test})
compare.head(10)