In [None]:
#https://www.kaggle.com/harlfoxem/housesalesprediction/tasks?taskId=955

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, mean_absolute_error

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
pd.options.display.float_format = "{:.2f}".format

In [None]:
df=pd.read_csv('../input/housesalesprediction/kc_house_data.csv')

In [None]:
df.head(2)

In [None]:
df1=df.drop(['id','date'],axis=1)
print('Creating a new dataframe by dropping ID and Date Columns.')

In [None]:
round(df1.describe(),1)

Checking for Null items in the entire dataset

In [None]:
df1.isnull().sum().sum()

Checking for NaN items in the entire dataset

In [None]:
np.isnan(df1).sum().sum()

So, there are no NaN values or Null cells in the dataset

## Exploratory Data Analysis

In [None]:
fig, ax = plt.subplots(1,1,figsize=(20,6))
sns.boxplot(df1['price'])
ax.set(title='Box Plot')
ax.set_ylim(-0.7,0.7)
q1=df1['price'].quantile(0.25)
q2=df1['price'].quantile(0.5)
q3=df1['price'].quantile(0.75)
iqr=q3-q1
min=df1['price'].min()
max_iqr=q3+1.5*iqr
max=df1['price'].max()
ax.annotate(str(min),(min-90000,0.23))
ax.annotate(str(q1),(q1,0.43))
ax.annotate(str(q2),(q2,-0.45))
ax.annotate(str(q3),(q3,0.2))
ax.annotate(str(max_iqr),(max_iqr,0.2))
ax.annotate(str(max),(max-40000,0.05))
ax.annotate(str('4000000'),(4000000,0.05))

#Number of values which are below and above MAX (q3+1.5iqr)

below_maxiqr=df1['price'][df1['price']<=max_iqr].count()
above_maxiqr=df1['price'][df1['price']>max_iqr].count()

ax.annotate(str(below_maxiqr)+' Houses below '+str(max_iqr),(min,-0.6))
ax.annotate(str(above_maxiqr)+' Houses above '+str(max_iqr)+' : Outliers',(1500000,-0.6))

Price Column Outliers Analysis

In [None]:
outliers=1146/(20467+1146)*100
print('There are '+str(round(outliers,3))+'% outliers in the house prices.')

Removing the rows in the dataset where the price is more than 4000000 - as they are outliers from the above Box Plot:

In [None]:
df1=df1[df1['price']<4000000]

In [None]:
fig, ax = plt.subplots(1,1,figsize=(23,6))
sns.boxplot(df1['price'])
ax.set(title='Box Plot')
ax.set_ylim(-0.7,0.7)
q1=df1['price'].quantile(0.25)
q2=df1['price'].quantile(0.5)
q3=df1['price'].quantile(0.75)
iqr=q3-q1
min=df1['price'].min()
max_iqr=q3+1.5*iqr
max=df1['price'].max()
ax.annotate(str(min),(min,0.23))
ax.annotate(str(q1),(q1,0.43))
ax.annotate(str(q2),(q2,-0.45))
ax.annotate(str(q3),(q3,0.2))
ax.annotate(str(max_iqr),(max_iqr,0.2))
ax.annotate(str(round(max,1)),(max,-0.25))

#Number of values which are below and above MAX (q3+1.5iqr)

below_maxiqr=df1['price'][df1['price']<=max_iqr].count()
above_maxiqr=df1['price'][df1['price']>max_iqr].count()

ax.annotate(str(below_maxiqr)+' Houses below '+str(max_iqr),(min,-0.6))
ax.annotate(str(above_maxiqr)+' Houses above '+str(max_iqr)+' : Outliers',(700000,-0.6))

Removed 12 houses whose prices are outliers (>4mil)

In [None]:
df.count()[0]-df1.count()[0]

In [None]:
fig,ax=plt.subplots(figsize=(15,10))
sns.heatmap(df1.corr(),annot=True,cmap='RdYlGn',fmt='.2f')

The above heatmap gives the correlation of Price with other factors. Factors like number of bathrooms, living area, area above the ground, grade and sqft living15 are highly positively correlated with Price.

Other factors like number of bedrooms, number of floors, waterfront, view, basement area, lattitude are weakly positively correlated.

We will use this information in our model building at a later stage.

In [None]:
from scipy import stats
corr_df = pd.DataFrame(columns=['correlation','p-value','feature'])
c=[]
p=[]
index=[]
for i in df1:
    x=stats.pearsonr(df1['price'],df1[i])
    c.append(round(x[0],3))
    p.append(round(x[1],5))
    index.append(i)
corr_df['correlation']=c
corr_df['p-value']=p
corr_df['feature']=index
corr_df.set_index('feature',inplace=True)

In [None]:
corr_df.sort_values(by='correlation',ascending=False)

The correlation values of various factors with price in descending order

In [None]:
df1.nunique().sort_values()

Number of unique values in each Factor column. This gives us a good view of what all are categorical variables. The top 5 variables could be considered as categorical variables.

**Data Visualisation of each column. And outlier elimation**

Note: I have used 'df' dataframe and not 'df1' dataframe to visualise the original data. But performed outlier elimination of df1

In [None]:
fig,ax=plt.subplots(figsize=(20,6))
sns.countplot(df['bedrooms'],ax=ax)
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x(),i.get_height()+100))

In [None]:
df1=df1[df1['bedrooms']<9]
print('Removed data of houses where number of bedrooms is 9 or more')

In [None]:
df1=df1[df1['bedrooms']>0]
print('Removed data of houses where number of bedrooms is 0 or less')

In [None]:
fig,ax=plt.subplots(figsize=(20,6))
sns.countplot(df['bathrooms'],ax=ax)
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x(),i.get_height()+100))

In [None]:
df1=df1[df1['bathrooms']<4.75]
print('Removed data of houses where number of bathrooms is 4.75 or more')
df1=df1[df1['bathrooms']>0.5]
print('Removed data of houses where number of bathrooms is 0.5 or less')

In [None]:
x='sqft_living'
fig,ax=plt.subplots(figsize=(15,4))
plt.hist(df[x])
ax.set_xlabel(x)
miny=df[x].min()
maxy=df[x].max()
ticksize=(maxy-miny)/10
ax.xaxis.set(ticks=np.arange(miny,maxy+ticksize,ticksize))
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x()+ticksize/3,i.get_height()+100))

In [None]:
x='sqft_living'
val=7850
df1=df1[df1[x]<val]
print('Removed data of houses where '+x+' is '+str(val)+' or more.')

In [None]:
x='sqft_lot'
fig,ax=plt.subplots(figsize=(15,4))
plt.hist(df[x])
ax.set_xlabel(x)
miny=df[x].min()
maxy=df[x].max()
ticksize=(maxy-miny)/10
ax.xaxis.set(ticks=np.arange(miny,maxy+ticksize,ticksize))
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x()+ticksize/3,i.get_height()+100))

In [None]:
x='sqft_lot'
val=444233 #444233 is used instead of 496000 here as you will notice that after using 496000 there are still some outliers beyong 444233
df1=df1[df1[x]<val]
print('Removed data of houses where '+x+' is '+str(val)+' or more.')

In [None]:
fig,ax=plt.subplots()
sns.countplot(df['floors'],ax=ax)
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x(),i.get_height()+100))

In [None]:
x='floors'
val=3.5
df1=df1[df1[x]<val]
print('Removed data of houses where '+x+' is '+str(val)+' or more.')

In [None]:
fig,ax=plt.subplots()
sns.countplot(df['waterfront'],ax=ax)
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x(),i.get_height()+200))

In [None]:
fig,ax=plt.subplots()
sns.countplot(df['view'],ax=ax)
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x(),i.get_height()+200))

In [None]:
fig,ax=plt.subplots()
sns.countplot(df['condition'],ax=ax)
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x(),i.get_height()+100))

In [None]:
fig,ax=plt.subplots()
sns.countplot(df['grade'],ax=ax)
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x(),i.get_height()+100))

In [None]:
x='grade'
val=3
df1=df1[df1[x]>val]
print('Removed data of houses where '+x+' is '+str(val)+' or less.')

In [None]:
x='grade'
val=13
df1=df1[df1[x]<val]
print('Removed data of houses where '+x+' is '+str(val)+' or more.')

In [None]:
x='sqft_above'
fig,ax=plt.subplots(figsize=(15,4))
plt.hist(df[x])
ax.set_xlabel(x)
miny=df[x].min()
maxy=df[x].max()
ticksize=(maxy-miny)/10
ax.xaxis.set(ticks=np.arange(miny,maxy+ticksize,ticksize))
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x()+ticksize/3,i.get_height()+100))

In [None]:
x='sqft_above'
val=6674
df1=df1[df1[x]<val]
print('Removed data of houses where '+x+' is '+str(val)+' or more.')

In [None]:
x='sqft_basement'
fig,ax=plt.subplots(figsize=(15,4))
plt.hist(df[x])
ax.set_xlabel(x)
miny=df[x].min()
maxy=df[x].max()
ticksize=(maxy-miny)/10
ax.xaxis.set(ticks=np.arange(miny,maxy+ticksize,ticksize))
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x()+ticksize/3,i.get_height()+100))

In [None]:
x='sqft_basement'
val=2410 #2410 is used as more outliers were spotted after removing those above 2892
df1=df1[df1[x]<val]
print('Removed data of houses where '+x+' is '+str(val)+' or more.')

In [None]:
fig,ax=plt.subplots(figsize=(20,6))
sns.countplot(df['yr_built'],ax=ax)
ax.xaxis.set_tick_params(rotation=90)
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x(),i.get_height()+1))

In [None]:
fig,ax=plt.subplots(figsize=(20,6))
sns.countplot(df['yr_renovated'],ax=ax)
ax.xaxis.set_tick_params(rotation=90)
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x(),i.get_height()+1))

Ignoring Year Renovated and Year Built without removing outliers - as we will drop these columns in model building, as their correlation is very less and can be ignored.

In [None]:
fig,ax=plt.subplots(figsize=(20,6))
sns.countplot(df['zipcode'],ax=ax)
ax.xaxis.set_tick_params(rotation=90)
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x(),i.get_height()+1))

In [None]:
x='sqft_living15'
fig,ax=plt.subplots(figsize=(15,4))
plt.hist(df[x])
ax.set_xlabel(x)
miny=df[x].min()
maxy=df[x].max()
ticksize=(maxy-miny)/10
ax.xaxis.set(ticks=np.arange(miny,maxy+ticksize,ticksize))
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x()+ticksize/3,i.get_height()+80))

In [None]:
x='sqft_living15'
val=5048
df1=df1[df1[x]<val]
print('Removed data of houses where '+x+' is '+str(val)+' or more.')

In [None]:
x='sqft_lot15'
fig,ax=plt.subplots(figsize=(15,4))
plt.hist(df[x])
ax.set_xlabel(x)
miny=df[x].min()
maxy=df[x].max()
ticksize=(maxy-miny)/10
ax.xaxis.set(ticks=np.arange(miny,maxy+ticksize,ticksize))
for i in ax.patches:
    ax.annotate(str(i.get_height()),(i.get_x()+ticksize/3,i.get_height()+100))

In [None]:
x='sqft_lot15'
val=261816
df1=df1[df1[x]<val]
print('Removed data of houses where '+x+' is '+str(val)+' or more.')

In [None]:
fig,ax=plt.subplots(figsize=(20,7))
plt.scatter(df['lat'],df['long'],c=df['price'],cmap='RdYlGn')
plt.xlabel('Lattitude')
plt.ylabel('Longtitude')
plt.colorbar()

The houses between Lat: 47.5 and 47.75, Long: -122.4 and -121.8 are more expensive than others as given in the Price Colour bar

## Price vs Other factors - visualisation

In [None]:
fig,ax=plt.subplots(figsize=(20,7))
x=df1['bedrooms']
y=df1['price']
sns.boxplot(x,y,ax=ax)

The price seems to increase  as the number of bedrooms increase

In [None]:
fig,ax=plt.subplots(figsize=(20,7))
x=df1['bathrooms']
y=df1['price']
sns.boxplot(x,y,ax=ax)

The price seems to increase with number of bathrooms

In [None]:
fig,ax=plt.subplots(figsize=(20,7))
x=df1['floors']
y=df1['price']
sns.boxplot(x,y,ax=ax)

The price seems to increase very slightly with number of floors, except for floors >=3

In [None]:
fig,ax=plt.subplots(figsize=(20,7))
x=df1['view']
y=df1['price']
sns.boxplot(x,y,ax=ax)

In [None]:
fig,ax=plt.subplots(figsize=(20,7))
x=df1['condition']
y=df1['price']
sns.boxplot(x,y,ax=ax)

In [None]:
fig,ax=plt.subplots(figsize=(20,7))
x=df1['grade']
y=df1['price']
sns.boxplot(x,y,ax=ax)

Grade is highly positively correlated with Price.

In [None]:
sns.lmplot(x='sqft_living',y='price',data=df1)

In [None]:
sns.lmplot(x='sqft_lot',y='price',data=df1)

In [None]:
sns.lmplot(x='sqft_above',y='price',data=df1)

In [None]:
sns.lmplot(x='sqft_basement',y='price',data=df1)

In [None]:
sns.lmplot(x='sqft_living15',y='price',data=df1)

In [None]:
sns.lmplot(x='sqft_lot15',y='price',data=df1)

In [None]:
sns.lmplot(x='yr_built',y='price',data=df1)

Almost no correlation between year built and price

In [None]:
sns.lmplot(x='yr_renovated',y='price',data=df1)

The correlation between year renovated and price is 0.124 - very less and can be dropped

## Model Building

In [None]:
model_ft=['sqft_living','grade','sqft_living15','sqft_above','bathrooms','view','lat','bedrooms','sqft_basement',
         'floors','waterfront']
print('We will use these Features to build the model : '+str(model_ft))
print('Number of features: '+str(len(model_ft)))

In [None]:
X=df1[model_ft]
y=df1['price']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=1)

## Multi Linear Regression

In [None]:
linear=linear_model.LinearRegression()
linear.fit(X_train,y_train)

In [None]:
linear.coef_

In [None]:
model_coef = pd.DataFrame(model_ft,columns=['Feature'])
model_coef['Coefficient']=linear.coef_
model_coef

In [None]:
yhat_linear=linear.predict(X_test)

In [None]:
r2_linear = r2_score(y_test,yhat_linear)
print('The r2 value (Goodness of Fit) is: '+str(r2_linear))
mae_linear=mean_absolute_error(y_test,yhat_linear)
print('The Mean Absolute Error value is: '+str(mae_linear))
mse_linear=mean_squared_error(y_test,yhat_linear)
print('The Mean Squared Error is: '+str(mse_linear))

## Upvote my submission if you found it helpful. Thank you!