Import the required Libraries

In [None]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

1. EDA

In [None]:
#importing the input data
house = pd.read_csv("../input/kc_house_data.csv")
house.head()

In [None]:
2.Data Understanding

In [None]:
house.shape # there are 21612 rows with 21 columns

In [None]:
house.info()

In [None]:
house.describe(include='all')

In [None]:
# inspect the structure etc.
print(house.info(), "\n")
print(house.shape)

In [None]:
# check column names
house.columns

In [None]:
# unique records in house dataset
house.nunique()

Missing Value Treatment
Let's now move to missing value treatment. 
Let's have a look at the number of missing values

In [None]:
# identify the unique number of ids in house dataset
len(house.id.unique())
#there are duplciates so we have to drop the duplciates.

In [None]:
# check if there are any duplciates or nulls on id
house.id.notnull().sum()

In [None]:
house.duplicated('id').sum()

In [None]:
#lets drop duplicate rows by id column
house = house.drop_duplicates(subset='id')

In [None]:
house.shape

In [None]:
# column-wise missing values 
house.isnull().sum() # no null values

In [None]:
#Lets remove the columns id,date which may not required from this data set for house price prediction
house = house.drop(['id', 'date'],axis=1)

In [None]:
print("So now we are left with",house.shape ,"rows & columns.")

In [None]:
# lets check data types of all columns 
house.dtypes

Data Exploration & Data Visualising the Data
Let's now spend some time doing what is arguably the most important step - understanding the data.
If there is some obvious multicollinearity going on, this is the first place to catch it
Here's where you'll also identify if some predictors directly have a strong association with the outcome variable
We'll visualise our data using matplotlib and seaborn.

In [None]:
#### Visualising Numeric Variables
sns.pairplot(house)
plt.show()

Visualising Categorical Variables
As you might have noticed, there are a few categorical variables as well. Let's make a boxplot for some of these variables.

In [None]:
house.columns

In [None]:
house.head()

In [None]:
plt.figure(figsize=(20, 12))
plt.subplot(2,3,1)
sns.boxplot(x = 'bedrooms', y = 'price', data = house)
plt.subplot(2,3,2)
sns.boxplot(x = 'bathrooms', y = 'price', data = house)
plt.subplot(2,3,3)
sns.boxplot(x = 'floors', y = 'price', data = house)
plt.subplot(2,3,4)
sns.boxplot(x = 'condition', y = 'price', data = house)
plt.subplot(2,3,5)
sns.boxplot(x = 'grade', y = 'price', data = house)
plt.show()

In [None]:
plt.figure(figsize=(20, 12))
plt.subplot(2,3,1)
sns.boxplot(x = 'waterfront', y = 'price', data = house)
plt.subplot(2,3,2)
sns.boxplot(x = 'view', y = 'price', data = house)
plt.subplot(2,3,3)
sns.boxplot(x = 'yr_built', y = 'price', data = house)
plt.subplot(2,3,4)
sns.boxplot(x = 'yr_renovated', y = 'price', data = house)
plt.show()

In [None]:
house.columns

In [None]:
with sns.plotting_context("notebook",font_scale=2.5):
    g = sns.pairplot(house[['sqft_lot','sqft_above','price','sqft_living','bedrooms']], 
                 hue='bedrooms', palette='tab20',size=6)
g.set(xticklabels=[]);

In [None]:
house_correlation = house.corr()
house_correlation

In [None]:
f, ax = plt.subplots(figsize=(14, 9))
sns.heatmap(house_correlation, 
            xticklabels=house_correlation.columns.values,
            yticklabels=house_correlation.columns.values,cmap="YlGnBu",annot= True)
plt.show()

Data Transformation


In [None]:
house['basement_present'] = house['sqft_basement'].apply(lambda x: 1 if x > 0 else 0) # Indicate whether there is a basement or not
house['renovated'] = house['yr_renovated'].apply(lambda x: 1 if x > 0 else 0) # 1 if the house has been renovated

In [None]:
house.head()

In [None]:
house.columns

Splitting the Data into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

# We specify this so that the train and test data set always have the same rows, respectively

df_train, df_test = train_test_split(house, train_size = 0.7, test_size = 0.3, random_state = 100)

Rescaling the Features
We will use MinMax scaling.

In [None]:
house.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
# Apply scaler() to all the columns except the 'yes-no' and 'dummy' variables
num_vars = ['bedrooms', 'bathrooms','sqft_living','sqft_lot','floors','condition','grade','sqft_above',
            'sqft_basement','yr_built',
            'yr_renovated','zipcode','lat','long','sqft_living15','sqft_lot15','price']
#num_vars
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

df_train.head()

Dividing into X and Y sets for the model building

In [None]:
y_train = df_train.pop('price')
X_train = df_train

Building our model
This time, we will be using the LinearRegression function from SciKit Learn for its compatibility with RFE (which is a utility from sklearn)

RFE
Recursive feature elimination

In [None]:
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [None]:
# Running RFE with the output number of the variable equal to 10
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, 10)             # running RFE
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
X_train.columns[~rfe.support_]

Building model using statsmodel, for the detailed statistics

In [None]:
# Creating X_test dataframe with RFE selected variables
X_train_rfe = X_train[col]

In [None]:
# Adding a constant variable 
import statsmodels.api as sm  
X_train_rfe = sm.add_constant(X_train_rfe)

In [None]:
lm = sm.OLS(y_train,X_train_rfe).fit()   # Running the linear model

In [None]:
#Let's see the summary of our linear model
print(lm.summary())

In [None]:
# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
Rebuilding the model without const

In [None]:
X_train_new = X_train_rfe.drop(["const"], axis = 1)

In [None]:
# Adding a constant variable 
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_new)

In [None]:
lm = sm.OLS(y_train,X_train_lm).fit()   # Running the linear model

In [None]:
#Let's see the summary of our linear model
print(lm.summary())

In [None]:
X_train_new.columns

In [None]:
# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Residual Analysis of the train data

In [None]:
y_train_price = lm.predict(X_train_lm)

In [None]:
# Importing the required libraries for plots.
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_train_price), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  # Plot heading 
plt.xlabel('Errors', fontsize = 18)                         # X-label

## Making Predictions

In [None]:
num_vars = ['bedrooms', 'bathrooms','sqft_living','sqft_lot','floors','condition','grade','sqft_above',
            'sqft_basement','yr_built',
            'yr_renovated','zipcode','lat','long','sqft_living15','sqft_lot15','price']

df_test[num_vars] = scaler.transform(df_test[num_vars])

Dividing into X_test and y_test

In [None]:
y_test = df_test.pop('price')
X_test = df_test

In [None]:
# Now let's use our model to make predictions.

# Creating X_test_new dataframe by dropping variables from X_test
X_test_new = X_test[X_train_new.columns]

# Adding a constant variable 
X_test_new = sm.add_constant(X_test_new)

In [None]:
# Making predictions
y_pred = lm.predict(X_test_new)

In [None]:
# Plotting y_test and y_pred to understand the spread.
fig = plt.figure()
plt.scatter(y_test,y_pred)
fig.suptitle('y_test vs y_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('y_pred', fontsize=16)                          # Y-label