In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# read csv file
kc = pd.read_csv('../input/kc_house_data.csv')

In [None]:
# set maximum number of columns to display
pd.set_option('display.max_columns',25)

In [None]:
# check data contents
kc.head()

In [None]:
# check data types
kc.dtypes

Good that all the data is in numerical format of either float64 or int64

In [None]:
# check if there are any null values
kc.isnull().sum()

We do not have null values

In [None]:
# Dropping 'id' and 'date' columns as they are not relevant to us
kc = kc.drop(columns=['id', 'date'])

In [None]:
# Import visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Set figure size for seaborn
sns.set(rc={'figure.figsize':(20,10)})

In [None]:
# heatmap for correlation of columns
sns.heatmap(kc.corr(), annot=True, cmap="YlGnBu")

In [None]:
# dropping 'condition', 'yr_renovated' and 'zipcode' as they do not seem to have high correlation
kc = kc.drop(columns=['condition', 'yr_renovated', 'zipcode'])

In [None]:
# Check distribution of 'price'
sns.distplot(kc.price)

From the heatmap we see 'price' and 'sqft_living' have a high correlation. Lets explore it with scatterplot

In [None]:
sns.scatterplot(x='price', y='sqft_living',data=kc)

In [None]:
# Do train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(kc.drop(columns='price'), kc.price, test_size=0.3)

In [None]:
# Import Regression algorithms
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, ElasticNet, RANSACRegressor, SGDRegressor, HuberRegressor, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error

Check how our regression algorithms perform

In [None]:
regressors = [DecisionTreeRegressor( ), LinearRegression( ), Lasso(max_iter=2000), Ridge( ), RidgeCV( ),ElasticNet(max_iter=2000), RANSACRegressor( ), SGDRegressor(max_iter=1000, tol=1e-3), HuberRegressor( ), BayesianRidge( ), AdaBoostRegressor(n_estimators=250, learning_rate=0.1), GradientBoostingRegressor( ), RandomForestRegressor(n_estimators=100), BaggingRegressor( ), ExtraTreesRegressor(n_estimators=100), SVR(gamma='scale'), KNeighborsRegressor( ), XGBRegressor( ), MLPRegressor(max_iter=500)]

for rgrs in regressors:
    rgrs.fit(X_train, y_train)
    name = rgrs.__class__.__name__
    
    print('='*30)
    print(name)
    print('****Results****')
    preds = rgrs.predict(X_test)
    print(f'Score : {rgrs.score(X_test,y_test)}')
    print(f'RMSE : {mean_squared_error(y_test, preds)**0.5}')

We see that ExtraTreesRegressor performs with 88% accuracy while SGDRegressor performs badly. Lets explore which features have the biggest impact on predictions with respect to the best and worst performing algorithms.

In [None]:
# Import PermutationImportance which provides insight into feature importance
import eli5
from eli5.sklearn import PermutationImportance

In [None]:
# Feature importance for ExtraTreesRegressor
etr_model = ExtraTreesRegressor(n_estimators=100).fit(X_train, y_train)
perm_etr = PermutationImportance(etr_model).fit(X_test, y_test)
eli5.show_weights(perm_etr, feature_names = X_test.columns.tolist())

We see that features like 'lat', 'grade', 'sqft_living' and 'long' have higher impact when making predictions for ExtraTreesRegressor so it performs well

In [None]:
# Feature importance for SGDRegressor
sgdr_model = SGDRegressor().fit(X_train, y_train)
perm_sgdr = PermutationImportance(sgdr_model).fit(X_test, y_test)
eli5.show_weights(perm_sgdr, feature_names = X_test.columns.tolist())

We see that features like 'sqft_lot' and 'sqft_lot15' have higher impact when making predictions for SGDRegressor, hence it does not performs well