In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
import xgboost
import matplotlib. pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Let's load the data into a Pandas DataFrame

In [None]:
df = pd.read_csv("/kaggle/input/housesalesprediction/kc_house_data.csv")


Let's use the head method to get a brief overview

In [None]:
df.head()

We have 21 features to work with, but the columns of 'id, 'date' won't be of much use, let's discard them 

In [None]:
df.drop(columns=['id','date'],inplace=True)

Let's see the available features that we can work with

In [None]:
df.columns

We have 16 features left, all of which appear important so let's do some quick EDA to determine what to prioritize. All features relating to the size of the house should show a sufficiently linear releationship with our target, i.e the price, so let's see if that's the case 

In [None]:
sns.scatterplot(x=df['sqft_living'],y=df['price'])



Our intution seems right, let's do it for the rest of area based features just to confirm

In [None]:
fig, (sp1,sp2,sp3,sp4,sp5,sp6) = plt.subplots(1, 6, figsize=(10, 6)) 
sp1.scatter(df['sqft_living'], df['price']) 
sp2.scatter(df['sqft_lot'], df['price']) 
sp3.scatter(df['sqft_above'], df['price'])
sp4.scatter(df['sqft_basement'],df['price'])
sp5.scatter(df['sqft_living15'],df['price'])
sp6.scatter(df['sqft_lot15'],df['price'])

sp1.set_xlabel('sqft_living', fontsize=8) 
sp1.set_ylabel('price', fontsize=8)

sp2.set_xlabel('sqft_lot', fontsize=8) 
sp2.set_ylabel('price', fontsize=8)

sp3.set_xlabel('sqft_above', fontsize=8) 
sp3.set_ylabel('price', fontsize=8)

sp4.set_xlabel('sqft_basement',fontsize=8)
sp4.set_ylabel('price',fontsize=8)

sp5.set_xlabel('sqft_living15',fontsize=8)
sp5.set_ylabel('price',fontsize=8)

sp6.set_xlabel('sqft_lot15',fontsize=8)
sp6.set_ylabel('price',fontsize=8)


sp3.set_title('Correlation of house area based features', fontsize=15, pad=40)
plt.tight_layout()
plt.show()

The trend seems to hold, so let's move on to the next most important features, the number of bedrooms and bathrooms

Let's examine the no_of_bedrooms and bathrooms, to see how they are correlated with the target

In [None]:
sns.scatterplot(x=df['bedrooms'],y=df['price'])

As expected, having more bedrooms increases the chance of shifting the price higher but it cannot be exactly called a strong predictor, neverthless there is usab;e information, so let's proceed

In [None]:
sns.scatterplot(x=df['bathrooms'],y=df['price'])

It's the same with bathrooms, although we see higher priced houses almost exclusively in houses with 3 or more bathrooms

Grade parameter is very important as it directly gives us information on the overall quality of the house, which means it will play an important role in our predictor, so let's check it out.

In [None]:
sns.countplot(x=df['grade'])

In [None]:
sns.scatterplot(x=df['grade'],y=df_target)

As expected, higher grades houses have many more expensive houses

In [None]:
sns.countplot(x=df['view'])
sns.countplot(x=df['waterfront'])

Only very few houses have a "view" and a "waterfront", so they can be safely ignored and dropped from the dataset.

In [None]:
df.drop(columns=['waterfront','view'],inplace=True)

Let's plot a correlation table and a heatmap to get a general idea on how the features are correlated

In [None]:
df_corr = df.corr()
df.corr().T



In [None]:
sns.heatmap(df_corr)

Let's begin building the model now, first we sperate the target and the dataset into training and test sets.

In [None]:
df_target = df['price']
df.drop(columns=['price'],inplace=True)

Put all features in one single list

In [None]:
features = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15']

Splitting our dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df1[features],df_target,test_size=0.3)

Create a 'regressors' list, items will be the regressor model we will fit to our data

In [None]:
regressors = [[LinearRegression(),'Linear Regression'],[KNeighborsRegressor(),'KNeighborsRegressor'],[xgboost.XGBRegressor(),'XGB Regressor']]

In [None]:
for regressor in regressors:
    model = regressor[0]
    model.fit(X_train,y_train)
    model_name = regressor[1]
    score_list = []
    
    pred = model.predict(X_test)
    score = model.score(X_test,y_test)
    score_list.append(score)
    
    print(model_name, 'model score: ' + str(round(score*100, 2)) + '%')
    