id - Unique ID for each home sol<br>

date - Date of the home sale <br>

price - Price of each home sold <br>

bedrooms - Number of bedrooms <br>

bathrooms - Number of bathrooms, where .5 accounts for a room with a toilet but no shower <br>

sqft_living - Square footage of the apartments interior living space <br>

sqft_lot - Square footage of the land space <br>

floors - Number of floors <br>

waterfront - A dummy variable for whether the apartment was overlooking the waterfront or not <br>

view - An index from 0 to 4 of how good the view of the property was <br>

condition - An index from 1 to 5 on the condition of the apartment,<br>

grade - An index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average level of construction and design, and 11-13 have a high quality level of construction and design. <br>

sqft_above - The square footage of the interior housing space that is above ground level <br>

sqft_basement - The square footage of the interior housing space that is below ground level <br>

yr_built - The year the house was initially built <br>

yr_renovated - The year of the house’s last renovation <br>

zipcode - What zipcode area the house is in <br>

lat - Lattitude <br>

long - Longitude <br>

sqft_living15 - The square footage of interior housing living space for the nearest 15 neighbors <br>

sqft_lot15 - The square footage of the land lots of the nearest 15 neighbors <br>

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('../input/kc-housesales-data/kc_house_data.csv')
df.head()

In [None]:
df.info() #Getting the datatype of features

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
#Dropping id and date column as they are not of any significance in predicting price
df = df.drop(['id', 'date'], axis = 1) 

In [None]:
df.head()

In [None]:
#Finding categorical and numerical columns
categorical_columns = []

for i in df.columns:
  unique_values = len(pd.unique(df[i])) 
  if unique_values < 90:
    print(f"Unique values in {i} are {len(pd.unique(df[i]))}")
    categorical_columns.append(i)
print('Categorical Columns', categorical_columns)
print('No. of categorical columns',len(categorical_columns))

# Data Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

fig, axs = plt.subplots(ncols=2, nrows=5, figsize=(20, 30)) #specifies how many diagrams we want is each row
index = 0

axs = axs.flatten()
print('length after flatten', len(axs))
print(axs[index])
for k,v in df.items():
    if k not in categorical_columns:
        sns.boxplot(y=k, data=df, ax=axs[index])
        index += 1
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0) #adjusting padding between figures

In [None]:
#Calculating percentage outlier in all numerical columns
for column in df.columns:
    
    if column not in categorical_columns:
        count = 0
        column_array = np.array(sorted(list(df[column])))
        q1 = np.percentile(column_array, 25)
        q3 = np.percentile(column_array, 75)
        iqr =  q3 - q1
        for i in column_array:
            if i < (q1 - 1.5*iqr):
                count +=1
            elif i > (q3 + 1.5*iqr):
                count +=1
       
        print(f'Total outliers in {column} = {((count/len(column_array)) * 100)}%')

In [None]:
#Plotting correlation
corrMatrix = df.corr()
plt.figure(figsize=(25,10)) # Plotting the figure of required size
ax = sns.heatmap(corrMatrix, vmin=0, vmax=1, center=0, annot=True,
                 cmap="YlGnBu", linewidths = 1.0,
                 square=True)

plt.show()

# Creating Model


In [None]:
from sklearn import linear_model

In [None]:
X = df.iloc[:,1:]
Y = df.iloc[:,0]

In [None]:
from sklearn.model_selection import train_test_split 

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 41)

In [None]:
regression = linear_model.LinearRegression()
regression.fit(X_train, Y_train)

<h2>Evaluating Model

In [None]:
Y_pred = regression.predict(X_test)

In [None]:
from sklearn import metrics
print(metrics.mean_absolute_error(Y_test, Y_pred))
print(metrics.mean_squared_error(Y_test, Y_pred))

In [None]:
MSE = np.square(np.subtract(Y_test,Y_pred)).mean() 
print(MSE)