In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, mean_squared_log_error
%matplotlib inline
df = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')
pd.set_option('float_format', '{:f}'.format)

# Exploring The Data

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

* No missing values to deal with
* Dataset has 21,613 rows and 20 columns 
* ID variable doesn't seem to be very helpful for predicting house prices so I will remove it

In [None]:
df.drop(columns = ['id'], inplace= True)

# Analyzing date of sale vs price

In [None]:
df['date']

In [None]:
df['date'] = df['date'].apply(lambda x: x[0:8])
df['date'] = pd.to_datetime(df['date'], format = '%Y%m%d') # convert to datetime object to analyze time patterns
recent = df['date'].min()
latest = df['date'].max()
print(recent, latest) #date range of this dataset is around one year starting on May 2nd, 2014 to May 27th, 2015

In [None]:
df['date']

In [None]:
df['month'] = df["date"].dt.month
df['day'] = df['date'].dt.day

In [None]:
plt.figure(figsize = (18, 10))
sns.lineplot(x = df['date'], y = df['price'])

In [None]:
plt.figure(figsize = (18, 10))
sns.boxplot(x = df['month'], y = df['price'])

In [None]:
plt.figure(figsize = (18, 10))
sns.boxplot(x = df['day'], y=  df['price'])

Looking at these graphs, we can see that the date of house purchase can't really explain the price of a house, therefore we will remove these columns.

In [None]:
df.drop(columns = ['date', 'day', 'month'], inplace = True)

# Distributions Of Each Variable

In [None]:
cols = df.columns
fig, ax = plt.subplots(nrows = 5, ncols = 4, figsize = (18, 18))
for i, ax in enumerate(fig.axes):
    if i >= len(cols):
        fig.delaxes(ax)
    else:
        sns.histplot(x = df[cols[i]], ax = ax)

1. Variables like waterfront, view, sqft_basement, yr_renovated have a lot of 0s, and some categorical variables like floors and view are unevenly distributed
    
    a. May affect data visualization and model performance
    
    b. Remove variable if there is no significant difference in price between 0s and non 0s
    
    c. Remove some of the categorical variables if seen necessary later on

2. Numeric variables like price, sqft_living, sqft_above are skewed to the left
    
    a. Removing outliers may fix the distribution
    
    b. Normalizing the distribution may benefit specific models

# Looking into variables w/ a lot of 0s

In [None]:
# waterfront
print(len(df[df['waterfront'] == 0]) / len(df['waterfront'])) # percentage of 0s
sns.boxplot(x = df['waterfront'], y = df['price'])

99% of the houses in this dataset are not near waterfronts. The boxplot shows that the difference in price between 0s and 1s is significant enough so we will keep this variable.

In [None]:
# view
print(len(df[df['view'] == 0]) / len(df['view']))
sns.boxplot(x = df['view'], y = df['price'])

90% of the houses in this dataset have a *view* grade of 0. The boxplot shows that there is as slight increase in price as the view rating increases so we will keep this variable.

In [None]:
print(df[df['sqft_basement'] == 0]['price'].describe())
print('-------------------------')
print(df[df['sqft_basement'] != 0]['price'].describe())

We can also see that the difference in mean between houses w/ and w/out basements is around $140000 which is significant enough to keep this variable.

In [None]:
# yr_renovated
print(len(df[df['yr_renovated'] == 0]) / len(df['yr_renovated']))
df2 = df[df['yr_renovated'] != 0]
sns.regplot(x = df2['yr_renovated'], y = df2['price'], scatter_kws = {'alpha': .3, 's': 10})

95% of the houses in this dataset were never renovated, and the regression plot (ignoring houses that haven't been renovated) shows a slightly positive relationship between *price* and *yr_renovated*  

In [None]:
print(df[df['yr_renovated'] == 0]['price'].describe())
print('--------------------------')
print(df[df['yr_renovated'] != 0]['price'].describe())

Although the count of houses that have been renovated is way less than the count of houses that haven't (5%:95%), we can still see a slightly higher price mean with houses that have been renovated. 

In [None]:
# sqft_basement
print(len(df[df['sqft_basement'] == 0]) / len(df['sqft_basement']))
df2 = df[df['sqft_basement'] != 0]
sns.regplot(x = df2['sqft_basement'], y = df2['price'], scatter_kws = {'alpha': .3, 's': 10})

60% of houses in this dataset do not have basements. The regression plot (ignoring houses with no basements) shows that there is a slightly positive relationship between *sqft_basement* and *price*. There are a couple outliers that are way off the other datapoints and the reg line which may affect our model performance later on, so I will remove those. 

In [None]:
df2[df2['price'] > 7000000]

In [None]:
# dropping outliers
df.drop([3914, 7252], inplace = True)

# Bivariate analysis between independent and dependent variables

In [None]:
df.columns
num = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement','sqft_living15', 'sqft_lot15']
fig, ax = plt.subplots(nrows = 2, ncols = 3, figsize = (18, 9))
for i, ax in enumerate(fig.axes):
    if i >= len(num):
        fig.delaxes(ax)
    else:
        sns.regplot(x = df[num[i]], y = df['price'], scatter_kws = {'alpha': .3, 's': 10}, ax = ax)

Most numerical variables seem to be well linearly correlated with *price*. Big outlier in *sqft_living*, will drop this. *sqft_lot* and *sqft_lot15* do not look like good predictor variables as the data doesn't seem consistent enough with *price*, therefore I will drop these variables. 

In [None]:
df.drop(columns = ['sqft_lot', 'sqft_lot15'], inplace = True)

In [None]:
df[df['sqft_living'] > 12000]

In [None]:
df.drop([12777], inplace = True)

In [None]:
df.columns
categorical = ['bedrooms', 'floors', 'waterfront', 'view', 'condition', 'grade', 'bathrooms']
fig, ax = plt.subplots(nrows = 2, ncols = 4, figsize = (25, 9))
for i, ax in enumerate(fig.axes):
    if i >= len(categorical):
        fig.delaxes(ax)
    else:
        sns.boxplot(x = df[categorical[i]], y = df['price'], ax = ax)

Most categorical variables seem to be linearly correlated with *price*. Outlier in *bedrooms* where a house has 33 bedrooms.*floors* sees an increase in price until it reaches 3 floors where it unusally drops. We saw in univariate analysis that *floors* was not distributed well, which may have caused the unusual drop in price, therefore I will drop this column. I will also drop the outlier in *bedrooms*.

In [None]:
df[df['bedrooms'] == 33]

In [None]:
# drop bedroom outlier
df.drop([15870], inplace = True)

In [None]:
df.drop(columns = ['floors'], inplace = True)

# Normalizing distribution of numeric variables

To normalize *price*, *sqft_above*, and *sqft_living*, I will remove some outliers and use log transformation.

In [None]:
sns.boxplot(x = df['price'])
df[df['price'] > 4000000].index
temp = df.drop([1164, 1315, 1448, 2626, 4411, 8092, 8638, 9254, 12370])
df['price'] = np.log2(df['price'])

In [None]:
sns.histplot(x = df['price'])

In [None]:
sns.boxplot(x = df['sqft_living'])
df['sqft_living'] = np.log2(df['sqft_living'])

In [None]:
sns.histplot(x = df['sqft_living'])

In [None]:
sns.boxplot(x = df['sqft_above'])
df[df['sqft_above'] > 8000]
df.drop([18302], inplace = True)
df['sqft_above'] = np.log2(df['sqft_above'])

In [None]:
sns.histplot(x = df['sqft_above'])

# Model Testing

In [None]:
from sklearn.metrics import r2_score
X = df.drop(columns = ['price'])
y = df['price']
models = ['Linear', 'Ridge', 'Lasso', 'RandomForest', 'XGBoost']
final = []

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
kf = KFold(n_splits = 10)
scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    mod = LinearRegression().fit(X_train, y_train)
    pred = mod.predict(X_test)
    scores.append(r2_score(pred, y_test))
final.append(np.mean(scores))

In [None]:
# Ridge Regression
from sklearn import linear_model
kf = KFold(n_splits = 10)
scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    mod = linear_model.Ridge(alpha = 2).fit(X_train, y_train)
    pred = mod.predict(X_test)
    scores.append(r2_score(pred, y_test))
final.append(np.mean(scores))

In [None]:
#Lasso Regression
from sklearn.ensemble import RandomForestRegressor
kf = KFold(n_splits = 10)
scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    mod = linear_model.Lasso().fit(X_train, y_train)
    pred = mod.predict(X_test)
    scores.append(r2_score(pred, y_test))
final.append(np.mean(scores))

In [None]:
# Random Forest Regression 
from sklearn.ensemble import RandomForestRegressor
kf = KFold(n_splits = 10)
scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    mod = RandomForestRegressor().fit(X_train, y_train)
    pred = mod.predict(X_test)
    scores.append(r2_score(pred, y_test))
final.append(np.mean(scores))

In [None]:
# XGBoost
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
kf = KFold(n_splits = 10)
scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    mod = XGBRegressor().fit(X_train, y_train)
    pred = mod.predict(X_test)
    scores.append(r2_score(pred, y_test))
final.append(np.mean(scores))

In [None]:
data = {'Model': models, 'R2 Score': final}
df2 = pd.DataFrame(data)
sns.barplot(x = df2['R2 Score'], y = df2['Model'], orient = 'h')

XGBoost (without any parameter changes) seems to be the best model with an R2 score of .88, so I will optimize its parameters using GridSearch to get the best R2 score

In [None]:
from sklearn.model_selection import GridSearchCV
mod = XGBRegressor(n_jobs = 5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
params = {'n_estimators': [100, 250, 500, 750], 'max_depth': [3, 5, 7, 9], 'learning_rate' : [.1, .3, .5, .7]}
grid = GridSearchCV(mod, scoring = 'r2', cv = 10, param_grid = params, n_jobs = 5)
grid.fit(X, y)

In [None]:
print(grid.best_score_)

With GridSearch, my final R2 score was around .89