In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from math import radians
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error,accuracy_score,mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler


In [None]:
def CorrMtx(df, dropDuplicates = True):

    # Your dataset is already a correlation matrix.
    # If you have a dateset where you need to include the calculation
    # of a correlation matrix, just uncomment the line below:
    # df = df.corr()

    # Exclude duplicate correlations by masking uper right values
    if dropDuplicates:    
        mask = np.zeros_like(df, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True

    # Set background color / chart style
    sns.set_style(style = 'white')

    # Set up  matplotlib figure
    f, ax = plt.subplots(figsize=(22, 19))

    # Add diverging colormap from red to blue
    cmap = sns.diverging_palette(250, 10, as_cmap=True)

    # Draw correlation plot with or without duplicates
    if dropDuplicates:
        sns.heatmap(df, mask=mask, cmap=cmap, 
                square=True,
                linewidth=.5, cbar_kws={"shrink": .5}, ax=ax, annot=True)
    else:
        sns.heatmap(df, cmap=cmap, 
                square=True,
                linewidth=.5, cbar_kws={"shrink": .5}, ax=ax, annot=True)


Let's load and examine the data

In [None]:
house = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')
checkhouse = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')

In [None]:
house.head()

In [None]:
house.describe()

In [None]:
#formatting check
(house['date'][0][:8])

In [None]:
house['newdate'] = [house.date[i][:8] for i in range(len(house.date))]

In [None]:
total = house.isnull().sum().sort_values(ascending=False)
percent = (house.isnull().sum()/house.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Great, no missing data!

In [None]:
categorical_feats = ['bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'condition', 'grade', 'zipcode', 'yr_built', 'yr_renovated']

Let's visually explore the data with graphs

In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(x=house.condition, y=house.price)
plt.ylabel('Price in Millions')
plt.ylim(0,1250000)
plt.xlabel('Condition')
plt.show()

In [None]:
plt.figure(figsize = (12,6))
sns.boxplot(x=house.grade, y=house.price)
plt.ylabel('Price in Millions')
plt.ylim(0,6e6)
plt.xlabel('Grade')
plt.show()

In [None]:
plt.figure(figsize = (15,6))
sns.boxplot(x=house.zipcode, y=house.price)
plt.ylabel('Price in Millions')
plt.ylim(0,3.5e6)
plt.xlabel('Zipcode')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize = (15,6))
sns.scatterplot(x=house.sqft_living, y=house.price, hue=house.bathrooms)
plt.ylabel('Price in Millions')
plt.ylim(0,6e6)
plt.xlim(0,8000)
plt.xlabel('Square Feet of House')
plt.show()

In [None]:
#plt.figure(figsize = (15,6))
#sns.scatterplot(x=house.age, y=house.grade)


Time to get to work! Let's look at some of the outlier data...

In [None]:
questionable = house[house.bedrooms > 10]
questionable

In [None]:
house = house.drop([15870])
house = house.drop([8757])

In [None]:
questionable2 = house[house.bedrooms == 0]
questionable2

In [None]:
for row in questionable2.index:
    house = house.drop([row])

So we have now reduced the dataset by 15 rows to help us to get a better prediction model. We eliminated the 33 and 11 bedroom houses
as these seemed to be erroneous...semi. We also removed houses with no bedrooms (not a house!) It would be hard to believe
a house didn't have a bathroom so let's get after that next.

In [None]:
questionable3 = house[house.bathrooms == 0]
questionable3

In [None]:
for row in questionable3.index:
    house = house.drop([row])

In [None]:
reno_year = house[house.yr_renovated > 0]
len(reno_year)

It's a bit sad to drop 3 more rows, but 18/20k+ entries is probably insignificant. Let's now work on some feature engineering
-- how about looking back at renovation year. There are 913 entries with renovations, which is not alot...

In [None]:
house['renovated'] = np.where(house.yr_renovated > 0, 1, 0)
house['since_reno'] = [(2015 - yr) for yr in house.yr_renovated]
house['since_reno'] = np.where(house.since_reno == 2015, (2015 - house['yr_built']), house['since_reno'])

Maybe having a basement affects price. You can imagine in a floodplain, a basement may not be so great! But other people may appreciate the space

In [None]:
house['has_basement'] = np.where(house.sqft_basement > 0, 1, 0)
house['basement_lot_pct'] = house.sqft_basement / house.sqft_living * 100
house['aboveground_lot_pct'] = house.sqft_above / house.sqft_living * 100
house['PctofLot'] = round((house.sqft_living / house.sqft_lot)*100,2)

In [None]:
questionable4 = house[house.PctofLot > 100]
questionable4

In [None]:
#A check to make sure we don't totally destroy the dataset -- no outstanding issues
"""house['likelyApartment'] = np.where(house.PctofLot > 100, 1, 0)
for feat in house.columns:
    plt.figure(figsize = (15,6))
    sns.scatterplot(x=house[feat], y=house.price, hue=house.likelyApartment) 
    plt.ylabel('Price in Millions')
    plt.xlabel(feat.title())
    plt.show()"""

Hmm...so to help our analysis, let's only address conventional homes...It will be unfortunate to lose 800 rows but...gotta do what you gotta do.
Here, we will remove anything that looks like an apartment and leave more likely to be conventional homes -- house on lot with lawn, so on

In [None]:
for row in questionable4.index:
    house = house.drop([row])

In [None]:
house['age'] = 2015 - house.yr_built

Okay, so now we've done some good features. Let's also convert lat/long into radians so they make sense to the model and send them into
a KNNeighbors coordinate

In [None]:
house['latRads'] = house['lat'].apply(radians)
house['longRads'] = house['long'].apply(radians)
knnX = house[['latRads', 'longRads']].values
knny = house.price.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(knnX, knny, test_size=0.25)

In [None]:
nbrs = KNeighborsRegressor(weights='distance', metric='haversine')

nbrs.fit(X_train, y_train)

y_pred = nbrs.predict(X_test)

In [None]:
house['latLongUseable'] = nbrs.predict(knnX)

In [None]:
#A check to see about waterfront's affect on pricing
"""for feat in house.columns:
    plt.figure(figsize = (15,6))
    sns.scatterplot(x=house[feat], y=house.price, hue=house.waterfront) 
    plt.ylabel('Price in Millions')
    plt.xlabel(feat.title())
    plt.show()"""

In [None]:
categorical_feats += ['renovated', 'since_reno', 'has_basement', 'age']

In [None]:
houseMatrix = house.corr()
CorrMtx(houseMatrix, True)

* bedrooms: bathrooms, sqft_living
* bathrooms: sqft_living, sqft_above, grade
* sqft_living: sqft_living15, sqft_above, grade, bathrooms, bedrooms
* sqft_lot: sqft_lot15
* floors: OK
* waterfront: OK (slight view)
* view: OK
* condition: OK (yr_built)
* grade: sqft_living15, sqft_above, bathrooms, sqft_living
* sqft_above: sqft_living15, grade
* sqft_basement:OK
* yr_built: OK
* yr_renovated: OK
* zip: OK
* lat: OK
* long: OK
* sqft_living15: OK
* sqft_lot15: sqft_lot

So problematic vars are bathrooms, sqft_above, bathrooms, bedrooms, sqft_living15, sqft_lot15, and to a lesser extent grade

In [None]:
house.columns

In [None]:
nonproblematic = ['price', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'yr_built', 'yr_renovated', 'zipcode',
                 'lat', 'long', 'newdate']

In [None]:
nonProbHouse = house[nonproblematic]
houseMatrix = nonProbHouse.corr()
CorrMtx(houseMatrix, True)

In [None]:
isproblem = nonproblematic + ['renovated', 'since_reno', 'has_basement', 'basement_lot_pct', 'aboveground_lot_pct', 
                              'PctofLot', 'age', 'latRads', 'longRads', 'latLongUseable']

Checking to see if features "isproblem"? -- we add some features to check collinearity

In [None]:
isProbHouse = house[isproblem]
houseMatrix = isProbHouse.corr()
CorrMtx(houseMatrix, True)

remove: year_built, yr_renovated, zipcode, lat, long, latRads, longRads

In [None]:
#Another check, removing more variables
isproblem2 = ['price', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'renovated', 'since_reno', 
             'has_basement', 'PctofLot', 'latLongUseable']

In [None]:
isProbHouse = house[isproblem2]
houseMatrix = isProbHouse.corr()
CorrMtx(houseMatrix, True)

Okay, looking much better...
There are still some issues with collinearity but we have made some (hopefully) intelligent decisions about what stays in the model. All remaining features ideally should tell us something about houses and, in theory, should be *relatively* independent of each other. Grade / sqft living is troubling, but we want to believe a higher sqft living shouldn't mean a higher grade...

Since reno is years since renovation defaulting to total age of the house. This is an assumption that a renovated house ought to be "like new" -- and age was an imperfect measure that, when included, leads to more collinearity concerns. Hopefully since reno gives us ultimately more information. 

In [None]:
barchart = isProbHouse.corr()
barchart = barchart.iloc[0]
barchart = barchart.drop('price')
barchart = barchart.sort_values(ascending=False)
barchart = barchart.reset_index()
barchart = barchart.rename({'index':'labels'}, axis='columns')
barchart.labels

In [None]:
xticks = barchart.index
xlabels = barchart.labels

In [None]:
plt.clf()
plt.figure(figsize=(15,10))
xs = barchart.index
ys = barchart.price
sns.barplot(xs, ys)


for x,y in zip(xs,ys):

    label = "{:.2f}".format(y)

    plt.annotate(label, # this is the text
                 (x,y), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

plt.xticks(xticks, xlabels, rotation=90)
plt.title('Correlation Coefficient Weights, Sorted')
plt.ylabel('Price')
plt.xlabel('Feature')
plt.show()

Let's make our final features and then create some models and plot their residuals

In [None]:
features = isproblem2[1:]

In [None]:
X = house[features]
linX = sm.add_constant(X)
y = house.price

In [None]:
model = sm.OLS(y,linX).fit()
linPred = model.predict(linX)
linR2 = round(model.rsquared, 2)
print(model.summary())
print('MAE: ', mean_absolute_error(linPred, y))

In [None]:
train_x,val_x,train_y,val_y=train_test_split(X,y,train_size=0.8,test_size=0.2)

In [None]:
RFRmodel=RandomForestRegressor(random_state=1)
RFRmodel.fit(train_x,train_y)
RFRpred=RFRmodel.predict(val_x)
rfrScore = RFRmodel.score(val_x,val_y)
print("Mean absolute error:",mean_absolute_error(RFRpred,val_y))
print("Model score",RFRmodel.score(val_x,val_y))

In [None]:
feature_importances = pd.DataFrame(RFRmodel.feature_importances_,
                                   index = train_x.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

In [None]:
resid = pd.DataFrame()
resid['true'] = house.price

In [None]:
resid['lin_resid'] = house.price - model.predict(linX)
resid['lin_pred'] = model.predict(linX)

In [None]:
resid['rfr_resid'] = house.price - RFRmodel.predict(X)
resid['rfr_pred'] = RFRmodel.predict(X)

In [None]:
plt.figure(figsize = (15,10))
plt.text(4,4000000,'R-squared = '+str(linR2), size=20) #HOW TO GET A R2 at a Point
ax1 = plt.subplot()
ax1.axhline(0, ls='solid', color='red')
sns.scatterplot(x=resid.index, y=resid.lin_resid, ax=ax1)
plt.show()

In [None]:
df = resid.lin_resid
plt.figure(figsize = (15,10))
plt.text(2e6,3e-6,'R-squared = '+str(linR2), size=20) #HOW TO GET A R2 at a Point
sns.kdeplot(data=df)
plt.show()

In [None]:
plt.figure(figsize = (15,10))
plt.text(4,2e6,'R-squared = '+str(round(rfrScore,2)), size=20) #HOW TO GET A R2 at a Point
ax2 = plt.subplot()
ax2.axhline(0, ls='solid', color='red')
sns.scatterplot(x=resid.index, y=resid.rfr_resid, ax=ax2)
plt.title('RandomForest Residuals')
plt.show()

In [None]:
df = resid.rfr_resid
plt.figure(figsize = (15,10))
plt.text(1e6,1e-5,'R-squared = '+str(round(rfrScore,2)), size=20) #HOW TO GET A R2 at a Point
sns.kdeplot(data=df)
plt.title('RandomForest Residual Distribution')
plt.show()

In [None]:
resid['pctFromLinTrue'] = abs(round(resid.lin_resid / resid.true *100, 2))
resid['pctFromRFRTrue'] = abs(round(resid.rfr_resid / resid.true *100, 2))

In [None]:
print(max(resid.pctFromLinTrue))
print(max(resid.pctFromRFRTrue))

In [None]:
linDist = []
rfrDist = []
for test in range(0,270):
    lin = sum(resid.pctFromLinTrue <= test) / len(resid)
    rfr = sum(resid.pctFromRFRTrue <= test) / len(resid)
    linDist.append(lin)
    rfrDist.append(rfr)

In [None]:
linDist = pd.DataFrame(linDist)
rfrDist = pd.DataFrame(rfrDist)

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x=linDist.index, y=linDist[0])
plt.title('Linear Regression Model')
plt.ylabel('Pct of Homes Estimated within each Pct Point')
plt.xlabel('Pct Points Away From True Price')
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x=rfrDist.index, y=rfrDist[0])
plt.title('RandomForest Model')
plt.ylabel('Pct of Homes Estimated within each Pct Point')
plt.xlabel('Pct Points Away From True Price')
plt.show()

To use a ridge regression, we have to scale the data

In [None]:
standardized = features + ['price']

In [None]:
min_max_scaler = MinMaxScaler()
houseCopy = house.copy(deep=True)
houseCopy[standardized] = min_max_scaler.fit_transform(houseCopy[standardized])
houseCopy.head()

In [None]:
lNX = houseCopy[features]
lNy = houseCopy.price

In [None]:
train_x,val_x,train_y,val_y=train_test_split(lNX,lNy,train_size=0.8,test_size=0.2)
ridgeModel = Ridge(alpha=1.0)
ridgeModel.fit(train_x, train_y)
ridgePred = ridgeModel.predict(val_x)
ridgeRsquared = ridgeModel.score(val_x,val_y)
print("Mean absolute error:",mean_absolute_error(ridgePred,val_y))
print("Model score",ridgeRsquared)

In [None]:
resid['std_true'] = houseCopy.price
resid['ridge_pred'] = ridgeModel.predict(lNX)
resid['ridge_resid'] = resid.std_true - resid.ridge_pred

In [None]:
train_x,val_x,train_y,val_y=train_test_split(X,y,train_size=0.8,test_size=0.2)
lassoModel = Lasso(alpha=1.0, tol=.01)
lassoModel.fit(train_x, train_y)
lassoPred = lassoModel.predict(val_x)
lassoRsquared = lassoModel.score(val_x,val_y)
print("Mean absolute error:",mean_absolute_error(lassoPred,val_y))
print("Model score",lassoRsquared)

In [None]:
resid['lasso_pred'] = lassoModel.predict(X)
resid['lasso_resid'] = resid.true - resid.lasso_pred

In [None]:
plt.figure(figsize = (15,10))
#plt.text(4,4000000,'R-squared = '+str(ridgeRsquared), size=20) #HOW TO GET A R2 at a Point
ax1 = plt.subplot()
ax1.axhline(0, ls='solid', color='red')
sns.scatterplot(x=resid.index, y=resid.ridge_resid, ax=ax1)
plt.show()

In [None]:
df = resid.ridge_resid
plt.figure(figsize = (15,10))
#plt.text(2e6,3e-6,'R-squared = '+str(ridgeRsquared), size=20) #HOW TO GET A R2 at a Point
sns.kdeplot(data=df)
plt.show()

In [None]:
plt.figure(figsize = (15,10))
#plt.text(4,4000000,'R-squared = '+str(ridgeRsquared), size=20) #HOW TO GET A R2 at a Point
ax1 = plt.subplot()
ax1.axhline(0, ls='solid', color='red')
sns.scatterplot(x=resid.index, y=resid.lasso_resid, ax=ax1)
plt.show()

In [None]:
df = resid.lasso_resid
plt.figure(figsize = (15,10))
#plt.text(2e6,3e-6,'R-squared = '+str(ridgeRsquared), size=20) #HOW TO GET A R2 at a Point
sns.kdeplot(data=df)
plt.show()

In [None]:
resid['pctFromLassoTrue'] = abs(round((resid.lasso_resid / resid.true) *100, 2))

In [None]:
resid['pctFromRidgeTrue'] = np.where(resid.std_true > 0, 
                                     abs(resid.ridge_resid / resid.std_true)*100,
                                     0)

In [None]:
lassoDist = [(sum(resid.pctFromLassoTrue <= test) / len(resid)) for test in range(0,378)]
ridgeDist = [(sum(resid.pctFromRidgeTrue <= test) / len(resid)) for test in range(0, 5921)]

In [None]:
lassoDist = pd.DataFrame(lassoDist)
ridgeDist = pd.DataFrame(ridgeDist)

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x=lassoDist.index, y=lassoDist[0])
plt.title('Lasso Regression Model')
plt.ylabel('Pct of Homes Estimated within each Pct Point')
plt.xlabel('Pct Points Away From True Price')
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x=ridgeDist.index, y=ridgeDist[0])
plt.title('Ridge Regression Model')
plt.ylabel('Pct of Homes Estimated within each Pct Point')
plt.xlabel('Pct Points Away From True Price')
plt.show()

In [None]:
compare = pd.concat([linDist, rfrDist, ridgeDist, lassoDist], axis=1, keys=['Linear', 'RandomForest', 'Ridge', 'Lasso'])

In [None]:
compare

In [None]:
plt.figure(figsize=(15,10))
sns.lineplot(data=compare)
plt.title('Comparison of Regression Model')
plt.ylabel('Pct of Homes Estimated within each Pct Point')
plt.xlabel('Pct Points Away From True Price')
plt.xlim(-1,75)
plt.show()

How much data is within each percentage point using the RFR model? In the next table, the index is the percentage from true -- .8pct of the dataset is exactly precise. 43.78% of the predictions are within 1% of the true price. 59.91% of the predictions are within 2% of the true price...and so on.

In [None]:
rfrPctCaptured = compare.RandomForest.head(21)
rfrPctCaptured

A check on how many rows have been eliminated, just to be sure we have a significant result in terms of removed rows

In [None]:
len(house)

In [None]:
len(checkhouse)

Some final chart renders for exporting to slides...

In [None]:
plt.figure(figsize = (12,6))
sns.boxplot(x=house.has_basement, y=house.price)
plt.ylim(0,1.25e6)
plt.ylabel('Price')
plt.xlabel('Basement')
plt.show()

In [None]:
plt.figure(figsize = (12,6))

sns.boxplot(x=house.view, y=house.price)
plt.ylabel('Price')
plt.ylim(0,3e6)
plt.xlabel('View')
plt.show()

In [None]:
plt.figure(figsize = (12,6))
sns.boxplot(x=house.floors, y=house.price)
plt.ylim(0,1.25e6)
plt.ylabel('Price')
plt.xlabel('Basement')
plt.show()

In [None]:
plt.figure(figsize = (12,6))
sns.boxplot(x=house.renovated, y=house.price)
plt.ylim(0,1.75e6)
plt.ylabel('Price')
plt.xlabel('Renovated')
plt.show()

In [None]:
plt.figure(figsize = (12,6))
sns.scatterplot(x=house.sqft_lot, y=house.price, hue=house.PctofLot)
plt.xlim(0, .5e6)
#plt.ylim(0,2e6)
plt.xlabel('Square Feet of Lot')
plt.ylabel('Price')

In [None]:
plt.figure(figsize = (12,6))
sns.scatterplot(x=house.sqft_lot, y=house.price, hue=house.grade)
plt.xlim(0, 1e5)
plt.ylim(0,4e6)
plt.xlabel('Square Feet of Lot')
plt.ylabel('Price')

In [None]:
plt.figure(figsize = (12,6))
sns.scatterplot(x=house.sqft_living, y=house.price, hue=house.grade)
#plt.xlim(0, 1e5)
#plt.ylim(0,4e6)
plt.xlabel('Square Feet of Lot')
plt.ylabel('Price')

In [None]:
plt.figure(figsize = (12,6))
sns.scatterplot(x=house.yr_built, y=house.price, hue=house.renovated)
#plt.xlim(0, .5e6)
#plt.ylim(0,2e6)
plt.xlabel('Year Built')
plt.ylabel('Price')

Calculations, also for slides...

In [None]:
reno = house[house.renovated == 1]
noReno = house[house.renovated == 0]
print(reno.price.mean())
print(noReno.price.mean())

In [None]:
reno.price.mean() - noReno.price.mean()

In [None]:
(reno.price.mean() - noReno.price.mean()) / noReno.price.mean()

In [None]:
basemnt = house[house.has_basement == 1]
noBasemnt = house[house.has_basement == 0]
print(basemnt.price.mean())
print(noBasemnt.price.mean())

In [None]:
(basemnt.price.mean() - noBasemnt.price.mean()) / noBasemnt.price.mean()

In [None]:
view4 = house[house.view < 4]
view5 = house[house.view == 4]
print(view4.price.mean())
print(view5.price.mean())

In [None]:
(view5.price.mean() - view4.price.mean()) / view4.price.mean()

In [None]:
wtft = house[house.waterfront == 1]
nowtft = house[house.waterfront == 0]
print(wtft.price.mean())
print(nowtft.price.mean())

In [None]:
(wtft.price.mean() - nowtft.price.mean()) / nowtft.price.mean()

In [None]:
meh = house[house.condition < 3]
good = house[house.condition >= 3]
print(meh.price.mean())
print(good.price.mean())

In [None]:
(good.price.mean() - meh.price.mean()) / meh.price.mean()

This concludes this exercise. We have seen that many factors influence house price, but location (here in latLongUseable KNN points) is the most important determining factor of price, followed by sqft_living. This seems to make sense as sqft_living also encapsulates bathrooms and bedrooms, with which it had high correlation. 