In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
df = pd.read_csv('data/data_clean.csv')

In [2]:
def model_diag(results):
    rmse = round(((results.resid ** 2).sum() / len(y)) ** 0.5,0)/1000
    mae = round(results.resid.abs().sum() / len(y),0)/1000
    print(f'MAE:{mae}, RMSE: {rmse}')
    print(results.summary())
    fig, ax = plt.subplots(1,3,figsize=(20,5))
    sns.scatterplot(x= df['sqft_living_norm'],y= results.resid, ax=ax[0])
    sns.histplot(results.resid, bins=20, element="step", kde=True, ax=ax[1])
    sm.graphics.qqplot(results.resid, dist=stats.norm, line='45', fit=True, ax=ax[2])
    plt.show();

In [3]:
def result_df(results):
    """
    This function takes a results object and creates a dataframe with a single row
    including adjusted R-squared, Mean Absolute Error, the number of p-values greater
    than 0.05 from the coefficients, and Conditional Number. 
    """
    res_dic = {}
    res_dic['features'] = len(results.params)
    res_dic['r2_adj'] = round(results.rsquared_adj * 100, 2)
    res_dic['MAE'] = round(results.resid.abs().sum() / len(y),0)
    res_dic['large_pvals'] = (results.pvalues.apply(lambda x: round(x, 2))
                              > 0.05).sum()
    res_dic['cond_num'] = results.condition_number
    pvals = results.pvalues.apply(lambda x: round(x, 2))
    large_pvals = list(pvals[pvals > 0.05].index)
    if len(large_pvals) == 0:
        print(abs(results.params).sort_values())
    else:
        print(f'Large_pvals: {list(pvals[pvals>0.05].index)}')
    return pd.DataFrame(res_dic, index=[0])

In [None]:
df.groupby('zipcode').mean()['price'].sort_values()[df.groupby('zipcode').mean()['price'].sort_values()==956097.5794491526]

# Baseline

In [None]:
# Since sqft_living is the feature with the strongest correlation, let's build a simple linear regression with that.
y = df['price']
X = df['sqft_living_norm']
model = sm.OLS(y, sm.add_constant(X))
baseline_results = model.fit()
model_diag(baseline_results)

# Model Iteration

In [None]:
#1 Starting with all features selected based on EDA
keep_X = df[[
    'sqft_living_norm', 'bathrooms_norm', 'bedrooms_norm', 'view_norm',
    'sqft_basement_norm', 'floors_norm', 'greenbelt', 'waterfront', 'zipcode'
]]
y = df['price']

# Creating dummy variables for categorical features. Dropping properties not the greenbelt, not on the waterfront,
# and the zipcode with the median average price.
X = pd.get_dummies(keep_X, columns=[
    'greenbelt', 'waterfront', 'zipcode'
]).drop(columns=['greenbelt_0.0', 'waterfront_0.0', 'zipcode_98059.0'])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = result_df(results)
res_df

In [None]:
#2 Dropping large p-values.
keep_X = df[[
    'sqft_living_norm', 'bathrooms_norm', 'bedrooms_norm', 'view_norm',
    'sqft_basement_norm', 'floors_norm', 'greenbelt', 'waterfront', 'zipcode'
]]
y = df['price']

X = pd.get_dummies(
    keep_X, columns=['greenbelt', 'waterfront', 'zipcode']).drop(columns=[
        'greenbelt_0.0', 'waterfront_0.0', 'zipcode_98059.0',
        'zipcode_98019.0', 'zipcode_98146.0'
    ])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

In [None]:
#3 Dropping bathrooms
keep_X = df[[
    'sqft_living_norm', 'bedrooms_norm', 'view_norm',
    'sqft_basement_norm', 'floors_norm', 'greenbelt', 'waterfront', 'zipcode'
]]
y = df['price']

X = pd.get_dummies(
    keep_X, columns=['greenbelt', 'waterfront', 'zipcode']).drop(columns=[
        'greenbelt_0.0', 'waterfront_0.0', 'zipcode_98059.0',
        'zipcode_98019.0', 'zipcode_98146.0'
    ])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

In [None]:
#4 Dropping floors
keep_X = df[[
    'sqft_living_norm', 'bedrooms_norm', 'view_norm', 'sqft_basement_norm',
    'greenbelt', 'waterfront', 'zipcode'
]]
y = df['price']

X = pd.get_dummies(
    keep_X, columns=['greenbelt', 'waterfront', 'zipcode']).drop(columns=[
        'greenbelt_0.0', 'waterfront_0.0', 'zipcode_98059.0',
        'zipcode_98019.0', 'zipcode_98146.0'
    ])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

In [None]:
#5 Dropping greenbelt
keep_X = df[[
    'sqft_living_norm', 'bedrooms_norm', 'view_norm', 'sqft_basement_norm',
    'waterfront', 'zipcode'
]]
y = df['price']

X = pd.get_dummies(keep_X, columns=['waterfront', 'zipcode']).drop(columns=[
    'waterfront_0.0', 'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0'
])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

In [None]:
#6 Dropping bedrooms
keep_X = df[[
    'sqft_living_norm', 'view_norm', 'sqft_basement_norm',
    'waterfront', 'zipcode'
]]
y = df['price']

X = pd.get_dummies(keep_X, columns=['waterfront', 'zipcode']).drop(columns=[
    'waterfront_0.0', 'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0'
])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

In [None]:
#7 Dropping sqft_basement
keep_X = df[['sqft_living_norm', 'view_norm', 'waterfront', 'zipcode']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['waterfront', 'zipcode']).drop(columns=[
    'waterfront_0.0', 'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0'
])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

In [None]:
#8 Dropping large p-values
keep_X = df[['sqft_living_norm', 'view_norm', 'waterfront', 'zipcode']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['waterfront', 'zipcode']).drop(columns=[
    'waterfront_0.0', 'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0',
    'zipcode_98045.0', 'zipcode_98106.0', 'zipcode_98108.0'
])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

In [None]:
#9 Dropping view
keep_X = df[['sqft_living_norm', 'waterfront', 'zipcode']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['waterfront', 'zipcode']).drop(columns=[
    'waterfront_0.0', 'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0',
    'zipcode_98045.0', 'zipcode_98106.0', 'zipcode_98108.0'
])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

In [None]:
#10 Dropping large p-value
keep_X = df[['sqft_living_norm', 'waterfront', 'zipcode']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['waterfront', 'zipcode']).drop(columns=[
    'waterfront_0.0', 'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0',
    'zipcode_98045.0', 'zipcode_98106.0', 'zipcode_98108.0','zipcode_98166.0'
])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

In [None]:
#11a Testing what it would look like to drop zipcode
keep_X = df[['sqft_living_norm', 'waterfront']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['waterfront']).drop(columns=[
    'waterfront_0.0'])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
result_df(results)

In [None]:
#11b Testing what it would look like to drop waterfront
keep_X = df[['sqft_living_norm', 'zipcode']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['zipcode']).drop(columns=[
    'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0', 'zipcode_98045.0',
    'zipcode_98106.0', 'zipcode_98108.0', 'zipcode_98166.0'
])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df
# I'm choosing to drop waterfront given this model has a significantly
# higher adjusted R-squared and lower Mean Absolute Error

In [None]:
#12 Dropping large p-values
keep_X = df[['sqft_living_norm', 'zipcode']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['zipcode']).drop(columns=[
    'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0', 'zipcode_98045.0',
    'zipcode_98106.0', 'zipcode_98108.0', 'zipcode_98166.0', 'zipcode_98014.0',
    'zipcode_98051.0', 'zipcode_98288.0'
])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df


In [None]:
#13 Including top 50 zipcodes
keep_X = df[['sqft_living_norm', 'zipcode']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['zipcode']).drop(columns=[
    'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0', 'zipcode_98045.0',
    'zipcode_98106.0', 'zipcode_98108.0', 'zipcode_98166.0', 'zipcode_98014.0',
    'zipcode_98051.0', 'zipcode_98288.0', 'zipcode_98056.0', 'zipcode_98178.0',
    'zipcode_98065.0', 'zipcode_98118.0', 'zipcode_98133.0', 'zipcode_98057.0',
    'zipcode_98155.0', 'zipcode_98126.0', 'zipcode_98070.0', 'zipcode_98168.0',
    'zipcode_98148.0', 'zipcode_98198.0', 'zipcode_98028.0', 'zipcode_98058.0',
    'zipcode_98188.0'
])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df



In [None]:
#14 Including top 40 zipcodes
keep_X = df[['sqft_living_norm', 'zipcode']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['zipcode']).drop(columns=[
    'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0', 'zipcode_98045.0',
    'zipcode_98106.0', 'zipcode_98108.0', 'zipcode_98166.0', 'zipcode_98014.0',
    'zipcode_98051.0', 'zipcode_98288.0', 'zipcode_98056.0', 'zipcode_98178.0',
    'zipcode_98065.0', 'zipcode_98118.0', 'zipcode_98133.0', 'zipcode_98057.0',
    'zipcode_98155.0', 'zipcode_98126.0', 'zipcode_98070.0', 'zipcode_98168.0',
    'zipcode_98148.0', 'zipcode_98198.0', 'zipcode_98028.0', 'zipcode_98058.0',
    'zipcode_98188.0', 'zipcode_98024.0', 'zipcode_98125.0', 'zipcode_98038.0',
    'zipcode_98055.0', 'zipcode_98354.0', 'zipcode_98047.0', 'zipcode_98032.0',
    'zipcode_98002.0', 'zipcode_98072.0', 'zipcode_98011.0'
])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

In [None]:
#15 Including top 30 zipcodes
keep_X = df[['sqft_living_norm', 'zipcode']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['zipcode']).drop(columns=[
    'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0', 'zipcode_98045.0',
    'zipcode_98106.0', 'zipcode_98108.0', 'zipcode_98166.0', 'zipcode_98014.0',
    'zipcode_98051.0', 'zipcode_98288.0', 'zipcode_98056.0', 'zipcode_98178.0',
    'zipcode_98065.0', 'zipcode_98118.0', 'zipcode_98133.0', 'zipcode_98057.0',
    'zipcode_98155.0', 'zipcode_98126.0', 'zipcode_98070.0', 'zipcode_98168.0',
    'zipcode_98148.0', 'zipcode_98198.0', 'zipcode_98028.0', 'zipcode_98058.0',
    'zipcode_98188.0', 'zipcode_98024.0', 'zipcode_98125.0', 'zipcode_98038.0',
    'zipcode_98055.0', 'zipcode_98354.0', 'zipcode_98047.0', 'zipcode_98032.0',
    'zipcode_98002.0', 'zipcode_98072.0', 'zipcode_98011.0', 'zipcode_98031.0',
    'zipcode_98003.0', 'zipcode_98022.0', 'zipcode_98030.0', 'zipcode_98042.0',
    'zipcode_98144.0', 'zipcode_98010.0', 'zipcode_98177.0', 'zipcode_98001.0',
    'zipcode_98023.0'
])
model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

In [None]:
#16 Including top 20 zipcodes
keep_X = df[['sqft_living_norm', 'zipcode']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['zipcode']).drop(columns=[
    'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0', 'zipcode_98045.0',
    'zipcode_98106.0', 'zipcode_98108.0', 'zipcode_98166.0', 'zipcode_98014.0',
    'zipcode_98051.0', 'zipcode_98288.0', 'zipcode_98056.0', 'zipcode_98178.0',
    'zipcode_98065.0', 'zipcode_98118.0', 'zipcode_98133.0', 'zipcode_98057.0',
    'zipcode_98155.0', 'zipcode_98126.0', 'zipcode_98070.0', 'zipcode_98168.0',
    'zipcode_98148.0', 'zipcode_98198.0', 'zipcode_98028.0', 'zipcode_98058.0',
    'zipcode_98188.0', 'zipcode_98024.0', 'zipcode_98125.0', 'zipcode_98038.0',
    'zipcode_98055.0', 'zipcode_98354.0', 'zipcode_98047.0', 'zipcode_98032.0',
    'zipcode_98002.0', 'zipcode_98072.0', 'zipcode_98011.0', 'zipcode_98031.0',
    'zipcode_98003.0', 'zipcode_98022.0', 'zipcode_98030.0', 'zipcode_98042.0',
    'zipcode_98144.0', 'zipcode_98010.0', 'zipcode_98177.0', 'zipcode_98001.0',
    'zipcode_98023.0', 'zipcode_98092.0', 'zipcode_98136.0', 'zipcode_98077.0',
    'zipcode_98027.0', 'zipcode_98117.0', 'zipcode_98116.0', 'zipcode_98107.0',
    'zipcode_98053.0', 'zipcode_98034.0', 'zipcode_98115.0'
])
model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

In [None]:
#17 Including top 10 zipcodes
keep_X = df[['sqft_living_norm', 'zipcode']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['zipcode']).drop(columns=[
    'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0', 'zipcode_98045.0',
    'zipcode_98106.0', 'zipcode_98108.0', 'zipcode_98166.0', 'zipcode_98014.0',
    'zipcode_98051.0', 'zipcode_98288.0', 'zipcode_98056.0', 'zipcode_98178.0',
    'zipcode_98065.0', 'zipcode_98118.0', 'zipcode_98133.0', 'zipcode_98057.0',
    'zipcode_98155.0', 'zipcode_98126.0', 'zipcode_98070.0', 'zipcode_98168.0',
    'zipcode_98148.0', 'zipcode_98198.0', 'zipcode_98028.0', 'zipcode_98058.0',
    'zipcode_98188.0', 'zipcode_98024.0', 'zipcode_98125.0', 'zipcode_98038.0',
    'zipcode_98055.0', 'zipcode_98354.0', 'zipcode_98047.0', 'zipcode_98032.0',
    'zipcode_98002.0', 'zipcode_98072.0', 'zipcode_98011.0', 'zipcode_98031.0',
    'zipcode_98003.0', 'zipcode_98022.0', 'zipcode_98030.0', 'zipcode_98042.0',
    'zipcode_98144.0', 'zipcode_98010.0', 'zipcode_98177.0', 'zipcode_98001.0',
    'zipcode_98023.0', 'zipcode_98092.0', 'zipcode_98136.0', 'zipcode_98077.0',
    'zipcode_98027.0', 'zipcode_98117.0', 'zipcode_98116.0', 'zipcode_98107.0',
    'zipcode_98053.0', 'zipcode_98034.0', 'zipcode_98115.0', 'zipcode_98103.0',
    'zipcode_98122.0', 'zipcode_98029.0', 'zipcode_98074.0', 'zipcode_98007.0',
    'zipcode_98075.0', 'zipcode_98052.0', 'zipcode_98105.0', 'zipcode_98199.0',
    'zipcode_98119.0'
])
model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

In [None]:
#18 Including top 5 zipcodes
keep_X = df[['sqft_living_norm', 'zipcode']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['zipcode']).drop(columns=[
    'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0', 'zipcode_98045.0',
    'zipcode_98106.0', 'zipcode_98108.0', 'zipcode_98166.0', 'zipcode_98014.0',
    'zipcode_98051.0', 'zipcode_98288.0', 'zipcode_98056.0', 'zipcode_98178.0',
    'zipcode_98065.0', 'zipcode_98118.0', 'zipcode_98133.0', 'zipcode_98057.0',
    'zipcode_98155.0', 'zipcode_98126.0', 'zipcode_98070.0', 'zipcode_98168.0',
    'zipcode_98148.0', 'zipcode_98198.0', 'zipcode_98028.0', 'zipcode_98058.0',
    'zipcode_98188.0', 'zipcode_98024.0', 'zipcode_98125.0', 'zipcode_98038.0',
    'zipcode_98055.0', 'zipcode_98354.0', 'zipcode_98047.0', 'zipcode_98032.0',
    'zipcode_98002.0', 'zipcode_98072.0', 'zipcode_98011.0', 'zipcode_98031.0',
    'zipcode_98003.0', 'zipcode_98022.0', 'zipcode_98030.0', 'zipcode_98042.0',
    'zipcode_98144.0', 'zipcode_98010.0', 'zipcode_98177.0', 'zipcode_98001.0',
    'zipcode_98023.0', 'zipcode_98092.0', 'zipcode_98136.0', 'zipcode_98077.0',
    'zipcode_98027.0', 'zipcode_98117.0', 'zipcode_98116.0', 'zipcode_98107.0',
    'zipcode_98053.0', 'zipcode_98034.0', 'zipcode_98115.0', 'zipcode_98103.0',
    'zipcode_98122.0', 'zipcode_98029.0', 'zipcode_98074.0', 'zipcode_98007.0',
    'zipcode_98075.0', 'zipcode_98052.0', 'zipcode_98105.0', 'zipcode_98199.0',
    'zipcode_98119.0', 'zipcode_98004.0', 'zipcode_98005.0', 'zipcode_98006.0',
    'zipcode_98008.0', 'zipcode_98033.0'
])
model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
res_df = res_df.append(result_df(results),ignore_index=True)
res_df

# Final Model

In [None]:
#12 Final
keep_X = df[['sqft_living_norm', 'zipcode']]
keep_X = df[['sqft_living_norm', 'zipcode']]
y = df['price']

X = pd.get_dummies(keep_X, columns=['zipcode']).drop(columns=[
    'zipcode_98059.0', 'zipcode_98019.0', 'zipcode_98146.0', 'zipcode_98045.0',
    'zipcode_98106.0', 'zipcode_98108.0', 'zipcode_98166.0', 'zipcode_98014.0',
    'zipcode_98051.0', 'zipcode_98288.0'
])

model = sm.OLS(y, sm.add_constant(X))
final_results = model.fit()
model_diag(final_results)

# Model Evaluation

In [None]:
plt.style.available

In [None]:
plt.style.use("seaborn-muted")


In [None]:
fig,ax = plt.subplots(figsize = (5,5))
ax.scatter
sm.graphics.plot_fit(baseline_results, "sqft_living_norm",ax=ax,marker='.',)
plt.show()

In [None]:
fig, ax = plt.subplots()
df.plot.scatter(x="sqft_living_norm", y="price", label="Data points", ax=ax,alpha=0.1)
sm.graphics.abline_plot(model_results=baseline_results, label="Regression line", ax=ax, color="black")
ax.legend();

In [None]:
fig, ax = plt.subplots()

ax.scatter(df["price"], baseline_results.resid, alpha=0.2)
ax.axhline(y=0, color="black")
ax.set_xlabel("price")
ax.set_ylabel("residuals");

In [None]:
sm.graphics.plot_fit(final_results, "sqft_living_norm",marker='.')
plt.show()

In [None]:
fig = plt.figure(figsize=(12,10))
sm.graphics.plot_regress_exog(final_results, "sqft_living_norm", fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(12,10))
sm.graphics.plot_regress_exog(baseline_results, "sqft_living_norm", fig=fig)
plt.show()