In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
df = pd.read_csv('data/data_clean.csv')
df['zipcode'] = df['zipcode'].astype('str')
def model_diag(results):
    mae = int(round(results.resid.abs().sum() / len(y),0))
    print(f'MAE:{mae}')
    fig, ax = plt.subplots(1,3,figsize=(20,5))
    sns.scatterplot(x= df['sqft_living_norm'],y= results.resid, ax=ax[0])
    sns.histplot(results.resid, bins=20, element="step", kde=True, ax=ax[1])
    sm.graphics.qqplot(results.resid, dist=stats.norm, line='45', fit=True, ax=ax[2])
    plt.show();
    print(results.summary())
def result_df(results):
    """
    This function takes a results object and creates a dataframe with a single row
    including adjusted R-squared, Mean Absolute Error, the number of p-values greater
    than 0.05 from the coefficients, and Conditional Number. 
    """
    res_dic = {}
    res_dic['features'] = len(results.params)
    res_dic['r2_adj'] = round(results.rsquared_adj * 100, 2)
    res_dic['MAE'] = round(results.resid.abs().sum() / len(y),0)
    res_dic['large_pvals'] = (results.pvalues.apply(lambda x: round(x, 2))
                              > 0.05).sum()
    res_dic['cond_num'] = results.condition_number
    pvals = results.pvalues.apply(lambda x: round(x, 2))
    large_pvals = list(pvals[pvals > 0.05].index)
    if len(large_pvals) == 0:
        print(abs(results.params).sort_values())
    else:
        print(f'Large_pvals: {list(pvals[pvals>0.05].index)}')
    return pd.DataFrame(res_dic, index=[0])
def RFE_df(df,
           features=[
               'sqft_living_norm', 'bathrooms_norm', 'bedrooms_norm',
               'view_norm', 'sqft_basement_norm', 'greenbelt',
               'waterfront', 'zipcode'
           ],
           to_drop=['zipcode_98059'],
           start=0,
           stop=10):
    """
    This function goes runs MLR models and removes the any p-values larger
    than 0.05. If there are none, it removes the feature with the smallest 
    absolute coefficient. It then adds a row and creates a dataframe with 
    relevent metrics. 
    """
    pd.set_option('display.max_rows', None)
    res_df = pd.DataFrame({})
    dropped = None
    while start < stop:
        y = df['price']
        X = pd.get_dummies(df[features]).drop(columns=to_drop)
        model = sm.OLS(y, sm.add_constant(X))
        results = model.fit()
        res_dic = {}
        num_features = len(results.params)
        res_dic['num_features'] = num_features
        res_dic['r2_adj'] = round(results.rsquared_adj * 100, 2)
        res_dic['f_pvalue'] = results.f_pvalue
        res_dic['MAE'] = round(results.resid.abs().sum() / len(y), 0)
        res_dic['large_pvals'] = (results.pvalues.apply(lambda x: round(x, 2))
                                  > 0.01).sum()
        res_dic['cond_num'] = results.condition_number
        res_dic['dropped'] = dropped
        pvals = results.pvalues.apply(lambda x: round(x, 3))
        large_pvals = list(pvals[pvals > 0.01].index)
        
        if len(large_pvals) == 0:
            feat = results.params.abs().sort_values().index[0]
            if feat == 'const':
                dropped = results.params.abs().sort_values().index[1]
            else:
                dropped = results.params.abs().sort_values().index[0] 
            to_drop.append(dropped)
        elif len(large_pvals) > 1:
            dropped = ""
            for i in large_pvals[:-1]:
                dropped += i + ', '
            dropped += large_pvals[-1]
            for i in large_pvals:
                to_drop.append(i)
        else:
            dropped = large_pvals[0]
            to_drop.append(dropped)
        start += 1
        res_df = res_df.append(pd.DataFrame(res_dic, index=[0]),
                               ignore_index=True)
    return res_df,to_drop,results

# Using my RFE Function

In [53]:
dfn,drop_list,results = RFE_df(df,
           features=[
               'sqft_living_norm', 'bathrooms_norm', 'bedrooms_norm',
               'view_norm', 'sqft_basement_norm', 'greenbelt',
               'waterfront', 'zipcode'
           ],
           to_drop=['zipcode_98059'],
           start=0,
           stop=11)
dfn

Unnamed: 0,num_features,r2_adj,f_pvalue,MAE,large_pvals,cond_num,dropped
0,82,73.5,0.0,191949.0,6,96.503169,
1,76,73.5,0.0,191908.0,0,70.084387,"zipcode_98019, zipcode_98045, zipcode_98106, z..."
2,75,73.46,0.0,192002.0,0,62.079918,bedrooms_norm
3,74,73.42,0.0,192066.0,0,57.971083,bathrooms_norm
4,73,72.93,0.0,193310.0,1,51.675064,sqft_basement_norm
5,71,72.93,0.0,193317.0,0,49.5519,"zipcode_98014, zipcode_98051"
6,70,72.02,0.0,196705.0,1,45.152621,view_norm
7,69,72.02,0.0,196702.0,0,45.1331,zipcode_98070
8,68,72.0,0.0,196785.0,0,45.116987,greenbelt
9,67,71.96,0.0,196969.0,0,45.069263,zipcode_98133


In [56]:

pd.set_option('display.max_rows', None)
res_df = pd.DataFrame({})
to_drop=['zipcode_98059']
features=['sqft_living_norm', 'bathrooms_norm', 'bedrooms_norm','view_norm', 'sqft_basement_norm', 'greenbelt','waterfront', 'zipcode']
dropped = None
num_features = 3
count = 0
while count < 9:
    y = df['price']
    X = pd.get_dummies(df[features]).drop(columns=to_drop)
    model = sm.OLS(y, sm.add_constant(X))
    results = model.fit()
    res_dic = {}
    num_features = len(results.params)
    res_dic['num_features'] = num_features
    res_dic['r2_adj'] = str(round(results.rsquared_adj*100, 2))+'%'
    res_dic['f_pvalue'] = round(results.f_pvalue,3)
    mae = str(int(results.resid.abs().sum() / len(y)))
    res_dic['MAE'] = '$' + mae[:3] + ',' + mae[3:]
    res_dic['large_pvals'] = (results.pvalues.apply(lambda x: round(x, 3))
                              > 0.01).sum()
    res_dic['cond_num'] = round(results.condition_number,2)
    res_dic['dropped'] = dropped
    pvals = results.pvalues.apply(lambda x: round(x, 3))
    large_pvals = list(pvals[pvals > 0.01].index)

    if len(large_pvals) == 0:
        feat = results.params.abs().sort_values().index[0]
        if feat == 'const':
            dropped = results.params.abs().sort_values().index[1]
        else:
            dropped = results.params.abs().sort_values().index[0] 
        to_drop.append(dropped)
    elif len(large_pvals) > 1:
        dropped = ""
        for i in large_pvals[:-1]:
            dropped += i + ', '
        dropped += large_pvals[-1]
        for i in large_pvals:
            to_drop.append(i)
    else:
        dropped = large_pvals[0]
        to_drop.append(dropped)
    count += 1
    res_df = res_df.append(pd.DataFrame(res_dic, index=[0]),
                           ignore_index=True)
res_df

Unnamed: 0,num_features,r2_adj,f_pvalue,MAE,large_pvals,cond_num,dropped
0,82,73.5%,0.0,"$191,949",6,96.5,
1,76,73.5%,0.0,"$191,907",0,70.08,"zipcode_98019, zipcode_98045, zipcode_98106, z..."
2,75,73.46%,0.0,"$192,001",0,62.08,bedrooms_norm
3,74,73.42%,0.0,"$192,065",0,57.97,bathrooms_norm
4,73,72.93%,0.0,"$193,309",2,51.68,sqft_basement_norm
5,71,72.93%,0.0,"$193,317",0,49.55,"zipcode_98014, zipcode_98051"
6,70,72.02%,0.0,"$196,704",1,45.15,view_norm
7,69,72.02%,0.0,"$196,702",0,45.13,zipcode_98070
8,68,72.0%,0.0,"$196,785",0,45.12,greenbelt


In [58]:
to_drop

['zipcode_98059',
 'zipcode_98019',
 'zipcode_98045',
 'zipcode_98106',
 'zipcode_98108',
 'zipcode_98146',
 'zipcode_98166',
 'bedrooms_norm',
 'bathrooms_norm',
 'sqft_basement_norm',
 'zipcode_98014',
 'zipcode_98051',
 'view_norm',
 'zipcode_98070',
 'greenbelt',
 'zipcode_98133']

# Model Evaluation

In [None]:
price_by_zips = df.groupby('zipcode').mean()['price'].sort_values()
med_price = price_by_zips.median()
price_by_zips[price_by_zips==med_price]

In [None]:
fig = plt.figure(figsize=(12,10))
sm.graphics.plot_regress_exog(results, "sqft_living_norm", fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(12,10))
sm.graphics.plot_regress_exog(final_results, "sqft_living_norm", fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(12,10))
sm.graphics.plot_regress_exog(final_results, "waterfront", fig=fig)
plt.show()

In [None]:
plt.style.available

In [None]:
plt.style.use("seaborn-muted")


In [None]:
fig,ax = plt.subplots(figsize = (10,10))
ax.scatter
sm.graphics.plot_fit(baseline_results, "sqft_living",ax=ax,marker='.',)
plt.show()

In [None]:
fig, ax = plt.subplots()
df.plot.scatter(x="sqft_living", y="price", label="Data points", ax=ax,alpha=0.1)
sm.graphics.abline_plot(model_results=baseline_results, label="Regression line", ax=ax, color="black")
ax.legend();

In [None]:
fig, ax = plt.subplots()

ax.scatter(df["price"], baseline_results.resid, alpha=0.2)
ax.axhline(y=0, color="black")
ax.set_xlabel("price")
ax.set_ylabel("residuals");

In [None]:
sm.graphics.plot_fit(final_results, "sqft_living_norm",marker='.')
plt.show()

In [None]:
fig = plt.figure(figsize=(12,10))
sm.graphics.plot_regress_exog(final_results, "sqft_living_norm", fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(12,10))
sm.graphics.plot_regress_exog(baseline_results, "sqft_living", fig=fig)
plt.show()