In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
#%run -i utils.py
import seaborn as sns
#import datetime 

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error


In [None]:
df_2017 = pd.read_csv('../data/properties_2017.csv')

In [5]:
prop_types = pd.read_excel('../data/zillow_data_dictionary.xlsx', 
                           sheet_name='PropertyLandUseTypeID', 
                           names=['propertylandusetypeid', 'prop_description'])

df_2017 = df_2017.join(
    prop_types.set_index('propertylandusetypeid'),
    on='propertylandusetypeid')

In [18]:
print("Number of counties: {}".format(len(df_2017.regionidcounty.value_counts())))
print("Number of cities: {}".format(len(df_2017.regionidcity.value_counts())))
print("Number of zips: {}".format(len(df_2017.regionidzip.value_counts())))
print("Number of neighborhoods: {}".format(len(df_2017.regionidneighborhood.value_counts())))

Number of counties: 3
Number of cities: 186
Number of zips: 403
Number of neighborhoods: 529


In [21]:
features = [
#     'fips', # Federal Information Processing Standard code
#     'propertylandusetypeid', # Categorical - 16 values
    'prop_description', # Categorical - 16 values
#     'rawcensustractandblock',
#     'regionidcounty',
#     'longitude',
#     'latitude',
#     'assessmentyear', #Almost 100% from 2016
    'bedroomcnt', # Exclude count > 6
    'bathroomcnt', # Exclude count > 10
    'roomcnt', # Exclude count > 20
#     'propertycountylandusecode', - Long tail, examine in more detail
#     'regionidzip',
#     'taxamount', #Data leakage?
#     'calculatedfinishedsquarefeet', #Seems to be identical to finishedsquarefeet12
    'yearbuilt',
    'regionidcity',
#     'censustractandblock',
#     'fullbathcnt', #Consider for addback
#     'calculatedbathnbr', #Consider for addback
    'finishedsquarefeet12',
    'lotsizesquarefeet'
]

labels = [
    'taxvaluedollarcnt',
    'structuretaxvaluedollarcnt',
    'landtaxvaluedollarcnt',
]

In [22]:
# Filter on total assessed value
df = df_2017[features + labels]
df = df[(df.taxvaluedollarcnt > 100000) & (df.taxvaluedollarcnt < 1000000)]
df.describe()
df.shape

(2430767, 11)

In [23]:
#Filter on Single Family Homes 
df.prop_description.value_counts()
df = df[df.prop_description == 'Single Family Residential']
df.shape

(1741590, 11)

In [24]:
# Filter on bedroom, bathroom, and total room counts
df = df[(df.bedroomcnt <= 6) & (df.bedroomcnt >= 1)]
df = df[(df.bathroomcnt <= 10) & (df.bathroomcnt >= 1)]
df = df[(df.roomcnt <= 20)]
df.shape

(1732506, 11)

In [25]:
#Drop all rows with null values
tmp = df.dropna()
print('{:.2f}% dropped'.format((1 - len(tmp) / len(df)) * 100))
df = tmp
df.shape

2.70% dropped


(1685811, 11)

In [27]:
df = pd.get_dummies(df, columns=['regionidcity'])
df.head()

Unnamed: 0,prop_description,bedroomcnt,bathroomcnt,roomcnt,yearbuilt,finishedsquarefeet12,lotsizesquarefeet,taxvaluedollarcnt,structuretaxvaluedollarcnt,landtaxvaluedollarcnt,...,regionidcity_118880.0,regionidcity_118895.0,regionidcity_118914.0,regionidcity_118994.0,regionidcity_272578.0,regionidcity_396053.0,regionidcity_396054.0,regionidcity_396550.0,regionidcity_396551.0,regionidcity_396556.0
20,Single Family Residential,4.0,2.0,0.0,2005.0,3633.0,9826.0,296425.0,222321.0,74104.0,...,0,0,0,0,0,0,0,0,0,0
33,Single Family Residential,3.0,2.0,0.0,1926.0,2077.0,6490.0,646760.0,210192.0,436568.0,...,0,0,0,0,0,0,0,0,0,0
110,Single Family Residential,3.0,1.0,0.0,1950.0,1244.0,6021.0,169471.0,108040.0,61431.0,...,0,0,0,0,0,0,0,0,0,0
111,Single Family Residential,3.0,2.0,0.0,1950.0,1300.0,4917.0,233266.0,77415.0,155851.0,...,0,0,0,0,0,0,0,0,0,0
112,Single Family Residential,3.0,2.0,0.0,1951.0,1222.0,5500.0,290492.0,117839.0,172653.0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
df.isnull().sum().sum()

0

In [41]:
features = df[[c for c in df.columns if c not in ['prop_description','taxvaluedollarcnt', 'structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt']]]
features['constant'] = 1
target = df['taxvaluedollarcnt']

model = sm.OLS(target, features)
model.fit().summary()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0,1,2,3
Dep. Variable:,taxvaluedollarcnt,R-squared:,0.317
Model:,OLS,Adj. R-squared:,0.317
Method:,Least Squares,F-statistic:,4156.0
Date:,"Thu, 13 Dec 2018",Prob (F-statistic):,0.0
Time:,17:59:36,Log-Likelihood:,-22683000.0
No. Observations:,1685811,AIC:,45370000.0
Df Residuals:,1685622,BIC:,45370000.0
Df Model:,188,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
bedroomcnt,-1.402e+04,206.844,-67.772,0.000,-1.44e+04,-1.36e+04
bathroomcnt,1.336e+04,288.337,46.339,0.000,1.28e+04,1.39e+04
roomcnt,-8943.9342,93.948,-95.201,0.000,-9128.069,-8759.799
yearbuilt,978.2746,9.358,104.534,0.000,959.932,996.617
finishedsquarefeet12,108.7387,0.330,329.988,0.000,108.093,109.385
lotsizesquarefeet,0.0017,0.002,0.958,0.338,-0.002,0.005
regionidcity_3491.0,1.144e+05,1.14e+04,10.052,0.000,9.21e+04,1.37e+05
regionidcity_3980.0,1.976e+05,8.39e+04,2.355,0.019,3.31e+04,3.62e+05
regionidcity_4406.0,-2.856e+04,1946.667,-14.670,0.000,-3.24e+04,-2.47e+04

0,1,2,3
Omnibus:,85399.103,Durbin-Watson:,1.878
Prob(Omnibus):,0.0,Jarque-Bera (JB):,204058.795
Skew:,0.311,Prob(JB):,0.0
Kurtosis:,4.587,Cond. No.,1.05e+16


In [42]:
X1, X2, y1, y2 = train_test_split(features, target, random_state=0, train_size=.7)

model = LinearRegression().fit(X1, y1)
print('OLS Training set r2: {:.3f}'.format(model.score(X1, y1)))

y_score = model.predict(X2)
print('OLS Test set r2: {:.3f}'.format(r2_score(y2, y_score)))

OLS Training set r2: 0.317
OLS Test set r2: 0.315


In [44]:
rf_model = RandomForestRegressor(
    random_state=0,
    criterion="mse",
    min_samples_leaf=20,
    n_estimators=20, 
    max_depth=10,
)
                           
rf_model.fit(X1, y1)

print('RF Training set r2: {:.3f}'.format(rf_model.score(X1, y1)))
y_score = rf_model.predict(X2)
print('RF Test set r2: {:.3f}'.format(r2_score(y2, y_score)))

print("Root Mean Squared Error: {:,.0f}".format(mean_squared_error(y2, y_score)**0.5))

RF Training set r2: 0.304
RF Test set r2: 0.301
Root Mean Squared Error: 170,792
