In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import env
import acquire
import wrangle
from scipy import stats
import sklearn.preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from numpy import percentile

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, f_regression

    Filter out data using SQL propertylandusetypeid = 261 and transactiondate 2017

In [2]:
df = acquire.get_zillow_data()

Getting a fresh copy from SQL database...
Saving to csv...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56080 entries, 0 to 56079
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            56080 non-null  int64  
 1   bedroomcnt                    56080 non-null  float64
 2   bathroomcnt                   56080 non-null  float64
 3   calculatedfinishedsquarefeet  55849 non-null  float64
 4   poolcnt                       10951 non-null  float64
 5   fireplacecnt                  7631 non-null   float64
 6   garagecarcnt                  18247 non-null  float64
 7   yearbuilt                     55831 non-null  float64
 8   lotsizesquarefeet             55730 non-null  float64
 9   fips                          56080 non-null  float64
 10  taxvaluedollarcnt             56072 non-null  float64
 11  transactiondate               56080 non-null  object 
 12  propertylandusetypeid         56080 non-null  float64
dtypes

In [4]:
df_prep = df.copy(deep=True)
df_prep.isnull().sum()

id                                  0
bedroomcnt                          0
bathroomcnt                         0
calculatedfinishedsquarefeet      231
poolcnt                         45129
fireplacecnt                    48449
garagecarcnt                    37833
yearbuilt                         249
lotsizesquarefeet                 350
fips                                0
taxvaluedollarcnt                   8
transactiondate                     0
propertylandusetypeid               0
dtype: int64

    Replace zero values in bath & bed with NaN

In [5]:
bed_bath_0_columns = ['bedroomcnt', 'bathroomcnt']
df_prep[bed_bath_0_columns] = df_prep[bed_bath_0_columns].replace(0, np.NaN)
df_prep = df_prep.reset_index(drop=True)

In [6]:
df_prep.isnull().sum()

id                                  0
bedroomcnt                        359
bathroomcnt                       366
calculatedfinishedsquarefeet      231
poolcnt                         45129
fireplacecnt                    48449
garagecarcnt                    37833
yearbuilt                         249
lotsizesquarefeet                 350
fips                                0
taxvaluedollarcnt                   8
transactiondate                     0
propertylandusetypeid               0
dtype: int64

    Replace NaN values with 0 for pool, fire, and garage

In [7]:
fire_garge_pool_0_columns = ['poolcnt', 'fireplacecnt','garagecarcnt']
df_prep[fire_garge_pool_0_columns] = df_prep[fire_garge_pool_0_columns].replace(np.NaN, 0)
df_prep = df_prep.reset_index(drop=True)


In [8]:
df_prep.isnull().sum()

id                                0
bedroomcnt                      359
bathroomcnt                     366
calculatedfinishedsquarefeet    231
poolcnt                           0
fireplacecnt                      0
garagecarcnt                      0
yearbuilt                       249
lotsizesquarefeet               350
fips                              0
taxvaluedollarcnt                 8
transactiondate                   0
propertylandusetypeid             0
dtype: int64

In [9]:
df_prep.shape

(56080, 13)

In [10]:
df_prep = df_prep.dropna(axis=0)
df_prep = df_prep.reset_index(drop=True)

In [11]:
df_prep.shape

(55359, 13)

In [12]:
df_prep.head(50)

Unnamed: 0,id,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,poolcnt,fireplacecnt,garagecarcnt,yearbuilt,lotsizesquarefeet,fips,taxvaluedollarcnt,transactiondate,propertylandusetypeid
0,20,4.0,2.0,3633.0,0.0,0.0,0.0,2005.0,9826.0,6037.0,296425.0,2017-01-02,261.0
1,33,3.0,2.0,2077.0,0.0,0.0,0.0,1926.0,6490.0,6037.0,646760.0,2017-01-02,261.0
2,110,3.0,1.0,1244.0,0.0,0.0,0.0,1950.0,6021.0,6037.0,169471.0,2017-01-03,261.0
3,111,3.0,2.0,1300.0,0.0,0.0,0.0,1950.0,4917.0,6037.0,233266.0,2017-01-03,261.0
4,112,3.0,2.0,1222.0,1.0,0.0,0.0,1951.0,5500.0,6037.0,290492.0,2017-01-03,261.0
5,244,3.0,2.5,1821.0,0.0,0.0,2.0,2013.0,2442.0,6059.0,537949.0,2017-01-03,261.0
6,245,4.0,3.5,2684.0,0.0,0.0,2.0,2014.0,3510.0,6059.0,644990.0,2017-01-03,261.0
7,246,3.0,2.5,2284.0,0.0,0.0,2.0,2014.0,3398.0,6059.0,934353.0,2017-01-03,261.0
8,248,3.0,3.0,2342.0,0.0,0.0,2.0,2014.0,3508.0,6059.0,1170140.0,2017-01-03,261.0
9,249,4.0,3.0,2666.0,0.0,0.0,2.0,2015.0,5145.0,6059.0,926448.0,2017-01-03,261.0
