In [1]:
# IMPORTS
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from env import host, username, password

import acquire
import warnings
warnings.filterwarnings("ignore")

In [2]:
# returns df from acquire.py file
df = acquire.get_zillow_data()

In [3]:
# shows how many rows and columns
df.shape

(77380, 68)

In [4]:
# displays a list of every column title
df.columns

Index(['id', 'parcelid', 'airconditioningtypeid', 'architecturalstyletypeid',
       'basementsqft', 'bathroomcnt', 'bedroomcnt', 'buildingclasstypeid',
       'buildingqualitytypeid', 'calculatedbathnbr', 'decktypeid',
       'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet',
       'finishedsquarefeet12', 'finishedsquarefeet13', 'finishedsquarefeet15',
       'finishedsquarefeet50', 'finishedsquarefeet6', 'fips', 'fireplacecnt',
       'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'hashottuborspa',
       'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet',
       'poolcnt', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
       'propertycountylandusecode', 'propertylandusetypeid',
       'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt',
       'storytypeid', 'threequarterbathnbr', 'typeconstructiontypeid',
       'unitcnt', 'yardbuildingsqft17', 'yardb

In [5]:
df.head()

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,1727539,14297519,,,,3.5,4.0,,,3.5,...,60590630000000.0,0.025595,2017-01-01,,,,,Single Family Residential,,
1,1387261,17052889,,,,1.0,2.0,,,1.0,...,61110010000000.0,0.055619,2017-01-01,,,,,Single Family Residential,,
2,11677,14186244,,,,2.0,3.0,,,2.0,...,60590220000000.0,0.005383,2017-01-01,,,,,Single Family Residential,,
3,2288172,12177905,,,,3.0,4.0,,8.0,3.0,...,60373000000000.0,-0.10341,2017-01-01,,,,Central,Single Family Residential,,
4,1970746,10887214,1.0,,,3.0,3.0,,8.0,3.0,...,60371240000000.0,0.00694,2017-01-01,Central,,,Central,Condominium,,


In [6]:
# shows the data types for columns
df.dtypes

id                            int64
parcelid                      int64
airconditioningtypeid       float64
architecturalstyletypeid    float64
basementsqft                float64
                             ...   
buildingclassdesc            object
heatingorsystemdesc          object
propertylandusedesc          object
storydesc                    object
typeconstructiondesc         object
Length: 68, dtype: object

In [7]:
# displays the number of columns, column labels, column data types, memory usage, range index, and the number of cells in each column (non-null values)
acquire.overview(df)

--- Shape: (77380, 68)
--- Info
<class 'pandas.core.frame.DataFrame'>
Int64Index: 77380 entries, 0 to 77379
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            77380 non-null  int64  
 1   parcelid                      77380 non-null  int64  
 2   airconditioningtypeid         24953 non-null  float64
 3   architecturalstyletypeid      206 non-null    float64
 4   basementsqft                  50 non-null     float64
 5   bathroomcnt                   77380 non-null  float64
 6   bedroomcnt                    77380 non-null  float64
 7   buildingclasstypeid           15 non-null     float64
 8   buildingqualitytypeid         49671 non-null  float64
 9   calculatedbathnbr             76771 non-null  float64
 10  decktypeid                    614 non-null    float64
 11  finishedfloor1squarefeet      6023 non-null   float64
 12  calculatedfinishedsquarefeet

In [8]:
acquire.nulls_by_columns(df)

Unnamed: 0,num_cols_missing,pct_cols_missing
id,0,0.000000
parcelid,0,0.000000
airconditioningtypeid,52427,0.677526
architecturalstyletypeid,77174,0.997338
basementsqft,77330,0.999354
...,...,...
buildingclassdesc,77365,0.999806
heatingorsystemdesc,27941,0.361088
propertylandusedesc,0,0.000000
storydesc,77330,0.999354


In [9]:
acquire.nulls_by_rows(df)

num_rows_missing  percent_missing
23                0.338235               2
24                0.352941              13
25                0.367647              24
26                0.382353              65
27                0.397059             316
28                0.411765             455
29                0.426471            5270
30                0.441176            3455
31                0.455882            9891
32                0.470588           12579
33                0.485294           14782
34                0.500000           13326
35                0.514706            5148
36                0.529412            5775
37                0.544118            3620
38                0.558824            1925
39                0.573529             285
40                0.588235             230
41                0.602941              29
42                0.617647              23
43                0.632353              28
44                0.647059              78
45                0.

In [10]:
import prepare

# sets thresh hold to 75 percent nulls
threshold = df.shape[0] * .50

# remove columns with high nulls
df = df.dropna(axis=1, thresh=threshold)

df.columns

In [11]:
df = prepare.prep_zillow(df)

The following 34 columns were dropped because they were missing more    than 50.0% of data: 
['airconditioningtypeid', 'architecturalstyletypeid', 'basementsqft', 'buildingclasstypeid', 'decktypeid', 'finishedfloor1squarefeet', 'finishedsquarefeet13', 'finishedsquarefeet15', 'finishedsquarefeet50', 'finishedsquarefeet6', 'fireplacecnt', 'garagecarcnt', 'garagetotalsqft', 'hashottuborspa', 'poolcnt', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'regionidneighborhood', 'storytypeid', 'threequarterbathnbr', 'typeconstructiontypeid', 'yardbuildingsqft17', 'yardbuildingsqft26', 'numberofstories', 'fireplaceflag', 'taxdelinquencyflag', 'taxdelinquencyyear', 'airconditioningdesc', 'architecturalstyledesc', 'buildingclassdesc', 'storydesc', 'typeconstructiondesc']

0 rows were dropped because they were missing more than          50.0% of data
yearbuilt converted to age. 



In [12]:
df.head().T

Unnamed: 0,28,44,67,70,86
parcelid,12036177,12106936,11016518,11018202,12579560
bathroomcnt,2.0,2.0,2.0,1.0,1.0
bedroomcnt,3,3,4,2,2
calculatedfinishedsquarefeet,1851,1447,1625,812,1027
fips,6037,6037,6037,6037,6037
latitude,34103373.0,34166370.0,34282275.0,34276431.0,33816016.0
longitude,-118293280.0,-118151336.0,-118492692.0,-118447368.0,-118271776.0
lotsizesquarefeet,6714,7283,8427,7150,5574
propertycountylandusecode,0100,0100,0100,0100,0100
rawcensustractandblock,60371892,60374616,60371066,60371095,60375437


In [13]:
df.dtypes

parcelid                          int64
bathroomcnt                     float64
bedroomcnt                        int64
calculatedfinishedsquarefeet      int64
fips                              int64
latitude                        float64
longitude                       float64
lotsizesquarefeet                 int64
propertycountylandusecode        object
rawcensustractandblock            int64
regionidcity                      int64
regionidzip                       int64
structuretaxvaluedollarcnt        int64
taxvaluedollarcnt                 int64
landtaxvaluedollarcnt             int64
taxamount                         int64
censustractandblock               int64
logerror                        float64
transactiondate                  object
age                               int64
county                           object
dtype: object

### Explore leftover nulls

In [14]:
df.isnull().sum()

parcelid                        0
bathroomcnt                     0
bedroomcnt                      0
calculatedfinishedsquarefeet    0
fips                            0
latitude                        0
longitude                       0
lotsizesquarefeet               0
propertycountylandusecode       0
rawcensustractandblock          0
regionidcity                    0
regionidzip                     0
structuretaxvaluedollarcnt      0
taxvaluedollarcnt               0
landtaxvaluedollarcnt           0
taxamount                       0
censustractandblock             0
logerror                        0
transactiondate                 0
age                             0
county                          0
dtype: int64