In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from env import get_db_url

# Acquire data from SQL database

In [9]:
# Find all of the "Single Family Residential" properties from 2017.
sql = """
SELECT bedroomcnt AS bedrooms, bathroomcnt AS bathrooms, `calculatedfinishedsquarefeet` AS square_feet,  
`lotsizesquarefeet` AS lot_size, poolcnt AS pool, `regionidzip` AS zip_code, yearbuilt AS year_built, latitude, 
longitude, fips, `taxvaluedollarcnt` AS assessed_value, `taxamount` AS tax_value, transactiondate AS transaction_date
FROM properties_2017
JOIN predictions_2017 USING (id)
WHERE propertylandusetypeid = 261 AND bedroomcnt > 0 AND bathroomcnt > 0
AND transactiondate >= '2017-01-01' AND transactiondate <= '2017-12-32'
"""
# added single family properties and inferred single family properties and removed properities
# without bedrooms and bathrooms
df = pd.read_sql(sql, get_db_url("zillow"))
df.head()

Unnamed: 0,bedrooms,bathrooms,square_feet,lot_size,pool,zip_code,year_built,latitude,longitude,fips,assessed_value,tax_value,transaction_date
0,4.0,2.0,3633.0,9826.0,,97329.0,2005.0,34560018.0,-118169806.0,6037.0,296425.0,6941.39,2017-01-02
1,3.0,4.0,1620.0,,,96047.0,2011.0,33996200.0,-118438000.0,6037.0,847770.0,10244.94,2017-01-02
2,3.0,2.0,2077.0,6490.0,,96152.0,1926.0,34012977.0,-118479243.0,6037.0,646760.0,7924.68,2017-01-02
3,3.0,1.0,1244.0,6021.0,,96201.0,1950.0,33953559.0,-118083855.0,6037.0,169471.0,2532.88,2017-01-03
4,3.0,2.0,1300.0,4917.0,,96193.0,1950.0,33897134.0,-118102953.0,6037.0,233266.0,3110.99,2017-01-03


In [10]:
# Shows me how many rows and columns
df.shape

(55692, 13)

In [11]:
# List the columns
df.columns

Index(['bedrooms', 'bathrooms', 'square_feet', 'lot_size', 'pool', 'zip_code',
       'year_built', 'latitude', 'longitude', 'fips', 'assessed_value',
       'tax_value', 'transaction_date'],
      dtype='object')

In [12]:
# Shows me how many nulls are in each column
df.isnull().sum()

bedrooms                0
bathrooms               0
square_feet             5
lot_size              300
pool                44757
zip_code               64
year_built             29
latitude                0
longitude               0
fips                    0
assessed_value          2
tax_value              72
transaction_date        0
dtype: int64

In [13]:
# List all the columns, how many nulls are in each column, the percentage of nulls, and min and max of each column.
for col in df.columns:
    print(col)
    print(df[col].isnull().sum())
    print((df[col].isnull().sum()/55692)*100)
    print(df[col].min())
    print(df[col].max())

bedrooms
0
0.0
1.0
25.0
bathrooms
0
0.0
1.0
20.0
square_feet
5
0.008977950154420744
320.0
26345.0
lot_size
300
0.5386770092652445
500.0
6971010.0
pool
44757
80.36522301228185
1.0
1.0
zip_code
64
0.11491776197658551
95982.0
399675.0
year_built
29
0.05207211089564031
1862.0
2016.0
latitude
0
0.0
33340711.0
34806946.0
longitude
0
0.0
-119448392.0
-117555933.0
fips
0
0.0
6037.0
6111.0
assessed_value
2
0.0035911800617682973
7584.0
30166843.0
tax_value
72
0.12928248222365868
20.24
342940.38
transaction_date
0
0.0
2017-01-02
2017-09-21
