## Data Exploration
To interact with this notebook run `jupyter notebook viz_prod.ipynb` from command line.

In [6]:
import pandas as pd
import build_db
from core.data.socrata import soda_data, socrata_api_requests
from core.data import dbclient, daily_case_data_by_zip
from core.data.groundtruth import process_ground_truth_data

In [7]:
# connect to db and show all tables
db = dbclient.DBClient()
db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(db.cursor.fetchall())

[('VACCINATIONS',), ('DAILY_COVID_CASE_DATA',), ('DAILY_FOOT_TRAFFIC_DATA',), ('TRAFFIC_CRASH_DATA',), ('DEMOGRAPHICS',)]


### Daily Case Data

In [8]:
query = f"select * from {build_db.CASE_TBL}"
print(f"query = {query}")
covid_case_df = pd.read_sql_query(query, db.conn)
covid_case_df.describe(include='all')

query = select * from DAILY_COVID_CASE_DATA


Unnamed: 0,index,STD_DATE,ZIPCODE,confirmed_cases,confirmed_cases_change,total_tested,total_tested_change,AVG7DAY_confirmed_cases,AVG7DAY_total_tested
count,333098.0,333098,332995.0,333098.0,333098.0,333098.0,333098.0,324332.0,324332.0
unique,,323,1446.0,6895.0,195.0,39670.0,1776.0,,
top,,2021-03-13 00:00:00,60133.0,6.0,0.0,147.0,0.0,,
freq,,1430,323.0,9014.0,149887.0,346.0,28360.0,,
mean,166548.5,,,,,,,478.640146,6376.932092
std,96157.254321,,,,,,,999.274745,18170.056928
min,0.0,,,,,,,6.0,10.428571
25%,83274.25,,,,,,,26.0,387.0
50%,166548.5,,,,,,,99.571429,1232.285714
75%,249822.75,,,,,,,447.857143,5750.607143


### Vaccination Data

In [9]:
query = f"select * from {build_db.VACC_TBL}"
print(f"query = {query}")
vacc_df = pd.read_sql_query(query, db.conn)
vacc_df.describe(include='all')

query = select * from VACCINATIONS


Unnamed: 0,index,ZIPCODE,STD_DATE,total_doses_daily,total_doses_cumulative,vaccine_series_completed_daily,vaccine_series_completed_percent_population,population,AVG7DAY_total_doses_daily,AVG7DAY_vaccine_series_completed_daily
count,5000.0,4919.0,5000,5000.0,5000.0,5000.0,5000.0,5000.0,4565.0,4565.0
unique,,59.0,91,,,,,,,
top,,60615.0,2021-03-14 00:00:00,,,,,,,
freq,,91.0,60,,,,,,,
mean,2499.5,,,143.7686,4530.2546,51.5104,0.033289,46211.6418,146.053294,52.594148
std,1443.520003,,,162.413316,5334.012868,72.468759,0.046259,26625.453119,134.711408,60.168219
min,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1249.75,,,22.0,516.0,0.0,0.0,28569.0,38.285714,3.714286
50%,2499.5,,,89.5,2275.0,20.0,0.014,46591.0,110.714286,30.0
75%,3749.25,,,214.25,7000.75,73.0,0.049,67711.0,217.857143,80.142857


### Foot Traffic Data

In [10]:
query = f"select * from {build_db.FOOT_TRAFF_TBL}"
print(f"query = {query}")
foot_traffic_df = pd.read_sql_query(query, db.conn)
foot_traffic_df.describe(include='all')

query = select * from DAILY_FOOT_TRAFFIC_DATA


Unnamed: 0,index,STD_DATE,ZIPCODE,AIRPORTS_TRANSIT_CENTERS,BARS,BEAUTY_WELLNESS,FITNESS_CENTERS,GROCERY,MASS_MERCH,PARKS_BEACHES,...,RESTAURANT,RETAIL,SCHOOLS_LIBRARIES,SHOPPING_CENTERS_MALLS,TOURIST_ATTRACTIONS,AVG7DAY_BARS,AVG7DAY_GROCERY,AVG7DAY_RESTAURANT,AVG7DAY_PARKS_BEACHES,AVG7DAY_SCHOOLS_LIBRARIES
count,25724.0,25724,25724.0,17004.0,25288.0,25724.0,22236.0,17004.0,16132.0,23108.0,...,24852.0,19620.0,24852.0,22236.0,19620.0,24940.0,16770.0,24510.0,22790.0,24510.0
unique,,436,59.0,,,,,,,,...,,,,,,,,,,
top,,2020-05-24 00:00:00,60601.0,,,,,,,,...,,,,,,,,,,
freq,,59,436.0,,,,,,,,...,,,,,,,,,,
mean,12861.5,,,50.086568,55.053622,64.913855,57.422513,83.968243,61.747149,78.690454,...,93.205054,60.165698,54.164172,64.963752,57.582518,54.730089,83.862348,92.768578,78.489826,53.680993
std,7426.023498,,,56.654978,38.023444,28.041231,51.594104,33.390125,36.472314,120.264241,...,274.585147,40.78787,43.181279,28.638627,50.36151,32.572987,30.068362,260.071679,95.741359,34.879655
min,0.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.428571,0.0
25%,6430.75,,,21.0,30.0,47.0,32.0,62.0,41.0,41.0,...,47.0,31.0,28.0,46.0,23.0,32.428571,63.428571,48.714286,45.285714,30.714286
50%,12861.5,,,33.0,47.0,63.0,46.0,80.0,63.0,64.0,...,68.0,57.0,48.0,63.0,50.0,47.857143,81.428571,69.0,66.428571,48.428571
75%,19292.25,,,64.0,71.0,81.0,69.0,102.0,84.0,95.0,...,89.0,83.0,68.0,81.0,81.0,71.428571,101.714286,89.857143,91.285714,67.428571


### Traffic Crash Data

In [11]:
query = f"select * from {build_db.CRASHES_TBL}"
print(f"query = {query}")
crashes_df = pd.read_sql_query(query, db.conn)
crashes_df.describe(include='all')

query = select * from TRAFFIC_CRASH_DATA


Unnamed: 0,index,STD_DATE,ZIPCODE,crash_count,AVG7DAY_crash_count
count,43049.0,43049,42381.0,43049.0,41979.0
unique,,796,70.0,,
top,,2019-08-15 00:00:00,60639.0,,
freq,,62,796.0,,
mean,21524.0,,,5.260401,5.317969
std,12427.320206,,,3.597711,2.777847
min,0.0,,,1.0,1.0
25%,10762.0,,,2.0,3.0
50%,21524.0,,,5.0,5.0
75%,32286.0,,,7.0,7.285714


### Demographics

In [12]:
query = f"select * from {build_db.CENSUS_TBL}"
print(f"query = {query}")
census_df = pd.read_sql_query(query, db.conn)
census_df.describe(include='all')

query = select * from DEMOGRAPHICS


Unnamed: 0,index,zcta,hhold_size,fam_size,unemploy_rate,median_income,pct_below_poverty_lvl,median_age,pct_65_or_older,pct_hispanic,...,pct_white,pct_pacific_islander,pct_american_indian,pct_other_race,pct_high_school_grad,pct_hholds_w_computer,pct_hholds_w_internet,pct_w_health_insur,state,ZIPCODE
count,58.0,58,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,...,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0
unique,,58,,,,,,,,,...,,,,,,,,,,58.0
top,,ZCTA5 60634,,,,,,,,,...,,,,,,,,,,60629.0
freq,,1,,,,,,,,,...,,,,,,,,,,1.0
mean,28.5,,2.416724,3.222759,5.306897,69298.913793,13.234483,35.377586,12.405172,21.343103,...,38.406897,0.012069,0.2,0.210345,20.458621,88.603448,79.363793,91.806897,17.0,
std,16.886879,,0.545085,0.527662,3.301986,35812.102415,9.850827,3.657678,4.305705,21.570831,...,27.44321,0.032861,0.861659,0.213319,11.979237,7.252391,9.459622,4.489983,0.0,
min,0.0,,1.49,2.14,0.2,22158.0,0.0,30.2,0.5,1.3,...,1.0,0.0,0.0,0.0,0.0,69.5,56.6,79.8,17.0,
25%,14.25,,1.965,2.8725,2.65,41563.5,5.0,32.3,9.9,5.725,...,7.9,0.0,0.0,0.1,8.425,83.45,73.75,88.175,17.0,
50%,28.5,,2.5,3.355,4.2,62631.5,10.35,34.5,11.85,11.5,...,44.65,0.0,0.1,0.1,21.7,90.1,79.45,91.7,17.0,
75%,42.75,,2.7875,3.6075,7.425,94877.0,19.2,37.95,15.35,38.1,...,62.075,0.0,0.1,0.3,30.15,93.85,87.4,95.675,17.0,
