# Storing vaccination center data

This notebook parses and saves data of COVID-19 vaccination centers. Original source data retrieved from [CDC website](https://data.cdc.gov/Vaccinations/Vaccines-gov-COVID-19-vaccinating-provider-locatio/5jp2-pgaw).

In [1]:
import pandas as pd
import psycopg2 as pg
import geopandas as gpd
import h3pandas
import h3
from psycopg2.extras import execute_values
import itertools as itt
import configparser

data_path = '../../../data/vaccinating_provider_locations.csv'

H3_LEVEL = 9

config = configparser.ConfigParser()
config.read("../../../config/config.ini")    
db_params = config['DB']

## Seting up POI table

In [2]:

table_sql = """
    CREATE TABLE IF NOT EXISTS public.pois
    (
        POIID serial NOT NULL,
        Name varchar(150) NOT NULL,
        H3ID char(15) NOT NULL,
        Category varchar(50) NOT NULL,
        Lat real NOT NULL,
        Long real NOT NULL,
        PRIMARY KEY (POIID)
    );

    CREATE INDEX IF NOT EXISTS poi_h3_index ON public.pois USING HASH (H3ID);

    CREATE INDEX IF NOT EXISTS poi_category_index ON public.pois USING HASH (Category);
"""

with pg.connect(**db_params) as conn:
    with conn.cursor() as cur:
        cur.execute(table_sql)

## Saving vaccination center information

In [23]:
df = pd.read_csv(data_path, usecols=['loc_name', 'latitude', 'longitude', 'in_stock', 'Category'])
print("Total vaccination centers across US: {}".format(len(df)))
print("Number of locations with missing coordinates: {} (will be skipped)".format(len(df[df['latitude'].isna() | df['longitude'].isna()])))
df = df.dropna()
print("Number of locations marked as out of stock: {} (will be included)".format(len(df[df['in_stock'] == False])))
print("Vaccination centers to be inserted into DB: {}".format(len(df)))

Total vaccination centers across US: 310490
Number of locations with missing coordinates: 42 (will be skipped)
Number of locations marked as out of stock: 178585 (will be included)
Vaccination centers to be inserted into DB: 310448


In [25]:
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
gdf = gdf.h3.geo_to_h3(resolution=H3_LEVEL, set_index=False)
gdf['Category'] = 'Vaccination centre'
gdf = gdf.rename({"h3_09": "H3ID", "loc_name": "Name"}, axis=1)

In [26]:
#helper function for batching
def grouper(n, iterable):
    it = iter(iterable)
    while True:
        chunk = list(itt.islice(it, n))
        if not chunk:
            return
        yield chunk

with pg.connect(**db_params) as conn:    
    with conn.cursor() as cur:
        #check if vaccination centers already exist - if so, skip    
        cur.execute(""" SELECT POIID FROM public.pois WHERE Category = %s """, ("Vaccination centre",))
        if cur.fetchone() is None:
            print(f"Inserting vaccination centers")
                        
            #save information into DB in batches of 100 locations
            batch_size = 100
            tuple_iterator = gdf[['Name', 'H3ID', 'latitude', 'longitude', 'Category']].itertuples(index=False, name=None)
            for i, batch in enumerate(grouper(100, tuple_iterator)):
                if i % 1000 == 0:
                    print("Saving item #{}".format(i * batch_size))                
                execute_values(cur, 'INSERT INTO public.pois (Name, H3ID, Lat, Long, Category) VALUES %s', batch)                
        else: 
            print(f"Vaccination centers already exist - skipping")


Inserting vaccination centers
Saving item #0
Saving item #100000
Saving item #200000
Saving item #300000
