helpful links:
- https://towardsdatascience.com/data-visualization-with-bokeh-in-python-part-ii-interactions-a4cf994e2512
- https://realpython.com/lessons/using-groupfilter-and-cdsview/

In [None]:
import pandas
import math

In [None]:
state_to_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Federated States of Micronesia': 'FM',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Marshall Islands': 'MH',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands': 'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

In [None]:
# County population data from us census
#     https://www.census.gov/data/datasets/time-series/demo/popest/2010s-counties-total.html#par_textimage_70769902
#     https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv

all_pop_data = pandas.read_csv('./co-est2019-alldata.csv', encoding='IBM850')
#with open('./co-est2019-alldata.csv') as f:
#    text = f.read()
county_pop_data = all_pop_data[all_pop_data.SUMLEV == 50][
    ['STATE', 'COUNTY', 'STNAME', 'CTYNAME', 'POPESTIMATE2019']
]
county_pop_data['fips'] = county_pop_data.STATE * 1000 + county_pop_data.COUNTY
county_pop_data[county_pop_data.fips == 6037]

In [None]:
county_pop_data = county_pop_data[['fips', 'POPESTIMATE2019']]
county_pop_data = county_pop_data.rename(columns={
    'POPESTIMATE2019': 'population',
})
county_pop_data = county_pop_data.set_index('fips')
county_pop_data[county_pop_data.index == 6037]

In [None]:
nytimes_counties_url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
counties_raw_data = pandas.read_csv(nytimes_counties_url, parse_dates=['date'])
# fips codes are left out for, ie, New York City, and "Unknown" groupings for states

In [None]:
# New York city is special - the city is divided into 5 counties (that's backward!)
# It's obviously so weird that even the New York Times doesn't abide by this, and just lists
# one entry for "New York City" - need to deal with this foolishness specially

new_york_burroughs = [
    'New York County',
    'Kings County',
    'Bronx County',
    'Richmond County',
    'Queens County',
]

nycity_pop = 0
for burrough in new_york_burroughs:
    burrough_data = all_pop_data[(all_pop_data.STNAME == 'New York') & (all_pop_data.CTYNAME == burrough)]
    assert len(burrough_data) == 1
    nycity_pop += burrough_data.POPESTIMATE2019.iat[0]

    # make up nycity's fips as -1
NYCITY_FIPS = -1

county_pop_data.loc[NYCITY_FIPS] = nycity_pop

#nycity_data = counties_raw_data[counties_raw_data.county == 'New York City'].copy()
#nycity_data.fips = NYCITY_FIPS

counties_raw_data.loc[counties_raw_data.county == 'New York City', 'fips'] = NYCITY_FIPS
counties_raw_data[counties_raw_data.county == 'New York City']

county_pop_data[county_pop_data.index == NYCITY_FIPS]

In [None]:
counties_data = counties_raw_data[counties_raw_data.fips.notna()]
counties_data = counties_data.astype({'fips': int})
#counties_data['state_fips'] = counties_data.fips // 1000
#counties_data['county_fips'] = counties_data.fips % 1000
#counties_data['county_state'] = counties_data['county'].str.cat(counties_data['state'], sep =", ")
#all_counties = (counties_data['county_state'].unique())
counties_data = pandas.merge(counties_data, county_pop_data, left_on='fips', right_on=county_pop_data.index)
counties_data['cases_per_million'] = counties_data.cases / (counties_data.population / 1e6)
counties_data['deaths_per_million'] = counties_data.deaths / (counties_data.population / 1e6)

In [None]:
# Confirm all counties in nytimes data have population data
counties_fips = set(counties_data.fips.unique())
pop_fips = set(county_pop_data.index.unique())
assert len(counties_fips - pop_fips) == 0

In [None]:
# Confirm all states in nytimes data have abbreviations
counties_states = set(counties_data.state.unique())
abbrev_states = set(state_to_abbrev)
assert len(counties_states - abbrev_states) == 0

In [None]:
#la_data = counties_data[counties_data.county_state == 'Los Angeles, California']
#oc_data = counties_data[counties_data.county_state == 'Orange, California']

counties_to_graph = [
    ('Los Angeles', 'California'),
    ('Orange', 'California'),
    ('Middlesex', 'Massachusetts'),
    ('New York City', 'New York'),
]

counties_selected_data = {}

for county, state in counties_to_graph:
    state_abbrev = state_to_abbrev[state]
    county_data = counties_data[(counties_data.state == state) & (counties_data.county == county)]
    assert len(county_data) > 0
    counties_selected_data[(county, state_abbrev)] = county_data
        
#counties_selected_data[('Los Angeles', 'CA')]
counties_selected_data[('New York City', 'NY')]

In [None]:
def get_data_since(data, condition_func):
    condition = condition_func(data)
    since_data = data[condition].reset_index(drop=True)
    day0 = since_data.date.min()
    since_data['days'] = (since_data.date - day0).apply(lambda x: x.days)
    return since_data

def deaths_per_mill_greater_1(data):
    return data.deaths_per_million >= 1.0

counties_since_data = {}

for county_state, data in counties_selected_data.items():
    counties_since_data[county_state] = get_data_since(data, deaths_per_mill_greater_1)

counties_since_data[('Orange', 'CA')]

In [None]:
from collections import OrderedDict

# Thanks to Kenneth Kelly + Ohad Schneider:
# https://stackoverflow.com/a/13781114/920545
kelly_colors_dict = OrderedDict(
    black=(0,0,0),
    vivid_yellow=(255, 179, 0),
    strong_purple=(128, 62, 117),
    vivid_orange=(255, 104, 0),
    very_light_blue=(166, 189, 215),
    vivid_red=(193, 0, 32),
    grayish_yellow=(206, 162, 98),
    medium_gray=(129, 112, 102),

    # these aren't good for people with defective color vision:
    vivid_green=(0, 125, 52),
    strong_purplish_pink=(246, 118, 142),
    strong_blue=(0, 83, 138),
    strong_yellowish_pink=(255, 122, 92),
    strong_violet=(83, 55, 122),
    vivid_orange_yellow=(255, 142, 0),
    strong_purplish_red=(179, 40, 81),
    vivid_greenish_yellow=(244, 200, 0),
    strong_reddish_brown=(127, 24, 13),
    vivid_yellowish_green=(147, 170, 0),
    deep_yellowish_brown=(89, 51, 21),
    vivid_reddish_orange=(241, 58, 19),
    dark_olive_green=(35, 44, 22),
)
kelly_colors = list(kelly_colors_dict.values())

In [None]:
from bokeh.plotting import figure, output_file, output_notebook, show

# display output inline in notebook
output_notebook()

# # output to static HTML file
# output_file("deaths.html")
# p = figure(title="Covid 19", x_axis_label='Date', y_axis_label='Deaths', x_axis_type='datetime', y_axis_type='log')
# p.line(x='date', y='deaths', source=la_data, line_width=3, color=kelly_colors[0])
# p.line(x='date', y='deaths', source=oc_data, line_width=3, color=kelly_colors[1])

# # output to static HTML file
output_file("deaths_million_since_1.html")
plot = figure(title="Covid 19 - deaths since 1/million",
           x_axis_label='Days since 1 death/million', y_axis_label='Deaths/million',
           y_axis_type='log')

for i, (county_state, data) in enumerate(counties_since_data.items()):
    plot.line(x='days', y='deaths_per_million', source=data,
           line_width=3, color=kelly_colors[i], legend_label=', '.join(county_state))

# show the results
show(plot)