# **ROB'S NYTIMES COVID ANALYSIS (PER COUNTY)**


- import the NYTIMES covid  data as a pandas frame
- massage and plot
- **To Execute**:
 - select Runtime->Run all
 - or Type Shift-Return on each cell to execute it



In [53]:
import numpy as np
import pandas as pd

# I'd prefer matplotlib notebook for the backend, but it doesnt work under colab
#%matplotlib inline

# reload modules without reloading explicitly
import importlib
%load_ext autoreload
%autoreload 2

import rycovid as cvd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
%pwd


'/Users/ryu/Documents/p2021/covid'

## RAW DATA

see https://towardsdatascience.com/3-ways-to-load-csv-files-into-colab-7c14fcbdcb92



Load the NYTIMES data from the git repo raw view

In [55]:
#
# set the URL for the "raw view" of the data
NYTIMES_COUNTY_URL="https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"


In [56]:
# get raw data
READ_DATA_FROM_WEB = False

local_nyt_fname = 'us-counties-localcopy.csv'
if READ_DATA_FROM_WEB:
    nyt_df = pd.read_csv(NYTIMES_COUNTY_URL)  # "df" = "data frame"

    # and write it again locally
    tmpfile = open(local_nyt_fname,mode='w',newline='')
    nyt_df.to_csv(tmpfile)
    tmpfile.close()
else:
    nyt_df = pd.read_csv(local_nyt_fname)
#end

## read county population data from the US Census excel spreadsheet

In [57]:
!ls
pop_df = pd.read_excel('co-est2019-annres.xlsx',header=3,nrows=3147-4)
pop_df = pop_df.rename(columns={"Unnamed: 0":"county+state"})
pop_df = pop_df.drop([0]).reset_index()  # first row is overall US population
pop_df.head()
print(len(pop_df))


README.md                          normalized_nyt.csv
[34m__pycache__[m[m                        normalized_pop.csv
analyze-nytimes-covid-county.ipynb rycovid.py
cases-vs-date.png                  state-geocodes-v2017.xlsx
co-est2019-annres.xlsx             us-counties-localcopy.csv
3142


## NORMALIZE DATA SETS

In [148]:
# 
# in census data, split "county,state" columns into separate columns
pop_df1=pop_df.copy()
pop_df1 = cvd.split_county_state(pop_df1)
assert 'county' in pop_df1
assert 'state' in pop_df1

3142
3142


In [149]:
# 
# census data: bunch of fixup (reformatting, normalization, add missing entries)
pop_df2 = pop_df1.copy()
pop_df2=cvd.strip_leading_dot(pop_df2,'county')
pop_df2 = cvd.to_lower(pop_df2,"county")
pop_df2 = cvd.to_lower(pop_df2,"state")
#pop_df2 = strip_trailing_county(pop_df2,'county')
pop_df2 = cvd.strip_spaces(pop_df2,'county')
pop_df2 = cvd.strip_spaces(pop_df2,'state')

pop_df2 = cvd.do_misc_census_fixup(pop_df2)


index=cvd.get_index_county_state(pop_df2, 'new york city', 'new york')
assert len(pop_df2[index]['county'])==1

In [166]:
#
#  NYT data: reformatting and normalization
nyt_df1 = nyt_df.copy()
nyt_df1 = cvd.to_lower(nyt_df1,'county')
nyt_df1 = cvd.to_lower(nyt_df1,'state')

In [167]:
#
# drop all county='unknown' entries
unknown_indexes = nyt_df1[nyt_df1.county=='unknown'].index    
nyt_df1 = nyt_df1.drop(index=unknown_indexes)
nyt_df1[nyt_df1.county=='unknown']

Unnamed: 0.1,Unnamed: 0,date,county,state,fips,cases,deaths


In [174]:
# convert 'date' to timestamp and daynum
# this may take a while...
nyt_df1 = cvd.fix_date(nyt_df1)


0 20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 420 440 460 480 500 520 540 560 580 600 


In [175]:
nyt_df1.head()

Unnamed: 0.1,Unnamed: 0,date,county,state,fips,cases,deaths,tstamp,daynum
0,0,2020-01-21,snohomish,washington,53061.0,1,0.0,2020-01-21 00:00:00,18282.0
1,1,2020-01-22,snohomish,washington,53061.0,1,0.0,2020-01-22 00:00:00,18283.0
2,2,2020-01-23,snohomish,washington,53061.0,1,0.0,2020-01-23 00:00:00,18284.0
3,3,2020-01-24,cook,illinois,17031.0,1,0.0,2020-01-24 00:00:00,18285.0
4,4,2020-01-24,snohomish,washington,53061.0,1,0.0,2020-01-24 00:00:00,18285.0


In [176]:
# misc data fixes
nyt_df1 = cvd.do_misc_nyt_fixup(nyt_df1)

assert len(nyt_df1[nyt_df1.county=='new york'])==0
#print(type(nyt_df1['tstamp'][0]))

In [177]:
#
# save off data at this point
import pickle
normalized_nyt_fname = "normalized_nyt.pickle"
normalized_pop_fname = "normalized_pop.pickle"
cvd.pickle_dataframes(nyt_df1, normalized_nyt_fname,
                     pop_df1, normalized_pop_fname)


!ls *.pickle

filtered_nyt.pickle   normalized_nyt.pickle
filtered_pop.pickle   normalized_pop.pickle


In [107]:
#
#verify that every county, state in NYT exists in census data
#
mismatch_list = cvd.validate_county_match(nyt_df1,pop_df2)
assert len(mismatch_list)==0, f"found {len(mismatch_list)} (county,state) mismatches"


AssertionError: found 85 (county,state) mismatches

## FILTER BY COUNTY

In [178]:
READ_DF_FROM_FILE=True
normalized_nyt_fname = "normalized_nyt.pickle"
normalized_pop_fname = "normalized_pop.pickle"
if READ_DF_FROM_FILE:
    nyt_df1, pop_df1 = cvd.unpickle_dataframes(normalized_nyt_fname, 
                                              normalized_pop_fname)

#end
nyt_df1.head()
#print(type(nyt_df1['tstamp'][0]))

Unnamed: 0.1,Unnamed: 0,date,county,state,fips,cases,deaths,tstamp,daynum
0,0,2020-01-21,snohomish,washington,53061.0,1,0.0,2020-01-21 00:00:00,18282.0
1,1,2020-01-22,snohomish,washington,53061.0,1,0.0,2020-01-22 00:00:00,18283.0
2,2,2020-01-23,snohomish,washington,53061.0,1,0.0,2020-01-23 00:00:00,18284.0
3,3,2020-01-24,cook,illinois,17031.0,1,0.0,2020-01-24 00:00:00,18285.0
4,4,2020-01-24,snohomish,washington,53061.0,1,0.0,2020-01-24 00:00:00,18285.0


In [271]:
# generate county-state filter 
# Uncomment the county selection criteria below
COUNTY_CRITERIA="manual_selection"
#COUNTY_CRITERIA="all"

if COUNTY_CRITERIA=='all':
    just_sc_df = nyt_df1.loc[:,['county','state']]
    just_sc_df = just_sc_df.drop_duplicates()
    cs_list = [(row['county'],row['state']) for index, row in just_sc_df.iterrows()]
elif COUNTY_CRITERIA=='manual_selection':    
    cs_list = [('santa clara', 'california'),('bernalillo','new mexico'),('alachua','florida')]
    #cs_list = [('santa clara', 'california')]
    #cs_list = [('lane', 'oregon')]
else:
    assert False, "invalid selection"
#end

print(f"selected {len(cs_list)} counties")
assert len(cs_list) > 0

selected 3 counties


In [272]:
#
# discard (county,state) rows not in our list
nyt_df2 = cvd.filter_rows_by_state_county(nyt_df1, cs_list).reset_index()
print(f"found {len(nyt_df2)} rows after filtering")

print(nyt_df2['county'].unique())

found 1721 rows after filtering
['santa clara' 'bernalillo' 'alachua']



## FILTER BY DATE

In [273]:
#
# print earliest, latest dates in dataframe
# assumes dataframe sorted by timestamp
print(f"earliest entry: {nyt_df2.loc[0].tstamp}")
print(f"latest entry: {nyt_df2.iloc[-1].tstamp}")

earliest entry: 2020-01-31 00:00:00
latest entry: 2021-09-21 00:00:00


In [274]:
#
# specify START_DATE, find closest match and filter
START_DATE='2021-05-01'

start_dt=pd.to_datetime(START_DATE)  # find index corresponding to START_DATE

#nyt_df3 = nyt_df2[nyt_df2.tstamp >= start_dt]
#nyt_df3 = nyt_df3.reset_index(drop=True)

#assert nyt_df2.loc[0,'tstamp']==START_DATE, "first entry does not correspond to desired date'

In [275]:
#
# NYT: determine STOP_DATE
#STOP_DATE = '2021-09-23'
STOP_DATE='latest'

if STOP_DATE=='latest':
    stop_dt = nyt_df3.iloc[-1]['tstamp']
    print(f"latest date in dataframe is {stop_dt}")
else:
    stop_dt = pd.to_datetime(STOP_DATE)
#end
    
assert start_dt < stop_dt
#nyt_df3 = nyt_df2[(nyt_df2.tstamp >= start_dt) and (nyt_df2.tstamp <= stop_dt)]
nyt_df3 = nyt_df2[(nyt_df2.tstamp >= start_dt) & (nyt_df2.tstamp <= stop_dt)]
nyt_df3 = nyt_df3.reset_index(drop=True)




latest date in dataframe is 2021-09-21 00:00:00


In [276]:
print(f"earliest entry: {nyt_df2.loc[0].tstamp}")
print(f"latest entry: {nyt_df2.iloc[-1].tstamp}")

earliest entry: 2020-01-31 00:00:00
latest entry: 2021-09-21 00:00:00


In [277]:
# save off data
#
# save off data at this point
import pickle

nyt_fname = "filtered_nyt.pickle"
pop_fname = "filtered_pop.pickle"

cvd.pickle_dataframes(nyt_df3, nyt_fname,
                     pop_df2, pop_fname)


!ls *.pickle

filtered_nyt.pickle   normalized_nyt.pickle
filtered_pop.pickle   normalized_pop.pickle


## MASSAGE DATA

In [278]:
READ_DF_FROM_FILE=True
nyt_fname = "filtered_nyt.pickle"
pop_fname = "filtered_pop.pickle"
if READ_DF_FROM_FILE:
    nyt_df3, pop_df3 = cvd.unpickle_dataframes(nyt_fname, 
                                               pop_fname)

#end
nyt_df3.head()

Unnamed: 0.1,index,Unnamed: 0,date,county,state,fips,cases,deaths,tstamp,daynum
0,1274524,1274524,2021-05-01,santa clara,california,6085.0,118369,2069.0,2021-05-01 00:00:00,18748.0
1,1277771,1277771,2021-05-02,santa clara,california,6085.0,118424,2069.0,2021-05-02 00:00:00,18749.0
2,1281018,1281018,2021-05-03,santa clara,california,6085.0,118479,2070.0,2021-05-03 00:00:00,18750.0
3,1284265,1284265,2021-05-04,santa clara,california,6085.0,118541,2071.0,2021-05-04 00:00:00,18751.0
4,1287511,1287511,2021-05-05,santa clara,california,6085.0,118593,2074.0,2021-05-05 00:00:00,18752.0


In [279]:
#
# compute norm_deaths_per_county_df and norm_cases_per_county_df
nyt_df4 = cvd.normalize_cases_deaths(nyt_df3, pop_df3) 

nyt_df4.tail()

created column norm_deaths
created column norm_cases


Unnamed: 0.1,index,Unnamed: 0,date,county,state,fips,cases,deaths,tstamp,daynum,norm_cases,norm_deaths
427,1725959,1725959,2021-09-17,alachua,florida,12001.0,37590,344.0,2021-09-17 00:00:00,18887.0,0.139717,0.001279
428,1729208,1729208,2021-09-18,alachua,florida,12001.0,37590,463.0,2021-09-18 00:00:00,18888.0,0.139717,0.001721
429,1732457,1732457,2021-09-19,alachua,florida,12001.0,37590,463.0,2021-09-19 00:00:00,18889.0,0.139717,0.001721
430,1735706,1735706,2021-09-20,alachua,florida,12001.0,37590,463.0,2021-09-20 00:00:00,18890.0,0.139717,0.001721
431,1738954,1738954,2021-09-21,alachua,florida,12001.0,37590,463.0,2021-09-21 00:00:00,18891.0,0.139717,0.001721


## FINALLY, PLOTS

In [280]:
import plotly.graph_objects as go
import plotly.subplots as subplots

In [281]:
nytplot_df = nyt_df4.copy()

In [283]:
def moving_average(x, w):
    return np.convolve(x, (1.0/w) * np.ones(w), 'valid')

In [284]:
def show_graph():
    x_axis=dict(
        showgrid=True,
        title='day')
    y_axis=dict(
        showgrid=True,
        title='cases')

    layout=go.Layout(
        title={
            'text':GRAPH_TITLE,
            'x':0.5  # center
        },
        xaxis=x_axis,
        #yaxis_type="log",
        yaxis=y_axis,
        width=1000,
        showlegend=True
    )
    fig = go.Figure( data=graph_list, layout=layout)
    fig.update_yaxes(gridcolor='black')
    fig.update_xaxes(gridcolor='black')
    fig.show()

In [289]:
avg_days = 7
GRAPH_TITLE=f"new deaths vs date - {avg_days} day avg"
graph_list=[]

def moving_average(x, w):
    return np.convolve(x, (1.0/w) * np.ones(w), 'valid')

for county,state in cs_list:
    indices=(nytplot_df.county==county) & (nytplot_df.state==state)
    tmp_df = nytplot_df[indices]
    ts = tmp_df['tstamp'].to_list()
    deaths = tmp_df['deaths'].to_numpy()
    #print(deaths)
    ddeaths = deaths[1:] - deaths[0:-1]
    ddeaths = np.maximum(ddeaths, np.zeros(len(ddeaths)))  # can't have less than 0 new deaths per day
    #print(ddeaths)
    avg_ddeaths = moving_average(ddeaths,avg_days)
    graph_list.append(go.Scatter(name=f"{county}, {state}",
                               x=ts[avg_days:],
                               y=avg_ddeaths,
                               mode='lines+markers'
                               ))
#end
show_graph()

In [290]:
avg_days = 7
GRAPH_TITLE=f"new cases vs date - {avg_days} day avg"
graph_list=[]

for county,state in cs_list:
    indices=(nytplot_df.county==county) & (nytplot_df.state==state)
    tmp_df = nytplot_df[indices]
    ts = tmp_df['tstamp'].to_list()
    cases = tmp_df['cases'].to_numpy()
    dcases = cases[1:] - cases[0:-1]
    dcases = np.maximum(dcases, np.zeros(len(dcases)))  # can't have less than 0 new deaths per day
    avg_dcases = moving_average(dcases,avg_days)
    graph_list.append(go.Scatter(name=f"{county}, {state}",
                               x=ts[avg_days:],
                               y=avg_dcases,
                               mode='lines+markers'
                               ))
#end
show_graph()


In [287]:
#
# plot new cases vs total cases
# this plot supposedly highlights when a population drops off the curve
GRAPH_TITLE=f"cases-per-day vs total_cases - {COUNTY_CRITERIA}"
graph_list=[]
for county,state in cs_list:
    indices=(nytplot_df.county==county) & (nytplot_df.state==state)
    tmp_df = nytplot_df[indices]
    graph_list.append(go.Scatter(name=f"{county}, {state}",
                               x=tmp_df['cases'],
                               y=cvd.avg_over_daynum(tmp_df, county,state,'dcases',7),
                               mode='lines+markers'
                               ))
x_axis=dict(
    showgrid=True,
    title='total cases')
y_axis=dict(
    showgrid=True,
    title='new cases')
layout=go.Layout(
    title={
        'text':GRAPH_TITLE,
        'x':0.5  # center
    },
    xaxis=x_axis,
    xaxis_type="log",
    yaxis=y_axis,
    yaxis_type="log",
    width=1000,
    showlegend=True
)
fig = go.Figure( data=graph_list, layout=layout)
fig.update_yaxes(gridcolor='black')
fig.update_xaxes(gridcolor='black')
fig.show()

AssertionError: 

In [32]:
#
# new deaths per day
GRAPH_TITLE=f"new cases vs date - {COUNTY_CRITERIA}"
graph_list=[]
for county,state in cs_list:
    indices=(nytplot_df.county==county) & (nytplot_df.state==state)
    tmp_df = nytplot_df[indices]
    graph_list.append(go.Scatter(name=f"{county}, {state}",
                               x=tmp_df['tstamp'],
                               y=tmp_df['dcases'],
                               mode='lines+markers'
                               ))
x_axis=dict(
    showgrid=True,
    title='day')
y_axis=dict(
    showgrid=True,
    title='cases',
    range=[0,3])

layout=go.Layout(
    title={
        'text':GRAPH_TITLE,
        'x':0.5  # center
    },
    xaxis=x_axis,
    #yaxis_type="log",
    yaxis=y_axis,
    width=1000,
    showlegend=True
)
fig = go.Figure( data=graph_list, layout=layout)
fig.update_yaxes(gridcolor='black')
fig.update_xaxes(gridcolor='black')
fig.show()

In [33]:
#
# growthfactor per day
GRAPH_TITLE=f"growthfactor vs date - {COUNTY_CRITERIA}"
graph_list=[]
for county,state in cs_list:
    indices=(nytplot_df.county==county) & (nytplot_df.state==state)
    tmp_df = nytplot_df[indices]
    graph_list.append(go.Scatter(name=f"{county}, {state}",
                               x=tmp_df['tstamp'],
                               y=cvd.avg_over_daynum(tmp_df, county, state, 'growthfactor',14),
                               mode='lines+markers'
                               ))
x_axis=dict(
    showgrid=True,
    title='day')
y_axis=dict(
    showgrid=True,
    range=[0,3],
    title='growthf')

layout=go.Layout(
    title={
        'text':GRAPH_TITLE,
        'x':0.5  # center
    },
    xaxis=x_axis,
    #yaxis_type="log",
    yaxis=y_axis,
    width=1000,
    showlegend=True
)
fig = go.Figure( data=graph_list, layout=layout)
fig.update_yaxes(gridcolor='black')
fig.update_xaxes(gridcolor='black')
fig.show()