# **ROB'S NYTIMES COVID ANALYSIS (PER COUNTY)**


- import the NYTIMES covid  data as a pandas frame
- massage and plot
- **To Execute**:
 - select Runtime->Run all
 - or Type Ctrl-Return on each cell to execute it



In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as subplots
# I'd prefer matplotlib notebook for the backend, but it doesnt work under colab
#%matplotlib inline

# reload modules without reloading explicitly
import importlib
%load_ext autoreload
%autoreload 2


In [6]:
%pwd
import rycovid as cvd

# LOAD DATA

see https://towardsdatascience.com/3-ways-to-load-csv-files-into-colab-7c14fcbdcb92



## Load the NYTIMES data from the git repo raw view

In [7]:
#
# set the URL for the "raw view" of the data
NYTIMES_COUNTY_URL="https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"


In [8]:
# get data direct
nyt_df = pd.read_csv(NYTIMES_COUNTY_URL)
nyt_df.tail()  # print tail so we can latest date

Unnamed: 0,date,county,state,fips,cases,deaths
30838,2020-04-03,Sublette,Wyoming,56035.0,1,0
30839,2020-04-03,Sweetwater,Wyoming,56037.0,3,0
30840,2020-04-03,Teton,Wyoming,56039.0,32,0
30841,2020-04-03,Uinta,Wyoming,56041.0,1,0
30842,2020-04-03,Washakie,Wyoming,56043.0,2,0


 ## read county population data from the US Census excel spreadsheet

In [9]:
!ls
pop_df = pd.read_excel('co-est2019-annres.xlsx',header=3,nrows=3147-4)
pop_df = pop_df.rename(columns={"Unnamed: 0":"county+state"})
pop_df = pop_df.drop([0]).reset_index()  # first row is overall US population
pop_df.head()
print(len(pop_df))


[34m__pycache__[m[m                        rycovid.py~
analyze-nytimes-covid-county.ipynb state-geocodes-v2017.xlsx
co-est2019-annres.xlsx             ~$co-est2019-annres.xlsx
rycovid.py
3142


 ## prep the data

In [18]:
pop_df1=pop_df.copy()
pop_df1 = cvd.split_county_state(pop_df1)
pop_df1.head()
#print(pop_df1[pop_df1['state']=='Alabama'])

3142
3142


Unnamed: 0,index,Census,Estimates Base,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,county,state
0,1,54571,54597,54773,55227,54954,54727,54893,54864,55243,55390,55533,55869,.Autauga County,Alabama
1,2,182265,182265,183112,186558,190145,194885,199183,202939,207601,212521,217855,223234,.Baldwin County,Alabama
2,3,27457,27455,27327,27341,27169,26937,26755,26283,25806,25157,24872,24686,.Barbour County,Alabama
3,4,22915,22915,22870,22745,22667,22521,22553,22566,22586,22550,22367,22394,.Bibb County,Alabama
4,5,57322,57322,57376,57560,57580,57619,57526,57526,57494,57787,57771,57826,.Blount County,Alabama


In [130]:
pop_df2 = pop_df1.copy()
pop_df2=cvd.strip_leading_dot(pop_df2,'county')
pop_df2 = cvd.to_lower(pop_df2,"county")
pop_df2 = cvd.to_lower(pop_df2,"state")
#pop_df2 = strip_trailing_county(pop_df2,'county')
pop_df2 = cvd.strip_spaces(pop_df2,'county')
pop_df2 = cvd.strip_spaces(pop_df2,'state')

pop_df2 = cvd.do_misc_census_fixup(pop_df2)

#pop_df2.head()
#pop_df2[pop_df2['state'].str.match('alabama')]
pop_df2.columns
#pop_df2['county'].unique()

Index([         'index',         'Census', 'Estimates Base',             2010,
                   2011,             2012,             2013,             2014,
                   2015,             2016,             2017,             2018,
                   2019,         'county',          'state'],
      dtype='object')

In [127]:
#
# census county names are sometimes "blah county," sometimes "blah county census area" etc.
# so to find a specific county, use df.county.str.contains('blah')
#pop_df2[(pop_df2.county.str.contains('anchorage')) & (pop_df2.state=='alaska')]
#pop_df2[(pop_df2.county.str.contains('virginia beach city')) & (pop_df2.state=='virginia')]
#pop_df2[cvd.get_index_county_state(pop_df2,'snohomish','washington',smartmatch=True)]
#pop_df2[cvd.get_index_county_state(pop_df2,'anchorage','alaska',smartmatch=True)]
#pop_df2[cvd.get_index_county_state(pop_df2,'smith','texas',smartmatch=True)]
pop_df2[cvd.get_index_county_state(pop_df2,'ford','kansas',smartmatch=True)]





Unnamed: 0,index,Census,Estimates Base,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,county,state
916,917.0,33848.0,33844.0,34008.0,34372.0,34665.0,34879.0,34930.0,34629.0,34552.0,34218.0,33877.0,33619,ford county,kansas


      index   Census  Estimates Base     2010     2011     2012     2013  \
1858   1859  1585873         1586381  1588767  1608293  1623911  1627491   

         2014     2015     2016     2017     2018     2019         county  \
1858  1630678  1636063  1635443  1630698  1629055  1628706  new york city   

         state  
1858  new york  


In [13]:
#
# fix the NYT data
nyt_df1 = nyt_df.copy()
nyt_df1 = cvd.to_lower(nyt_df1,'county')
nyt_df1 = cvd.to_lower(nyt_df1,'state')

#
# drop all county='unknown' entries
unknown_indexes = nyt_df1[nyt_df1.county=='unknown'].index    
nyt_df1 = nyt_df1.drop(index=unknown_indexes)
nyt_df1[nyt_df1.county=='unknown']
nyt_df1.tail()

Unnamed: 0,date,county,state,fips,cases,deaths
30838,2020-04-03,sublette,wyoming,56035.0,1,0
30839,2020-04-03,sweetwater,wyoming,56037.0,3,0
30840,2020-04-03,teton,wyoming,56039.0,32,0
30841,2020-04-03,uinta,wyoming,56041.0,1,0
30842,2020-04-03,washakie,wyoming,56043.0,2,0


In [131]:
#validate_county_match(nyt_df1, pop_df2)
mismatch_list = cvd.validate_county_match(nyt_df1,pop_df2)
print(f"found {len(mismatch_list)} (county,state) mismatches")

found 0 (county,state) mismatches


# **MANIPULATE DATA**

In [None]:
# DATE FIXUP
# convert date to timestamp for easier manipulation
if 'date' in states_df:
  states_df['tstamp'] = [datestring_to_timestamp(d) for d in states_df['date']]
  states_df.head()

  # add day-number column
  states_df['daynum'] = [datestring_to_daynum(d) for d in states_df['date'] ]

  states_df=states_df.drop(columns=['date'])  # 'date' is a string, not very useful
#end

In [None]:
#
#
START_DATE=pd.to_datetime('2020-03-10')  # find index corresponding to START_DATE
slice_df = states_df[states_df.tstamp >= START_DATE]
slice_df = slice_df.reset_index(drop=True)
slice_df.head()


In [None]:
# make list of desired states to plot
if False:  # specify True to include all states
  states = slice_df['state']
  states=states.drop_duplicates()
else:
  #states=['California','New York','New Jersey','Florida']
  #states=['California','New York','Florida','New Jersey']
  states=['Louisiana', 'Oklahoma', 'South Carolina', 'Tennessee','Michigan','Washington']
#end
len(states)

In [None]:
#
# select counties here
# TODO: sort by number of cases and select top 5, etc.
county_list=['Santa Clara','Alachua','Alameda']

In [None]:
slice_df1 = slice_df.copy()


In [None]:
# compute deltacases (i.e. Nd)
slice_df2 = slice_df1.copy()

for countyname in county_list:
  print(countyname)
  slice_df2 = calc_delta_over_daynum(slice_df2, countyname,'cases','dcases')
  slice_df2 = calc_delta_over_daynum(slice_df2, countyname,'deaths','ddeaths')
slice_df2[slice_df2.county=='Alameda'].head()

In [None]:

slice_df3 = slice_df2.copy()
for countyname in county_list:
  slice_df3 = calc_growthfactor(slice_df3,countyname)


In [None]:
slice_df4 = slice_df3.copy()
for countyname in county_list:
  slice_df4 = avg_over_daynum(slice_df4, countyname, "growthf", 3, "avggrowthf")


In [None]:
# graph: cases per day
graph_cases_list=[]
for countyname in county_list:
  tmp_df = slice_df3[slice_df.county==countyname]
  graph_cases_list.append(go.Scatter(name=countyname,
                               x=tmp_df['tstamp'],
                               y=tmp_df['cases'],
                               mode='lines+markers'
                               ))

x_axis=dict(
    showgrid=True,
    title='day')
y_axis=dict(
    showgrid=True,
    title='cases')

layout=go.Layout(
    title={
        'text':'total cases vs Date',
        'x':0.5  # center
    },
    xaxis=x_axis,
    yaxis_type="log",
    yaxis=y_axis,
    width=1000,
    showlegend=True
)
fig = go.Figure( data=graph_cases_list, layout=layout)
fig.update_yaxes(gridcolor='black')
fig.update_xaxes(gridcolor='black')
fig.show()


In [None]:
#
# new cases per day
graph_new_list=[]
for countyname in county_list:
  tmp_df = slice_df3[slice_df.county==countyname]
  graph_new_list.append(go.Scatter(name=countyname,
                               x=tmp_df['tstamp'],
                               y=tmp_df['dcases'],
                               mode='lines+markers'
                               ))
  x_axis=dict(
    showgrid=True,
    title='day')
y_axis=dict(
    showgrid=True,
    title='cases')
layout=go.Layout(
    title={
        'text':'New Cases vs date',
        'x':0.5  # center
    },
    xaxis=x_axis,
    #yaxis_type="log",
    yaxis=y_axis,
    width=1000,
    showlegend=True
)
fig = go.Figure( data=graph_new_list, layout=layout)
fig.update_yaxes(gridcolor='black')
fig.update_xaxes(gridcolor='black')
fig.show()

In [None]:
#
# plot new cases vs total cases
# this plot supposedly highlights when a population drops off the curve
graph_new_list=[]
for countyname in county_list:
  tmp_df = slice_df3[slice_df.county==countyname]
  graph_new_list.append(go.Scatter(name=countyname,
                               x=tmp_df['cases'],
                               y=tmp_df['dcases'],
                               mode='lines+markers'
                               ))
  x_axis=dict(
    showgrid=True,
    title='total cases')
y_axis=dict(
    showgrid=True,
    title='new cases')
layout=go.Layout(
    title={
        'text':'New Cases vs total cases',
        'x':0.5  # center
    },
    xaxis=x_axis,
    xaxis_type="log",
    yaxis=y_axis,
    yaxis_type="log",
    width=1000,
    showlegend=True
)
fig = go.Figure( data=graph_new_list, layout=layout)
fig.update_yaxes(gridcolor='black')
fig.update_xaxes(gridcolor='black')
fig.show()

In [None]:
#
# new deaths per day
graph_deaths_list=[]
for countyname in county_list:
  tmp_df = slice_df3[slice_df.county==countyname]
  graph_deaths_list.append(go.Scatter(name=countyname,
                               x=tmp_df['tstamp'],
                               y=tmp_df['ddeaths'],
                               mode='lines+markers'
                               ))
x_axis=dict(
    showgrid=True,
    title='day')
y_axis=dict(
    showgrid=True,
    title='cases')

layout=go.Layout(
    title={
        'text':'daily deaths vs date',
        'x':0.5  # center
    },
    xaxis=x_axis,
    #yaxis_type="log",
    yaxis=y_axis,
    width=1000,
    showlegend=True
)
fig = go.Figure( data=graph_deaths_list, layout=layout)
fig.update_yaxes(gridcolor='black')
fig.update_xaxes(gridcolor='black')
fig.show()

In [None]:
#
# growthfactor per day
graph_growth_list=[]
for countyname in county_list:
  tmp_df = slice_df4[slice_df.county==countyname]
  graph_growth_list.append(go.Scatter(name=countyname,
                               x=tmp_df['tstamp'],
                               y=tmp_df['avggrowthf'],
                               mode='lines+markers'
                               ))
  x_axis=dict(
    showgrid=True,
    title='day')
y_axis=dict(
    showgrid=True,
    range=[0,3],
    title='growthf')

layout=go.Layout(
    title={
        'text':'# Growth Factor d(N)/d(N-1)',
        'x':0.5  # center
    },
    xaxis=x_axis,
    #yaxis_type="log",
    yaxis=y_axis,
    width=1000,
    showlegend=True
)
fig = go.Figure( data=graph_growth_list, layout=layout)
fig.update_yaxes(gridcolor='black')
fig.update_xaxes(gridcolor='black')
fig.show()