# Data Cleaning 1: JHU COVID-19 USA dataset

Data cleaning of JHU COVID-19 USA dataset
Aggregated by state
* Confirmed = cumulative confirmed cases
* Deaths = cumulative deaths
* Population = state population
* Date
* new_confirmed = new confirmed cases each day
* new_deaths = new death cases each day

In [None]:
import pandas as pd
import numpy as np
import re
import os
import datetime as dt

In [None]:
raw_df = pd.read_csv('https://raw.githubusercontent.com/datasets/covid-19/master/data/us_simplified.csv')

print(raw_df.shape)
raw_df.head()

(928520, 6)


Unnamed: 0,Date,Admin2,Province/State,Confirmed,Deaths,Country/Region
0,2020-01-22,Autauga,Alabama,0,0,US
1,2020-01-23,Autauga,Alabama,0,0,US
2,2020-01-24,Autauga,Alabama,0,0,US
3,2020-01-25,Autauga,Alabama,0,0,US
4,2020-01-26,Autauga,Alabama,0,0,US


In [None]:
# drop US, FIPS, Admin2 (only us data)
raw_df = raw_df.drop(['Admin2', 'Country/Region'], axis=1)
raw_df.head()

Unnamed: 0,Date,Province/State,Confirmed,Deaths
0,2020-01-22,Alabama,0,0
1,2020-01-23,Alabama,0,0
2,2020-01-24,Alabama,0,0
3,2020-01-25,Alabama,0,0
4,2020-01-26,Alabama,0,0


https://raw.githubusercontent.com/datasets/covid-19/master/data/us_simplified.csv

In [None]:
# aggregate by date and province/state, get Confirmed, Deaths, Population

df = raw_df.groupby(['Date', 'Province/State']).agg('sum').reset_index()
df.head()

Unnamed: 0,Date,Province/State,Confirmed,Deaths
0,2020-01-22,Alabama,0,0
1,2020-01-22,Alaska,0,0
2,2020-01-22,American Samoa,0,0
3,2020-01-22,Arizona,0,0
4,2020-01-22,Arkansas,0,0


In [None]:
set(df['Province/State'].values.tolist()) # print out states, delete irrelevant states

{'Alabama',
 'Alaska',
 'American Samoa',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'Diamond Princess',
 'District of Columbia',
 'Florida',
 'Georgia',
 'Grand Princess',
 'Guam',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Northern Mariana Islands',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Puerto Rico',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virgin Islands',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming'}

In [None]:
# Filter out  because they are not relevant, not a state
not_state = ['American Samoa', 'Diamond Princess', 'Grand Princess', 'Guam', 'Northern Mariana Islands', 'Puerto Rico']
df_clean = df[~df['Province/State'].isin(not_state)]
df_clean.tail()

Unnamed: 0,Date,Province/State,Confirmed,Deaths
16119,2020-10-25,Virginia,172774,3575
16120,2020-10-25,Washington,102913,2296
16121,2020-10-25,West Virginia,21907,425
16122,2020-10-25,Wisconsin,198166,1778
16123,2020-10-25,Wyoming,11041,68


In [None]:
df_clean[(df_clean['Date']=='2020-10-17') & (df_clean['Province/State'] == 'New York')] # sanity check

Unnamed: 0,Date,Province/State,Confirmed,Deaths
15638,2020-10-17,New York,482891,33347


In [None]:
import datetime as dt
df_clean['Date'] = df['Date'].apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
df_clean = df_clean.sort_values(by=['Date', 'Province/State'])
df_clean.tail()

Unnamed: 0,Date,Province/State,Confirmed,Deaths
16119,2020-10-25,Virginia,172774,3575
16120,2020-10-25,Washington,102913,2296
16121,2020-10-25,West Virginia,21907,425
16122,2020-10-25,Wisconsin,198166,1778
16123,2020-10-25,Wyoming,11041,68


In [None]:
####### Below is working function, please leave it
def new_cases(dataframe, state):
  confirmed = dataframe[dataframe['Province/State'] == state]['Confirmed'].values.tolist()
  deaths = dataframe[dataframe['Province/State'] == state]['Deaths'].values.tolist()
  tmp_df = dataframe[dataframe['Province/State'] == state]
  new_confirmed = [confirmed[0]]
  new_deaths = [deaths[0]]
  for i in range(len(tmp_df)-1):
    new_confirmed.append(confirmed[i+1]-confirmed[i])
    new_deaths.append(deaths[i+1]-deaths[i])
  tmp_df['new_confirmed'] = new_confirmed
  tmp_df['new_deaths'] = new_deaths

  return tmp_df # returns subsetted df with daily new confirmed and daily new deaths

In [None]:
states = sorted(set(df_clean['Province/State'].values.tolist()))
df_final = pd.DataFrame() # empty dataframe to store information

for state in states:
  df_tmp = new_cases(df_clean, state=state)
  df_final = df_final.append(df_tmp, ignore_index=True)

df_final.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,Date,Province/State,Confirmed,Deaths,new_confirmed,new_deaths
14451,2020-10-21,Wyoming,9848,61,322,0
14452,2020-10-22,Wyoming,10119,68,271,7
14453,2020-10-23,Wyoming,10545,68,426,0
14454,2020-10-24,Wyoming,10805,68,260,0
14455,2020-10-25,Wyoming,11041,68,236,0


In [None]:
# Compute rolling 7 day average for new_confirmed and new_death for each state
df_final['new_confirmed_avg'] = df_final.groupby('Province/State')['new_confirmed'].rolling(7).mean().reset_index(0, drop=True)
df_final['new_deaths_avg'] = df_final.groupby('Province/State')['new_deaths'].rolling(7).mean().reset_index(0, drop=True)

In [None]:
df_final.tail()

Unnamed: 0,Date,Province/State,Confirmed,Deaths,new_confirmed,new_deaths,new_confirmed_avg,new_deaths_avg
14451,2020-10-21,Wyoming,9848,61,322,0,238.714286,0.571429
14452,2020-10-22,Wyoming,10119,68,271,7,249.142857,1.571429
14453,2020-10-23,Wyoming,10545,68,426,0,268.571429,1.571429
14454,2020-10-24,Wyoming,10805,68,260,0,284.142857,1.571429
14455,2020-10-25,Wyoming,11041,68,236,0,288.0,1.571429


In [None]:
# sanity check: most recent NY state data
df_final[(df_final['Date'] == max(df_final['Date'])) & (df_final['Province/State'] == 'New York')]

Unnamed: 0,Date,Province/State,Confirmed,Deaths,new_confirmed,new_deaths,new_confirmed_avg,new_deaths_avg
9173,2020-10-25,New York,495464,33422,1632,4,1597.571429,9.285714


In [None]:
#filter to NY only
df_final[(df_final['Province/State'] == 'New York')]

Unnamed: 0,Date,Province/State,Confirmed,Deaths,new_confirmed,new_deaths,new_confirmed_avg,new_deaths_avg
8896,2020-01-22,New York,0,0,0,0,,
8897,2020-01-23,New York,0,0,0,0,,
8898,2020-01-24,New York,0,0,0,0,,
8899,2020-01-25,New York,0,0,0,0,,
8900,2020-01-26,New York,0,0,0,0,,
...,...,...,...,...,...,...,...,...
9169,2020-10-21,New York,488506,33371,2026,5,1509.428571,7.857143
9170,2020-10-22,New York,490134,33396,1628,25,1533.428571,8.428571
9171,2020-10-23,New York,491771,33418,1637,22,1523.428571,11.571429
9172,2020-10-24,New York,493832,33418,2061,0,1563.000000,10.142857


In [None]:
# export (write to csv, or put this as an .py initialization and make it to return the data.)


This result seems right. End of data cleaning pipeline for JHU COVID-19.

---
---


# Data Cleaning 2: Oxford Policy Dataset (US ONLY)

https://raw.githubusercontent.com/OxCGRT/USA-covid-policy/master/data/OxCGRT_US_latest.csv

Dataset information:
* Aggregated by state
* Reported daily

Columns:

C1_School closing,C1_Flag,C1_Notes,
C2_Workplace closing,C2_Flag,C2_Notes,
C3_Cancel public events,C3_Flag,C3_Notes,
C4_Restrictions on gatherings,C4_Flag,C4_Notes,
C5_Close public transport,C5_Flag,C5_Notes,
C6_Stay at home requirements,C6_Flag,C6_Notes,
C7_Restrictions on internal movement,C7_Flag,C7_Notes,
C8_International travel controls,C8_Notes,

E1_Income support,E1_Flag,E1_Notes,
E2_Debt/contract relief,E2_Notes,
E3_Fiscal measures,E3_Notes,
E4_International support,E4_Notes,

H1_Public information campaigns,H1_Flag,H1_Notes,
H2_Testing policy,H2_Notes,
H3_Contact tracing,H3_Notes,
H4_Emergency investment in healthcare,H4_Notes,
H5_Investment in vaccines,H5_Notes,

M1_Wildcard,M1_Notes,

ConfirmedCases,ConfirmedDeaths,
StringencyIndex,StringencyIndexForDisplay,
StringencyLegacyIndex,StringencyLegacyIndexForDisplay,
GovernmentResponseIndex,GovernmentResponseIndexForDisplay,
ContainmentHealthIndex,ContainmentHealthIndexForDisplay,
EconomicSupportIndex,EconomicSupportIndexForDisplay


In [None]:
oxford_raw = pd.read_csv('https://raw.githubusercontent.com/OxCGRT/USA-covid-policy/master/data/OxCGRT_US_latest.csv', quotechar='"',skipinitialspace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
oxford_raw.shape # reads in all the data

(16308, 66)

In [None]:
oxford_raw.tail()

Unnamed: 0,CountryName,CountryCode,RegionName,RegionCode,Jurisdiction,Date,C1_School closing,C1_Flag,C1_Notes,C2_Workplace closing,C2_Flag,C2_Notes,C3_Cancel public events,C3_Flag,C3_Notes,C4_Restrictions on gatherings,C4_Flag,C4_Notes,C5_Close public transport,C5_Flag,C5_Notes,C6_Stay at home requirements,C6_Flag,C6_Notes,C7_Restrictions on internal movement,C7_Flag,C7_Notes,C8_International travel controls,C8_Notes,E1_Income support,E1_Flag,E1_Notes,E2_Debt/contract relief,E2_Notes,E3_Fiscal measures,E3_Notes,E4_International support,E4_Notes,H1_Public information campaigns,H1_Flag,H1_Notes,H2_Testing policy,H2_Notes,H3_Contact tracing,H3_Notes,H4_Emergency investment in healthcare,H4_Notes,H5_Investment in vaccines,H5_Notes,H6_Facial Coverings,H6_Flag,H6_Notes,M1_Wildcard,M1_Notes,ConfirmedCases,ConfirmedDeaths,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
16303,United States,USA,Wyoming,US_WY,STATE_ALL,20201024,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10805.0,68.0,,40.74,,49.52,,45.24,,48.61,,25.0
16304,United States,USA,Wyoming,US_WY,STATE_ALL,20201025,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11041.0,68.0,,40.74,,49.52,,45.24,,48.61,,25.0
16305,United States,USA,Wyoming,US_WY,STATE_ALL,20201026,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11477.0,77.0,,40.74,,49.52,,45.24,,48.61,,25.0
16306,United States,USA,Wyoming,US_WY,STATE_ALL,20201027,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11806.0,77.0,,40.74,,49.52,,45.24,,48.61,,25.0
16307,United States,USA,Wyoming,US_WY,STATE_ALL,20201028,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,40.74,,49.52,,45.24,,48.61,,25.0


In [None]:
# Drop columns not necessary
oxford_raw = oxford_raw.loc[:,~oxford_raw.columns.str.contains('_Notes')] # remove notes
oxford_raw = oxford_raw.loc[:,~oxford_raw.columns.str.contains('Country')] # remove country code (all USA)
oxford_raw = oxford_raw.loc[:,~oxford_raw.columns.str.contains('Confirmed')] # remove cases/deaths (all USA)
oxford_raw = oxford_raw.loc[:,~oxford_raw.columns.str.contains('ForDisplay')] # remove indexes for display
oxford_raw = oxford_raw.drop(['RegionCode', 'Jurisdiction'], axis=1)
oxford_raw.tail()

Unnamed: 0,RegionName,Date,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H4_Emergency investment in healthcare,H5_Investment in vaccines,H6_Facial Coverings,H6_Flag,M1_Wildcard,StringencyIndex,StringencyLegacyIndex,GovernmentResponseIndex,ContainmentHealthIndex,EconomicSupportIndex
16303,Wyoming,20201024,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
16304,Wyoming,20201025,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
16305,Wyoming,20201026,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
16306,Wyoming,20201027,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
16307,Wyoming,20201028,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
oxford = oxford_raw.copy()
oxford['Date'] = oxford['Date'].apply(lambda x: dt.datetime.strptime(str(x), '%Y%m%d'))

In [None]:
# sanity check: end of March in NY - everything should be closed
oxford[(oxford['Date']=='2020-03-31')&(oxford['RegionName']=='New York')]

Unnamed: 0,RegionName,Date,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H4_Emergency investment in healthcare,H5_Investment in vaccines,H6_Facial Coverings,H6_Flag,M1_Wildcard,StringencyIndex,StringencyLegacyIndex,GovernmentResponseIndex,ContainmentHealthIndex,EconomicSupportIndex
10962,New York,2020-03-31,3.0,1.0,3.0,1.0,2.0,1.0,4.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,,0.0,0.0,,,79.63,84.52,71.43,66.67,100.0


In [None]:
# sanity check: Most recent in NY - everything should be closed
oxford[(oxford['Date']==max(oxford['Date']))&(oxford['RegionName']=='New York')]

Unnamed: 0,RegionName,Date,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,E1_Income support,E1_Flag,E2_Debt/contract relief,E3_Fiscal measures,E4_International support,H1_Public information campaigns,H1_Flag,H2_Testing policy,H3_Contact tracing,H4_Emergency investment in healthcare,H5_Investment in vaccines,H6_Facial Coverings,H6_Flag,M1_Wildcard,StringencyIndex,StringencyLegacyIndex,GovernmentResponseIndex,ContainmentHealthIndex,EconomicSupportIndex
11173,New York,2020-10-28,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


* **Notes**:
    - More recent dates do not have updated information. 

* **Merging**:  `RegionName` and `Date` combined as the index

In [None]:
# export to csv ## SAVES TO SYDNEY's LOCAL!!!
#from google.colab import drive
#drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
#oxford.to_csv('/gdrive/My Drive/0_Fall_2020/practicum/oxford_20201027.csv')

# Data Cleaning 3: JHU COVID-19 Global Dataset

Data cleaning of JHU COVID-19 USA dataset
Aggregated by state
* Confirmed = cumulative confirmed (cases?)
* Deaths = cumulative deaths
* Population = state population
* Date
* new_confirmed = new confirmed cases each day
* new_deaths = new death cases each day

In [None]:
# importing country lists and details from YYG

EU_COUNTRIES = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czechia', 'Denmark',
    'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary',
    'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands',
    'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden'
]
LATIN_AMERICA_COUNTRIES = [
    'Argentina', 'Bolivia', 'Brazil', 'Chile', 'Colombia', 'Cuba', 'Dominican Republic',
    'Ecuador', 'Honduras', 'Mexico', 'Panama', 'Peru',
]
AFRICAN_COUNTRIES = ['Algeria', 'Egypt', 'Morocco', 'Nigeria', 'South Africa']
ASIAN_COUNTRIES = ['Bangladesh', 'China', 'Iran', 'Israel', 'Japan', 'Indonesia', 'India', 'Kuwait',
    'Malaysia', 'Pakistan', 'Philippines', 'Russia', 'Saudi Arabia', 'South Korea', 'Turkey',
    'United Arab Emirates']
EUROPEAN_COUNTRIES = EU_COUNTRIES + [
    'United Kingdom', 'Switzerland', 'Norway',
    'Belarus', 'Iceland', 'Moldova', 'Serbia', 'Ukraine']
OTHER_COUNTRIES = ['Australia', 'Canada']

ADDL_COUNTRIES_SUPPORTED = EUROPEAN_COUNTRIES + LATIN_AMERICA_COUNTRIES + \
    AFRICAN_COUNTRIES + ASIAN_COUNTRIES + OTHER_COUNTRIES
ALL_COUNTRIES = ADDL_COUNTRIES_SUPPORTED + ['US']

DASH_REGIONS = ['Miami-Dade']
NON_SEASONAL_COUNTRIES = ['Indonesia', 'Philippines', 'India', 'Malaysia', 'Nigeria',
    'Bolivia', 'Colombia', 'Cuba', 'Dominican Republic', 'Ecuador', 'Honduras', 'Panama', 'Peru', 'Brazil']
SOUTHERN_HEMISPHERE_COUNTRIES = ['Argentina', 'Australia', 'Chile', 'South Africa']
HIGH_INCOME_EUROPEAN_COUNTRIES = ['Iceland', 'Norway', 'Switzerland', 'United Kingdom'] + \
    [c for c in EU_COUNTRIES if c not in ['Bulgaria']]
HIGH_INCOME_COUNTRIES = ['US', 'Australia', 'Canada', 'Chile', 'Israel', 'Japan', 'South Korea',
    'Kuwait', 'Panama', 'Saudi Arabia', 'United Arab Emirates'] + HIGH_INCOME_EUROPEAN_COUNTRIES
EARLY_IMPACTED_COUNTRIES = ['US', 'Canada', 'China', 'Japan', 'South Korea', 'Israel', 'Iran'] + EUROPEAN_COUNTRIES
NO_LOCKDOWN_COUNTRIES = ['Sweden', 'Belarus']
SECOND_LOCKDOWN_COUNTRIES = ['Australia', 'Israel']

In [None]:
#read 
raw_global = pd.read_csv("https://raw.githubusercontent.com/datasets/covid-19/master/data/time-series-19-covid-combined.csv")

In [None]:
raw_global.head()

Unnamed: 0,Date,Country/Region,Province/State,Confirmed,Recovered,Deaths
0,2020-01-22,Afghanistan,,0,0.0,0
1,2020-01-23,Afghanistan,,0,0.0,0
2,2020-01-24,Afghanistan,,0,0.0,0
3,2020-01-25,Afghanistan,,0,0.0,0
4,2020-01-26,Afghanistan,,0,0.0,0


In [None]:
#drop lat/long
raw_global = raw_global.drop(['Lat', 'Long'], axis=1)
raw_global.head()

KeyError: ignored

In [None]:
# aggregate by date and country/region, get Confirmed, Deaths, Population

df_global = raw_global.groupby(['Date', 'Country/Region']).agg('sum').reset_index()
df_global.head()

Unnamed: 0,Date,Country/Region,Confirmed,Recovered,Deaths
0,2020-01-22,Afghanistan,0,0.0,0
1,2020-01-22,Albania,0,0.0,0
2,2020-01-22,Algeria,0,0.0,0
3,2020-01-22,Andorra,0,0.0,0
4,2020-01-22,Angola,0,0.0,0


In [None]:
# filter to most relevant countries

df_global_clean = df_global[df_global['Country/Region'].isin(ALL_COUNTRIES)]
df_global_clean.head()


Unnamed: 0,Date,Country/Region,Confirmed,Recovered,Deaths
2,2020-01-22,Algeria,0,0.0,0
6,2020-01-22,Argentina,0,0.0,0
8,2020-01-22,Australia,0,0.0,0
9,2020-01-22,Austria,0,0.0,0
13,2020-01-22,Bangladesh,0,0.0,0


In [None]:
df_global_clean['Date'] = df_global_clean['Date'].apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df_global_clean = df_global_clean.sort_values(by=['Date', 'Country/Region'])
df_global_clean.tail()

Unnamed: 0,Date,Country/Region,Confirmed,Recovered,Deaths
52527,2020-10-25,Turkey,361801,314390.0,9799
52528,2020-10-25,US,8635966,3422878.0,225229
52530,2020-10-25,Ukraine,353723,147240.0,6566
52531,2020-10-25,United Arab Emirates,125123,118931.0,477
52532,2020-10-25,United Kingdom,876840,2685.0,44986


In [None]:
####### Below is working function, please leave it
def new_cases_country(dataframe, country):
  confirmed = dataframe[dataframe['Country/Region'] == country]['Confirmed'].values.tolist()
  deaths = dataframe[dataframe['Country/Region'] == country]['Deaths'].values.tolist()
  tmp_df = dataframe[dataframe['Country/Region'] == country]
  new_confirmed = [confirmed[0]]
  new_deaths = [deaths[0]]
  for i in range(len(tmp_df)-1):
    new_confirmed.append(confirmed[i+1]-confirmed[i])
    new_deaths.append(deaths[i+1]-deaths[i])
  tmp_df['new_confirmed'] = new_confirmed
  tmp_df['new_deaths'] = new_deaths

  return tmp_df # returns subsetted df with daily new confirmed and daily new deaths

In [None]:
countries = sorted(set(df_global_clean['Country/Region'].values.tolist()))
df_country_final = pd.DataFrame() # empty dataframe to store information

for country in countries:
  df_tmp = new_cases_country(df_global_clean, country=country)
  df_country_final = df_country_final.append(df_tmp, ignore_index=True)

df_country_final.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,Date,Country/Region,Confirmed,Recovered,Deaths,new_confirmed,new_deaths
19455,2020-10-21,United Kingdom,792194,2636.0,44248,26707,191
19456,2020-10-22,United Kingdom,813451,2652.0,44437,21257,189
19457,2020-10-23,United Kingdom,834010,2657.0,44661,20559,224
19458,2020-10-24,United Kingdom,857043,2676.0,44835,23033,174
19459,2020-10-25,United Kingdom,876840,2685.0,44986,19797,151


In [None]:
# Compute rolling 7 day average for new_confirmed and new_death for each state
df_country_final['new_confirmed_avg'] = df_country_final.groupby('Country/Region')['new_confirmed'].rolling(7).mean().reset_index(0, drop=True)
df_country_final['new_deaths_avg'] = df_country_final.groupby('Country/Region')['new_deaths'].rolling(7).mean().reset_index(0, drop=True)

In [None]:
df_country_final.tail()

Unnamed: 0,Date,Country/Region,Confirmed,Recovered,Deaths,new_confirmed,new_deaths,new_confirmed_avg,new_deaths_avg
19455,2020-10-21,United Kingdom,792194,2636.0,44248,26707,191,19247.857143,143.285714
19456,2020-10-22,United Kingdom,813451,2652.0,44437,21257,189,19570.857143,150.571429
19457,2020-10-23,United Kingdom,834010,2657.0,44661,20559,224,20271.142857,163.142857
19458,2020-10-24,United Kingdom,857043,2676.0,44835,23033,174,21249.285714,166.571429
19459,2020-10-25,United Kingdom,876840,2685.0,44986,19797,151,21649.714286,178.571429


# Next Cleaning Task - JHU Datasets (Updated Daily)

In [None]:
raw_cases_us = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv")
raw_deaths_us = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv")

raw_cases_global = pd.read_csv("https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv", error_bad_lines=False)
raw_deaths_global = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")

b'Skipping line 50: expected 1 fields, saw 2\nSkipping line 55: expected 1 fields, saw 2\nSkipping line 56: expected 1 fields, saw 2\nSkipping line 70: expected 1 fields, saw 2\nSkipping line 155: expected 1 fields, saw 3\nSkipping line 167: expected 1 fields, saw 6\nSkipping line 168: expected 1 fields, saw 3\nSkipping line 192: expected 1 fields, saw 4\nSkipping line 196: expected 1 fields, saw 2\nSkipping line 198: expected 1 fields, saw 2\nSkipping line 199: expected 1 fields, saw 2\nSkipping line 200: expected 1 fields, saw 2\nSkipping line 201: expected 1 fields, saw 2\nSkipping line 202: expected 1 fields, saw 2\nSkipping line 203: expected 1 fields, saw 2\nSkipping line 204: expected 1 fields, saw 2\nSkipping line 205: expected 1 fields, saw 2\nSkipping line 206: expected 1 fields, saw 2\nSkipping line 210: expected 1 fields, saw 2\nSkipping line 211: expected 1 fields, saw 2\nSkipping line 217: expected 1 fields, saw 2\nSkipping line 220: expected 1 fields, saw 2\nSkipping lin

## 1. Data Cleaning: US

In [None]:
raw_cases_us.tail()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,2/1/20,2/2/20,2/3/20,2/4/20,2/5/20,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20,2/16/20,2/17/20,2/18/20,2/19/20,...,9/18/20,9/19/20,9/20/20,9/21/20,9/22/20,9/23/20,9/24/20,9/25/20,9/26/20,9/27/20,9/28/20,9/29/20,9/30/20,10/1/20,10/2/20,10/3/20,10/4/20,10/5/20,10/6/20,10/7/20,10/8/20,10/9/20,10/10/20,10/11/20,10/12/20,10/13/20,10/14/20,10/15/20,10/16/20,10/17/20,10/18/20,10/19/20,10/20/20,10/21/20,10/22/20,10/23/20,10/24/20,10/25/20,10/26/20,10/27/20
3335,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.58908,"Teton, Wyoming, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,490,493,498,504,506,517,529,532,536,554,560,560,577,591,597,602,616,625,629,633,645,656,658,662,675,679,686,687,692,692,700,711,716,715,727,740,743,747,756,761
3336,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,"Uinta, Wyoming, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,327,330,334,335,339,342,347,348,350,352,354,354,357,356,358,359,360,365,368,368,373,378,379,380,381,384,385,388,392,401,401,403,404,407,410,425,428,431,442,451
3337,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.0,0.0,"Unassigned, Wyoming, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3338,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,"Washakie, Wyoming, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,113,113,113,113,113,113,113,114,114,115,115,116,116,116,117,118,119,121,120,119,123,123,124,125,126,127,132,132,133,133,134,135,139,142,143,144,145,145,146,151
3339,84056045,US,USA,840,56045.0,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,23,23,23,23,26,25,27,35,36,36,36,37,39,39,39,41,42,44,46,50,51,54,58,58,58,62,66,71,81,88,93,99,101,103,115,121,127,127,132,139


In [None]:
raw_deaths_us.tail()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Population,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,2/1/20,2/2/20,2/3/20,2/4/20,2/5/20,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20,2/16/20,2/17/20,2/18/20,...,9/18/20,9/19/20,9/20/20,9/21/20,9/22/20,9/23/20,9/24/20,9/25/20,9/26/20,9/27/20,9/28/20,9/29/20,9/30/20,10/1/20,10/2/20,10/3/20,10/4/20,10/5/20,10/6/20,10/7/20,10/8/20,10/9/20,10/10/20,10/11/20,10/12/20,10/13/20,10/14/20,10/15/20,10/16/20,10/17/20,10/18/20,10/19/20,10/20/20,10/21/20,10/22/20,10/23/20,10/24/20,10/25/20,10/26/20,10/27/20
3335,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.58908,"Teton, Wyoming, US",23464,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3336,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,"Uinta, Wyoming, US",20226,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3
3337,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.0,0.0,"Unassigned, Wyoming, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0
3338,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,"Washakie, Wyoming, US",7805,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
3339,84056045,US,USA,840,56045.0,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",6927,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# clean us cases and deaths
def us_col_clean(case_df, death_df):
  cols_to_drop = ['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Country_Region', 'Lat', 'Long_', 'Combined_Key']
  tmp_case = case_df.drop(cols_to_drop, axis=1)
  tmp_death = death_df.drop(cols_to_drop + ['Population'], axis=1)
  tmp_case['indicator'] = 'Confirmed'
  tmp_death['indicator'] = 'Deaths'
  tmp = pd.concat([tmp_case, tmp_death], axis=0, ignore_index=True)
  
  return tmp

In [None]:
us_raw = us_col_clean(raw_cases_us, raw_deaths_us)
us_raw.tail()

Unnamed: 0,Province_State,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,2/1/20,2/2/20,2/3/20,2/4/20,2/5/20,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20,2/16/20,2/17/20,2/18/20,2/19/20,2/20/20,2/21/20,2/22/20,2/23/20,2/24/20,2/25/20,2/26/20,2/27/20,2/28/20,2/29/20,...,9/19/20,9/20/20,9/21/20,9/22/20,9/23/20,9/24/20,9/25/20,9/26/20,9/27/20,9/28/20,9/29/20,9/30/20,10/1/20,10/2/20,10/3/20,10/4/20,10/5/20,10/6/20,10/7/20,10/8/20,10/9/20,10/10/20,10/11/20,10/12/20,10/13/20,10/14/20,10/15/20,10/16/20,10/17/20,10/18/20,10/19/20,10/20/20,10/21/20,10/22/20,10/23/20,10/24/20,10/25/20,10/26/20,10/27/20,indicator
6675,Wyoming,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,Deaths
6676,Wyoming,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,Deaths
6677,Wyoming,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,Deaths
6678,Wyoming,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,Deaths
6679,Wyoming,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Deaths


In [None]:
def us_shape_clean(df):
  df = pd.melt(df, id_vars=['Province_State', 'indicator'], var_name='Date', value_name='Value')
  #df.head()
  df['Date'] = df['Date'].apply(lambda x: dt.datetime.strptime(str(x), '%m/%d/%y'))
  df = pd.pivot_table(df, index = ['Province_State', 'Date'], columns='indicator', values = 'Value', aggfunc=np.sum).reset_index()
  # Remove non-states
  not_state = ['American Samoa', 'Diamond Princess', 'Grand Princess', 'Guam', 'Northern Mariana Islands', 'Puerto Rico']
  df = df[~df['Province_State'].isin(not_state)]
  df = df.sort_values(by=['Date', 'Province_State'])

  return df

In [None]:
us_clean = us_shape_clean(us_raw)

In [None]:
us_clean.tail()

indicator,Province_State,Date,Confirmed,Deaths
15119,Virginia,2020-10-27,174786,3595
15399,Washington,2020-10-27,104027,2337
15679,West Virginia,2020-10-27,22710,434
15959,Wisconsin,2020-10-27,206311,1852
16239,Wyoming,2020-10-27,11806,77


In [None]:
def calculate_daily(df, state):
  df = df.sort_values(by=['Date'])
  confirmed = df[df['Province_State'] == state]['Confirmed'].values.tolist()
  deaths = df[df['Province_State'] == state]['Deaths'].values.tolist()
  tmp_df = df[df['Province_State'] == state]
  new_confirmed = [confirmed[0]]
  new_deaths = [deaths[0]]
  for i in range(len(tmp_df)-1):
    new_confirmed.append(confirmed[i+1]-confirmed[i])
    new_deaths.append(deaths[i+1]-deaths[i])
  tmp_df['new_confirmed'] = new_confirmed
  tmp_df['new_deaths'] = new_deaths

  return tmp_df # returns subsetted df with daily new confirmed and daily new deaths

In [None]:
states = sorted(set(us_clean['Province_State'].values.tolist()))
us_final = pd.DataFrame()

for state in states:
  tmp = calculate_daily(us_clean, state=state)
  us_final = us_final.append(tmp, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [None]:
us_final.tail()

indicator,Province_State,Date,Confirmed,Deaths,new_confirmed,new_deaths
14555,Wyoming,2020-10-23,10545,68,426,0
14556,Wyoming,2020-10-24,10805,68,260,0
14557,Wyoming,2020-10-25,11041,68,236,0
14558,Wyoming,2020-10-26,11477,77,436,9
14559,Wyoming,2020-10-27,11806,77,329,0


In [None]:
# Compute rolling 7 day average for new_confirmed and new_death for each state
us_final['new_confirmed_avg'] = us_final.groupby('Province_State')['new_confirmed'].rolling(7).mean().reset_index(0, drop=True)
us_final['new_deaths_avg'] = us_final.groupby('Province_State')['new_deaths'].rolling(7).mean().reset_index(0, drop=True)
us_final['cum_confirmed_avg'] = us_final.groupby('Province_State')['Confirmed'].rolling(7).mean().reset_index(0, drop=True)
us_final['cum_death_avg'] = us_final.groupby('Province_State')['Deaths'].rolling(7).mean().reset_index(0, drop=True)

In [None]:
us_final.tail(10)

indicator,Province_State,Date,Confirmed,Deaths,new_confirmed,new_deaths,new_confirmed_avg,new_deaths_avg,cum_confirmed_avg,cum_death_avg
14550,Wyoming,2020-10-18,9025,57,209,0,202.0,0.428571,8403.428571,56.571429
14551,Wyoming,2020-10-19,9311,57,286,0,215.571429,0.428571,8619.0,57.0
14552,Wyoming,2020-10-20,9526,61,215,4,223.142857,0.571429,8842.142857,57.571429
14553,Wyoming,2020-10-21,9848,61,322,0,238.714286,0.571429,9080.857143,58.142857
14554,Wyoming,2020-10-22,10119,68,271,7,249.142857,1.571429,9330.0,59.714286
14555,Wyoming,2020-10-23,10545,68,426,0,268.571429,1.571429,9598.571429,61.285714
14556,Wyoming,2020-10-24,10805,68,260,0,284.142857,1.571429,9882.714286,62.857143
14557,Wyoming,2020-10-25,11041,68,236,0,288.0,1.571429,10170.714286,64.428571
14558,Wyoming,2020-10-26,11477,77,436,9,309.428571,2.857143,10480.142857,67.285714
14559,Wyoming,2020-10-27,11806,77,329,0,325.714286,2.285714,10805.857143,69.571429


* **Add population information**
  - Data: US Census, 2019 estimate, state total population
  - Note: Data uploaded at Sydney's personal github page, so the data can be downloaded straight from the web

In [None]:
us_state_pop = pd.read_csv('https://raw.githubusercontent.com/bolimsydneyson/sydney_bson/master/Practicum/us_state_population.csv').dropna()
us_state_pop.tail()

Unnamed: 0,Region,Population
52,Washington,7614893
53,West Virginia,1792147
54,Wisconsin,5822434
55,Wyoming,578759
56,Puerto Rico,3193694


In [None]:
us_total = us_final.merge(us_state_pop, how='left', left_on='Province_State', right_on='Region').drop(['Region'], axis=1)

In [None]:
us_total['Population'] = us_total['Population'].str.replace(',', '').astype(float) # convert string to float
us_total.tail()

Unnamed: 0,Province_State,Date,Confirmed,Deaths,new_confirmed,new_deaths,new_confirmed_avg,new_deaths_avg,cum_confirmed_avg,cum_death_avg,Population
14555,Wyoming,2020-10-23,10545,68,426,0,268.571429,1.571429,9598.571429,61.285714,578759.0
14556,Wyoming,2020-10-24,10805,68,260,0,284.142857,1.571429,9882.714286,62.857143,578759.0
14557,Wyoming,2020-10-25,11041,68,236,0,288.0,1.571429,10170.714286,64.428571,578759.0
14558,Wyoming,2020-10-26,11477,77,436,9,309.428571,2.857143,10480.142857,67.285714,578759.0
14559,Wyoming,2020-10-27,11806,77,329,0,325.714286,2.285714,10805.857143,69.571429,578759.0


In [None]:
us_total['Population'].dtype

dtype('float64')

# Final US Data output is `us_total` with the below columns:

* Province_State: state
* Date
* Confirmed: cumulative confirmed cases
* Deaths: cumulative confirmed deaths
* new_confirmed: new confirmed cases
* new_deaths: new confirmed deaths
* new_confirmed_avg: rolling 7 day average for new confirmed cases
* new_deaths_avg: rolling 7 day average for new confirmed deaths
* cum_confirmed_avg: rolling 7 day average for cumulative confirmed cases
* cum_death_avg: rolling 7 day average for cumulative confirmed deaths
* Population: 2019 population estimate, from U.S. Census data

In [None]:
#download us and oxford data
from google.colab import files
us_total.to_csv('us_total.csv')
files.download("us_total.csv")

oxford.to_csv('oxford.csv')
files.download('oxford.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>