# Data Cleaning for KPMG COVID-19 Projection Model

### Datasets used:
* Data 1: JHU COVID-19 US (county level)
* Data 2: Oxford policy data (state level)
* Data 3: US Census demographic/population -ACS estimate (county level)
* Data 4: Hospital beds, esri [link](https://coronavirus-resources.esri.com/datasets/1044bb19da8d4dbfb6a96eb1b4ebf629_0/data) (county level)

In [1]:
# load packages
import pandas as pd
import numpy as np
import re
import os
import datetime as dt


# 1. JHU COVID-19 USA

In [2]:
raw_cases_us = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv")
raw_deaths_us = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv")

raw_cases_us.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,11/2/20,11/3/20,11/4/20,11/5/20,11/6/20,11/7/20,11/8/20,11/9/20,11/10/20,11/11/20
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,2186,2197,2212,2230,2242,2267,2283,2304,2328,2351
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,6985,6995,7061,7097,7134,7188,7226,7263,7348,7409
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,1065,1074,1079,1080,1090,1092,1095,1098,1107,1112
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,883,890,897,907,917,924,926,932,948,961
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,2108,2162,2188,2222,2253,2286,2297,2335,2378,2400


In [3]:
raw_deaths_us.tail()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,11/2/20,11/3/20,11/4/20,11/5/20,11/6/20,11/7/20,11/8/20,11/9/20,11/10/20,11/11/20
3335,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.58908,...,1,2,2,2,2,2,2,2,2,2
3336,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,...,3,3,3,3,3,3,4,4,4,4
3337,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.0,0.0,...,10,0,0,0,0,9,0,0,0,0
3338,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,...,7,7,7,7,7,7,7,7,7,7
3339,84056045,US,USA,840,56045.0,Weston,Wyoming,US,43.839612,-104.567488,...,0,0,1,1,1,1,0,0,0,0


In [4]:
# clean us cases and deaths
def us_col_clean(case_df, death_df):
    cols_to_drop = ['UID', 'iso2', 'iso3', 'code3', 'Admin2', 'Country_Region', 'Lat', 'Long_', 'Combined_Key']
    tmp_case = case_df.drop(cols_to_drop, axis=1)
    tmp_death = death_df.drop(cols_to_drop + ['Population'], axis=1)
    tmp_case['indicator'] = 'Confirmed'
    tmp_death['indicator'] = 'Deaths'
    tmp = pd.concat([tmp_case, tmp_death], axis=0, ignore_index=True)
  
    return tmp

# clean the FIPS codes to 5 digits
def fips_codes(df):
    df = df[pd.notnull(df['FIPS'])] # no NA for FIPS
    df['FIPS'] = df['FIPS'].astype(int)
    df['FIPS'] = ['0'+str(i) if len(str(i))==4 else str(i) for i in df['FIPS']]
    return df

In [5]:
us_raw = us_col_clean(raw_cases_us, raw_deaths_us)
us_raw = fips_codes(us_raw) # FIPS to 5 digit string
us_raw.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Unnamed: 0,FIPS,Province_State,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,...,11/3/20,11/4/20,11/5/20,11/6/20,11/7/20,11/8/20,11/9/20,11/10/20,11/11/20,indicator
0,1001,Alabama,0,0,0,0,0,0,0,0,...,2197,2212,2230,2242,2267,2283,2304,2328,2351,Confirmed
1,1003,Alabama,0,0,0,0,0,0,0,0,...,6995,7061,7097,7134,7188,7226,7263,7348,7409,Confirmed
2,1005,Alabama,0,0,0,0,0,0,0,0,...,1074,1079,1080,1090,1092,1095,1098,1107,1112,Confirmed
3,1007,Alabama,0,0,0,0,0,0,0,0,...,890,897,907,917,924,926,932,948,961,Confirmed
4,1009,Alabama,0,0,0,0,0,0,0,0,...,2162,2188,2222,2253,2286,2297,2335,2378,2400,Confirmed


In [24]:
def us_shape_clean(df):
    df = pd.melt(df, id_vars=['FIPS', 'indicator', 'Province_State'], var_name='Date', value_name='Value')
    #df.head()
    df['Date'] = df['Date'].apply(lambda x: dt.datetime.strptime(str(x), '%m/%d/%y'))
    df = pd.pivot_table(df, index = ['Date', 'FIPS', 'Province_State'], columns='indicator', values = 'Value', aggfunc=np.sum).reset_index()
    # Remove non-states
    not_state = ['American Samoa', 'Diamond Princess', 'Grand Princess', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'Virgin Islands']
    df = df[~df['Province_State'].isin(not_state)]
    df = df.sort_values(by=['Date', 'FIPS'])

    return df

In [25]:
%%time
us_clean = us_shape_clean(us_raw)

Wall time: 18.2 s


In [26]:
# Columns: FIPS, Date, Province_State, cases, deaths
us_clean.head()

indicator,Date,FIPS,Province_State,Confirmed,Deaths
0,2020-01-22,1001,Alabama,0,0
1,2020-01-22,1003,Alabama,0,0
2,2020-01-22,1005,Alabama,0,0
3,2020-01-22,1007,Alabama,0,0
4,2020-01-22,1009,Alabama,0,0


In [27]:
us_clean.tail()

indicator,Date,FIPS,Province_State,Confirmed,Deaths
982344,2020-11-11,90051,Virginia,0,0
982345,2020-11-11,90053,Washington,416,3
982346,2020-11-11,90054,West Virginia,0,0
982347,2020-11-11,90055,Wisconsin,0,0
982348,2020-11-11,90056,Wyoming,0,0


In [None]:
####### Below is working function, please leave it
def new_cases_country(dataframe, country):
  confirmed = dataframe[dataframe['Country/Region'] == country]['Confirmed'].values.tolist()
  deaths = dataframe[dataframe['Country/Region'] == country]['Deaths'].values.tolist()
  tmp_df = dataframe[dataframe['Country/Region'] == country]
  new_confirmed = [confirmed[0]]
  new_deaths = [deaths[0]]
  for i in range(len(tmp_df)-1):
    new_confirmed.append(confirmed[i+1]-confirmed[i])
    new_deaths.append(deaths[i+1]-deaths[i])
  tmp_df['new_confirmed'] = new_confirmed
  tmp_df['new_deaths'] = new_deaths

  return tmp_df # returns subsetted df with daily new confirmed and daily new deaths

In [29]:
# Compute rolling 7 day average for new_confirmed and new_death for each county
us_clean['new_confirmed_avg'] = us_clean.groupby('FIPS')['Confirmed'].rolling(7).mean().reset_index(0, drop=True)
us_clean['new_deaths_avg'] = us_clean.groupby('FIPS')['Deaths'].rolling(7).mean().reset_index(0, drop=True)

In [32]:
# sanity check
us_clean[us_clean['FIPS'] == "36061"].tail(10) ## manhattan

indicator,Date,FIPS,Province_State,Confirmed,Deaths,new_confirmed_avg,new_deaths_avg
950908,2020-11-02,36061,New York,36814,3200,36377.857143,3197.714286
954238,2020-11-03,36061,New York,36980,3200,36532.571429,3198.428571
957568,2020-11-04,36061,New York,37100,3200,36678.857143,3199.142857
960898,2020-11-05,36061,New York,37278,3201,36829.428571,3199.571429
964228,2020-11-06,36061,New York,37430,3200,36976.571429,3199.857143
967558,2020-11-07,36061,New York,37641,3199,37136.428571,3200.0
970888,2020-11-08,36061,New York,37940,3200,37311.857143,3200.0
974218,2020-11-09,36061,New York,38171,3200,37505.714286,3200.0
977548,2020-11-10,36061,New York,38384,3198,37706.285714,3199.714286
980878,2020-11-11,36061,New York,38668,3200,37930.285714,3199.714286


In [33]:
us_clean[us_clean['FIPS'] == "90051"].tail(10) ## random county in virginia

indicator,Date,FIPS,Province_State,Confirmed,Deaths,new_confirmed_avg,new_deaths_avg
952374,2020-11-02,90051,Virginia,0,0,0.0,0.0
955704,2020-11-03,90051,Virginia,0,0,0.0,0.0
959034,2020-11-04,90051,Virginia,0,0,0.0,0.0
962364,2020-11-05,90051,Virginia,0,0,0.0,0.0
965694,2020-11-06,90051,Virginia,0,0,0.0,0.0
969024,2020-11-07,90051,Virginia,0,0,0.0,0.0
972354,2020-11-08,90051,Virginia,0,0,0.0,0.0
975684,2020-11-09,90051,Virginia,0,0,0.0,0.0
979014,2020-11-10,90051,Virginia,0,0,0.0,0.0
982344,2020-11-11,90051,Virginia,0,0,0.0,0.0


In [34]:
us_clean[us_clean['FIPS'] == "90053"].tail(10) ## random county in washington state

indicator,Date,FIPS,Province_State,Confirmed,Deaths,new_confirmed_avg,new_deaths_avg
952375,2020-11-02,90053,Washington,371,3,382.142857,3.0
955705,2020-11-03,90053,Washington,373,3,383.285714,3.0
959035,2020-11-04,90053,Washington,484,3,394.571429,3.0
962365,2020-11-05,90053,Washington,375,3,388.285714,3.0
965695,2020-11-06,90053,Washington,424,3,393.857143,3.0
969025,2020-11-07,90053,Washington,451,3,405.857143,3.0
972355,2020-11-08,90053,Washington,419,3,413.857143,3.0
975685,2020-11-09,90053,Washington,388,3,416.285714,3.0
979015,2020-11-10,90053,Washington,416,3,422.428571,3.0
982345,2020-11-11,90053,Washington,416,3,412.714286,3.0


In [None]:
# export if necessary

file_name = 'us_daily_' + str(max(us_final['Date'])) +'.csv'
us_final.to_csv(file_name, index=False)

# 2: Oxford policy data (state level)

https://raw.githubusercontent.com/OxCGRT/USA-covid-policy/master/data/OxCGRT_US_latest.csv

Dataset information:
* Aggregated by state
* Reported daily

Columns:

C1_School closing,C1_Flag,C1_Notes,
C2_Workplace closing,C2_Flag,C2_Notes,
C3_Cancel public events,C3_Flag,C3_Notes,
C4_Restrictions on gatherings,C4_Flag,C4_Notes,
C5_Close public transport,C5_Flag,C5_Notes,
C6_Stay at home requirements,C6_Flag,C6_Notes,
C7_Restrictions on internal movement,C7_Flag,C7_Notes,
C8_International travel controls,C8_Notes,

E1_Income support,E1_Flag,E1_Notes,
E2_Debt/contract relief,E2_Notes,
E3_Fiscal measures,E3_Notes,
E4_International support,E4_Notes,

H1_Public information campaigns,H1_Flag,H1_Notes,
H2_Testing policy,H2_Notes,
H3_Contact tracing,H3_Notes,
H4_Emergency investment in healthcare,H4_Notes,
H5_Investment in vaccines,H5_Notes,

M1_Wildcard,M1_Notes,

ConfirmedCases,ConfirmedDeaths,
StringencyIndex,StringencyIndexForDisplay,
StringencyLegacyIndex,StringencyLegacyIndexForDisplay,
GovernmentResponseIndex,GovernmentResponseIndexForDisplay,
ContainmentHealthIndex,ContainmentHealthIndexForDisplay,
EconomicSupportIndex,EconomicSupportIndexForDisplay


In [None]:
# Read in the data
oxford_raw = pd.read_csv('https://raw.githubusercontent.com/OxCGRT/USA-covid-policy/master/data/OxCGRT_US_latest.csv',
                 usecols=range(59))

In [None]:
oxford_raw.tail() # check

In [None]:
# Drop columns not necessary
cols_to_drop = ['CountryName', 'CountryCode','ConfirmedCases', 'ConfirmedDeaths', 'StringencyIndex','StringencyLegacyIndex','GovernmentResponseIndex']
oxford_raw = oxford_raw.drop(cols_to_drop, axis=1)
oxford_raw.info()

In [None]:
oxford = oxford_raw.copy()
del(oxford_raw) # delete, save space
oxford['Date'] = oxford['Date'].apply(lambda x: dt.datetime.strptime(str(x), '%Y%m%d'))

In [None]:
# sanity check: end of March in NY - everything should be closed
oxford[(oxford['Date']=='2020-03-31')&(oxford['RegionName']=='New York')]

In [None]:
# sanity check: Most recent in NY - everything should be closed
oxford[(oxford['Date']==max(oxford['Date']))&(oxford['RegionName']=='New York')]

In [None]:
## ready for merging
# check
jhu_states = sorted(set(us_final['Province_State'].values.tolist()))
oxford_states = sorted(set(oxford['RegionName'].values.tolist()))

# same length?
len(jhu_states) == len(oxford_states)

# 3. US Census demographic/population -ACS estimate (county level)

ACS survey gives estimate of the total population in a county


# 4. Hospital beds, esri

Downloaded, already available in .csv format

