# Data Cleaning 1: JHU COVID-19 USA dataset

Data cleaning of JHU COVID-19 USA dataset
Aggregated by state
* Confirmed = cumulative confirmed cases
* Deaths = cumulative deaths
* Population = state population
* Date
* new_confirmed = new confirmed cases each day
* new_deaths = new death cases each day

In [1]:
import pandas as pd
import numpy as np
import re
import os
import datetime as dt

In [None]:
raw_df = pd.read_csv('https://raw.githubusercontent.com/datasets/covid-19/master/data/us_simplified.csv')

print(raw_df.shape)
raw_df.head()

(901800, 8)


Unnamed: 0,Date,FIPS,Admin2,Province/State,Confirmed,Deaths,Population,Country/Region
0,2020-01-22,1001.0,Autauga,Alabama,0,0,55869,US
1,2020-01-23,1001.0,Autauga,Alabama,0,0,55869,US
2,2020-01-24,1001.0,Autauga,Alabama,0,0,55869,US
3,2020-01-25,1001.0,Autauga,Alabama,0,0,55869,US
4,2020-01-26,1001.0,Autauga,Alabama,0,0,55869,US


In [None]:
# drop US, FIPS, Admin2 (only us data)
raw_df = raw_df.drop(['FIPS', 'Admin2', 'Country/Region'], axis=1)
raw_df.head()

Unnamed: 0,Date,Province/State,Confirmed,Deaths,Population
0,2020-01-22,Alabama,0,0,55869
1,2020-01-23,Alabama,0,0,55869
2,2020-01-24,Alabama,0,0,55869
3,2020-01-25,Alabama,0,0,55869
4,2020-01-26,Alabama,0,0,55869


https://raw.githubusercontent.com/datasets/covid-19/master/data/us_simplified.csv

In [None]:
# aggregate by date and province/state, get Confirmed, Deaths, Population

df = raw_df.groupby(['Date', 'Province/State']).agg('sum').reset_index()
df.head()

Unnamed: 0,Date,Province/State,Confirmed,Deaths,Population
0,2020-01-22,Alabama,0,0,4903185
1,2020-01-22,Alaska,0,0,731545
2,2020-01-22,American Samoa,0,0,55641
3,2020-01-22,Arizona,0,0,7278717
4,2020-01-22,Arkansas,0,0,3017804


In [None]:
set(df['Province/State'].values.tolist()) # print out states, delete irrelevant states

{'Alabama',
 'Alaska',
 'American Samoa',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'Diamond Princess',
 'District of Columbia',
 'Florida',
 'Georgia',
 'Grand Princess',
 'Guam',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Northern Mariana Islands',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Puerto Rico',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virgin Islands',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming'}

In [None]:
# Filter out  because they are not relevant, not a state
not_state = ['American Samoa', 'Diamond Princess', 'Grand Princess', 'Guam', 'Northern Mariana Islands', 'Puerto Rico']
df_clean = df[~df['Province/State'].isin(not_state)]
df_clean.tail()

Unnamed: 0,Date,Province/State,Confirmed,Deaths,Population
15655,2020-10-17,Virginia,164795,3419,8535519
15656,2020-10-17,Washington,97671,2239,7614893
15657,2020-10-17,West Virginia,19805,401,1792147
15658,2020-10-17,Wisconsin,166186,1574,5822434
15659,2020-10-17,Wyoming,8816,57,578759


In [None]:
df_clean[(df_clean['Date']=='2020-10-17') & (df_clean['Province/State'] == 'New York')] # sanity check

Unnamed: 0,Date,Province/State,Confirmed,Deaths,Population
15638,2020-10-17,New York,482891,33347,19453561


In [None]:
import datetime as dt
df_clean['Date'] = df['Date'].apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
df_clean = df_clean.sort_values(by=['Date', 'Province/State'])
df_clean.tail()

Unnamed: 0,Date,Province/State,Confirmed,Deaths,Population
15655,2020-10-17,Virginia,164795,3419,8535519
15656,2020-10-17,Washington,97671,2239,7614893
15657,2020-10-17,West Virginia,19805,401,1792147
15658,2020-10-17,Wisconsin,166186,1574,5822434
15659,2020-10-17,Wyoming,8816,57,578759


In [None]:
####### Below is working function, please leave it
def new_cases(dataframe, state):
    confirmed = dataframe[dataframe['Province/State'] == state]['Confirmed'].values.tolist()
    deaths = dataframe[dataframe['Province/State'] == state]['Deaths'].values.tolist()
    tmp_df = dataframe[dataframe['Province/State'] == state]
    new_confirmed = [confirmed[0]]
    new_deaths = [deaths[0]]
    for i in range(len(tmp_df)-1):
        new_confirmed.append(confirmed[i+1]-confirmed[i])
        new_deaths.append(deaths[i+1]-deaths[i])
    tmp_df['new_confirmed'] = new_confirmed
    tmp_df['new_deaths'] = new_deaths

    return tmp_df # returns subsetted df with daily new confirmed and daily new deaths

In [None]:
states = sorted(set(df_clean['Province/State'].values.tolist()))
df_final = pd.DataFrame() # empty dataframe to store information

for state in states:
  df_tmp = new_cases(df_clean, state=state)
  df_final = df_final.append(df_tmp, ignore_index=True)

df_final.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,Date,Province/State,Confirmed,Deaths,Population,new_confirmed,new_deaths
14035,2020-10-13,Wyoming,7964,57,578759,162,3
14036,2020-10-14,Wyoming,8177,57,578759,213,0
14037,2020-10-15,Wyoming,8375,57,578759,198,0
14038,2020-10-16,Wyoming,8665,57,578759,290,0
14039,2020-10-17,Wyoming,8816,57,578759,151,0


In [None]:
# Compute rolling 7 day average for new_confirmed and new_death for each state
df_final['new_confirmed_avg'] = df_final.groupby('Province/State')['new_confirmed'].rolling(7).mean().reset_index(0, drop=True)
df_final['new_deaths_avg'] = df_final.groupby('Province/State')['new_deaths'].rolling(7).mean().reset_index(0, drop=True)

In [None]:
df_final.tail()

Unnamed: 0,Date,Province/State,Confirmed,Deaths,Population,new_confirmed,new_deaths,new_confirmed_avg,new_deaths_avg
14035,2020-10-13,Wyoming,7964,57,578759,162,3,170.571429,0.571429
14036,2020-10-14,Wyoming,8177,57,578759,213,0,182.571429,0.571429
14037,2020-10-15,Wyoming,8375,57,578759,198,0,183.285714,0.428571
14038,2020-10-16,Wyoming,8665,57,578759,290,0,190.0,0.428571
14039,2020-10-17,Wyoming,8816,57,578759,151,0,194.428571,0.428571


In [None]:
# sanity check: most recent NY state data
df_final[(df_final['Date'] == max(df_final['Date'])) & (df_final['Province/State'] == 'New York')]

Unnamed: 0,Date,Province/State,Confirmed,Deaths,Population,new_confirmed,new_deaths,new_confirmed_avg,new_deaths_avg
8909,2020-10-17,New York,482891,33347,19453561,1784,10,1392.571429,7.714286


In [None]:
# export (write to csv, or put this as an .py initialization and make it to return the data.)

This result seems right. End of data cleaning pipeline for JHU COVID-19.

---
---


# Data Cleaning 2: Oxford Policy Dataset (US ONLY)

https://raw.githubusercontent.com/OxCGRT/USA-covid-policy/master/data/OxCGRT_US_latest.csv

Dataset information:
* Aggregated by state
* Reported daily

Columns:

C1_School closing,C1_Flag,C1_Notes,
C2_Workplace closing,C2_Flag,C2_Notes,
C3_Cancel public events,C3_Flag,C3_Notes,
C4_Restrictions on gatherings,C4_Flag,C4_Notes,
C5_Close public transport,C5_Flag,C5_Notes,
C6_Stay at home requirements,C6_Flag,C6_Notes,
C7_Restrictions on internal movement,C7_Flag,C7_Notes,
C8_International travel controls,C8_Notes,

E1_Income support,E1_Flag,E1_Notes,
E2_Debt/contract relief,E2_Notes,
E3_Fiscal measures,E3_Notes,
E4_International support,E4_Notes,

H1_Public information campaigns,H1_Flag,H1_Notes,
H2_Testing policy,H2_Notes,
H3_Contact tracing,H3_Notes,
H4_Emergency investment in healthcare,H4_Notes,
H5_Investment in vaccines,H5_Notes,

M1_Wildcard,M1_Notes,

ConfirmedCases,ConfirmedDeaths,
StringencyIndex,StringencyIndexForDisplay,
StringencyLegacyIndex,StringencyLegacyIndexForDisplay,
GovernmentResponseIndex,GovernmentResponseIndexForDisplay,
ContainmentHealthIndex,ContainmentHealthIndexForDisplay,
EconomicSupportIndex,EconomicSupportIndexForDisplay


In [None]:
# Read in the data
oxford_raw = pd.read_csv('https://raw.githubusercontent.com/OxCGRT/USA-covid-policy/master/data/OxCGRT_US_latest.csv',
                 usecols=range(59))

In [None]:
oxford_raw.tail() # check

Unnamed: 0,CountryName,CountryCode,RegionName,RegionCode,Jurisdiction,Date,C1_School closing,C1_Flag,C1_Notes,C2_Workplace closing,C2_Flag,C2_Notes,C3_Cancel public events,C3_Flag,C3_Notes,C4_Restrictions on gatherings,C4_Flag,C4_Notes,C5_Close public transport,C5_Flag,C5_Notes,C6_Stay at home requirements,C6_Flag,C6_Notes,C7_Restrictions on internal movement,C7_Flag,C7_Notes,C8_International travel controls,C8_Notes,E1_Income support,E1_Flag,E1_Notes,E2_Debt/contract relief,E2_Notes,E3_Fiscal measures,E3_Notes,E4_International support,E4_Notes,H1_Public information campaigns,H1_Flag,H1_Notes,H2_Testing policy,H2_Notes,H3_Contact tracing,H3_Notes,H4_Emergency investment in healthcare,H4_Notes,H5_Investment in vaccines,H5_Notes,M1_Wildcard,M1_Notes,ConfirmedCases,ConfirmedDeaths,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay
15925,United States,USA,Wyoming,US_WY,STATE_ALL,20201017,1.0,1.0,,1.0,1.0,,1.0,1.0,,,,,2.0,0.0,,0.0,,,0.0,,,0.0,,0.0,,,1.0,,,,,,2.0,1.0,,2.0,,2.0,,,,,,,,8816.0,57.0,,40.74,47.62,47.62,,44.87
15926,United States,USA,Wyoming,US_WY,STATE_ALL,20201018,1.0,1.0,,1.0,1.0,,1.0,1.0,,,,,2.0,0.0,,0.0,,,0.0,,,0.0,,0.0,,,1.0,,,,,,2.0,1.0,,2.0,,2.0,,,,,,,,9025.0,57.0,,40.74,47.62,47.62,,44.87
15927,United States,USA,Wyoming,US_WY,STATE_ALL,20201019,1.0,1.0,,1.0,1.0,,1.0,1.0,,,,,2.0,0.0,,0.0,,,0.0,,,0.0,,0.0,,,1.0,,,,,,2.0,1.0,,2.0,,2.0,,,,,,,,9311.0,57.0,,40.74,47.62,47.62,,44.87
15928,United States,USA,Wyoming,US_WY,STATE_ALL,20201020,1.0,1.0,,1.0,1.0,,1.0,1.0,,,,,2.0,0.0,,0.0,,,0.0,,,0.0,,0.0,,,1.0,,,,,,2.0,1.0,,2.0,,2.0,,,,,,,,9526.0,61.0,,40.74,47.62,47.62,,44.87
15929,United States,USA,Wyoming,US_WY,STATE_ALL,20201021,1.0,1.0,,1.0,1.0,,1.0,1.0,,,,,2.0,0.0,,0.0,,,0.0,,,0.0,,0.0,,,1.0,,,,,,2.0,1.0,,2.0,,2.0,,,,,,,,,,,40.74,47.62,47.62,,44.87


In [None]:
# Drop columns not necessary
oxford_raw = oxford_raw.drop(['CountryName', 'CountryCode','ConfirmedCases', 'ConfirmedDeaths', 'StringencyIndex','StringencyLegacyIndex','GovernmentResponseIndex'], axis=1)
oxford_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15930 entries, 0 to 15929
Data columns (total 52 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   RegionName                             15340 non-null  object 
 1   RegionCode                             15340 non-null  object 
 2   Jurisdiction                           15930 non-null  object 
 3   Date                                   15930 non-null  int64  
 4   C1_School closing                      15391 non-null  float64
 5   C1_Flag                                11208 non-null  float64
 6   C1_Notes                               795 non-null    object 
 7   C2_Workplace closing                   15405 non-null  float64
 8   C2_Flag                                10348 non-null  float64
 9   C2_Notes                               778 non-null    object 
 10  C3_Cancel public events                15403 non-null  float64
 11  C3

In [None]:
oxford = oxford_raw.copy()
oxford['Date'] = oxford['Date'].apply(lambda x: dt.datetime.strptime(str(x), '%Y%m%d'))

In [None]:
# sanity check: end of March in NY - everything should be closed
oxford[(oxford['Date']=='2020-03-31')&(oxford['RegionName']=='New York')]

Unnamed: 0,RegionName,RegionCode,Jurisdiction,Date,C1_School closing,C1_Flag,C1_Notes,C2_Workplace closing,C2_Flag,C2_Notes,C3_Cancel public events,C3_Flag,C3_Notes,C4_Restrictions on gatherings,C4_Flag,C4_Notes,C5_Close public transport,C5_Flag,C5_Notes,C6_Stay at home requirements,C6_Flag,C6_Notes,C7_Restrictions on internal movement,C7_Flag,C7_Notes,C8_International travel controls,C8_Notes,E1_Income support,E1_Flag,E1_Notes,E2_Debt/contract relief,E2_Notes,E3_Fiscal measures,E3_Notes,E4_International support,E4_Notes,H1_Public information campaigns,H1_Flag,H1_Notes,H2_Testing policy,H2_Notes,H3_Contact tracing,H3_Notes,H4_Emergency investment in healthcare,H4_Notes,H5_Investment in vaccines,H5_Notes,M1_Wildcard,M1_Notes,StringencyIndexForDisplay,StringencyLegacyIndexForDisplay,GovernmentResponseIndexForDisplay
10710,New York,US_NY,STATE_ALL,2020-03-31,3.0,1.0,,3.0,1.0,,2.0,1.0,,4.0,1.0,,1.0,1.0,,2.0,1.0,,1.0,1.0,,2.0,,2.0,1.0,,2.0,,0.0,,0.0,,2.0,1.0,,1.0,,1.0,,,,,,,,79.63,84.52,76.92


In [None]:
# sanity check: Most recent in NY - everything should be closed
oxford[(oxford['Date']==max(oxford['Date']))&(oxford['RegionName']=='New York')]

Unnamed: 0,RegionName,RegionCode,Jurisdiction,Date,C1_School closing,C1_Flag,C1_Notes,C2_Workplace closing,C2_Flag,C2_Notes,C3_Cancel public events,C3_Flag,C3_Notes,C4_Restrictions on gatherings,C4_Flag,C4_Notes,C5_Close public transport,C5_Flag,C5_Notes,C6_Stay at home requirements,C6_Flag,C6_Notes,C7_Restrictions on internal movement,C7_Flag,C7_Notes,C8_International travel controls,C8_Notes,E1_Income support,E1_Flag,E1_Notes,E2_Debt/contract relief,E2_Notes,E3_Fiscal measures,E3_Notes,E4_International support,E4_Notes,H1_Public information campaigns,H1_Flag,H1_Notes,H2_Testing policy,H2_Notes,H3_Contact tracing,H3_Notes,H4_Emergency investment in healthcare,H4_Notes,H5_Investment in vaccines,H5_Notes,M1_Wildcard,M1_Notes,StringencyIndexForDisplay,StringencyLegacyIndexForDisplay,GovernmentResponseIndexForDisplay
10914,New York,US_NY,STATE_ALL,2020-10-21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,69.91,74.52,75.32


# Data Cleaning 3: JHU COVID-19 Global Dataset

Data cleaning of JHU COVID-19 USA dataset
Aggregated by state
* Confirmed = cumulative confirmed (cases?)
* Deaths = cumulative deaths
* Population = state population
* Date
* new_confirmed = new confirmed cases each day
* new_deaths = new death cases each day

In [None]:
# importing country lists and details from YYG

EU_COUNTRIES = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czechia', 'Denmark',
    'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary',
    'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands',
    'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden'
]
LATIN_AMERICA_COUNTRIES = [
    'Argentina', 'Bolivia', 'Brazil', 'Chile', 'Colombia', 'Cuba', 'Dominican Republic',
    'Ecuador', 'Honduras', 'Mexico', 'Panama', 'Peru',
]
AFRICAN_COUNTRIES = ['Algeria', 'Egypt', 'Morocco', 'Nigeria', 'South Africa']
ASIAN_COUNTRIES = ['Bangladesh', 'China', 'Iran', 'Israel', 'Japan', 'Indonesia', 'India', 'Kuwait',
    'Malaysia', 'Pakistan', 'Philippines', 'Russia', 'Saudi Arabia', 'South Korea', 'Turkey',
    'United Arab Emirates']
EUROPEAN_COUNTRIES = EU_COUNTRIES + [
    'United Kingdom', 'Switzerland', 'Norway',
    'Belarus', 'Iceland', 'Moldova', 'Serbia', 'Ukraine']
OTHER_COUNTRIES = ['Australia', 'Canada']

ADDL_COUNTRIES_SUPPORTED = EUROPEAN_COUNTRIES + LATIN_AMERICA_COUNTRIES + \
    AFRICAN_COUNTRIES + ASIAN_COUNTRIES + OTHER_COUNTRIES
ALL_COUNTRIES = ADDL_COUNTRIES_SUPPORTED + ['US']

DASH_REGIONS = ['Miami-Dade']
NON_SEASONAL_COUNTRIES = ['Indonesia', 'Philippines', 'India', 'Malaysia', 'Nigeria',
    'Bolivia', 'Colombia', 'Cuba', 'Dominican Republic', 'Ecuador', 'Honduras', 'Panama', 'Peru', 'Brazil']
SOUTHERN_HEMISPHERE_COUNTRIES = ['Argentina', 'Australia', 'Chile', 'South Africa']
HIGH_INCOME_EUROPEAN_COUNTRIES = ['Iceland', 'Norway', 'Switzerland', 'United Kingdom'] + \
    [c for c in EU_COUNTRIES if c not in ['Bulgaria']]
HIGH_INCOME_COUNTRIES = ['US', 'Australia', 'Canada', 'Chile', 'Israel', 'Japan', 'South Korea',
    'Kuwait', 'Panama', 'Saudi Arabia', 'United Arab Emirates'] + HIGH_INCOME_EUROPEAN_COUNTRIES
EARLY_IMPACTED_COUNTRIES = ['US', 'Canada', 'China', 'Japan', 'South Korea', 'Israel', 'Iran'] + EUROPEAN_COUNTRIES
NO_LOCKDOWN_COUNTRIES = ['Sweden', 'Belarus']
SECOND_LOCKDOWN_COUNTRIES = ['Australia', 'Israel']

In [None]:
#read 
raw_global = pd.read_csv("https://raw.githubusercontent.com/datasets/covid-19/master/data/time-series-19-covid-combined.csv")

In [None]:
raw_global.head()

Unnamed: 0,Date,Country/Region,Province/State,Lat,Long,Confirmed,Recovered,Deaths
0,2020-01-22,Afghanistan,,33.93911,67.709953,0,0.0,0
1,2020-01-23,Afghanistan,,33.93911,67.709953,0,0.0,0
2,2020-01-24,Afghanistan,,33.93911,67.709953,0,0.0,0
3,2020-01-25,Afghanistan,,33.93911,67.709953,0,0.0,0
4,2020-01-26,Afghanistan,,33.93911,67.709953,0,0.0,0


In [None]:
#drop lat/long
raw_global = raw_global.drop(['Lat', 'Long'], axis=1)
raw_global.head()

Unnamed: 0,Date,Country/Region,Province/State,Confirmed,Recovered,Deaths
0,2020-01-22,Afghanistan,,0,0.0,0
1,2020-01-23,Afghanistan,,0,0.0,0
2,2020-01-24,Afghanistan,,0,0.0,0
3,2020-01-25,Afghanistan,,0,0.0,0
4,2020-01-26,Afghanistan,,0,0.0,0


In [None]:
# aggregate by date and country/region, get Confirmed, Deaths, Population

df_global = raw_global.groupby(['Date', 'Country/Region']).agg('sum').reset_index()
df_global.head()

Unnamed: 0,Date,Country/Region,Confirmed,Recovered,Deaths
0,2020-01-22,Afghanistan,0,0.0,0
1,2020-01-22,Albania,0,0.0,0
2,2020-01-22,Algeria,0,0.0,0
3,2020-01-22,Andorra,0,0.0,0
4,2020-01-22,Angola,0,0.0,0


In [None]:
# filter to most relevant countries

df_global_clean = df_global[df_global['Country/Region'].isin(ALL_COUNTRIES)]
df_global_clean.head()


Unnamed: 0,Date,Country/Region,Confirmed,Recovered,Deaths
2,2020-01-22,Algeria,0,0.0,0
6,2020-01-22,Argentina,0,0.0,0
8,2020-01-22,Australia,0,0.0,0
9,2020-01-22,Austria,0,0.0,0
13,2020-01-22,Bangladesh,0,0.0,0


In [None]:
df_global_clean['Date'] = df_global_clean['Date'].apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df_global_clean = df_global_clean.sort_values(by=['Date', 'Country/Region'])
df_global_clean.tail()

Unnamed: 0,Date,Country/Region,Confirmed,Recovered,Deaths
51015,2020-10-17,Turkey,345678,302499.0,9224
51016,2020-10-17,US,8106384,3220573.0,219286
51018,2020-10-17,Ukraine,301856,129033.0,5669
51019,2020-10-17,United Arab Emirates,114387,106354.0,459
51020,2020-10-17,United Kingdom,708298,2572.0,43669


In [None]:
####### Below is working function, please leave it
def new_cases_country(dataframe, country):
  confirmed = dataframe[dataframe['Country/Region'] == country]['Confirmed'].values.tolist()
  deaths = dataframe[dataframe['Country/Region'] == country]['Deaths'].values.tolist()
  tmp_df = dataframe[dataframe['Country/Region'] == country]
  new_confirmed = [confirmed[0]]
  new_deaths = [deaths[0]]
  for i in range(len(tmp_df)-1):
    new_confirmed.append(confirmed[i+1]-confirmed[i])
    new_deaths.append(deaths[i+1]-deaths[i])
  tmp_df['new_confirmed'] = new_confirmed
  tmp_df['new_deaths'] = new_deaths

  return tmp_df # returns subsetted df with daily new confirmed and daily new deaths

In [None]:
countries = sorted(set(df_global_clean['Country/Region'].values.tolist()))
df_country_final = pd.DataFrame() # empty dataframe to store information

for country in countries:
  df_tmp = new_cases_country(df_global_clean, country=country)
  df_country_final = df_country_final.append(df_tmp, ignore_index=True)

df_country_final.tail()

In [None]:
# Compute rolling 7 day average for new_confirmed and new_death for each state
df_country_final['new_confirmed_avg'] = df_country_final.groupby('Country/Region')['new_confirmed'].rolling(7).mean().reset_index(0, drop=True)
df_country_final['new_deaths_avg'] = df_country_final.groupby('Country/Region')['new_deaths'].rolling(7).mean().reset_index(0, drop=True)

In [None]:
df_country_final.tail()

Unnamed: 0,Date,Country/Region,Confirmed,Recovered,Deaths,new_confirmed,new_deaths,new_confirmed_avg,new_deaths_avg
18895,2020-10-13,United Kingdom,637708,2535.0,43108,17250,143,14989.857143,81.857143
18896,2020-10-14,United Kingdom,657459,2551.0,43245,19751,137,15786.714286,91.428571
18897,2020-10-15,United Kingdom,676455,2561.0,43383,18996,138,15993.285714,100.142857
18898,2020-10-16,United Kingdom,692112,2565.0,43519,15657,136,16246.0,107.142857
18899,2020-10-17,United Kingdom,708298,2572.0,43669,16186,150,16390.428571,117.0


# Next Cleaning Task - JHU Datasets (Updated Daily)

In [2]:
raw_cases_us = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv")
raw_deaths_us = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv")

raw_cases_global = pd.read_csv("https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv", error_bad_lines=False)
raw_deaths_global = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")

b'Skipping line 51: expected 1 fields, saw 2\nSkipping line 56: expected 1 fields, saw 2\nSkipping line 57: expected 1 fields, saw 2\nSkipping line 71: expected 1 fields, saw 2\nSkipping line 110: expected 1 fields, saw 2\nSkipping line 156: expected 1 fields, saw 3\nSkipping line 168: expected 1 fields, saw 6\nSkipping line 169: expected 1 fields, saw 3\nSkipping line 193: expected 1 fields, saw 4\nSkipping line 197: expected 1 fields, saw 2\nSkipping line 199: expected 1 fields, saw 2\nSkipping line 200: expected 1 fields, saw 2\nSkipping line 201: expected 1 fields, saw 2\nSkipping line 202: expected 1 fields, saw 2\nSkipping line 203: expected 1 fields, saw 2\nSkipping line 204: expected 1 fields, saw 2\nSkipping line 205: expected 1 fields, saw 2\nSkipping line 206: expected 1 fields, saw 2\nSkipping line 207: expected 1 fields, saw 2\nSkipping line 211: expected 1 fields, saw 2\nSkipping line 212: expected 1 fields, saw 2\nSkipping line 218: expected 1 fields, saw 2\nSkipping lin

## 1. Data Cleaning: US

In [3]:
raw_cases_us.tail()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,10/12/20,10/13/20,10/14/20,10/15/20,10/16/20,10/17/20,10/18/20,10/19/20,10/20/20,10/21/20
3335,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.58908,...,675,679,686,687,692,692,700,711,716,715
3336,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,...,381,384,385,388,392,401,401,403,404,407
3337,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3338,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,...,126,127,132,132,133,133,134,135,139,142
3339,84056045,US,USA,840,56045.0,Weston,Wyoming,US,43.839612,-104.567488,...,58,62,66,71,81,88,93,99,101,103


In [4]:
raw_deaths_us.tail()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,10/12/20,10/13/20,10/14/20,10/15/20,10/16/20,10/17/20,10/18/20,10/19/20,10/20/20,10/21/20
3335,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.58908,...,1,1,1,1,1,1,1,1,1,1
3336,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,...,2,2,2,2,2,2,2,2,2,2
3337,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3338,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,...,6,7,7,7,7,7,7,7,7,7
3339,84056045,US,USA,840,56045.0,Weston,Wyoming,US,43.839612,-104.567488,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# clean us cases and deaths
def us_col_clean(case_df, death_df):
    cols_to_drop = ['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Country_Region', 'Lat', 'Long_', 'Combined_Key']
    tmp_case = case_df.drop(cols_to_drop, axis=1)
    tmp_death = death_df.drop(cols_to_drop + ['Population'], axis=1)
    tmp_case['indicator'] = 'Confirmed'
    tmp_death['indicator'] = 'Deaths'
    tmp = pd.concat([tmp_case, tmp_death], axis=0, ignore_index=True)
  
    return tmp

In [6]:
us_raw = us_col_clean(raw_cases_us, raw_deaths_us)
us_raw.tail()

Unnamed: 0,Province_State,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,10/13/20,10/14/20,10/15/20,10/16/20,10/17/20,10/18/20,10/19/20,10/20/20,10/21/20,indicator
6675,Wyoming,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,Deaths
6676,Wyoming,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,Deaths
6677,Wyoming,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Deaths
6678,Wyoming,0,0,0,0,0,0,0,0,0,...,7,7,7,7,7,7,7,7,7,Deaths
6679,Wyoming,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Deaths


In [16]:
def us_shape_clean(df):
    df = pd.melt(df, id_vars=['Province_State', 'indicator'], var_name='Date', value_name='Value')
    #df.head()
    df['Date'] = df['Date'].apply(lambda x: dt.datetime.strptime(str(x), '%m/%d/%y'))
    df = pd.pivot_table(df, index = ['Province_State', 'Date'], columns='indicator', values = 'Value', aggfunc=np.sum).reset_index()
    # Remove non-states
    not_state = ['American Samoa', 'Diamond Princess', 'Grand Princess', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'Virgin Islands']
    df = df[~df['Province_State'].isin(not_state)]
    df = df.sort_values(by=['Date', 'Province_State'])

    return df

In [17]:
us_clean = us_shape_clean(us_raw)

In [18]:
us_clean.tail()

indicator,Province_State,Date,Confirmed,Deaths
14795,Virginia,2020-10-21,168260,3511
15069,Washington,2020-10-21,99874,2286
15343,West Virginia,2020-10-21,20735,414
15617,Wisconsin,2020-10-21,182687,1681
15891,Wyoming,2020-10-21,9848,61


In [19]:
def calculate_daily(df, state):
    df = df.sort_values(by=['Date'])
    confirmed = df[df['Province_State'] == state]['Confirmed'].values.tolist()
    deaths = df[df['Province_State'] == state]['Deaths'].values.tolist()
    tmp_df = df[df['Province_State'] == state]
    new_confirmed = [confirmed[0]]
    new_deaths = [deaths[0]]
    for i in range(len(tmp_df)-1):
        new_confirmed.append(confirmed[i+1]-confirmed[i])
        new_deaths.append(deaths[i+1]-deaths[i])
    tmp_df['new_confirmed'] = new_confirmed
    tmp_df['new_deaths'] = new_deaths

    return tmp_df # returns subsetted df with daily new confirmed and daily new deaths

In [20]:
states = sorted(set(us_clean['Province_State'].values.tolist()))
us_final = pd.DataFrame()

for state in states:
    tmp = calculate_daily(us_clean, state=state)
    us_final = us_final.append(tmp, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [21]:
us_final.tail()

indicator,Province_State,Date,Confirmed,Deaths,new_confirmed,new_deaths
13969,Wyoming,2020-10-17,8816,57,151,0
13970,Wyoming,2020-10-18,9025,57,209,0
13971,Wyoming,2020-10-19,9311,57,286,0
13972,Wyoming,2020-10-20,9526,61,215,4
13973,Wyoming,2020-10-21,9848,61,322,0


In [22]:
# Compute rolling 7 day average for new_confirmed and new_death for each state
us_final['new_confirmed_avg'] = us_final.groupby('Province_State')['new_confirmed'].rolling(7).mean().reset_index(0, drop=True)
us_final['new_deaths_avg'] = us_final.groupby('Province_State')['new_deaths'].rolling(7).mean().reset_index(0, drop=True)

In [23]:
us_final.tail(10)

indicator,Province_State,Date,Confirmed,Deaths,new_confirmed,new_deaths,new_confirmed_avg,new_deaths_avg
13964,Wyoming,2020-10-12,7802,54,191,0,167.571429,0.142857
13965,Wyoming,2020-10-13,7964,57,162,3,170.571429,0.571429
13966,Wyoming,2020-10-14,8177,57,213,0,182.571429,0.571429
13967,Wyoming,2020-10-15,8375,57,198,0,183.285714,0.428571
13968,Wyoming,2020-10-16,8665,57,290,0,190.0,0.428571
13969,Wyoming,2020-10-17,8816,57,151,0,194.428571,0.428571
13970,Wyoming,2020-10-18,9025,57,209,0,202.0,0.428571
13971,Wyoming,2020-10-19,9311,57,286,0,215.571429,0.428571
13972,Wyoming,2020-10-20,9526,61,215,4,223.142857,0.571429
13973,Wyoming,2020-10-21,9848,61,322,0,238.714286,0.571429


In [24]:
us_final.to_csv('us_daily_10_22.csv', index=False)