# Data Processing Code for COVID-19 JHU Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

#### Overwrite existing data.

JHU data in this repository is updated daily.

In [2]:
!sudo rm -r ../data/raw/covid19-JHU/

In [3]:
!git clone https://github.com/CSSEGISandData/COVID-19.git ../data/raw/covid19-JHU

Cloning into '../data/raw/covid19-JHU'...
remote: Enumerating objects: 21632, done.[K
remote: Total 21632 (delta 0), reused 0 (delta 0), pack-reused 21632[K
Receiving objects: 100% (21632/21632), 92.13 MiB | 38.63 MiB/s, done.
Resolving deltas: 100% (11584/11584), done.


#### Load into a df

Input and output file locations

In [4]:
mypath = '../data/raw/covid19-JHU/csse_covid_19_data/csse_covid_19_daily_reports/'
population_data = '../data/raw/population/populations.csv'
covid19_out = '../data/processed/covid19-JHU/covid19-JHU.csv'

In [5]:
from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f)) and f != 'README.md' and f != '.gitignore']
onlyfiles.sort()

check start and end file names

In [6]:
print(onlyfiles[0], onlyfiles[-1])

01-22-2020.csv 04-21-2020.csv


In [7]:
i = 0
for f in onlyfiles:
    # data format changed on 3-23
    if datetime.strptime(f[:-4] , '%m-%d-%Y') < datetime.strptime('03-22-2020', '%m-%d-%Y'):
        df = pd.read_csv(mypath + f)
        df = df[['Province/State', 'Country/Region', 'Last Update', 'Confirmed', 'Deaths', 'Recovered']]
        df = df.groupby(['Country/Region']).sum()
        df['Date'] = f[:-4]
        if i == 0:
            dfs = df
        if i > 0:
            dfs = pd.concat([dfs, df])
        i += 1
    else:
        df = pd.read_csv(mypath + f)
        df = df[['Province_State', 'Country_Region', 'Last_Update', 'Confirmed', 'Deaths', 'Recovered']]
        df.columns = ['Province/State', 'Country/Region', 'Last Update', 'Confirmed', 'Deaths', 'Recovered']
        df = df.groupby(['Country/Region']).sum()
        df['Date'] = f[:-4]
        if i == 0:
            dfs = df
        if i > 0:
            dfs = pd.concat([dfs, df])
        i += 1
    print(f, ' loaded')

01-22-2020.csv  loaded
01-23-2020.csv  loaded
01-24-2020.csv  loaded
01-25-2020.csv  loaded
01-26-2020.csv  loaded
01-27-2020.csv  loaded
01-28-2020.csv  loaded
01-29-2020.csv  loaded
01-30-2020.csv  loaded
01-31-2020.csv  loaded
02-01-2020.csv  loaded
02-02-2020.csv  loaded
02-03-2020.csv  loaded
02-04-2020.csv  loaded
02-05-2020.csv  loaded
02-06-2020.csv  loaded
02-07-2020.csv  loaded
02-08-2020.csv  loaded
02-09-2020.csv  loaded
02-10-2020.csv  loaded
02-11-2020.csv  loaded
02-12-2020.csv  loaded
02-13-2020.csv  loaded
02-14-2020.csv  loaded
02-15-2020.csv  loaded
02-16-2020.csv  loaded
02-17-2020.csv  loaded
02-18-2020.csv  loaded
02-19-2020.csv  loaded
02-20-2020.csv  loaded
02-21-2020.csv  loaded
02-22-2020.csv  loaded
02-23-2020.csv  loaded
02-24-2020.csv  loaded
02-25-2020.csv  loaded
02-26-2020.csv  loaded
02-27-2020.csv  loaded
02-28-2020.csv  loaded
02-29-2020.csv  loaded
03-01-2020.csv  loaded
03-02-2020.csv  loaded
03-03-2020.csv  loaded
03-04-2020.csv  loaded
03-05-2020.

#### Modify country names

Some of the country names are reported differently in different JHU files.

Regional distinctions are merged under one unifying country name.

Some names are altered to merge with the population data.

This may need to be altered over time if JHU country names change again in future files.

In [8]:
names = dfs.index.tolist()
names = [x.strip() for x in names]
changelist = [i for i, value in enumerate(names) if value == 'Mainland China']
for c in changelist: names[c] = 'China'
changelist = [i for i, value in enumerate(names) if value == 'Republic of Korea']
for c in changelist: names[c] = 'South Korea'
changelist = [i for i, value in enumerate(names) if value == 'Korea, South']
for c in changelist: names[c] = 'South Korea'
changelist = [i for i, value in enumerate(names) if value == 'Iran (Islamic Republic of)']
for c in changelist: names[c] = 'Iran'
changelist = [i for i, value in enumerate(names) if value == 'Iran (Islamic Republic of)']
for c in changelist: names[c] = 'Iran'
changelist = [i for i, value in enumerate(names) if value == 'occupied Palestinian territory']
for c in changelist: names[c] = 'Israel'
changelist = [i for i, value in enumerate(names) if value == 'United Kingdom']
for c in changelist: names[c] = 'UK'
changelist = [i for i, value in enumerate(names) if value == 'Hong Kong SAR']
for c in changelist: names[c] = 'Hong Kong'
changelist = [i for i, value in enumerate(names) if value == 'Taipei and environs']
for c in changelist: names[c] = 'Taiwan'
changelist = [i for i, value in enumerate(names) if value == 'Taiwan*']
for c in changelist: names[c] = 'Taiwan'
changelist = [i for i, value in enumerate(names) if value == 'Republic of Ireland']
for c in changelist: names[c] = 'Ireland'
changelist = [i for i, value in enumerate(names) if value == 'Czechia']
for c in changelist: names[c] = 'Czech Republic'
changelist = [i for i, value in enumerate(names) if value == 'Macau']
for c in changelist: names[c] = 'Macao'
changelist = [i for i, value in enumerate(names) if value == 'Macao SAR']
for c in changelist: names[c] = 'Macao'
changelist = [i for i, value in enumerate(names) if value == 'Republic of Moldova']
for c in changelist: names[c] = 'Moldova'
changelist = [i for i, value in enumerate(names) if value == 'Cote d\'Ivoire']
for c in changelist: names[c] = 'Ivory Coast'
changelist = [i for i, value in enumerate(names) if value == 'Viet Nam']
for c in changelist: names[c] = 'Vietnam'
changelist = [i for i, value in enumerate(names) if value == 'Russian Federation']
for c in changelist: names[c] = 'Russia'
changelist = [i for i, value in enumerate(names) if value == 'Congo (Kinshasa)']
for c in changelist: names[c] = 'Congo'
changelist = [i for i, value in enumerate(names) if value == 'DR Congo']
for c in changelist: names[c] = 'Congo'
changelist = [i for i, value in enumerate(names) if value == 'North Ireland']
for c in changelist: names[c] = 'Ireland'
changelist = [i for i, value in enumerate(names) if value == 'St. Martin']
for c in changelist: names[c] = 'Saint Martin'
dfs.index = names
dfs.index.rename('Country', inplace=True)
dfs['Date'] = pd.to_datetime(dfs['Date'])

#### Merge with the population data

In [9]:
dfp = pd.read_csv(population_data)
dfp.index = dfp.Country.values
dfp.drop(['Country'], axis=1, inplace=True)
dfp.index.rename('Country', inplace=True)
dfp.head()

Unnamed: 0_level_0,Population
Country,Unnamed: 1_level_1
China,1439323776
India,1380004385
US,331002651
Indonesia,273523615
Pakistan,220892340


In [10]:
dfsj = dfs.join(dfp, how='outer')
dfsj['Population'] = pd.to_numeric(dfsj['Population'])

#### ***IMPORTANT ERROR CHECK STEP***

Important error check step to see if JHU country names have changed in recent files and to see which JHU reporting regions have been left out of the final analysis. Sometimes new countries will show up in the list as well. As needed, the names of these countries will need to be modified above in order to merge properly.

In [11]:
dfsj[dfsj.Population.isnull()].groupby('Country').max().sort_values('Confirmed', ascending=False)

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Date,Population
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cruise Ship,712.0,8.0,567.0,2020-03-23,
Diamond Princess,712.0,13.0,644.0,2020-04-21,
Others,706.0,6.0,40.0,2020-03-10,
Kosovo,510.0,12.0,93.0,2020-04-21,
West Bank and Gaza,466.0,4.0,71.0,2020-04-21,
Congo (Brazzaville),165.0,6.0,16.0,2020-04-21,
Burma,121.0,5.0,7.0,2020-04-21,
Reunion,45.0,0.0,0.0,2020-03-21,
Saint Kitts and Nevis,15.0,0.0,0.0,2020-04-21,
Saint Vincent and the Grenadines,12.0,0.0,2.0,2020-04-21,


Drop those that don't merge properly

In [12]:
dfsj.dropna(inplace=True)
dfsj['PercPopConfirmed'] = 100* dfsj.Confirmed/dfsj.Population
dfsj['PercPopDeaths'] = 100* dfsj.Deaths/dfsj.Population

In [13]:
dfsj.to_csv(covid19_out)