# Prepare Datasets for Novel Coronavirus (COVID-19) Outbreak

In [1]:
import pandas as pd
import dateutil

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

### Read number of COVID-19 cases
Data are provided by [Coronavirus COVID-19 Global Cases by Johns Hopkins CSSE](https://gisanddata.maps.arcgis.com/apps/opsdashboard/index.html#/bda7594740fd40299423467b48e9ecf6)

Check this [git repository](https://github.com/CSSEGISandData/COVID-19/tree/master/daily_case_updates) for the latest available dataset and adjust ```daily_update``` accordingly.

In [3]:
daily_update = '02-12-2020_1020'

In [4]:
cases = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/daily_case_updates/" + daily_update + ".csv")

Clean up missing data and update data types

In [5]:
cases['Province/State'] = cases['Province/State'].fillna('')
cases['Last Update'] = cases['Last Update'].apply(dateutil.parser.parse)

In [6]:
cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Province/State  73 non-null     object        
 1   Country/Region  73 non-null     object        
 2   Last Update     73 non-null     datetime64[ns]
 3   Confirmed       73 non-null     int64         
 4   Deaths          73 non-null     int64         
 5   Recovered       73 non-null     int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 3.5+ KB


In [7]:
cases

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,Hubei,Mainland China,2020-02-12 14:13:08,33366,1068,2686
1,Guangdong,Mainland China,2020-02-12 12:23:09,1219,1,275
2,Henan,Mainland China,2020-02-12 14:13:08,1135,8,246
3,Zhejiang,Mainland China,2020-02-12 10:13:20,1131,0,321
4,Hunan,Mainland China,2020-02-12 12:43:03,946,2,304
5,Anhui,Mainland China,2020-02-12 10:13:20,889,4,127
6,Jiangxi,Mainland China,2020-02-12 01:23:06,844,1,152
7,Jiangsu,Mainland China,2020-02-12 14:13:08,543,0,125
8,Chongqing,Mainland China,2020-02-12 08:53:03,509,3,87
9,Shandong,Mainland China,2020-02-12 11:13:05,497,2,92


### Read time series file 
We use this file to retrieve the Latitude and Longitude of the outbreak locations.

In [8]:
time_series = pd.read_csv("https://github.com/CSSEGISandData/COVID-19/raw/master/time_series/time_series_2019-ncov-Confirmed.csv")

Clean up missing data and update data types

In [9]:
time_series['Province/State'] = time_series['Province/State'].fillna('')
time_series.iloc[:,4:] = time_series.iloc[:,4:].fillna(0).astype('int64')

In [10]:
time_series.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 45 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province/State  73 non-null     object 
 1   Country/Region  73 non-null     object 
 2   Lat             73 non-null     float64
 3   Long            73 non-null     float64
 4   1/21/20 22:00   73 non-null     int64  
 5   1/22/20 12:00   73 non-null     int64  
 6   1/23/20 12:00   73 non-null     int64  
 7   1/24/20 0:00    73 non-null     int64  
 8   1/24/20 12:00   73 non-null     int64  
 9   1/25/20 0:00    73 non-null     int64  
 10  1/25/20 12:00   73 non-null     int64  
 11  1/25/20 22:00   73 non-null     int64  
 12  1/26/20 11:00   73 non-null     int64  
 13  1/26/20 23:00   73 non-null     int64  
 14  1/27/20 9:00    73 non-null     int64  
 15  1/27/20 19:00   73 non-null     int64  
 16  1/27/20 20:30   73 non-null     int64  
 17  1/28/20 13:00   73 non-null     int64

In [11]:
time_series

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/21/20 22:00,1/22/20 12:00,1/23/20 12:00,1/24/20 0:00,1/24/20 12:00,1/25/20 0:00,1/25/20 12:00,1/25/20 22:00,1/26/20 11:00,1/26/20 23:00,1/27/20 9:00,1/27/20 19:00,1/27/20 20:30,1/28/20 13:00,1/28/20 18:00,1/28/20 23:00,1/29/20 13:30,1/29/20 14:30,1/29/20 21:00,1/30/20 11:00,1/31/20 14:00,2/1/20 10:00,2/2/20 21:00,2/3/20 21:00,2/4/20 9:40,2/4/20 22:00,2/5/20 9:00,2/5/20 23:00,2/6/20 9:00,2/6/20 14:20,2/7/20 20:13,2/7/20 22:50,2/8/20 10:24,2/8/20 23:04,2/9/20 10:30,2/9/20 23:20,2/10/20 10:30,2/10/20 19:30,2/11/20 10:50,2/11/20 20:44,2/12/20 10:20
0,Anhui,Mainland China,31.82571,117.2264,0,1,9,15,15,39,39,60,60,70,70,70,106,106,106,152,152,152,200,200,237,297,408,480,480,530,530,591,591,591,665,733,733,779,779,830,830,830,860,889,889
1,Beijing,Mainland China,40.18238,116.4142,10,14,22,26,36,36,41,51,68,68,72,80,80,91,91,91,111,111,111,114,139,168,191,212,212,228,253,274,274,274,297,315,315,326,326,337,337,337,342,342,352
2,Chongqing,Mainland China,30.05718,107.874,5,6,9,27,27,57,57,75,75,110,110,110,132,132,132,147,147,147,165,182,211,247,300,337,337,366,376,389,400,400,415,426,428,446,450,468,473,486,489,505,509
3,Fujian,Mainland China,26.07783,117.9895,0,1,5,5,10,10,18,18,35,35,56,59,59,80,80,82,84,84,101,101,120,144,159,179,179,194,205,215,215,215,224,239,239,250,250,261,261,261,267,272,272
4,Gansu,Mainland China,36.0611,103.8343,0,0,2,2,2,4,4,7,7,14,14,14,19,19,19,24,24,24,26,26,29,35,51,55,55,57,57,62,62,62,67,71,79,79,79,83,83,86,86,86,86
5,Guangdong,Mainland China,23.33841,113.422,17,26,32,53,53,78,78,98,111,146,151,151,151,207,207,241,277,277,311,354,436,535,683,725,797,870,895,944,970,970,1034,1075,1095,1120,1131,1151,1159,1159,1177,1219,1219
6,Guangxi,Mainland China,23.82908,108.7881,0,2,5,13,23,23,23,33,36,46,46,46,51,51,51,58,58,58,78,78,87,100,127,139,139,150,150,168,168,168,172,183,183,195,195,210,210,210,215,222,222
7,Guizhou,Mainland China,26.81536,106.8748,0,1,3,3,3,4,4,5,5,7,7,7,9,9,9,9,9,9,12,12,29,29,46,56,56,64,64,69,71,71,81,89,89,96,99,109,109,109,127,131,133
8,Hainan,Mainland China,19.19673,109.7455,0,4,5,8,8,17,19,19,22,22,33,33,33,40,40,43,43,43,43,46,52,62,71,79,79,91,91,100,106,106,117,124,124,128,131,136,138,138,144,151,157
9,Hebei,Mainland China,38.0428,114.5149,0,1,1,2,2,8,8,13,13,18,18,18,33,33,33,48,48,48,65,65,82,96,113,126,126,135,135,157,157,157,172,195,195,206,206,218,218,218,239,251,251


### Merge the two dataframes

In [12]:
df = pd.merge(cases, time_series, on=['Province/State','Country/Region'])

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73 entries, 0 to 72
Data columns (total 49 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Province/State  73 non-null     object        
 1   Country/Region  73 non-null     object        
 2   Last Update     73 non-null     datetime64[ns]
 3   Confirmed       73 non-null     int64         
 4   Deaths          73 non-null     int64         
 5   Recovered       73 non-null     int64         
 6   Lat             73 non-null     float64       
 7   Long            73 non-null     float64       
 8   1/21/20 22:00   73 non-null     int64         
 9   1/22/20 12:00   73 non-null     int64         
 10  1/23/20 12:00   73 non-null     int64         
 11  1/24/20 0:00    73 non-null     int64         
 12  1/24/20 12:00   73 non-null     int64         
 13  1/25/20 0:00    73 non-null     int64         
 14  1/25/20 12:00   73 non-null     int64         
 15  1/25/20 

### Split City, State string into separate columns

In [14]:
states = {
        # US
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming',
        # Canada
        'AB': 'Alberta',
        'BC': 'British Columbia',
        'MB': 'Manitoba',
        'NB': 'New Brunswick',
        'NL': 'Newfoundland and Labrador',
        'NT': 'Northwest Territories',
        'NS': 'Nova Scotia',
        'NU': 'Nunavut',
        'ON': 'Ontario',
        'PE': 'Prince Edward Island',
        'QC': 'Quebec',
        'SK': 'Saskatchewan',
        'YT': 'Yukon'
}

In [15]:
def get_country(country):
    if country == 'US':
        return 'USA'
    if country == 'UK':
        return 'United Kingdom'
    if country == 'Mainland China':
        return 'China'
    return country

In [16]:
def get_state(state):
    st = state
    if "," in str(state):
        st = str(state).split(",")[1].strip()
    if st in states:
        return states[st]
    else:
        return state

In [17]:
def get_city(state):
    if "," in str(state):
        return str(state).split(",")[0].strip()
    return ""

In [18]:
df['City'] = df['Province/State'].apply(get_city)
df['State'] = df['Province/State'].apply(get_state)
df['Country'] = df['Country/Region'].apply(get_country)
df['Outbreak'] = 'COVID-19'

### Create City Dataset

In [19]:
df_city = df[df['City'].str.len() > 0]
df_city = df_city[['City', 'State', 'Country', 'Long', 'Lat', 'Outbreak', 'Confirmed', 'Deaths', 'Recovered', 'Last Update']]
df_city.to_csv("../data/city_COVID-19.csv")
df_city

Unnamed: 0,City,State,Country,Long,Lat,Outbreak,Confirmed,Deaths,Recovered,Last Update
52,Toronto,Ontario,Canada,-79.3832,43.6532,COVID-19,2,0,0,2020-02-04 00:13:06
55,Chicago,Illinois,USA,-89.3985,40.6331,COVID-19,2,0,2,2020-02-09 19:03:03
56,San Benito,California,USA,-120.9876,36.5761,COVID-19,2,0,0,2020-02-03 03:53:02
57,Santa Clara,California,USA,-121.9552,37.3541,COVID-19,2,0,0,2020-02-03 00:43:02
60,London,Ontario,Canada,-81.2453,42.9849,COVID-19,1,0,0,2020-02-04 00:03:11
66,Boston,Massachusetts,USA,-71.0589,42.3601,COVID-19,1,0,0,2020-02-01 19:43:03
67,Los Angeles,California,USA,-118.2437,34.0522,COVID-19,1,0,0,2020-02-01 19:53:03
68,Madison,Wisconsin,USA,-89.4012,43.0731,COVID-19,1,0,0,2020-02-05 21:53:02
69,Orange,California,USA,-117.8531,33.7879,COVID-19,1,0,0,2020-02-01 19:53:03
70,San Diego County,California,USA,-117.1611,32.7157,COVID-19,1,0,0,2020-02-11 01:23:05


### Aggregate data by State
Latitude and longitude are average over all locations in a state

In [20]:
df_state = df[df['State'].str.len() > 0]
df_state = df_state.groupby(['State', 'Country'], as_index=False).agg({'Lat': "mean", 'Long': 'mean', 'Outbreak': 'first', 'Confirmed': "sum", "Deaths": 'sum', 'Recovered': 'sum', 'Last Update': 'max'})
df_state.to_csv("../data/state_COVID-19.csv")
df_state

Unnamed: 0,State,Country,Lat,Long,Outbreak,Confirmed,Deaths,Recovered,Last Update
0,Anhui,China,31.82571,117.2264,COVID-19,889,4,127,2020-02-12 10:13:20
1,Arizona,USA,34.0489,-111.094,COVID-19,1,0,0,2020-02-01 19:43:03
2,Beijing,China,40.18238,116.4142,COVID-19,352,3,56,2020-02-12 02:43:03
3,British Columbia,Canada,49.2827,-123.121,COVID-19,4,0,0,2020-02-07 05:43:03
4,California,USA,34.8972,-119.24014,COVID-19,7,0,0,2020-02-11 01:23:05
5,Chongqing,China,30.05718,107.874,COVID-19,509,3,87,2020-02-12 08:53:03
6,Diamond Princess cruise ship,Others,35.4437,129.638,COVID-19,175,0,0,2020-02-12 04:23:38
7,Fujian,China,26.07783,117.9895,COVID-19,272,0,53,2020-02-12 11:53:02
8,Gansu,China,36.0611,103.8343,COVID-19,86,2,29,2020-02-12 15:03:05
9,Guangdong,China,23.33841,113.422,COVID-19,1219,1,275,2020-02-12 12:23:09


### Aggregate data by Country
Latitude and longitude are average over all locations in a country

In [21]:
df_country = df.groupby(['Country'], as_index=False).agg({'Lat': 'mean', 'Long': 'mean', 'Outbreak': 'first', 'Confirmed': 'sum', 'Deaths': 'sum', 'Recovered': 'sum', 'Last Update': 'max'})
df_country.to_csv("../data/country_COVID-19.csv")
df_country

Unnamed: 0,Country,Lat,Long,Outbreak,Confirmed,Deaths,Recovered,Last Update
0,Australia,-33.520175,146.94955,COVID-19,15,0,2,2020-02-09 19:33:02
1,Belgium,50.5039,4.4699,COVID-19,1,0,0,2020-02-04 15:43:02
2,Cambodia,12.5657,104.991,COVID-19,1,0,1,2020-02-12 07:43:02
3,Canada,45.306933,-94.583167,COVID-19,7,0,0,2020-02-07 05:43:03
4,China,33.354209,111.565415,COVID-19,44687,1115,5062,2020-02-12 15:03:05
5,Finland,61.9241,25.7482,COVID-19,1,0,1,2020-02-12 00:03:12
6,France,46.2276,2.2137,COVID-19,11,0,0,2020-02-08 09:53:01
7,Germany,51.1657,10.4515,COVID-19,16,0,0,2020-02-11 19:33:03
8,Hong Kong,22.3193,114.1694,COVID-19,50,1,1,2020-02-12 09:53:02
9,India,20.5937,78.9629,COVID-19,3,0,0,2020-02-03 21:43:02


### Add Strain data from Nextstrain.org
Data are provided by [Nextstrain.org](https://nextstrain.org), a resource forrReal-time tracking of pathogen evolution.

Check this [git repository](https://github.com/nextstrain/ncov) for the latest available dataset.

In [22]:
strains = pd.read_csv("https://github.com/nextstrain/ncov/raw/master/data/metadata.tsv", sep = '\t')

In [23]:
strains

Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,date,region,country,division,location,segment,host,originating_lab,submitting_lab,authors,url,title
0,Australia/NSW01/2020,ncov,EPI_ISL_407893,?,2020-01-24,Oceania,Australia,New South Wales,Sydney,genome,human,Centre for Infectious Diseases and Microbiolog...,NSW Health Pathology - Institute of Clinical P...,Eden et al,https://www.gisaid.org,"Newly discovered betacoronavirus, 2019-2020"
1,Australia/QLD01/2020,ncov,EPI_ISL_407894,?,2020-01-28,Oceania,Australia,Queensland,Gold Coast,genome,human,"Pathology Queensland, Gold Coast, Australia","Public Health Virology Laboratory, Brisbane, A...",Huang et al,https://www.gisaid.org,"Newly discovered betacoronavirus, 2019-2020"
2,Australia/QLD02/2020,ncov,EPI_ISL_407896,?,2020-01-30,Oceania,Australia,Queensland,Gold Coast,genome,human,"Pathology Queensland, Gold Coast, Australia","Public Health Virology Laboratory, Brisbane, A...",Huang et al,https://www.gisaid.org,"Newly discovered betacoronavirus, 2019-2020"
3,Australia/VIC01/2020,ncov,EPI_ISL_406844,?,2020-01-25,Oceania,Australia,Victoria,Clayton,genome,human,"Monash Medical Centre, Melbourne, Australia",Collaboration between the University of Melbou...,Caly et al,https://www.gisaid.org,"Newly discovered betacoronavirus, 2019-2020"
4,Beijing/IVDC-BJ-005/2020,ncov,EPI_ISL_408485,?,2020-01-18,China,China,Beijing,Beijing,genome,human,National Institute for Viral Disease Control a...,National Institute for Viral Disease Control a...,Tan et al,https://www.gisaid.org,"Newly discovered betacoronavirus, 2019-2020"
5,Belgium/GHB-03021/2020,ncov,EPI_ISL_407976,?,2020-02-03,Europe,Belgium,Flanders,Leuven,genome,human,"KU Leuven, Clinical and Epidemiological Virolo...","KU Leuven, Clinical and Epidemiological Virolo...",Vanmechelen et al,https://www.gisaid.org,"Newly discovered betacoronavirus, 2019-2020"
6,Chongqing/IVDC-CQ-001/2020,ncov,EPI_ISL_408481,?,2020-01-18,China,China,Chongqing,Chongqing,genome,human,National Institute for Viral Disease Control a...,National Institute for Viral Disease Control a...,Tan et al,https://www.gisaid.org,"Newly discovered betacoronavirus, 2019-2020"
7,Chongqing/YC01/2020,ncov,EPI_ISL_408478,?,2020-01-21,China,China,Chongqing,Yongchuan,genome,human,Yongchuan District Center for Disease Control ...,Chongqing Municipal Center for Disease Control...,Sheng et al,https://www.gisaid.org,"Newly discovered betacoronavirus, 2019-2020"
8,Chongqing/ZX01/2020,ncov,EPI_ISL_408479,?,2020-01-23,China,China,Chongqing,Zhongxian,genome,human,Zhongxian Center for Disease Control and Preve...,Chongqing Municipal Center for Disease Control...,Sheng et al,https://www.gisaid.org,"Newly discovered betacoronavirus, 2019-2020"
9,England/01/2020,ncov,EPI_ISL_407071,?,2020-01-29,Europe,United Kingdom,England,England,genome,human,"Respiratory Virus Unit, Microbiology Services ...","Respiratory Virus Unit, Microbiology Services ...",Galiano et al,https://www.gisaid.org,"Newly discovered betacoronavirus, 2019-2020"


In [24]:
strains_city = pd.merge(df_city, strains, left_on=['City','State','Country'], right_on=['location','division', 'country'])
strains_city = strains_city[['City','State','Country','strain','genbank_accession','division','location']]
strains_city.to_csv("../data/strains_city_COVID-19.csv")
strains_city

Unnamed: 0,City,State,Country,strain,genbank_accession,division,location
0,Chicago,Illinois,USA,USA/IL1/2020,MN988713,Illinois,Chicago
1,Boston,Massachusetts,USA,USA-MA1/2020,?,Massachusetts,Boston
2,Los Angeles,California,USA,USA/CA1/2020,MN994467,California,Los Angeles
3,Seattle,Washington,USA,USA-WA1/2020,MN985325,Washington,Seattle


In [25]:
strains_state = pd.merge(df_state, strains, left_on=['State','Country'], right_on=['division', 'country'])
strains_state = strains_state[['State','Country','strain','genbank_accession','division','location']]
strains_state.to_csv("../data/strains_state_COVID-19.csv")
strains_state

Unnamed: 0,State,Country,strain,genbank_accession,division,location
0,Arizona,USA,USA/AZ1/2020,MN997409,Arizona,Phoenix
1,Beijing,China,Beijing/IVDC-BJ-005/2020,?,Beijing,Beijing
2,California,USA,USA/CA1/2020,MN994467,California,Los Angeles
3,California,USA,USA/CA2/2020,MN994468,California,Orange County
4,California,USA,USA/CA3/2020,?,California,California
5,California,USA,USA/CA4/2020,?,California,California
6,California,USA,USA/CA5/2020,?,California,California
7,California,USA,USA/CA6/2020,?,California,California
8,Chongqing,China,Chongqing/IVDC-CQ-001/2020,?,Chongqing,Chongqing
9,Chongqing,China,Chongqing/YC01/2020,?,Chongqing,Yongchuan


In [26]:
strains_country = pd.merge(df_country, strains, left_on='Country', right_on='country')
strains_country = strains_country[['Country','strain','genbank_accession','division','location']]
strains_country.to_csv("../data/strains_country_COVID-19.csv")
strains_country

Unnamed: 0,Country,strain,genbank_accession,division,location
0,Australia,Australia/NSW01/2020,?,New South Wales,Sydney
1,Australia,Australia/QLD01/2020,?,Queensland,Gold Coast
2,Australia,Australia/QLD02/2020,?,Queensland,Gold Coast
3,Australia,Australia/VIC01/2020,?,Victoria,Clayton
4,Australia,Sydney/2/2020,?,New South Wales,Sydney
5,Australia,Sydney/3/2020,?,New South Wales,Sydney
6,Belgium,Belgium/GHB-03021/2020,?,Flanders,Leuven
7,China,Beijing/IVDC-BJ-005/2020,?,Beijing,Beijing
8,China,Chongqing/IVDC-CQ-001/2020,?,Chongqing,Chongqing
9,China,Chongqing/YC01/2020,?,Chongqing,Yongchuan
