In [2]:
import io

# all of these are not included in "base Python" - they can be installed using the conda package manager though
import pandas as pd
import numpy
import requests
import altair as alt

In [4]:
url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
url = 'https://j.mp/covid19casescsvnew'
content_string = requests.get(url).text
raw_data = pd.read_csv(io.StringIO(content_string))
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258 entries, 0 to 257
Data columns (total 76 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province/State  80 non-null     object 
 1   Country/Region  258 non-null    object 
 2   Lat             258 non-null    float64
 3   Long            258 non-null    float64
 4   1/22/20         258 non-null    int64  
 5   1/23/20         258 non-null    int64  
 6   1/24/20         258 non-null    int64  
 7   1/25/20         258 non-null    int64  
 8   1/26/20         258 non-null    int64  
 9   1/27/20         258 non-null    int64  
 10  1/28/20         258 non-null    int64  
 11  1/29/20         258 non-null    int64  
 12  1/30/20         258 non-null    int64  
 13  1/31/20         258 non-null    int64  
 14  2/1/20          258 non-null    int64  
 15  2/2/20          258 non-null    int64  
 16  2/3/20          258 non-null    int64  
 17  2/4/20          258 non-null    int

In [52]:
# first illustration of how to flip the data to long form and clean up column names
value_columns = raw_data.columns[4:]
covid19 = raw_data.drop(
    ['Province/State', 'Lat', 'Long'], axis=1).melt(
        id_vars=['Country/Region'], value_vars=value_columns, var_name='textdate', value_name='Cases')
covid19['Date'] = covid19['textdate'].apply(pd.to_datetime)
covid19 = covid19.sort_values(by=['Date']).drop(['textdate'], axis=1)
covid19.columns = ['Country', 'Cases', 'Date']

In [53]:
sorted(covid19.Country.unique())

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burma',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Diamond Princess',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Holy See',


In [54]:
alt.Chart(covid19[covid19.Country.isin(['Spain',
                                        'Korea, South'])]
         ).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [55]:
alt.Chart(covid19[covid19.Country.isin(['Italy'])]
         ).mark_line().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [56]:
alt.Chart(covid19[covid19.Country.isin(['Congo (Kinshasa)'])]
         ).mark_line().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')


In [57]:
alt.Chart(covid19[covid19.Country.isin(['United Arab Emirates'])]
         ).mark_line().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')


In [58]:
alt.Chart(covid19[covid19.Country == 'China']
         ).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [59]:
alt.Chart(covid19[covid19.Country == 'South Africa']
         ).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [60]:
# US data is in multiple states, so to see it all combined need to use groupby and aggregate using sum
value_columns = raw_data.columns[4:]
covid19 = raw_data.drop(
    ['Lat', 'Long'], axis=1).melt(
        id_vars=['Country/Region', 'Province/State'], value_vars=value_columns, var_name='textdate', value_name='Cases')
covid19['Date'] = covid19['textdate'].apply(pd.to_datetime)
covid19 = covid19.sort_values(by=['Date']).drop(['textdate'], axis=1)
covid19.columns = ['Country', 'Province/State', 'Cases', 'Date']
grouped = covid19.groupby(['Date', 'Country'], as_index=False).sum()

In [61]:
alt.Chart(grouped[grouped.Country.isin(['US'])]).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [62]:
alt.Chart(grouped[grouped.Country.isin(['US', 'Italy'])]).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [63]:
alt.Chart(covid19[covid19.Country == 'Switzerland']
         ).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

### Working with time since first case
As [this page](https://uofabioinformaticshub.github.io/COVID19/COVID19_Progression.html) shows, it is useful to examine each country's outbreak from the perspective of the date of the first known case. The following code takes this approach and also scales cases by population size.

In [64]:
# aggregate each country's cases
covid19 = covid19.groupby(['Date', 'Country'], as_index=False).sum()

# find the date of the first case in each country
first_dates = {}
for row in covid19[covid19.Cases > 0].groupby('Country').head(1).loc[:,['Country', 'Date', 'Cases']].iterrows():
    first_dates[row[1].Country] = row[1].Date

adjustdate = lambda row: (row.Date - first_dates[row.Country]).days if row.Country in first_dates else numpy.nan
covid19_with_delta = covid19.assign(DaysSinceFirstCase = lambda df: df.apply(adjustdate, axis=1))

In [65]:
covid19_with_delta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12282 entries, 0 to 12281
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                12282 non-null  datetime64[ns]
 1   Country             12282 non-null  object        
 2   Cases               12282 non-null  int64         
 3   DaysSinceFirstCase  12282 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 479.8+ KB


In [66]:
# discard all data from before cases showed up in a country
positive_cases = covid19_with_delta[covid19_with_delta.DaysSinceFirstCase >= 0]
# alt.Chart(positive_cases[positive_cases.Country.isin(['US', 'UK'])]).mark_point().encode(x='DateSinceFirstCase', y='Cases', color='Country')
alt.Chart(positive_cases[positive_cases.Country.isin(['US', 
                                                      'United Kingdom', 
#                                                       'Iran', 
                                                      'Italy', 
                                                      'Germany', 'France'])]).mark_point().encode(x='DaysSinceFirstCase', y='Cases', color='Country')

In [67]:
# plot *all* countries - gets messy
alt.Chart(positive_cases).mark_point().encode(x='DaysSinceFirstCase', y='Cases', color='Country')

MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000). For information on how to plot larger datasets in Altair, see the documentation

alt.Chart(...)

In [68]:
# pick up some data on world population by country
population_file = io.StringIO(requests.get('https://datahub.io/JohnSnowLabs/population-figures-by-country/r/population-figures-by-country-csv.csv').text)

population = pd.read_csv(population_file, index_col=0)

In [69]:
# drop all but the latest population, convert 2016 population to millions
population = population.drop(population.columns[1:-1], axis=1).assign(Population = lambda _: population.Year_2016 / 1000000).drop(['Year_2016'], axis=1)
population

Unnamed: 0_level_0,Country_Code,Population
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Aruba,ABW,0.104822
Afghanistan,AFG,34.656032
Angola,AGO,28.813463
Albania,ALB,2.876101
Andorra,AND,0.077281
...,...,...
Kosovo,XKX,1.816200
"Yemen, Rep.",YEM,27.584213
South Africa,ZAF,55.908865
Zambia,ZMB,16.591390


In [70]:
# map the country names in the positive_cases dataframe to country codes from the population dataframe
population_country_names = set(population.index)
unknown_country_to_code = {
    'Korea, South': 'KOR',
    'Taiwan*': 'TWN',
    'US': 'USA',
    'Russia': 'RUS',
    'Cruise Ship': 'CRU',
    'Egypt': 'EGY',
    'Iran': 'IRN',
    'North Macedonia': 'MKD',
    'Czechia': 'CZE',
    'Holy See': 'VAT',
    'Slovakia': 'SVK',
    'Martinique': 'MTQ',
    'Brunei': 'BRN',
    'Congo (Kinshasa)': 'COD',
    'Eswatini': 'SWZ',
    'Saint Lucia': 'LCA',
    'Saint Vincent and the Grenadines': 'VCT',
    'Congo (Brazzaville)': 'COG',
    'The Bahamas': 'BHS',
    'Kyrgyzstan': 'KGZ',
    'Venezuela': 'VEN',
    'Cape Verde': 'CPV',
    'East Timor': 'TLS',
    'Syria': 'SYR',
    'French Guiana': 'GUF',
    'Guadeloupe': 'GLP',
    'Mayotte': 'MYT',
    'Reunion': 'REU',
    'Diamond Princess': 'CDP',
    'Bahamas': 'BHS',
    'Gambia': 'GMB',
    'Laos': 'LAO',
    'Saint Kitts and Nevis': 'KNA',
    'Burma': 'MMR',
    'MS Zaandam': 'CMZ'
}

country_name_mapping = {}
for name in list(positive_cases.Country.unique()):
    if name in population_country_names:
        country_name_mapping[name] = population.loc['South Africa'].Country_Code
    elif name in unknown_country_to_code:
        country_name_mapping[name] = unknown_country_to_code[name]
    else:
        print('Unknown:', name)

In [71]:
# these country codes exist in the case data but don't exist in the population data
set(country_name_mapping.values()).difference(set(population.Country_Code.unique()))

{'CDP', 'CMZ', 'TWN', 'VAT'}

In [72]:
# scale the cases in a country by population size of that country
def cases_per_million_pop(row):
    if country_name_mapping[row.Country] in population.Country_Code.values:
        return float(row.Cases / population[population.Country_Code == country_name_mapping[row.Country]].Population)
    else:
        return numpy.nan

In [73]:
scaled_positive_cases = positive_cases.assign(CasesPerMillionPop=positive_cases.apply(cases_per_million_pop, axis=1))

In [74]:
# now graph some cases per million population info
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['US', 
                                                                    'United Kingdom', 
#                                                                   'Iran', 
                                                                    'Italy', 
                                                                    'Germany',
                                                                    'France',
                                                                    'South Africa'])]).mark_point().encode(
                                                                        x='DaysSinceFirstCase', y='CasesPerMillionPop', color='Country')

In [75]:
# and again, but this time with log scale Y axis - a diagonal line heading up and right here is exponential growth in cases
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['US', 
                                                                    'United Kingdom', 
#                                                                   'Iran', 
                                                                    'Italy', 
                                                                    'Germany',
                                                                    'France',
                                                                    'Korea, South',
                                                                    'South Africa',
                                                                    'Japan'])], width=600, height=600).mark_line().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop', scale=alt.Scale(type='log')), color='Country')

In [76]:
# another example with linear Y axis but alos tooltips
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['US', 
                                                                    'United Kingdom', 
#                                                                   'Iran', 
                                                                    'Italy', 
                                                                    'Germany',
                                                                    'France',
                                                                    'Korea, South',
                                                                    'South Africa',
                                                                    'Japan'])]).mark_point().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop'), color='Country',
                                                                        tooltip=['Country'])

In [77]:
tweet = "Sat.1: #Africa #COVID19:
Flag of Algeria
82,
Flag of Benin
2,
Flag of Burkina Faso
40,
Flag of Cameroon
27,
Flag of Cape Verde
1,
Flag of Central African Republic
3,
Flag of Chad
 1,
Flag of Congo - Brazzaville
1,
Flag of Côte d’Ivoire
14,
Flag of Congo - Kinshasa
18,
Flag of Djibouti
1,
Flag of Egypt
285,
Flag of Equatorial Guinea
4,
Flag of Swaziland
1,
Flag of Ethiopia
9,
Flag of Gabon
4,
Flag of Gambia
1,
Flag of Ghana
16,
Flag of Guinea
2,
Flag of Kenya
7,
Flag of Liberia
3,
Flag of Madagascar
3,
Flag of Mauritius
12,
Flag of Mauritania
2,
Flag of Morocco
86,
Flag of Mayotte
1,
Flag of Namibia
3,
Flag of Niger
1,
Flag of Nigeria
12, Reunion-9,
Flag of Rwanda
17,
Flag of Senegal
47,
Flag of Seychelles
6,
Flag of Somalia
1,
Flag of South Africa
202,
Flag of Sudan
2,
Flag of Tanzania
6,
Flag of Togo
9,
Flag of Tunisia
69,
Flag of Zambia
2,
Flag of Zimbabwe
1. 39 countries, 2 territories, 1021 cases, 23 deaths, 103 recovered."

SyntaxError: EOL while scanning string literal (<ipython-input-77-a5dac7f8349e>, line 1)

In [78]:
# and again, but this time with log scale Y axis - a diagonal line heading up and right here is exponential growth in cases
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['South Africa', 
                                                                    'United Kingdom',
                                                                    'Italy'
                                                                    ])], width=600, height=600).mark_line().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop', scale=alt.Scale(type='log')), color='Country')

In [79]:
# and again, but this time with log scale Y axis - a diagonal line heading up and right here is exponential growth in cases
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['South Africa', 
                                                                    'United Kingdom',
                                                                    'Italy'
                                                                    ])], width=600, height=600).mark_line().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop'), color='Country')

In [80]:
positive_cases[positive_cases.Country == 'South Africa']

Unnamed: 0,Date,Country,Cases,DaysSinceFirstCase
7804,2020-03-05,South Africa,1,0
7982,2020-03-06,South Africa,1,1
8160,2020-03-07,South Africa,1,2
8338,2020-03-08,South Africa,3,3
8516,2020-03-09,South Africa,3,4
8694,2020-03-10,South Africa,7,5
8872,2020-03-11,South Africa,13,6
9050,2020-03-12,South Africa,17,7
9228,2020-03-13,South Africa,24,8
9406,2020-03-14,South Africa,38,9


In [85]:
# and again, but this time with log scale Y axis - a diagonal line heading up and right here is exponential growth in cases
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['South Africa', 
                                                                    'United Kingdom',
                                                                    'Mauritius',
                                                                    'Italy',
                                                                    'Czechia'
                                                                    ])], width=600, height=600).mark_line().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop', scale=alt.Scale(type='log')), color='Country')

In [82]:
positive_cases[positive_cases.Country == 'Mauritius']

Unnamed: 0,Date,Country,Cases,DaysSinceFirstCase
10076,2020-03-18,Mauritius,3,0
10254,2020-03-19,Mauritius,3,1
10432,2020-03-20,Mauritius,12,2
10610,2020-03-21,Mauritius,14,3
10788,2020-03-22,Mauritius,28,4
10966,2020-03-23,Mauritius,36,5
11144,2020-03-24,Mauritius,42,6
11322,2020-03-25,Mauritius,48,7
11500,2020-03-26,Mauritius,81,8
11678,2020-03-27,Mauritius,94,9


In [83]:
positive_cases[positive_cases.Country == 'Italy'].Cases.diff()

1685        NaN
1863        0.0
2041        0.0
2219        0.0
2397        0.0
2575        0.0
2753        0.0
2931        1.0
3109        0.0
3287        0.0
3465        0.0
3643        0.0
3821        0.0
3999        0.0
4177        0.0
4355        0.0
4533        0.0
4711        0.0
4889        0.0
5067        0.0
5245        0.0
5423       17.0
5601       42.0
5779       93.0
5957       74.0
6135       93.0
6313      131.0
6491      202.0
6669      233.0
6847      240.0
7025      566.0
7203      342.0
7381      466.0
7559      587.0
7737      769.0
7915      778.0
8093     1247.0
8271     1492.0
8449     1797.0
8627      977.0
8805     2313.0
8983        0.0
9161     5198.0
9339     3497.0
9517     3590.0
9695     3233.0
9873     3526.0
10051    4207.0
10229    5322.0
10407    5986.0
10585    6557.0
10763    5560.0
10941    4789.0
11119    5249.0
11297    5210.0
11475    6203.0
11653    5909.0
11831    5974.0
12009    5217.0
12187    4050.0
Name: Cases, dtype: float64

In [84]:
positive_cases[positive_cases.Country == 'South Africa'].Cases.diff()

7804       NaN
7982       0.0
8160       0.0
8338       2.0
8516       0.0
8694       4.0
8872       6.0
9050       4.0
9228       7.0
9406      14.0
9584      13.0
9762      11.0
9940       0.0
10118     54.0
10296     34.0
10474     52.0
10652     38.0
10830     34.0
11008    128.0
11186    152.0
11364    155.0
11542    218.0
11720    243.0
11898     17.0
12076     93.0
12254     46.0
Name: Cases, dtype: float64

In [87]:
# and again, but this time with log scale Y axis - a diagonal line heading up and right here is exponential growth in cases
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['South Africa', 
                                                                    'Czechia'
                                                                    ])], width=600, height=600).mark_line().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop'), color='Country')

In [88]:
positive_cases[positive_cases.Country == 'Czechia'].Cases.diff()

6986       NaN
7164       0.0
7342       2.0
7520       3.0
7698       4.0
7876       6.0
8054       1.0
8232      12.0
8410       0.0
8588      10.0
8766      50.0
8944       3.0
9122      47.0
9300      48.0
9478      64.0
9656      45.0
9834      98.0
10012     68.0
10190    230.0
10368    139.0
10546    162.0
10724    125.0
10902    116.0
11080    158.0
11258    260.0
11436    271.0
11614    354.0
11792    352.0
11970    186.0
12148    184.0
Name: Cases, dtype: float64

In [90]:
# and again, but this time with log scale Y axis - a diagonal line heading up and right here is exponential growth in cases
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['South Africa', 
                                                                    'Czechia'
                                                                    ])], width=600, height=600).mark_line().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop', scale=alt.Scale(type='log')), color='Country')

In [97]:
testing_data = pd.read_csv('https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv')
testing_data

Unnamed: 0,date,YYYYMMDD,cumulative_tests,recovered,hospitalisation,critical_icu,ventilation,deaths,contacts_identified,contacts_traced,scanned_travellers,passengers_elevated_temperature,covid_suspected_criteria
0,11-02-2020,20200211,61.0,0,,0.0,0.0,0,,,,,
1,13-02-2020,20200213,67.0,0,,0.0,0.0,0,,,,,
2,14-02-2020,20200214,71.0,0,,0.0,0.0,0,,,,,
3,19-02-2020,20200219,95.0,0,,0.0,0.0,0,,,,,
4,20-02-2020,20200220,106.0,0,,0.0,0.0,0,,,,,
5,24-02-2020,20200224,116.0,0,,0.0,0.0,0,,,,,
6,26-02-2020,20200226,121.0,0,,0.0,0.0,0,,,,,
7,02-03-2020,20200302,160.0,0,,0.0,0.0,0,,,13731.0,0.0,0.0
8,03-03-2020,20200303,164.0,0,,0.0,0.0,0,,,11025.0,0.0,0.0
9,06-03-2020,20200306,200.0,0,,0.0,0.0,0,,,,,


In [105]:
alt.Chart(testing_data).mark_point().encode(x=alt.X('YYYYMMDD', 
                                                    scale=alt.Scale(domain=(int(testing_data.YYYYMMDD.min()), 
                                                                            int(testing_data.YYYYMMDD.max())))), 
                                            y=alt.Y('cumulative_tests'))

In [106]:
testing_data.cumulative_tests.diff()

0        NaN
1        6.0
2        4.0
3       24.0
4       11.0
5       10.0
6        5.0
7       39.0
8        4.0
9       36.0
10      41.0
11     404.0
12     203.0
13      76.0
14      93.0
15     459.0
16     929.0
17     506.0
18     159.0
19    1762.0
20    1606.0
21     987.0
22    1890.0
23    3500.0
24    2714.0
25       NaN
26       NaN
27    8066.0
28    3426.0
29    3630.0
30    2816.0
31    2663.0
Name: cumulative_tests, dtype: float64