In [1]:
import io

# all of these are not included in "base Python" - they can be installed using the conda package manager though
import pandas as pd
import numpy
import requests
import altair as alt

In [2]:
url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
url = 'https://j.mp/covid19casescsvnew'
content_string = requests.get(url).text
raw_data = pd.read_csv(io.StringIO(content_string))
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 96 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province/State  82 non-null     object 
 1   Country/Region  264 non-null    object 
 2   Lat             264 non-null    float64
 3   Long            264 non-null    float64
 4   1/22/20         264 non-null    int64  
 5   1/23/20         264 non-null    int64  
 6   1/24/20         264 non-null    int64  
 7   1/25/20         264 non-null    int64  
 8   1/26/20         264 non-null    int64  
 9   1/27/20         264 non-null    int64  
 10  1/28/20         264 non-null    int64  
 11  1/29/20         264 non-null    int64  
 12  1/30/20         264 non-null    int64  
 13  1/31/20         264 non-null    int64  
 14  2/1/20          264 non-null    int64  
 15  2/2/20          264 non-null    int64  
 16  2/3/20          264 non-null    int64  
 17  2/4/20          264 non-null    int

In [3]:
# first illustration of how to flip the data to long form and clean up column names
value_columns = raw_data.columns[4:]
covid19 = raw_data.drop(
    ['Province/State', 'Lat', 'Long'], axis=1).melt(
        id_vars=['Country/Region'], value_vars=value_columns, var_name='textdate', value_name='Cases')
covid19['Date'] = covid19['textdate'].apply(pd.to_datetime)
covid19 = covid19.sort_values(by=['Date']).drop(['textdate'], axis=1)
covid19.columns = ['Country', 'Cases', 'Date']

In [4]:
sorted(covid19.Country.unique())

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burma',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Diamond Princess',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 

In [5]:
alt.Chart(covid19[covid19.Country.isin(['Spain',
                                        'Korea, South'])]
         ).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [6]:
alt.Chart(covid19[covid19.Country.isin(['Italy'])]
         ).mark_line().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [7]:
alt.Chart(covid19[covid19.Country.isin(['Congo (Kinshasa)'])]
         ).mark_line().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')


In [8]:
alt.Chart(covid19[covid19.Country.isin(['United Arab Emirates'])]
         ).mark_line().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')


In [9]:
alt.Chart(covid19[covid19.Country == 'China']
         ).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [10]:
alt.Chart(covid19[covid19.Country == 'South Africa']
         ).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [11]:
# US data is in multiple states, so to see it all combined need to use groupby and aggregate using sum
value_columns = raw_data.columns[4:]
covid19 = raw_data.drop(
    ['Lat', 'Long'], axis=1).melt(
        id_vars=['Country/Region', 'Province/State'], value_vars=value_columns, var_name='textdate', value_name='Cases')
covid19['Date'] = covid19['textdate'].apply(pd.to_datetime)
covid19 = covid19.sort_values(by=['Date']).drop(['textdate'], axis=1)
covid19.columns = ['Country', 'Province/State', 'Cases', 'Date']
grouped = covid19.groupby(['Date', 'Country'], as_index=False).sum()

In [12]:
alt.Chart(grouped[grouped.Country.isin(['US'])]).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [13]:
alt.Chart(grouped[grouped.Country.isin(['US', 'Italy'])]).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [14]:
alt.Chart(covid19[covid19.Country == 'Switzerland']
         ).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

### Working with time since first case
As [this page](https://uofabioinformaticshub.github.io/COVID19/COVID19_Progression.html) shows, it is useful to examine each country's outbreak from the perspective of the date of the first known case. The following code takes this approach and also scales cases by population size.

In [15]:
# aggregate each country's cases
covid19 = covid19.groupby(['Date', 'Country'], as_index=False).sum()

# find the date of the first case in each country
first_dates = {}
for row in covid19[covid19.Cases > 0].groupby('Country').head(1).loc[:,['Country', 'Date', 'Cases']].iterrows():
    first_dates[row[1].Country] = row[1].Date

adjustdate = lambda row: (row.Date - first_dates[row.Country]).days if row.Country in first_dates else numpy.nan
covid19_with_delta = covid19.assign(DaysSinceFirstCase = lambda df: df.apply(adjustdate, axis=1))

In [16]:
covid19_with_delta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17020 entries, 0 to 17019
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                17020 non-null  datetime64[ns]
 1   Country             17020 non-null  object        
 2   Cases               17020 non-null  int64         
 3   DaysSinceFirstCase  17020 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 664.8+ KB


In [17]:
# discard all data from before cases showed up in a country
positive_cases = covid19_with_delta[covid19_with_delta.DaysSinceFirstCase >= 0]
# alt.Chart(positive_cases[positive_cases.Country.isin(['US', 'UK'])]).mark_point().encode(x='DateSinceFirstCase', y='Cases', color='Country')
alt.Chart(positive_cases[positive_cases.Country.isin(['US', 
                                                      'United Kingdom', 
#                                                       'Iran', 
                                                      'Italy', 
                                                      'Germany', 'France'])]).mark_point().encode(x='DaysSinceFirstCase', y='Cases', color='Country')

In [18]:
# plot *all* countries - gets messy
alt.Chart(positive_cases).mark_point().encode(x='DaysSinceFirstCase', y='Cases', color='Country')

MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000). For information on how to plot larger datasets in Altair, see the documentation

alt.Chart(...)

In [19]:
# pick up some data on world population by country
population_file = io.StringIO(requests.get('https://datahub.io/JohnSnowLabs/population-figures-by-country/r/population-figures-by-country-csv.csv').text)

population = pd.read_csv(population_file, index_col=0)

In [20]:
# drop all but the latest population, convert 2016 population to millions
population = population.drop(population.columns[1:-1], axis=1).assign(Population = lambda _: population.Year_2016 / 1000000).drop(['Year_2016'], axis=1)
population

Unnamed: 0_level_0,Country_Code,Population
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Aruba,ABW,0.104822
Afghanistan,AFG,34.656032
Angola,AGO,28.813463
Albania,ALB,2.876101
Andorra,AND,0.077281
...,...,...
Kosovo,XKX,1.816200
"Yemen, Rep.",YEM,27.584213
South Africa,ZAF,55.908865
Zambia,ZMB,16.591390


In [21]:
# map the country names in the positive_cases dataframe to country codes from the population dataframe
population_country_names = set(population.index)
unknown_country_to_code = {
    'Korea, South': 'KOR',
    'Taiwan*': 'TWN',
    'US': 'USA',
    'Russia': 'RUS',
    'Cruise Ship': 'CRU',
    'Egypt': 'EGY',
    'Iran': 'IRN',
    'North Macedonia': 'MKD',
    'Czechia': 'CZE',
    'Holy See': 'VAT',
    'Slovakia': 'SVK',
    'Martinique': 'MTQ',
    'Brunei': 'BRN',
    'Congo (Kinshasa)': 'COD',
    'Eswatini': 'SWZ',
    'Saint Lucia': 'LCA',
    'Saint Vincent and the Grenadines': 'VCT',
    'Congo (Brazzaville)': 'COG',
    'The Bahamas': 'BHS',
    'Kyrgyzstan': 'KGZ',
    'Venezuela': 'VEN',
    'Cape Verde': 'CPV',
    'East Timor': 'TLS',
    'Syria': 'SYR',
    'French Guiana': 'GUF',
    'Guadeloupe': 'GLP',
    'Mayotte': 'MYT',
    'Reunion': 'REU',
    'Diamond Princess': 'CDP',
    'Bahamas': 'BHS',
    'Gambia': 'GMB',
    'Laos': 'LAO',
    'Saint Kitts and Nevis': 'KNA',
    'Burma': 'MMR',
    'MS Zaandam': 'CMZ',
    'Western Sahara': 'ESH'
}

country_name_mapping = {}
for name in list(positive_cases.Country.unique()):
    if name in population_country_names:
        country_name_mapping[name] = population.loc['South Africa'].Country_Code
    elif name in unknown_country_to_code:
        country_name_mapping[name] = unknown_country_to_code[name]
    else:
        print('Unknown:', name)

Unknown: Yemen


In [22]:
# these country codes exist in the case data but don't exist in the population data
set(country_name_mapping.values()).difference(set(population.Country_Code.unique()))

{'CDP', 'CMZ', 'ESH', 'TWN', 'VAT'}

In [23]:
# scale the cases in a country by population size of that country
def cases_per_million_pop(row):
    if country_name_mapping[row.Country] in population.Country_Code.values:
        return float(row.Cases / population[population.Country_Code == country_name_mapping[row.Country]].Population)
    else:
        return numpy.nan

In [24]:
scaled_positive_cases = positive_cases.assign(CasesPerMillionPop=positive_cases.apply(cases_per_million_pop, axis=1))

KeyError: 'Yemen'

In [None]:
# now graph some cases per million population info
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['US', 
                                                                    'United Kingdom', 
#                                                                   'Iran', 
                                                                    'Italy', 
                                                                    'Germany',
                                                                    'France',
                                                                    'South Africa'])]).mark_point().encode(
                                                                        x='DaysSinceFirstCase', y='CasesPerMillionPop', color='Country')

In [None]:
# and again, but this time with log scale Y axis - a diagonal line heading up and right here is exponential growth in cases
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['US', 
                                                                    'United Kingdom', 
#                                                                   'Iran', 
                                                                    'Italy', 
                                                                    'Germany',
                                                                    'France',
                                                                    'Korea, South',
                                                                    'South Africa',
                                                                    'Japan'])], width=600, height=600).mark_line().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop', scale=alt.Scale(type='log')), color='Country')

In [None]:
# another example with linear Y axis but alos tooltips
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['US', 
                                                                    'United Kingdom', 
#                                                                   'Iran', 
                                                                    'Italy', 
                                                                    'Germany',
                                                                    'France',
                                                                    'Korea, South',
                                                                    'South Africa',
                                                                    'Japan'])]).mark_point().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop'), color='Country',
                                                                        tooltip=['Country'])

In [None]:
tweet = "Sat.1: #Africa #COVID19:
Flag of Algeria
82,
Flag of Benin
2,
Flag of Burkina Faso
40,
Flag of Cameroon
27,
Flag of Cape Verde
1,
Flag of Central African Republic
3,
Flag of Chad
 1,
Flag of Congo - Brazzaville
1,
Flag of Côte d’Ivoire
14,
Flag of Congo - Kinshasa
18,
Flag of Djibouti
1,
Flag of Egypt
285,
Flag of Equatorial Guinea
4,
Flag of Swaziland
1,
Flag of Ethiopia
9,
Flag of Gabon
4,
Flag of Gambia
1,
Flag of Ghana
16,
Flag of Guinea
2,
Flag of Kenya
7,
Flag of Liberia
3,
Flag of Madagascar
3,
Flag of Mauritius
12,
Flag of Mauritania
2,
Flag of Morocco
86,
Flag of Mayotte
1,
Flag of Namibia
3,
Flag of Niger
1,
Flag of Nigeria
12, Reunion-9,
Flag of Rwanda
17,
Flag of Senegal
47,
Flag of Seychelles
6,
Flag of Somalia
1,
Flag of South Africa
202,
Flag of Sudan
2,
Flag of Tanzania
6,
Flag of Togo
9,
Flag of Tunisia
69,
Flag of Zambia
2,
Flag of Zimbabwe
1. 39 countries, 2 territories, 1021 cases, 23 deaths, 103 recovered."

In [None]:
# and again, but this time with log scale Y axis - a diagonal line heading up and right here is exponential growth in cases
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['South Africa', 
                                                                    'United Kingdom',
                                                                    'Italy',
                                                                    'US'
                                                                    ])], width=600, height=600).mark_line().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop', scale=alt.Scale(type='log')), color='Country')

In [None]:
# and again, but this time with log scale Y axis - a diagonal line heading up and right here is exponential growth in cases
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['South Africa', 
                                                                    'United Kingdom',
                                                                    'Italy',
                                                                    'US'
                                                                    ])], width=600, height=600).mark_line().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop'), color='Country')

In [None]:
positive_cases[positive_cases.Country == 'South Africa']

In [None]:
# and again, but this time with log scale Y axis - a diagonal line heading up and right here is exponential growth in cases
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['South Africa', 
                                                                    'United Kingdom',
                                                                    'Mauritius',
                                                                    'Italy',
                                                                    'Czechia'
                                                                    ])], width=600, height=600).mark_line().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop', scale=alt.Scale(type='log')), color='Country')

In [None]:
positive_cases[positive_cases.Country == 'Mauritius']

In [None]:
positive_cases[positive_cases.Country == 'Italy'].Cases.diff()

In [None]:
positive_cases[positive_cases.Country == 'South Africa'].Cases.diff()

In [None]:
# and again, but this time with log scale Y axis - a diagonal line heading up and right here is exponential growth in cases
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['South Africa', 
                                                                    'Czechia'
                                                                    ])], width=600, height=600).mark_line().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop'), color='Country')

In [None]:
positive_cases[positive_cases.Country == 'Czechia'].Cases.diff()

In [None]:
# and again, but this time with log scale Y axis - a diagonal line heading up and right here is exponential growth in cases
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['South Africa', 
                                                                    'Czechia'
                                                                    ])], width=600, height=600).mark_line().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop', scale=alt.Scale(type='log')), color='Country')

In [None]:
testing_data = pd.read_csv('https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv')
testing_data

In [None]:
alt.Chart(testing_data).mark_point().encode(x=alt.X('YYYYMMDD', 
                                                    scale=alt.Scale(domain=(int(testing_data.YYYYMMDD.min()), 
                                                                            int(testing_data.YYYYMMDD.max())))), 
                                            y=alt.Y('cumulative_tests'))

In [None]:
testing_data.cumulative_tests.diff()

In [None]:
covid19[covid19.Country == 'France'][covid19.Cases > 1000]

In [None]:
import altair as alt
from vega_datasets import data

cars = data.cars.url

brush = alt.selection_interval()  # selection of type "interval"

alt.Chart(cars).mark_point().encode(
    x='Miles_per_Gallon:Q',
    y='Horsepower:Q',
    color=alt.condition(brush, 'Origin:N', alt.value('lightgray'))    
).add_selection(
    brush
)


In [None]:
selection = alt.selection_multi(fields=['Origin'])
color = alt.condition(selection,
                      alt.Color('Origin:N', legend=None),
                      alt.value('lightgray'))

scatter = alt.Chart(cars).mark_point().encode(
    x='Horsepower:Q',
    y='Miles_per_Gallon:Q',
    color=color,
    tooltip='Name:N'
)

legend = alt.Chart(cars).mark_point().encode(
    y=alt.Y('Origin:N', axis=alt.Axis(orient='right')),
    color=color
).add_selection(
    selection
)

scatter | legend

In [None]:
import altair as alt
import pandas as pd
import numpy as np

rand = np.random.RandomState(42)

df = pd.DataFrame({
    'xval': range(100),
    'yval': rand.randn(100).cumsum()
})

slider = alt.binding_range(min=0, max=100, step=1, name='cutoff:')
selector = alt.selection_single(name="SelectorName", fields=['cutoff'],
                                bind=slider, init={'cutoff': 50})

alt.Chart(df).mark_point().encode(
    x='xval',
    y='yval',
    color=alt.condition(
        alt.datum.xval < selector.cutoff,
        alt.value('red'), alt.value('blue')
    )
).add_selection(
    selector
)


In [25]:
east_african_countries = [
    'Djibouti',
    'Eritrea',
    'Ethiopia',
    'Somalia',
    'Sudan',
    'South Sudan',
    'Madagascar',
    'Mauritius',
    'Comoros',
    'Seychelles',
    'Uganda',
    'Rwanda',
    'Burundi',
    'Kenya',
    'Tanzania',
]

In [30]:
covid19[covid19.Country.isin(east_african_countries)][covid19.Cases > 0]

  """Entry point for launching an IPython kernel.


Unnamed: 0,Date,Country,Cases
9493,2020-03-13,Ethiopia,1
9524,2020-03-13,Kenya,1
9593,2020-03-13,Sudan,1
9678,2020-03-14,Ethiopia,1
9709,2020-03-14,Kenya,1
...,...,...,...
16988,2020-04-22,Somalia,286
16990,2020-04-22,South Sudan,4
16993,2020-04-22,Sudan,140
16999,2020-04-22,Tanzania,284
