In [321]:
import io

# all of these are not included in "base Python" - they can be installed using the conda package manager though
import pandas as pd
import numpy
import requests
import altair as alt

In [322]:
url = "http://j.mp/covid19casescsv"
content_string = requests.get(url).text
raw_data = pd.read_csv(io.StringIO(content_string))

In [330]:
# first illustration of how to flip the data to long form and clean up column names
value_columns = raw_data.columns[4:]
covid19 = raw_data.drop(
    ['Province/State', 'Lat', 'Long'], axis=1).melt(
        id_vars=['Country/Region'], value_vars=value_columns, var_name='textdate', value_name='Cases')
covid19['Date'] = covid19['textdate'].apply(pd.to_datetime)
covid19 = covid19.sort_values(by=['Date']).drop(['textdate'], axis=1)
covid19.columns = ['Country', 'Cases', 'Date']

In [324]:
sorted(covid19.Country.unique())

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chile',
 'China',
 'Colombia',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cruise Ship',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Djibouti',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'Equatorial Guinea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Finland',
 'France',
 'Gabon',
 'Gambia, The',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Greenland',
 'Guatemala',
 'Guinea',
 'Guyana',
 'Holy See',
 'Honduras',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan

In [325]:
alt.Chart(covid19[covid19.Country.isin(['Spain',
                                        'Korea, South'])]
         ).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [326]:
alt.Chart(covid19[covid19.Country.isin(['Italy'])]
         ).mark_line().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [327]:
alt.Chart(covid19[covid19.Country == 'China']
         ).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [328]:
alt.Chart(covid19[covid19.Country == 'South Africa']
         ).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [331]:
# US data is in multiple states, so to see it all combined need to use groupby and aggregate using sum
value_columns = raw_data.columns[4:]
covid19 = raw_data.drop(
    ['Lat', 'Long'], axis=1).melt(
        id_vars=['Country/Region', 'Province/State'], value_vars=value_columns, var_name='textdate', value_name='Cases')
covid19['Date'] = covid19['textdate'].apply(pd.to_datetime)
covid19 = covid19.sort_values(by=['Date']).drop(['textdate'], axis=1)
covid19.columns = ['Country', 'Province/State', 'Cases', 'Date']
grouped = covid19.groupby(['Date', 'Country'], as_index=False).sum()

In [332]:
alt.Chart(grouped[grouped.Country.isin(['US'])]).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [333]:
alt.Chart(grouped[grouped.Country.isin(['US', 'Italy'])]).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

In [163]:
alt.Chart(covid19[covid19.Country == 'Switzerland']
         ).mark_point().encode(x=alt.X('monthdate(Date)', 
                                                  axis=alt.Axis(labelAngle=90)), 
                                         y='Cases', color='Country')

### Working with time since first case
As [this page](https://uofabioinformaticshub.github.io/COVID19/COVID19_Progression.html) shows, it is useful to examine each country's outbreak from the perspective of the date of the first known case. The following code takes this approach and also scales cases by population size.

In [334]:
# aggregate each country's cases
covid19 = covid19.groupby(['Date', 'Country'], as_index=False).sum()

# find the date of the first case in each country
first_dates = {}
for row in covid19[covid19.Cases > 0].groupby('Country').head(1).loc[:,['Country', 'Date', 'Cases']].iterrows():
    first_dates[row[1].Country] = row[1].Date

adjustdate = lambda row: (row.Date - first_dates[row.Country]).days
covid19_with_delta = covid19.assign(DaysSinceFirstCase = lambda df: df.apply(adjustdate, axis=1))

In [187]:
covid19_with_delta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8664 entries, 0 to 8663
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                8664 non-null   datetime64[ns]
 1   Country             8664 non-null   object        
 2   Cases               8664 non-null   int64         
 3   DaysSinceFirstCase  8664 non-null   int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 338.4+ KB


In [335]:
# discard all data from before cases showed up in a country
positive_cases = covid19_with_delta[covid19_with_delta.DaysSinceFirstCase >= 0]
# alt.Chart(positive_cases[positive_cases.Country.isin(['US', 'UK'])]).mark_point().encode(x='DateSinceFirstCase', y='Cases', color='Country')
alt.Chart(positive_cases[positive_cases.Country.isin(['US', 
                                                      'United Kingdom', 
#                                                       'Iran', 
                                                      'Italy', 
                                                      'Germany', 'France'])]).mark_point().encode(x='DaysSinceFirstCase', y='Cases', color='Country')

In [336]:
# plot *all* countries - gets messy
alt.Chart(positive_cases).mark_point().encode(x='DaysSinceFirstCase', y='Cases', color='Country')

In [337]:
# pick up some data on world population by country
population_file = StringIO(requests.get('https://datahub.io/JohnSnowLabs/population-figures-by-country/r/population-figures-by-country-csv.csv').text)

population = pd.read_csv(population_file, index_col=0)

In [338]:
# drop all but the latest population, convert 2016 population to millions
population = population.drop(population.columns[1:-1], axis=1).assign(Population = lambda _: population.Year_2016 / 1000000).drop(['Year_2016'], axis=1)
population

Unnamed: 0_level_0,Country_Code,Population
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Aruba,ABW,0.104822
Afghanistan,AFG,34.656032
Angola,AGO,28.813463
Albania,ALB,2.876101
Andorra,AND,0.077281
...,...,...
Kosovo,XKX,1.816200
"Yemen, Rep.",YEM,27.584213
South Africa,ZAF,55.908865
Zambia,ZMB,16.591390


In [339]:
# map the country names in the positive_cases dataframe to country codes from the population dataframe
population_country_names = set(population.index)
unknown_country_to_code = {
    'Korea, South': 'KOR',
    'Taiwan*': 'TWN',
    'US': 'USA',
    'Russia': 'RUS',
    'Cruise Ship': 'CRU',
    'Egypt': 'EGY',
    'Iran': 'IRN',
    'North Macedonia': 'MKD',
    'Czechia': 'CZE',
    'Holy See': 'VAT',
    'Slovakia': 'SVK',
    'Martinique': 'MTQ',
    'Brunei': 'BRN',
    'Congo (Kinshasa)': 'COD',
    'Eswatini': 'SWZ',
    'Saint Lucia': 'LCA',
    'Saint Vincent and the Grenadines': 'VCT',
    'Congo (Brazzaville)': 'COG',
    'The Bahamas': 'BHS',
    'Kyrgyzstan': 'KGZ',
    'Venezuela': 'VEN'
}

country_name_mapping = {}
for name in list(positive_cases.Country.unique()):
    if name in population_country_names:
        country_name_mapping[name] = population.loc['South Africa'].Country_Code
    elif name in unknown_country_to_code:
        country_name_mapping[name] = unknown_country_to_code[name]
    else:
        print('Unknown:', name)

In [292]:
# these country codes exist in the case data but don't exist in the population data
set(country_name_mapping.values()).difference(set(population.Country_Code.unique()))

{'CRU', 'MTQ', 'TWN', 'VAT'}

In [340]:
# scale the cases in a country by population size of that country
def cases_per_million_pop(row):
    if country_name_mapping[row.Country] in population.Country_Code.values:
        return float(row.Cases / population[population.Country_Code == country_name_mapping[row.Country]].Population)
    else:
        return numpy.nan

In [300]:
scaled_positive_cases = positive_cases.assign(CasesPerMillionPop=positive_cases.apply(cases_per_million_pop, axis=1))

In [341]:
# now graph some cases per million population info
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['US', 
                                                                    'United Kingdom', 
#                                                                   'Iran', 
                                                                    'Italy', 
                                                                    'Germany',
                                                                    'France'])]).mark_point().encode(
                                                                        x='DaysSinceFirstCase', y='CasesPerMillionPop', color='Country')

In [320]:
# and again, but this time with log scale Y axis to make it easi
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['US', 
                                                                    'United Kingdom', 
#                                                                   'Iran', 
                                                                    'Italy', 
                                                                    'Germany',
                                                                    'France',
                                                                    'Korea, South',
                                                                    'South Africa',
                                                                    'Japan'])], width=600, height=600).mark_line().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop', scale=alt.Scale(type='log')), color='Country')

In [318]:
alt.Chart(scaled_positive_cases[scaled_positive_cases.Country.isin(['US', 
                                                                    'United Kingdom', 
#                                                                   'Iran', 
                                                                    'Italy', 
                                                                    'Germany',
                                                                    'France',
                                                                    'Korea, South',
                                                                    'South Africa',
                                                                    'Japan'])]).mark_point().encode(
                                                                        x='DaysSinceFirstCase',
                                                                        y=alt.Y('CasesPerMillionPop'), color='Country',
                                                                        tooltip=['Country'])

In [316]:
sorted(scaled_positive_cases.Country.unique())

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chile',
 'China',
 'Colombia',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cruise Ship',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Djibouti',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'Equatorial Guinea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Finland',
 'France',
 'Gabon',
 'Gambia, The',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Greenland',
 'Guatemala',
 'Guinea',
 'Guyana',
 'Holy See',
 'Honduras',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan