This notebook is a quick data visualization notebook, still very much a work in progress.

Currently it shows confirmed cases and fatalities by country/region and province/state.

In [None]:
## Import required packages
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
import tensorflow as tf
import geopandas as geopd
import datetime as dt

from mpl_toolkits.axes_grid1 import make_axes_locatable

In [None]:
## Load data
train=pd.read_csv("../input/covid19-global-forecasting-week-1/train.csv")
test=pd.read_csv("../input/covid19-global-forecasting-week-1/test.csv")
train['Date']=pd.to_datetime(train['Date'])
test['Date']=pd.to_datetime(test['Date'])
train['Days']=((train['Date'] - dt.datetime(2020,1,22)).dt.total_seconds()/(24*60*60)).apply(int)
test['Days']=((test['Date'] - dt.datetime(2020,1,22)).dt.total_seconds()/(24*60*60)).apply(int)

Given the exponential nature of the pandemic, the data is probably better expressed in a logarithmic scale.

In [None]:
## Transform to dB scale, base 10
train['ConfirmedCases_dB']=10*np.log10(train['ConfirmedCases'])
train['Fatalities_dB']=10*np.log10(train['Fatalities'])
train.loc[np.where(train.loc[:, 'ConfirmedCases_dB']==-np.inf)[0],'ConfirmedCases_dB']=np.nan # remove -inf
train.loc[np.where(train.loc[:, 'Fatalities_dB']==-np.inf)[0],'Fatalities_dB']=np.nan # remove -inf

## Get unique countries and dates
countryUnique=np.unique(train['Country/Region'])
dateUnique=np.unique(train['Date'])

Plotting confirmed cases and fatalities by country.

In decibel log10 scale

0 dB = 1 case

10 dB = 10 cases

20 dB = 100 cases

30 dB = 1,000 cases

40 dB = 10,000 cases

50 dB = 100,000 cases

In [None]:
## Populate a geopandas world map with CC and F data
world = geopd.read_file(geopd.datasets.get_path('naturalearth_lowres'))

world['ConfirmedCases_dB']=0
world['Fatalities_dB']=0
for world_country_idx in range(0,len(world)):
#for world_country_idx in range(1,2):
    world_country_name=world.iloc[world_country_idx]['name']
    country_name=[]
    country_idx=np.where(world_country_name==countryUnique)[0]
    if country_idx.shape[0]>0:
        #print(np.max(train.loc[np.where(train.loc[:, 'Country/Region']==world_country_name)[0],'ConfirmedCases_dB']))
        world.loc[world_country_idx,'ConfirmedCases_dB']=np.max(train.loc[np.where(train.loc[:, 'Country/Region']==world_country_name)[0],'ConfirmedCases_dB'])
        world.loc[world_country_idx,'Fatalities_dB']=np.max(train.loc[np.where(train.loc[:, 'Country/Region']==world_country_name)[0],'Fatalities_dB'])
    else:
        if world_country_name=='United States of America':country_name='US'
        elif world_country_name=='Dem. Rep. Congo':country_name='Congo (Kinshasa)'
        elif world_country_name=='Congo':country_name='Congo (Brazzaville)'
        elif world_country_name=='Dominican Rep.':country_name='Dominican Republic'
        elif world_country_name=='CÃ´te d\'Ivoire':country_name='Cote d\'Ivoire'
        elif world_country_name=='Central African Rep.':country_name='Central African Republic'
        elif world_country_name=='Eq. Guinea':country_name='Equatorial Guinea'
        elif world_country_name=='Gambia':country_name='Gambia, The'
        elif world_country_name=='South Korea':country_name='Korea, South'
        elif world_country_name=='Taiwan':country_name='Taiwan*'
        elif world_country_name=='Bosnia and Herz.':country_name='Bosnia and Herzegovina'
        if country_name!=[]:
            world.loc[world_country_idx,'ConfirmedCases_dB']=np.max(train.loc[np.where(train.loc[:, 'Country/Region']==country_name)[0],'ConfirmedCases_dB'])
            world.loc[world_country_idx,'Fatalities_dB']=np.max(train.loc[np.where(train.loc[:, 'Country/Region']==country_name)[0],'Fatalities_dB'])
        #else:
            #print(world_country_name)

In [None]:
## Plot a world map with the most recent data

fig, ax = plt.subplots(2,1,figsize=(20,10))
divider = make_axes_locatable(ax[0])
cax = divider.append_axes("right", size="5%", pad=0.1)
world.plot(column='ConfirmedCases_dB', cmap='jet',ax=ax[0],legend=True, cax=cax, vmin=0, vmax=50);
ax[0].set_title('Confirmed Cases (dB)')
divider = make_axes_locatable(ax[1])
cax = divider.append_axes("right", size="5%", pad=0.1)
world.plot(column='Fatalities_dB', cmap='jet',ax=ax[1],legend=True, cax=cax, vmin=0, vmax=50);
ax[1].set_title('Fatalities (dB)')


In [None]:
## Plot confirmed cases and fatalities by country
numRows, numCols = 17, 10
fig, ax = plt.subplots(numRows,numCols,figsize=(20,20))
fig.tight_layout(pad=1.0)

for country in range(0,len(countryUnique)):
    row, col = np.divmod(country,numCols)
    
    # Collect numbers by country
    cc=train.loc[np.where(train.loc[:, 'Country/Region']==countryUnique[country])[0],'ConfirmedCases_dB']
    f=train.loc[np.where(train.loc[:, 'Country/Region']==countryUnique[country])[0],'Fatalities_dB']
    days=train.loc[np.where(train.loc[:, 'Country/Region']==countryUnique[country])[0],'Days']
    
    # Sum up countries that are divided into province
    days_unique=np.unique(days); cc_unique=np.zeros(days_unique.shape); f_unique=np.zeros(days_unique.shape)
    for d in range(0,len(days)):
        idx=np.where(days.iloc[d]==days_unique)
        if ~np.isnan(cc.iloc[d]):
            cc_unique[idx]=cc_unique[idx]+10**(cc.iloc[d]/10)
        if ~np.isnan(f.iloc[d]):
            f_unique[idx]=f_unique[idx]+10**(f.iloc[d]/10)
    if np.where(cc_unique)[0].shape[0]>0:
        x_lower_lim=np.min([50,np.min(np.where(cc_unique))])
    else:
        x_lower_lim=0
    cc_unique=10*np.log10(cc_unique)
    f_unique=10*np.log10(f_unique)
    
    # Plotting
    sn.scatterplot(x=days_unique,y=cc_unique,ax=ax[row,col])
    sn.scatterplot(x=days_unique,y=f_unique,ax=ax[row,col])
    ax[row,col].set_title(str(countryUnique[country]))
    ax[row,col].set_ylim([-5,50])
    ax[row,col].set_xlim([x_lower_lim,np.max(days_unique)])
    ax[row,col].set_ylabel('CC / F (dB)')
    ax[row,col].set_xlabel('',visible=False)
    ax[row,col].grid(1)
    
    # set background gray if no cases
    if np.sum(~np.isnan(cc))==0:
        ax[row,col].set_facecolor([0.8,0.8,0.8])


Some of the countries are further sub-divided by province/state: US, Canada, China, Netherlands, Australia, Denmark, UK, France, Cruise Ship. The x-axis for these graphs is wrong.

Also Diamond/Grand Princess (listed as Province/State) has two parts.

Try to subdivide by province/state.

In [None]:
## Get unique countries + province combinations
provinceUnique, provinceUniqueIdx = np.unique(train.dropna(subset=['Province/State'])['Province/State'], return_index=True)
countryUniqueWithProvince=np.append(countryUnique,train.dropna(subset=['Province/State'])['Country/Region'].iloc[provinceUniqueIdx])
emptyArray=np.empty(countryUnique.shape)
emptyArray[:]=np.nan
provinceUniqueWithProvince=np.append(emptyArray,provinceUnique)

In [None]:
## Plot confirmed cases and fatalities by province
numRows, numCols = 30, 10
fig, ax = plt.subplots(numRows,numCols,figsize=(20,35))
fig.tight_layout(pad=1.0)
for country in range(0,len(countryUniqueWithProvince)):
    row, col = np.divmod(country,numCols)
    
    if pd.isna(provinceUniqueWithProvince[country]):
        ax[row,col].set_title(str(countryUniqueWithProvince[country]))
    
        # Collect numbers by country
        cc=train.loc[np.where(train.loc[:, 'Country/Region']==countryUniqueWithProvince[country])[0],'ConfirmedCases_dB']
        f=train.loc[np.where(train.loc[:, 'Country/Region']==countryUniqueWithProvince[country])[0],'Fatalities_dB']
        days=train.loc[np.where(train.loc[:, 'Country/Region']==countryUniqueWithProvince[country])[0],'Days']
    
        # Sum up countries that are divided into province
        days_unique=np.unique(days); cc_unique=np.zeros(days_unique.shape); f_unique=np.zeros(days_unique.shape)
        for d in range(0,len(days)):
            idx=np.where(days.iloc[d]==days_unique)
            if ~np.isnan(cc.iloc[d]):
                cc_unique[idx]=cc_unique[idx]+10**(cc.iloc[d]/10)
            if ~np.isnan(f.iloc[d]):
                f_unique[idx]=f_unique[idx]+10**(f.iloc[d]/10)
        if np.where(cc_unique)[0].shape[0]>0:
            x_lower_lim=np.min([50,np.min(np.where(cc_unique))])
        else:
            x_lower_lim=0
        cc_unique=10*np.log10(cc_unique)
        f_unique=10*np.log10(f_unique)
        
        # Plotting
        sn.scatterplot(x=days_unique,y=cc_unique,ax=ax[row,col])
        sn.scatterplot(x=days_unique,y=f_unique,ax=ax[row,col])
    
    else:
        ax[row,col].set_title(str(provinceUniqueWithProvince[country]))
        
        # Collect numbers by province
        cc=train.loc[np.where(train.loc[:, 'Province/State']==provinceUniqueWithProvince[country])[0],'ConfirmedCases_dB']
        f=train.loc[np.where(train.loc[:, 'Province/State']==provinceUniqueWithProvince[country])[0],'Fatalities_dB']
        days=train.loc[np.where(train.loc[:, 'Province/State']==provinceUniqueWithProvince[country])[0],'Days']
        
        if np.where(~np.isnan(cc))[0].shape[0]>0:
            x_lower_lim=np.min([50,np.where(~np.isnan(cc))[0][0]])
        else:
            x_lower_lim=0
            
        # Plotting
        sn.scatterplot(x=days,y=cc,ax=ax[row,col])
        sn.scatterplot(x=days,y=f,ax=ax[row,col])
        
    ax[row,col].set_ylim([-5,50])
    ax[row,col].set_xlim([x_lower_lim,np.max(days_unique)])
    ax[row,col].set_ylabel('CC / F (dB)')
    ax[row,col].set_xlabel('',visible=False)
    ax[row,col].grid(1)
    
    # colour code those countries with "provinces"
    if countryUniqueWithProvince[country]=='US':
        ax[row,col].set_facecolor(np.array([204,229,255])/255)
    elif countryUniqueWithProvince[country]=='China':
        ax[row,col].set_facecolor(np.array([255,204,204])/255)
    elif countryUniqueWithProvince[country]=='Canada':
        ax[row,col].set_facecolor(np.array([229,255,204])/255)
    elif countryUniqueWithProvince[country]=='Australia':
        ax[row,col].set_facecolor(np.array([255,255,204])/255)
    elif countryUniqueWithProvince[country]=='United Kingdom':
        ax[row,col].set_facecolor(np.array([229,204,255])/255)
    elif countryUniqueWithProvince[country]=='Netherlands':
        ax[row,col].set_facecolor(np.array([255,204,255])/255)
    elif countryUniqueWithProvince[country]=='Denmark':
        ax[row,col].set_facecolor(np.array([255,204,153])/255)
    elif countryUniqueWithProvince[country]=='France':
        ax[row,col].set_facecolor(np.array([204,255,153])/255)
    elif countryUniqueWithProvince[country]=='Cruise Ship':
        ax[row,col].set_facecolor(np.array([229,171,100])/255)
    
    # set background gray if no cases
    if np.sum(~np.isnan(cc))==0:
        ax[row,col].set_facecolor([0.8,0.8,0.8])


Work to be continued.

1) Fix the way countries with provinces are grouped (done)

2) Figure out how best to reshape the dataframe. By country? Or by date?

3) Plot based on date, or date since a certain day (done)