In [None]:
# imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import matplotlib as mpl
import geopandas as gpd
import geoplot as gplot
import gc

#### Define some helper functions first

In [None]:
def extract_date(line, start_year='2020'):
    ''' extract date from dataset'''
    
    MONTHS = {'January' : '01',
              'February' :'02' ,
              'March' : '03', 
              'April' : '04',
              'May' : '05',
              'June' : '06',
              'July' : '07',
              'August' : '08',
              'September' : '09',
              'October' : '10',
              'November' : '11',
              'December' : '12'}
    
    date = False
    
    for m in MONTHS:
        pattern = '^\d+\s+' + m
        if re.match(pattern,line):
            tokens = re.split('\s+',line)
            if len(tokens[0]) < 2:
                tokens[0] = '0' + tokens[0]    
            date = start_year + MONTHS[m] + tokens[0]
            
    return date

### Load data

In [None]:
# freo https://datahub.io/core/world-cities#resource-world-cities
world_data = pd.read_csv('world-cities.csv')
world_data = world_data.astype({'country' : str, 'subcountry' : str, 'name' : str})
countries_l = list(world_data['country'].unique())
regions_l = list(world_data['subcountry'].unique())
cities_l = list(world_data['name'].unique())

#### Ingest data and turn it into list of lines

In [None]:
# ingest data
with open('covid19.txt','r') as f:
    # read everything in a single string
    text = f.read()

# get rid on new lines and build a list
lines = text.split('\n')

#### Parse data and attempt to extract meaningful information

In [None]:
# init a dict to hold data, will turn it into dataframe later
parsed = {}
parsed['date'] = []
parsed['time'] = []
parsed['timestamp'] = []
parsed['country'] = []
parsed['region'] = []
parsed['new_cases'] = []
parsed['new_deaths'] = []

# parse data
# i feel the code below can be greatly improved, it is by no means a very good solution
current_date = None
for line in lines:   
    date = extract_date(line)
    if date:
        current_date = date
        continue

    country = 'unknown'
    subregion = 'unknown'
    city = 'unknown'
    new_cases = 0
    new_deaths = 0
    timestamp = 'unknown'
    
    # get country
    for x in countries_l:
        if x in line:
            country = x
    
    # get subregion (state/ provonce etc)
    for x in regions_l:
        if x in line:
            subregion = x
    
    # get city
    for x in cities_l:
        if x in line:
            city = x
    
    # try to guess country based on subregion
    if country == 'unknown':
        if subregion != 'unknown':
            country = world_data.loc[world_data['subcountry'] == subregion, 'country'].values[0]
    
    # try to guess country based on city
    if country == 'unknown':
        if city != 'unknown':
            country = world_data.loc[world_data['name'] == city, 'country'].values[0]
    
    # give up if country still unknown
    if country == 'unknown':
        print(line)
        continue
    
    # try to guess subregion based on city
    if subregion == 'unknown':
        if city != 'unknown':
            subregion = world_data.loc[world_data['name'] == city, 'subcountry'].values[0]
    
    # timestamp
    pattern = '^\d+:\d+'
    z = re.findall(pattern,line)
    if z:
        timestamp = z[0].split(' ')[0]
    else:
        timestamp = '00:00'
    
    line = line.replace(timestamp+': ','')
    timestamp = timestamp.replace(':','')
    #print(line)
    
    # new cases
    pattern = '\d+\s+new\s+case|\d+\s+case'
    z = re.findall(pattern,line)
    if z:
        new_cases = z[0].split(' ')[0]
        
    pattern = 'First\s\d+\scase'
    z = re.findall(pattern,line)
    if z:
        new_cases = z[0].split(' ')[1]
        
    pattern = 'First\s+case'
    z = re.findall(pattern,line)
    if z:
        new_cases = 1
    
    # deaths
    pattern = '\d+\s+new\s+death|\d+\s+death'
    z = re.findall(pattern,line)
    if z:
        new_deaths = z[0].split(' ')[0]
        
    if new_cases == 0 and new_deaths == 0:
        continue
    
    parsed['date'].append(current_date)
    parsed['country'].append(country)
    parsed['region'].append(subregion)
    parsed['timestamp'].append(timestamp)
    parsed['time'].append(str(current_date)+str(timestamp))
    parsed['new_cases'].append(new_cases)
    parsed['new_deaths'].append(new_deaths)
    
    #print('%s\t%s\t%s\t%s\t%s\t%s' %(current_date,timestamp,country,subregion, new_cases, new_deaths))

### Prepare for plotting

#### Data into dataframe

In [None]:
# create dataframe for further use
df = pd.DataFrame(parsed)

# convert columns to numeric
df = df.astype({'new_cases' : int,
                'new_deaths' : int,
                'date' : int,
                'time' : int,
                'timestamp' : int
               })

#### Load geography maps

In [None]:
# world
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.loc[world['name'] == 'United States of America', 'name'] = 'United States'

In [None]:
# USA
contiguous_usa = gpd.read_file(gplot.datasets.get_path('contiguous_usa'))

#### The following 2 functions do the plotting.
Both are very similar and I guess I could have writtenjust one and made it very configurable but this keeps the code simpler. Both generate a graphics file and dump it in the current directory. Running them in a loop lets you generate frames which can be animated after with a utility like ffmpeg

In [None]:
def render_frame(data=None,
                 map_data=None,
                 time_point=None,
                 feature='new_cases',
                 extent=(-180,-90,180,90),
                 dims=(16,8),
                 dpi=150,
                 img_type='png',
                 title_annot='new cases',
                 cmap='OrRd',
                 color_min=0,
                 color_max=4,
                 verbose=False,
                 save=False,
                 ax=None):
    
    '''renders a single frame and dumps a graphics file'''
    
    # normalize colormap
    norm = mpl.colors.Normalize(vmin=color_min, vmax=color_max)
    cmap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap).cmap
    
    # sum numbers by country, take log10
    sum_by_country = np.log10(data.loc[ data['time'] <= time_point ].groupby('country')[feature].sum())
    #sum_by_country = pd.DataFrame(sum_by_country.rename_axis('name'))
    
    # merge w/ map data
    geodata = map_data.copy()
    geodata[feature] = np.zeros(geodata.shape[0])
    for country in sum_by_country.index:
        geodata.loc[geodata['name'] == country,feature] = sum_by_country.loc[country]
    
    # plot
    if (ax == None):
        fig, ax = plt.subplots(figsize=dims)
    
    #gplot.polyplot(map_data, ax = ax)
    gplot.choropleth(geodata,
                     hue=feature,
                     cmap=cmap,
                     ax=ax,
                     norm=norm,
                     extent=extent,
                     legend = True)
    
    # derive date and time for plot title
    str_date = str(time_point)[0:4] + '-' +str(time_point)[4:6]+ '-' +str(time_point)[6:8]
    str_time = str(time_point)[8:]
    
    ax.set_title('COVID19 '+ title_annot +' over time ' + str_date + ' ' + str_time)
            
    if save:
        fname = str(time_point) + '.'+img_type
        
        if verbose:
            print(fname)
        
        plt.savefig(fname, dpi=dpi)
        fig.clf()
        plt.close()

In [None]:
def render_frame_usa(data=None,
                     map_data=None,
                     time_point=None,
                     feature='new_cases',
                     extent=(-130,25,-65,55),
                     dims=(16,8),
                     dpi=150,
                     img_type='png',
                     title_annot='new cases',
                     cmap='OrRd',
                     color_min=0,
                     color_max=3,
                     verbose=False,
                     save=False,
                     ax=None):
    
    '''renders a single frame and dumps a graphics file'''
    
    # normalize colormap
    norm = mpl.colors.Normalize(vmin=color_min, vmax=color_max)
    cmap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap).cmap
    
    # sum numbers by country, take log10
    usa = data.loc[data['country'] == 'United States']
    sum_by_state = np.log10(usa.loc[ usa['time'] <= time_point ].groupby('region')[feature].sum())
    
    # merge w/ map data
    geodata = map_data.copy()
    geodata[feature] = np.zeros(geodata.shape[0])
    for state in sum_by_state.index:
        geodata.loc[geodata['state'] == state,feature] = sum_by_state.loc[state]
    
    # plot
    if (ax == None):
        fig, ax = plt.subplots( nrows=1, ncols=1, figsize=dims)
    
    gplot.choropleth(geodata,
                     hue=feature,
                     cmap=cmap,
                     ax=ax,
                     norm=norm,
                     extent=extent,
                     legend = True)
    
    # derive date and time for plot title
    str_date = str(time_point)[0:4] + '-' +str(time_point)[4:6]+ '-' +str(time_point)[6:8]
    str_time = str(time_point)[8:]
    
    ax.set_title('COVID19 USA '+ title_annot +' over time ' + str_date + ' ' + str_time)
        
    if save:
        fname = str(time_point) + '.'+img_type
        
        if verbose:
            print(fname)
        
        plt.savefig(fname, dpi=dpi)
        fig.clf()
        plt.close()

In [None]:
def render_sub_frame_per_country(params=None): 

    # normalize colormap
    norm = mpl.colors.Normalize(vmin=params['color_min'], vmax=params['color_max'])
    cmap = mpl.cm.ScalarMappable(norm=norm, cmap=params['cmap']).cmap
    
    # sum numbers by country, take log10
    sum_ = np.log10(params['data'].loc[ params['data']['time'] <= params['time_point'] ].groupby('country')[params['feature']].sum())
    
    # merge w/ map data
    geodata = params['map_data'].copy()
    geodata[params['feature']] = np.zeros(geodata.shape[0])
    
    for country in sum_.index:
        geodata.loc[geodata['name'] == country,params['feature']] = sum_.loc[country]
    
    gplot.choropleth(geodata,
                     hue=params['feature'],
                     cmap=cmap,
                     ax=params['ax'],
                     norm=norm,
                     extent=params['extent'],
                     legend = True)
    
    # derive date and time for plot title
    str_date = str(params['time_point'])[0:4] + '-' +str(params['time_point'])[4:6]+ '-' +str(params['time_point'])[6:8]
    str_time = str(params['time_point'])[8:] 
    params['ax'].set_title('COVID19 '+ params['title_annot'] +' over time ' + str_date + ' ' + str_time)

In [None]:
def render_sub_frame_per_state(params=None): 

    # normalize colormap
    norm = mpl.colors.Normalize(vmin=params['color_min'], vmax=params['color_max'])
    cmap = mpl.cm.ScalarMappable(norm=norm, cmap=params['cmap']).cmap
    
    # sum numbers by country, take log10
    usa = params['data'].loc[params['data']['country'] == 'United States']
    sum_ = np.log10(usa.loc[ usa['time'] <= params['time_point'] ].groupby('region')[params['feature']].sum())
    
    # merge w/ map data
    geodata = params['map_data'].copy()
    geodata[params['feature']] = np.zeros(geodata.shape[0])
    
    for state in sum_.index:
        geodata.loc[geodata['state'] == state,params['feature']] = sum_.loc[state]
    
    gplot.choropleth(geodata,
                     hue=params['feature'],
                     cmap=cmap,
                     ax=params['ax'],
                     norm=norm,
                     extent=params['extent'],
                     legend = True)
    
    # derive date and time for plot title
    str_date = str(params['time_point'])[0:4] + '-' +str(params['time_point'])[4:6]+ '-' +str(params['time_point'])[6:8]
    str_time = str(params['time_point'])[8:] 
    params['ax'].set_title('COVID19 USA'+ params['title_annot'] +' over time ' + str_date + ' ' + str_time)

In [None]:
def plot_by_region(data=None,
                    start_date=None,
                    end_date=None,
                    timestep=200,
                    dims=(12,12),
                    dpi=150,
                    mode='country',
                    params=None):
    ''' wrapper around render_subframe_by_country and render_subframe_by_state'''
    
    dates = np.sort(data['date'].unique())
    if start_date == None:
        start_date = dates[0]
        
    if end_date == None:
        end_date = dates[-1]
        
    dates = dates[np.where((dates >= start_date) & (dates <= end_date))]
    n_plots = len(params)
    
    for date in dates:
        for time in range(0,2500,timestep):
            # prepare the date+time number
            if time == 0:
                t = '0000'
            elif time < 1000:
                t = '0' + str(time)
            else:
                t = str(time)
            
            this_time = int(str(date) + t)
            fig,axs = plt.subplots(n_plots, figsize=dims)
        
            ax_ind = 0
            for fdict in params:
                fdict['data'] = data
                fdict['time_point'] = this_time
                
                if n_plots == 1:
                    fdict['ax'] = axs
                else:
                    fdict['ax'] = axs[ax_ind]
                    ax_ind += 1
            
                # render map
                if mode == 'country':
                    render_sub_frame_per_country(params=fdict)
                else:
                    render_sub_frame_per_state(params=fdict)
                
            plt.savefig(str(this_time) + '.png', dpi=FIG_DPI)
            fig.clf()
            plt.close()
        
            # maybe running into RAM problems because creating too many figures
            gc.collect()

### Plot stuff

In [None]:
# some defs to use later
feature_spread='new_cases'
title_spread = 'cumulative cases'

feature_deaths='new_deaths'
title_deaths='cumulative deaths'

# Europe
extent_europe=(-20,30,60,80)
color_max_europe_spread=3  # 1000
color_max_europe_deaths=2.7   # 500 

# world
extent_world=(-180,-90,180,90)
color_max_world_spread=4    # 10000
color_max_world_deaths=3 # 1000

# USA
color_max_usa_spread=3   # 1000
color_max_usa_deaths=1   # 10
extent_usa=(-130,25,-65,55)

# Africa
color_max_africa_spread=2   # 100
color_max_africa_deaths=1   # 10
extent_africa=(-25,-40,60,50)

# other stuff
FIG_DIMS_WORLD_USA=(12,12)
FIG_DIMS_AFRICA=(8,12)
FIG_DPI=150
TIMESTEP= 200 # this is 2 hours, not 200 minutes

In [None]:
# define various parameter dictionaries for plotting
europe_spread_dict = {
    'color_min' : 0,
    'color_max' : color_max_europe_spread,
    'cmap' : 'Greens',
    'map_data' : world,
    'feature': feature_spread,
    'extent' : extent_europe,
    'title_annot' : title_spread
}

europe_death_dict = {
    'color_min' : 0,
    'color_max' : color_max_europe_deaths,
    'cmap' : 'Reds',
    'map_data' : world,
    'feature': feature_deaths,
    'extent' : extent_europe,
    'title_annot' : title_deaths
}

world_spread_dict = {
    'color_min' : 0,
    'color_max' : color_max_world_spread,
    'cmap' : 'Greens',
    'map_data' : world,
    'feature': feature_spread,
    'extent' : extent_world,
    'title_annot' : title_spread
}

world_death_dict = {
    'color_min' : 0,
    'color_max' : color_max_world_deaths,
    'cmap' : 'Reds',
    'map_data' : world,
    'feature': feature_deaths,
    'extent' : extent_world,
    'title_annot' : title_deaths
}

usa_spread_dict = {
    'color_min' : 0,
    'color_max' : color_max_usa_spread,
    'cmap' : 'Greens',
    'map_data' : contiguous_usa,
    'feature': feature_spread,
    'extent' : extent_usa,
    'title_annot' : title_spread
}

usa_death_dict = {
    'color_min' : 0,
    'color_max' : color_max_usa_deaths,
    'cmap' : 'Reds',
    'map_data' : contiguous_usa,
    'feature': feature_deaths,
    'extent' : extent_usa,
    'title_annot' : title_deaths
}

africa_spread_dict = {
    'color_min' : 0,
    'color_max' : color_max_africa_spread,
    'cmap' : 'Greens',
    'map_data' : world,
    'feature': feature_spread,
    'extent' : extent_africa,
    'title_annot' : title_spread
}

africa_death_dict = {
    'color_min' : 0,
    'color_max' : color_max_africa_deaths,
    'cmap' : 'Reds',
    'map_data' : world,
    'feature': feature_deaths,
    'extent' : extent_africa,
    'title_annot' : title_deaths
}

In [None]:
plot_by_region(data=df,
               timestep=TIMESTEP,
               dims=FIG_DIMS_WORLD_USA,
               params=[usa_spread_dict, usa_death_dict],
               mode='usa',
               dpi=FIG_DPI)