In [None]:
# imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import matplotlib as mpl
import geopandas as gpd
import geoplot as gplot
import gc

#### Define some helper functions first

In [None]:
def extract_date(line, start_year='2020'):
    ''' extract date from dataset'''
    
    MONTHS = {'January' : '01',
              'February' :'02' ,
              'March' : '03', 
              'April' : '04',
              'May' : '05',
              'June' : '06',
              'July' : '07',
              'August' : '08',
              'September' : '09',
              'October' : '10',
              'November' : '11',
              'December' : '12'}
    
    date = False
    
    for m in MONTHS:
        pattern = '^\d+\s+' + m
        if re.match(pattern,line):
            tokens = re.split('\s+',line)
            if len(tokens[0]) < 2:
                tokens[0] = '0' + tokens[0]    
            date = start_year + MONTHS[m] + tokens[0]
            
    return date

### Load data

#### Load geography data

In [None]:
# freo https://datahub.io/core/world-cities#resource-world-cities
world_data = pd.read_csv('world-cities.csv')
world_data = world_data.astype({'country' : str, 'subcountry' : str, 'name' : str})
world_data.loc[world_data['country'] == 'nan','country'] = 'unknown'
world_data.loc[world_data['subcountry'] == 'nan','subcountry'] = 'unknown'
world_data.loc[world_data['name'] == 'nan','name'] = 'unknown'

world_data['subcountry'] = world_data['subcountry'].apply(lambda x : x.replace(' Sheng',''))
world_data['subcountry'] = world_data['subcountry'].apply(lambda x : x.replace(' Shi',''))
world_data['subcountry'] = world_data['subcountry'].apply(lambda x : x.replace(' Zhuangzu',''))
world_data['subcountry'] = world_data['subcountry'].apply(lambda x : x.replace(' Uygur',''))
world_data['subcountry'] = world_data['subcountry'].apply(lambda x : x.replace(' Zizhiqu',''))
world_data['subcountry'] = world_data['subcountry'].apply(lambda x : x.replace(' Autonomous Region',''))
world_data['subcountry'] = world_data['subcountry'].apply(lambda x : x.replace(' Huizu',''))

countries_l = world_data['country'].unique()
regions_l = world_data['subcountry'].unique()
cities_l = world_data['name'].unique()

In [None]:
# China
# from https://github.com/deldersveld/topojson
china = gpd.read_file('china-provinces.json')
china.loc[china['NAME_1'] == 'Nei Mongol','NAME_1'] = 'Inner Mongolia'
china.loc[china['NAME_1'] == 'Xinjiang Uygur','NAME_1'] = 'Xinjiang'
china.loc[china['NAME_1'] == 'Xizang','NAME_1'] = 'Tibet'
china.loc[china['NAME_1'] == 'Ningxia Hui','NAME_1'] = 'Ningxia'
china.rename(columns={'NAME_1' : 'geounit'}, inplace=True)

# world
world = gpd.read_file('world-countries.json')
world = world.drop(world.loc[world['geometry'].is_empty].index)
world.rename(columns = {'name' : 'geounit'}, inplace=True)
world.loc[world['geounit'] == 'United States of America','geounit'] = 'United States'
# make the UK
world.loc[world['geounit'] == 'England','geounit'] = 'United Kingdom'
world.loc[world['geounit'] == 'Scotland','geounit'] = 'United Kingdom'
world.loc[world['geounit'] == 'Wales','geounit'] = 'United Kingdom'
world.loc[world['geounit'] == 'Northern Ireland','geounit'] = 'United Kingdom'

# USA
usa = gpd.read_file('us-albers.json')
usa = usa.drop(usa.loc[usa['geometry'].is_empty].index)
usa.rename(columns={'name' : 'geounit'},inplace=True)

# europe
europe = gpd.read_file('europe.json')
europe.drop(europe.loc[europe['geometry'].is_empty].index,inplace=True)
# remove Russie for now
europe.drop(europe.loc[europe['geounit']=='Russia'].index,inplace=True)
# make the UK
europe.loc[europe['geounit'] == 'England','geounit'] = 'United Kingdom'
europe.loc[europe['geounit'] == 'Scotland','geounit'] = 'United Kingdom'
europe.loc[europe['geounit'] == 'Wales','geounit'] = 'United Kingdom'
europe.loc[europe['geounit'] == 'Northern Ireland','geounit'] = 'United Kingdom'

# africa
africa = gpd.read_file('africa.json')
africa.drop(africa.loc[africa['geometry'].is_empty].index,inplace=True)

# asia
asia = gpd.read_file('asia.json')
asia.drop(asia.loc[asia['geometry'].is_empty].index,inplace=True)
# remove Russie for now
asia.drop(asia.loc[asia['geounit']=='Russia'].index,inplace=True)

# south america
samerica = gpd.read_file('south-america.json')
samerica.drop(samerica.loc[samerica['geometry'].is_empty].index,inplace=True)

#### Ingest COVID19 data and turn it into list of lines
source: https://bnonews.com/index.php/2020/02/the-latest-coronavirus-cases/

In [None]:
# ingest data
with open('covid19.txt','r') as f:
    # read everything in a single string
    text = f.read()

# get rid on new lines and build a list
lines = text.split('\n')

#### Parse data and attempt to extract meaningful information

In [None]:
# init a dict to hold data, will turn it into dataframe later
parsed = {}
parsed['date'] = []
parsed['time'] = []
parsed['timestamp'] = []
parsed['country'] = []
parsed['region'] = []
parsed['new_cases'] = []
parsed['new_deaths'] = []

# parse data
# i feel the code below can be greatly improved, it is by no means a very good solution
current_date = None
for line in lines:   
    date = extract_date(line)
    if date:
        current_date = date
        continue

    #print(line)
    
    country = 'unknown'
    subregion = 'unknown'
    city = 'unknown'
    new_cases = 0
    new_deaths = 0
    timestamp = 'unknown'
    
    # timestamp
    pattern = '^\d+:\d+'
    z = re.findall(pattern,line)
    if z:
        timestamp = z[0].split(' ')[0]
    else:
        timestamp = '00:00'
    
    line = line.replace(timestamp+': ','')
    line = line.replace("'s",'')
    timestamp = timestamp.replace(':','')
    
    # doctor the line
    line = line.replace(',',' , ')
    line = line.replace('.',' . ')
    
    lc = line.split('. ')[0]
    #print(lc)

    # get country
    for x in countries_l:
        if lc.find(x + ' ') > -1:
            country = x
            break
    
    # get subregion (state/ provonce etc)
    for x in regions_l:
        if lc.find(x + ' ') > -1:
            subregion = x
            break
    
    # get city
    for x in cities_l:
        if lc.find(x + ' ') > -1:
            city = x
            break
                
    # new cases
    pattern = '\d+\s+new\s+case|\d+\s+case'
    z = re.findall(pattern,lc)
    if z:
        new_cases = z[0].split(' ')[0]
    
    # capture numbers like 1,123
    pattern = '\d+\s,\s\d+\s+new\s+case|\d+\s,\s\d+\s+case'
    z = re.findall(pattern,lc)
    if z:
        new_cases = z[0].replace(' , ','').split(' ')[0]
        
    pattern = 'First\s\d+\scase'
    z = re.findall(pattern,lc)
    if z:
        new_cases = z[0].split(' ')[1]
        
    pattern = 'First\s+case'
    z = re.findall(pattern,lc)
    if z:
        new_cases = 1
    
    # deaths
    pattern = '\d+\s+new\s+death|\d+\s+death'
    z = re.findall(pattern,lc)
    if z:
        new_deaths = z[0].split(' ')[0]
    
    # capture numbers like 1,123
    pattern = '\d+\s,\s\d+\s+new\s+death|\d+\s,\s\d+\s+death'
    z = re.findall(pattern,lc)
    if z:
        new_deaths = z[0].replace(' ','').split(' ')[0]    
    
    # try to guess country based on subregion
    if country == 'unknown':
        if subregion != 'unknown':
            country = world_data.loc[world_data['subcountry'] == subregion, 'country'].values[0]
    
    # try to guess country based on city
    if country == 'unknown':
        if city != 'unknown':
            country = world_data.loc[world_data['name'] == city, 'country'].values[0]
    
    # try to guess subregion based on city
    if subregion == 'unknown':
        if city != 'unknown':
            subregion = world_data.loc[world_data['name'] == city, 'subcountry'].values[0]
    
    #print('%s\t%s\t%s\t%s\t%s' %(country,subregion,city,new_cases,new_deaths))
    
    if new_cases == 0 and new_deaths == 0:
        continue
    
    # give up if country still unknown
    if country == 'unknown':
        # inform on failures
        print('@ ' + line)
        continue
                
    parsed['date'].append(current_date)
    parsed['country'].append(country)
    parsed['region'].append(subregion)
    parsed['timestamp'].append(timestamp)
    parsed['time'].append(str(current_date)+str(timestamp))
    parsed['new_cases'].append(new_cases)
    parsed['new_deaths'].append(new_deaths)
    
    #print('%s\t%s\t%s\t%s\t%s\t%s' %(current_date,timestamp,country,subregion, new_cases, new_deaths))

### Prepare for plotting

#### Data into dataframe

In [None]:
# create dataframe for further use
df = pd.DataFrame(parsed)

# convert columns to numeric
df = df.astype({'new_cases' : int,
                'new_deaths' : int,
                'date' : int,
                'time' : int,
                'timestamp' : int
               })

#### The following 2 functions do the plotting.

In [None]:
def render_sub_frame(params=None): 
    # normalize colormap
    norm = mpl.colors.Normalize(vmin=params['color_min'], vmax=params['color_max'])
    cmap = mpl.cm.ScalarMappable(norm=norm, cmap=params['cmap']).cmap
    
    feature = params['feature']
    raw_feature = feature + '_raw'
    time_point = params['time_point']
    
    # sum numbers by country, take log10
    s = params['data'].loc[ params['data']['time'] <= time_point ].groupby('country')[feature].sum()
    
    # merge w/ map data
    geodata = params['map_data'].copy()
    geodata[feature] = np.zeros(geodata.shape[0])
    geodata[raw_feature] = np.zeros(geodata.shape[0])
    
    for country in s.index:
        geodata.loc[geodata['geounit'] == country,feature] = np.log10(s.loc[country])
        geodata.loc[geodata['geounit'] == country,raw_feature] = s.loc[country]
    
    #legend_values = [10**x for x in range(params['color_min'],params['color_max']+1)]
    #legend_labels = legend_values
    
    #print(geodata.head(n=50))
    
    gplot.choropleth(geodata,
                     hue=feature,
                     cmap=cmap,
                     ax=params['ax'],
                     norm=norm,
                     legend = True,
     #                legend_values = legend_values,
     #                legend_labels = legend_labels
                    )
    
    if params['annot']:
        for ix,row in geodata.iterrows():
            centroid = row['geometry'].centroid.coords
            x0 = centroid[0][0]
            y0 = centroid[0][1]
            params['ax'].text(x0, y0, int(row[raw_feature]), fontsize=10)
    
    # derive date and time for plot title
    str_date = str(time_point)[0:4] + '-' +str(time_point)[4:6]+ '-' +str(time_point)[6:8]
    str_time = str(time_point)[8:] 
    params['ax'].set_title('COVID19 '+ params['title_annot'] +' over time ' + str_date + ' ' + str_time)
    del geodata

In [None]:
def plot_by_region(data=None,
                   start_date=None,
                   end_date=None,
                   timestep=200,
                   dims=(12,12),
                   dpi=150,
                   annot=True,
                   save_path=None,
                   mode='global',
                   country=None,
                   params=None,
                   verbose=True):
    ''' wrapper around render_subframe_by_country and render_subframe_by_state'''
    
    dates = np.sort(data['date'].unique())
    if start_date == None:
        start_date = dates[0]
        
    if end_date == None:
        end_date = dates[-1]
        
    dates = dates[np.where((dates >= start_date) & (dates <= end_date))]
    n_plots = len(params)
    
    for date in dates:
        for time in range(0,2500,timestep):
            # prepare the date+time number
            if time == 0:
                t = '0000'
            elif time < 1000:
                t = '0' + str(time)
            else:
                t = str(time)
            
            this_time = int(str(date) + t)
            fig,axs = plt.subplots(n_plots, figsize=dims)
        
            ax_ind = 0
            for fdict in params:
                fdict['time_point'] = this_time
                fdict['annot'] = annot
                
                if n_plots == 1:
                    fdict['ax'] = axs
                else:
                    fdict['ax'] = axs[ax_ind]
                    ax_ind += 1
            
                # render map
                if mode == 'global':
                    fdict['data'] = data
                    render_sub_frame(params=fdict)
                elif mode == 'country': 
                    fdict['data'] = data.loc[data['country'] == country].copy()
                    fdict['data'].rename(columns={'country' : 'cnt', 'region' : 'country'}, inplace=True)
                    render_sub_frame(params=fdict)
                else:
                    pass
            
            fig.tight_layout()
            fname = save_path + str(this_time) + '.png'
            if verbose:
                print(fname)
            
            plt.savefig(fname, dpi=FIG_DPI)
            fig.clf()
            plt.close()
        
            # maybe running into RAM problems because creating too many figures
            gc.collect()

### Plot stuff

In [None]:
# some defs to use later
feature_spread='new_cases'
title_spread = 'cumulative cases'

feature_deaths='new_deaths'
title_deaths='cumulative deaths'

# Europe
color_max_europe_spread=4  # 10000
color_max_europe_deaths=3   # 1000 

# South America
color_max_samerica_spread=2.7  # 500
color_max_samerica_deaths=1   # 1 

# world
color_max_world_spread=4    # 10000
color_max_world_deaths=3 # 1000

# USA
color_max_usa_spread=3   # 1000
color_max_usa_deaths=1   # 10

# Africa
color_max_africa_spread=2   # 100
color_max_africa_deaths=1   # 10

# China
color_max_china_spread=4    #10,000
color_max_china_deaths=3    # 1,000

# Asia
color_max_asia_spread=4    #10,000
color_max_asia_deaths=3    # 1,000

# other stuff
FIG_DIMS_WORLD=(12,12)
FIG_DIMS_AFRICA_SAMERICA=(8,12)
FIG_DPI=150
TIMESTEP=400 # this is 4 hours, not 400 minutes
#TIMESTEP=100  # 1 hour
#TIMESTEP=200 # 2 hours

In [None]:
# define various parameter dictionaries for plotting
europe_spread_dict = {
    'color_min' : 0,
    'color_max' : color_max_europe_spread,
    'cmap' : 'Greens',
    'map_data' : europe,
    'feature': feature_spread,
    'title_annot' : title_spread
}

europe_death_dict = {
    'color_min' : 0,
    'color_max' : color_max_europe_deaths,
    'cmap' : 'Reds',
    'map_data' : europe,
    'feature': feature_deaths,
    'title_annot' : title_deaths
}

samerica_spread_dict = {
    'color_min' : 0,
    'color_max' : color_max_samerica_spread,
    'cmap' : 'Greens',
    'map_data' : samerica,
    'feature': feature_spread,
    'title_annot' : title_spread
}

samerica_death_dict = {
    'color_min' : 0,
    'color_max' : color_max_samerica_deaths,
    'cmap' : 'Reds',
    'map_data' : samerica,
    'feature': feature_deaths,
    'title_annot' : title_deaths
}

world_spread_dict = {
    'color_min' : 0,
    'color_max' : color_max_world_spread,
    'cmap' : 'Greens',
    'map_data' : world,
    'feature': feature_spread,
    'title_annot' : title_spread
}

world_death_dict = {
    'color_min' : 0,
    'color_max' : color_max_world_deaths,
    'cmap' : 'Reds',
    'map_data' : world,
    'feature': feature_deaths,
    'title_annot' : title_deaths
}

usa_spread_dict = {
    'color_min' : 0,
    'color_max' : color_max_usa_spread,
    'cmap' : 'Greens',
    'map_data' : usa,
    'feature': feature_spread,
    'title_annot' : title_spread
}

usa_death_dict = {
    'color_min' : 0,
    'color_max' : color_max_usa_deaths,
    'cmap' : 'Reds',
    'map_data' : usa,
    'feature': feature_deaths,
    'title_annot' : title_deaths
}

africa_spread_dict = {
    'color_min' : 0,
    'color_max' : color_max_africa_spread,
    'cmap' : 'Greens',
    'map_data' : africa,
    'feature': feature_spread,
    'title_annot' : title_spread
}

africa_death_dict = {
    'color_min' : 0,
    'color_max' : color_max_africa_deaths,
    'cmap' : 'Reds',
    'map_data' : africa,
    'feature': feature_deaths,
    'title_annot' : title_deaths
}

china_spread_dict = {
    'color_min' : 0,
    'color_max' : color_max_china_spread,
    'cmap' : 'Greens',
    'map_data' : china,
    'feature': feature_spread,
    'title_annot' : title_spread
}

china_death_dict = {
    'color_min' : 0,
    'color_max' : color_max_china_deaths,
    'cmap' : 'Reds',
    'map_data' : china,
    'feature': feature_deaths,
    'title_annot' : title_deaths
}

asia_spread_dict = {
    'color_min' : 0,
    'color_max' : color_max_asia_spread,
    'cmap' : 'Greens',
    'map_data' : asia,
    'feature': feature_spread,
    'title_annot' : title_spread
}

asia_death_dict = {
    'color_min' : 0,
    'color_max' : color_max_asia_deaths,
    'cmap' : 'Reds',
    'map_data' : asia,
    'feature': feature_deaths,
    'title_annot' : title_deaths
}

In [None]:
plot_by_region(data=df,
               #start_date=20200317,
               timestep=TIMESTEP,
               dims=FIG_DIMS_WORLD,
               annot=True,
               params=[asia_spread_dict, asia_death_dict],
               mode='global',
               #country='China',
               #country='United States',
               save_path='./asia/',
               dpi=FIG_DPI)