# Appendix :: Code

## Library Imports

In [2]:
import json
import urllib.request
import sys
import os
import importlib
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import matplotlib
import bisect
import math
from bs4 import BeautifulSoup
import time
import warnings

In [5]:
import fetch_data

ModuleNotFoundError: No module named 'fetch_data'

## Functions

### Utility functions

In [6]:
def reload_module(module,
                 user_module_path = './'):
    # Block to handle reloading user modules
    module_path = os.path.abspath(os.path.join(user_module_path))
    if module_path not in sys.path:
        sys.path.append(module_path)
    importlib.reload(module)
    fetch_data.test_refresh()

In [109]:
def set_warnings(level):
    # Disable warnings
    warnings.filterwarnings(level)

### Plot functions

In [110]:
def plot_df(df, days=30):
    plt.title("Per day")
    plt.plot(df.index[-days:], df['confirmed'][-days:], label="Confirmed")
    plt.plot(df.index[-days:], df['deaths'][-days:], label="Deaths")
    plt.plot(df.index[-days:], df['recovered'][-days:], label="Recovered")
    plt.legend()
    plt.show()
    
#     plt.title("2nd diff")
#     plt.plot(df.index[-days:], df['confirmed_diff_2'][-days:], label="Confirmed")
#     plt.plot(df.index[-days:], df['deaths_diff_2'][-days:], label="Deaths")
#     plt.plot(df.index[-days:], df['recovered_diff_2'][-days:], label="Recovered")
#     plt.legend()
#     plt.show()

    plt.title("Cumulative, Linear")
    plt.plot(df.index[-days:], df['confirmed_total'][-days:], label="Confirmed")
    plt.plot(df.index[-days:], df['deaths_total'][-days:], label="Deaths")
    plt.plot(df.index[-days:], df['recovered_total'][-days:], label="Recovered")
    plt.legend()
    plt.show()

    plt.title("Cumulative, Log")
    plt.plot(df.index[-days:], df['confirmed_log_total'][-days:], label="Confirmed")
    plt.plot(df.index[-days:], df['deaths_log_total'][-days:], label="Deaths")
    plt.plot(df.index[-days:], df['recovered_log_total'][-days:], label="Recovered")
    plt.legend()
    plt.show()
    
    plt.title("Per day, Log")
    plt.plot(df.index[-days:], df['confirmed_log'][-days:], label="Confirmed")
    plt.plot(df.index[-days:], df['deaths_log'][-days:], label="Deaths")
    plt.plot(df.index[-days:], df['recovered_log'][-days:], label="Recovered")
    plt.legend()
    plt.show()

In [111]:
def plot_set(series,
             panels = 3,
             col_width  = 7,
             row_height = 5,
             fig_title="Title"):
    # Calculate the number of rows
    row_max, col_max, del_cells = calc_maxes(len(series), panels)
    fig, axs = plt.subplots(row_max, col_max)
    fig.set_size_inches(col_max * col_width, row_max * row_height)
    fig.suptitle(fig_title)

    row_idx = 0
    col_idx = 0

    # Handle single row case
    if (row_max == 1):
        for s in series:
            axs[col_idx].plot(s)
            col_idx += 1
    # Multirow case
    else:
        for s in series:
            axs[row_idx, col_idx].plot(s)
            col_idx += 1
            if (col_idx == col_max):
                row_idx += 1
                col_idx = 0

    # Hide extra panels
    offset = col_max - del_cells
    for i in range(0, del_cells):
        axs[row_max-1, offset+i].axis('off')  


In [112]:
def rebase_series(series, threshold=0, ret_idx = False, trim_idx = -1):
    if (trim_idx == -1):
        trim_idx = bisect.bisect_left(series, threshold)
    trim_series = np.array(list(series[trim_idx:].values) + [np.nan]*trim_idx)
    if ret_idx:
        return trim_series, trim_idx
    else:
        return trim_series

In [113]:
def threshold_plot(df,
                   countries,
                   threshold=25,
                   x_lim=45,
                   y_lim=None,
                   log_var = False):
    plt.rcParams.update({'font.size': 14})
    plt.subplots_adjust(wspace=0.4)
    fig, axs = plt.subplots(1, 2)
    fig.set_size_inches(16, 8)
    fig.suptitle("Cross Country comparisons of growth rates")
    
    plot_var = "deaths_total"
    if log_var:
        threshold = math.log(threshold)
    #plt.figure(figsize=(10,7))
    #plt.title(f"Total coronavirus deaths for places with at least {threshold} deaths")
    
    deaths_threshold = 25
    confirmed_threshold = 1000
    
    #y_ticks = [1, 2000-threshold, 5000-threshold, 20000-threshold]
    #y_tick_labels = [threshold, 2000, 5000, 20000]
#     max_region_st = 0
    for c in countries:
        country_df = df.loc[df["Country/Region"]==c]
        
        deaths_rebase = rebase_series(country_df['deaths_total'], threshold=deaths_threshold)
        axs[0].plot(deaths_rebase-deaths_threshold, label=c)
        
        confirmed_rebase = rebase_series(country_df['confirmed_total'], threshold=confirmed_threshold)
        axs[1].plot(confirmed_rebase-confirmed_threshold, label=c)
#         if region_st > max_region_st:
#             max_region_st = region_st
    #print(f"region_st {len(region_st)}")
        x = np.linspace(0,len(region_st),len(region_st))

        # Add scale lines
    for idx in [0,1]:
        axs[idx].plot(np.power(2,x), color="#666666")
        axs[idx].plot(np.power(2,x/2), color="#666666")
        axs[idx].plot(np.power(2,x/3), color="#666666")
        axs[idx].plot(np.power(2,x/7), color="#666666")
        axs[idx].plot(np.power(2,x/30), color="#666666")
        axs[idx].set_xlim((0,x_lim))
        axs[idx].set_yscale("log", basey=2)                                 
        axs[idx].legend(loc="upper right")                         
        
    axs[0].set_xlabel(f"Days since {deaths_threshold}th death")
    axs[1].set_xlabel(f"Days since {confirmed_threshold}th confirmed case")

    if y_lim is not None:
        axs[0].set_ylim((1,y_lim))
        axs[1].set_ylim((1,y_lim*8))

    axs[0].set_title(f"Total Deaths")
    axs[1].set_title(f"Total Confirmed Cases")

    axs[0].set_yticks([1,100-deaths_threshold,200-deaths_threshold,500-deaths_threshold,
                       1000-deaths_threshold,2000-deaths_threshold,5000-deaths_threshold,
                       10_000-deaths_threshold,20_000-deaths_threshold])
    axs[0].set_yticklabels([deaths_threshold,100,200,500,1000,2000,5000,10_000,20_000])

    axs[1].set_yticks([1,#100-confirmed_threshold,200-confirmed_threshold,500-confirmed_threshold,
                       2000-confirmed_threshold,5000-confirmed_threshold,
                       10_000-confirmed_threshold,20_000-confirmed_threshold,50_000-confirmed_threshold,
                       100_000-confirmed_threshold,200_000-confirmed_threshold,500_000-confirmed_threshold,
                       1_000_000-confirmed_threshold])
    axs[1].set_yticklabels([confirmed_threshold,2000,5000,10_000,20_000,50_000,100_000,
                            200_000,500_000,1_000_000])
    #axs[idx].get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
    plt.savefig(f'figures/country_set_timeComp_{threshold}.png')
#         plt.show()

In [114]:
def plot_panel(axs, row_idx, col_idx, country_df, plot_vars, threshold, title, log_scale = False, mean_window=1):
    first = True
    for plot_var in plot_vars.keys():
        if first:
            series, trim_idx = rebase_series(country_df[plot_var], threshold=threshold, ret_idx = True)
            first = False
        else:
            series = rebase_series(country_df[plot_var], trim_idx=trim_idx)
        
        series = pd.Series(series).rolling(window=mean_window).mean()
        axs[row_idx, col_idx].plot(series, label=plot_vars[plot_var])
    if log_scale:
        axs[row_idx, col_idx].set_yscale("log", basey=2)                                 
    axs[row_idx, col_idx].set_title(title)
    if row_idx == 1:
        axs[row_idx, col_idx].set_xlabel(f"Days since {threshold}th case")
    axs[row_idx, col_idx].legend(loc='upper left')

In [115]:
def plot_country(full_df, country, threshold, mean_window=3):
    fig, axs = plt.subplots(2, 2)
    plt.rcParams.update({'font.size': 12})
    #plt.subplots_adjust(wspace=0.4,hspace=1.3)
    fig.set_size_inches(16, 10)
    fig.suptitle(f'{country} :: Comparison over time')
    country_df = full_df.loc[full_df["Country/Region"]==country]
    daily_vars = { 'confirmed' : "New Cases",
                 'deaths' : "New Deaths",
                 'recovered' : "New Recovered",
                 'net_cases' : "Net Cases"    
    }
    plot_panel(axs, 0, 0, country_df, daily_vars, threshold, 
               "Daily", mean_window=mean_window)
    plot_panel(axs, 1, 0, country_df, daily_vars, threshold, 
               "Daily (Log)", mean_window=mean_window, log_scale=True)
    cum_vars = { 'confirmed_total' : "Total Confirmed Cases",
                 'deaths_total' : "Total Deaths",
                 'recovered_total' : "Recovered",
                 'active_cases' : "Active Cases"    
    }
    plot_panel(axs, 0, 1, country_df, cum_vars, threshold,
               "Cumulative")
    plot_panel(axs, 1, 1, country_df, cum_vars, threshold,
               "Cumulative (Log)", log_scale=True)
    plt.savefig(f'figures/{country}_timeComp_{threshold}.png')

In [116]:
def plot_country_list(full_df, countries, threshold=25):
    for country in countries:
        plot_country(full_df, country, threshold)

In [117]:
def plot_log_growth(full_df, countries, mean_window=3, days=45):
    #plot_var = 'deaths_log_growth'
    #threshold= 0.5
    # mean_window = 3
    # days = 45

    fig, axs = plt.subplots(1, 2)
    plt.rcParams.update({'font.size': 12})
    #plt.subplots_adjust(wspace=0.4,hspace=1.3)
    fig.set_size_inches(16, 6)

    labels = labels = [i * 15 for i in range(0, days // 15 + 1)]
    for country in countries:
        country_df = full_df.loc[full_df['Country/Region'] == country]
        death_series = country_df['deaths_log_growth'].rolling(window=mean_window).mean()
        confirmed_series = country_df['confirmed_log_growth'].rolling(window=mean_window).mean()
        #series = rebase_series(country_df[plot_var], threshold=threshold)
        axs[0].plot(death_series[-days:], label=country)
        axs[1].plot(confirmed_series[-days:], label=country)
    axs[0].set_title(f'Deaths :: Log Growth (last {days} days)')
    axs[1].set_title(f'Confirmed Cases :: Log Growth (last {days} days)')
    for idx in [0,1]:
        axs[idx].set_xticks(labels)
        axs[idx].set_xticklabels(labels)
    axs[1].legend()
    plt.savefig(f'figures/country_logGrowth_{days}.png')

In [118]:
def plot_normalised(full_df, countries, mean_window=3, days=60):
    fig, axs = plt.subplots(1, 2)
    plt.rcParams.update({'font.size': 12})
    #plt.subplots_adjust(wspace=0.4,hspace=1.3)
    fig.set_size_inches(16, 6)

    labels = [i * 15 for i in range(0, days // 15 + 1)]
    for country in countries:
        country_df = full_df.loc[full_df['Country/Region'] == country]
        death_series = country_df['confirmed_total_norm']
        confirmed_series = country_df['deaths_total_norm']
        #series = rebase_series(country_df[plot_var], threshold=threshold)
        axs[0].plot(death_series[-days:], label=country)
        axs[1].plot(confirmed_series[-days:], label=country)
    axs[0].set_title(f'Deaths, per Million population :: Log Growth (last {days} days)')
    axs[1].set_title(f'Confirmed Cases, per Million population :: Log Growth (last {days} days)')
    for idx in [0,1]:
        axs[idx].set_yscale("log", basey=10) 
        axs[idx].set_xticks(labels)
        axs[idx].set_xticklabels(labels)
    axs[1].legend()
    plt.savefig(f'figures/country_normPop_{days}.png')

## Fetch data

In [548]:
set_warnings('default')

In [549]:
reload_module(fetch_data)

Refreshed at 2020-04-20 17:52:58.811079


In [550]:
fetch_data.map_name_to_ISOcode('World')

>> File  [ ISO_codes.csv ] found. Checking age... 
	+ File  [ ISO_codes.csv ] not expired. Load from disk...


['WL', 'WLD', '000']

---
Fetch global corona virus numbers

In [551]:
PROJECT_FOLDER = '/Users/richie/Dropbox/Records/Education/2019-20_BGSE_MScDataScience/Modules/DS_Trimester02/DataVisualisation/PartB/Project/dashboard/minimal_dash_flask'
# APP_PATH = "dashboard/minimal_dash_flask/"
if os.getcwd() != PROJECT_FOLDER:
    #os.chdir(APP_PATH)
    os.chdir(PROJECT_FOLDER)
    

In [552]:
# country = 'Afghanistan'
# m_df = cv_merged_df.copy()

# first_date = m_df.columns.get_loc("1/22/20")
# sf = m_df.loc[(m_df['Country/Region'] == country) &
#                     (m_df['Province/State'].isna())]
# sf = sf.iloc[:,first_date:].T

# sf['Country/Region'] = country
# codes = map_name_to_ISOcode(country)
# sf['ISO3166_alpha2'] = codes[0]
# sf['ISO3166_alpha3'] = codes[1]
# sf['ISO3166_numeric'] = codes[2]

In [553]:
full_df, cv_merged_df, iso_codes_df, indicator_df = fetch_data.fetch_all(purge=True)


>> Purging cached files...
	+ Removing file [ assets/full_df.csv ]...
	+ Removing file [ assets/merged_df_global.csv ]...
	+ Removing file [ assets/coronaVirus_global.csv ]...
	+ File [ assets/coronaVirus_US.csv ] not found. Cannot remove...
	+ Removing file [ assets/ISO_codes.csv ]...
	+ Removing file [ assets/wb_indicators.csv ]...
>> File [ merged_df_global.csv ] not found. Fetching...
>> File [ coronaVirus_global.csv ] not found. Fetching...
>> File [ ISO_codes.csv ] not found. Fetching...
>> Before name fixes:
	+ [ Bahamas ] not found...
	+ [ Congo (Brazzaville) ] not found...
	+ [ Congo (Kinshasa) ] not found...
	+ [ Diamond Princess ] not found...
	+ [ Gambia ] not found...
	+ [ Holy See ] not found...
	+ [ Taiwan* ] not found...
	+ [ US ] not found...
	+ [ West Bank and Gaza ] not found...
	+ [ MS Zaandam ] not found...
>> After name fixes:
	+ [ Other ] not found...
>> Before summarise countries:


  result = getattr(ufunc, method)(*inputs, **kwargs)
  out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer]


	+ [Australia] not found...
	+ [Canada] not found...
	+ [Other] not found...
	+ [China] not found...
>> After summarise countries:
	+ No failures found.
>> Fetching World Bank Indicators...
>> File [ wb_indicators.csv ] not found. Fetching...
>> Merging data...
	+ Saving to [ assets/merged_df_global.csv ]
>> File [ full_df.csv ] not found. Fetching...
>> Building full df...
	+ Complete.


Groups
+ Date
+ Country information
    + Country Name
    + Province / State
    + Continent
    + International Organisation
    + OECD
    + G8
+ Confirmed / Deaths / Recovered / Active
    + Cumulative
    + Delta
    + Cumulative norm
    + Log cumulative
    + Log delta


In [554]:
# full_df_bak = full_df.copy()

In [555]:
full_df = full_df_bak.copy()

In [556]:
full_df

Unnamed: 0,confirmed_total,deaths_total,recovered_total,Province/State,Country/Region,ISO3166_alpha2,ISO3166_alpha3,ISO3166_numeric,confirmed,confirmed_total_norm,...,deaths_log_total,deaths_log,deaths_log_growth,recovered,recovered_total_norm,recovered_log_total,recovered_log,recovered_log_growth,net_cases,active_cases
1/22/20,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,,0.000000,...,-inf,,,,0.000000,-inf,,,,
1/23/20,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,-inf,,,0.0,0.000000,-inf,,,0.0,0.0
1/24/20,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,-inf,,,0.0,0.000000,-inf,,,0.0,0.0
1/25/20,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,-inf,,,0.0,0.000000,-inf,,,0.0,0.0
1/26/20,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,-inf,,,0.0,0.000000,-inf,,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4/15/20,2052844.0,134105.0,509266.0,All,World,WL,WLD,000,79822.0,273.382437,...,17.033004,0.090867,0.065010,36481.0,67.820244,18.958060,0.107235,0.077162,35155.0,1408963.0
4/16/20,2149157.0,143724.0,540332.0,All,World,WL,WLD,000,96313.0,286.208683,...,17.132941,0.099938,0.071727,31066.0,71.957382,19.043487,0.085427,0.061002,55628.0,1464591.0
4/17/20,2236854.0,153742.0,566472.0,All,World,WL,WLD,000,87697.0,297.887515,...,17.230152,0.097210,0.069703,26140.0,75.438512,19.111645,0.068159,0.048378,51539.0,1516130.0
4/18/20,2314376.0,159427.0,590406.0,All,World,WL,WLD,000,77522.0,308.211316,...,17.282536,0.052385,0.036978,23934.0,78.625863,19.171348,0.059703,0.042251,47903.0,1564033.0


In [534]:
full_df['Date'] = pd.to_datetime(full_df.index)
full_df.reset_index(inplace=True, drop=True)

In [535]:
full_df

Unnamed: 0,confirmed_total,deaths_total,recovered_total,Province/State,Country/Region,ISO3166_alpha2,ISO3166_alpha3,ISO3166_numeric,confirmed,confirmed_total_norm,...,deaths_log,deaths_log_growth,recovered,recovered_total_norm,recovered_log_total,recovered_log,recovered_log_growth,net_cases,active_cases,Date
0,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,,0.000000,...,,,,0.000000,-inf,,,,,2020-01-22
1,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,,,0.0,0.000000,-inf,,,0.0,0.0,2020-01-23
2,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,,,0.0,0.000000,-inf,,,0.0,0.0,2020-01-24
3,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,,,0.0,0.000000,-inf,,,0.0,0.0,2020-01-25
4,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,,,0.0,0.000000,-inf,,,0.0,0.0,2020-01-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16371,2052844.0,134105.0,509266.0,All,World,WL,WLD,000,79822.0,273.382437,...,0.090867,0.065010,36481.0,67.820244,18.958060,0.107235,0.077162,35155.0,1408963.0,2020-04-15
16372,2149157.0,143724.0,540332.0,All,World,WL,WLD,000,96313.0,286.208683,...,0.099938,0.071727,31066.0,71.957382,19.043487,0.085427,0.061002,55628.0,1464591.0,2020-04-16
16373,2236854.0,153742.0,566472.0,All,World,WL,WLD,000,87697.0,297.887515,...,0.097210,0.069703,26140.0,75.438512,19.111645,0.068159,0.048378,51539.0,1516130.0,2020-04-17
16374,2314376.0,159427.0,590406.0,All,World,WL,WLD,000,77522.0,308.211316,...,0.052385,0.036978,23934.0,78.625863,19.171348,0.059703,0.042251,47903.0,1564033.0,2020-04-18


In [536]:
full_df.loc[(full_df['Country/Region']=='United States') &
            (full_df['Date']=='2020-04-19')]

Unnamed: 0,confirmed_total,deaths_total,recovered_total,Province/State,Country/Region,ISO3166_alpha2,ISO3166_alpha3,ISO3166_numeric,confirmed,confirmed_total_norm,...,deaths_log,deaths_log_growth,recovered,recovered_total_norm,recovered_log_total,recovered_log,recovered_log_growth,net_cases,active_cases,Date
13883,759086.0,40661.0,70337.0,All,United States,US,USA,840,26889.0,2323.584458,...,0.072655,0.05165,5497.0,215.303615,16.101996,0.1174,0.084778,19395.0,648087.0,2020-04-19


In [537]:
merge_columns = [ 'ISO3166_alpha3', 'population', 'gdp_total',
                  'lifeExp_male', 'lifeExp_female', 'DisplayName']
cv_df = cv_df[merge_columns]
full_df = pd.merge(full_df, cv_df, how='left', on='ISO3166_alpha3')
# Fix the world name
#full_df.loc[full_df["ISO3166_alpha3"] == 'WLD', "Name"] = 'World'


In [538]:
full_df

Unnamed: 0,confirmed_total,deaths_total,recovered_total,Province/State,Country/Region,ISO3166_alpha2,ISO3166_alpha3,ISO3166_numeric,confirmed,confirmed_total_norm,...,recovered_log,recovered_log_growth,net_cases,active_cases,Date,population,gdp_total,lifeExp_male,lifeExp_female,DisplayName
0,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,,0.000000,...,,,,,2020-01-22,3.717239e+07,1.936297e+10,63.047,66.026,Afghanistan
1,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,,0.000000,...,,,,,2020-01-22,3.717239e+07,1.936297e+10,63.047,66.026,Afghanistan
2,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,,0.000000,...,,,,,2020-01-22,3.717239e+07,1.936297e+10,63.047,66.026,Afghanistan
3,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,,,0.0,0.0,2020-01-23,3.717239e+07,1.936297e+10,63.047,66.026,Afghanistan
4,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,,,0.0,0.0,2020-01-23,3.717239e+07,1.936297e+10,63.047,66.026,Afghanistan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69326,2314376.0,159427.0,590406.0,All,World,WL,WLD,000,77522.0,308.211316,...,0.059703,0.042251,47903.0,1564033.0,2020-04-18,7.509056e+09,8.451517e+13,12424.455,13272.667,
69327,2314376.0,159427.0,590406.0,All,World,WL,WLD,000,77522.0,308.211316,...,0.059703,0.042251,47903.0,1564033.0,2020-04-18,7.509056e+09,8.451517e+13,12424.455,13272.667,
69328,2397958.0,164955.0,621959.0,All,World,WL,WLD,000,83582.0,319.342143,...,0.075112,0.053443,46501.0,1610534.0,2020-04-19,7.509056e+09,8.451517e+13,12424.455,13272.667,
69329,2397958.0,164955.0,621959.0,All,World,WL,WLD,000,83582.0,319.342143,...,0.075112,0.053443,46501.0,1610534.0,2020-04-19,7.509056e+09,8.451517e+13,12424.455,13272.667,


In [539]:
tmp_df3 = full_df.drop_duplicates()

In [541]:
tmp_df3

Unnamed: 0,confirmed_total,deaths_total,recovered_total,Province/State,Country/Region,ISO3166_alpha2,ISO3166_alpha3,ISO3166_numeric,confirmed,confirmed_total_norm,...,recovered_log,recovered_log_growth,net_cases,active_cases,Date,population,gdp_total,lifeExp_male,lifeExp_female,DisplayName
0,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,,0.000000,...,,,,,2020-01-22,3.717239e+07,1.936297e+10,63.047,66.026,Afghanistan
3,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,,,0.0,0.0,2020-01-23,3.717239e+07,1.936297e+10,63.047,66.026,Afghanistan
6,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,,,0.0,0.0,2020-01-24,3.717239e+07,1.936297e+10,63.047,66.026,Afghanistan
9,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,,,0.0,0.0,2020-01-25,3.717239e+07,1.936297e+10,63.047,66.026,Afghanistan
12,0.0,0.0,0.0,All,Afghanistan,AF,AFG,004,0.0,0.000000,...,,,0.0,0.0,2020-01-26,3.717239e+07,1.936297e+10,63.047,66.026,Afghanistan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69316,2052844.0,134105.0,509266.0,All,World,WL,WLD,000,79822.0,273.382437,...,0.107235,0.077162,35155.0,1408963.0,2020-04-15,7.509056e+09,8.451517e+13,12424.455,13272.667,
69319,2149157.0,143724.0,540332.0,All,World,WL,WLD,000,96313.0,286.208683,...,0.085427,0.061002,55628.0,1464591.0,2020-04-16,7.509056e+09,8.451517e+13,12424.455,13272.667,
69322,2236854.0,153742.0,566472.0,All,World,WL,WLD,000,87697.0,297.887515,...,0.068159,0.048378,51539.0,1516130.0,2020-04-17,7.509056e+09,8.451517e+13,12424.455,13272.667,
69325,2314376.0,159427.0,590406.0,All,World,WL,WLD,000,77522.0,308.211316,...,0.059703,0.042251,47903.0,1564033.0,2020-04-18,7.509056e+09,8.451517e+13,12424.455,13272.667,


In [540]:
tmp_df3.loc[tmp_df3['ISO3166_alpha3']=='USA']

Unnamed: 0,confirmed_total,deaths_total,recovered_total,Province/State,Country/Region,ISO3166_alpha2,ISO3166_alpha3,ISO3166_numeric,confirmed,confirmed_total_norm,...,recovered_log,recovered_log_growth,net_cases,active_cases,Date,population,gdp_total,lifeExp_male,lifeExp_female,DisplayName
61588,1.0,0.0,0.0,All,United States,US,USA,840,,0.003061,...,,,,,2020-01-22,326687501.0,2.054434e+13,76.1,81.1,United States
61591,1.0,0.0,0.0,All,United States,US,USA,840,0.0,0.003061,...,,,0.0,0.0,2020-01-23,326687501.0,2.054434e+13,76.1,81.1,United States
61594,2.0,0.0,0.0,All,United States,US,USA,840,1.0,0.006122,...,,,1.0,1.0,2020-01-24,326687501.0,2.054434e+13,76.1,81.1,United States
61597,2.0,0.0,0.0,All,United States,US,USA,840,0.0,0.006122,...,,,0.0,1.0,2020-01-25,326687501.0,2.054434e+13,76.1,81.1,United States
61600,5.0,0.0,0.0,All,United States,US,USA,840,3.0,0.015305,...,,,3.0,4.0,2020-01-26,326687501.0,2.054434e+13,76.1,81.1,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61840,636350.0,28325.0,52096.0,All,United States,US,USA,840,28680.0,1947.885971,...,0.125279,0.090719,21853.0,555928.0,2020-04-15,326687501.0,2.054434e+13,76.1,81.1,United States
61843,667592.0,32916.0,54703.0,All,United States,US,USA,840,31242.0,2043.518647,...,0.070447,0.050042,24044.0,579972.0,2020-04-16,326687501.0,2.054434e+13,76.1,81.1,United States
61846,699706.0,36773.0,58545.0,All,United States,US,USA,840,32114.0,2141.820541,...,0.097926,0.070234,24415.0,604387.0,2020-04-17,326687501.0,2.054434e+13,76.1,81.1,United States
61849,732197.0,38664.0,64840.0,All,United States,US,USA,840,32491.0,2241.276442,...,0.147338,0.107524,24305.0,628692.0,2020-04-18,326687501.0,2.054434e+13,76.1,81.1,United States


In [316]:
for col in cv_merged_df.columns:
    print(col)

Province/State
Country/Region
Latitude
Longitude
Counter
UID
FIPS
CountyName
FullName
GEC
ISO3166_alpha2
ISO3166_alpha3
ISO3166_numeric
STANAG
ccTLDs
name
notes
population
population_notes
DisplayName
gdp_total
gdp_total_notes
lifeExp_male
lifeExp_male_notes
lifeExp_female
lifeExp_female_notes
1/22/20
1/23/20
1/24/20
1/25/20
1/26/20
1/27/20
1/28/20
1/29/20
1/30/20
1/31/20
2/1/20
2/2/20
2/3/20
2/4/20
2/5/20
2/6/20
2/7/20
2/8/20
2/9/20
2/10/20
2/11/20
2/12/20
2/13/20
2/14/20
2/15/20
2/16/20
2/17/20
2/18/20
2/19/20
2/20/20
2/21/20
2/22/20
2/23/20
2/24/20
2/25/20
2/26/20
2/27/20
2/28/20
2/29/20
3/1/20
3/2/20
3/3/20
3/4/20
3/5/20
3/6/20
3/7/20
3/8/20
3/9/20
3/10/20
3/11/20
3/12/20
3/13/20
3/14/20
3/15/20
3/16/20
3/17/20
3/18/20
3/19/20
3/20/20
3/21/20
3/22/20
3/23/20
3/24/20
3/25/20
3/26/20
3/27/20
3/28/20
3/29/20
3/30/20
3/31/20
4/1/20
4/2/20
4/3/20
4/4/20
4/5/20
4/6/20
4/7/20
4/8/20
4/9/20
4/10/20
4/11/20
4/12/20
4/13/20
4/14/20
4/15/20
4/16/20
4/17/20
4/18/20
4/19/20


In [174]:
full_df.head()

Unnamed: 0,confirmed_total,deaths_total,recovered_total,Province/State,Country/Region,confirmed,confirmed_total_norm,confirmed_log_total,confirmed_log,confirmed_log_growth,...,deaths_log,deaths_log_growth,recovered,recovered_total_norm,recovered_log_total,recovered_log,recovered_log_growth,net_cases,active_cases,Date
0,0.0,0.0,0.0,All,Afghanistan,,0.0,-inf,,,...,,,,0.0,-inf,,,,,2020-01-22
1,0.0,0.0,0.0,All,Afghanistan,0.0,0.0,-inf,,,...,,,0.0,0.0,-inf,,,0.0,0.0,2020-01-23
2,0.0,0.0,0.0,All,Afghanistan,0.0,0.0,-inf,,,...,,,0.0,0.0,-inf,,,0.0,0.0,2020-01-24
3,0.0,0.0,0.0,All,Afghanistan,0.0,0.0,-inf,,,...,,,0.0,0.0,-inf,,,0.0,0.0,2020-01-25
4,0.0,0.0,0.0,All,Afghanistan,0.0,0.0,-inf,,,...,,,0.0,0.0,-inf,,,0.0,0.0,2020-01-26


### Threshold plots

In [None]:
countries = ['Spain', 'Italy', 'China', 'Korea, South', 'United States', 'Singapore']

# Project

Richard Keely

## Data Sources
For this project, the principal source used was the John Hopkins University's dataset on corona virus cases. This dataset require a surprising amount of cleaning and heavy reshaping to be usable in the form used for this project. In addition, a number of derived statistics were created to allow things like the growth rate and the active statistics to be examined. In addition, to allow for more useful comparisons, I merged it with two additional datasets, the ISO3166 codes available (via some web-scraping) from the CIA World Factbook and then a number of indicators fetched from the World Bank using an API access.

The idea was to make this into an interactive dashboard, but a rather pernicious bug in my caching functions lead to a quite literal 11th hour switch back to static graphics, which are presented below.

## Visualisations and Analysis

In [None]:
threshold_plot(full_df, countries, y_lim=2**15, x_lim=60)

This plot is a reimplementation of one from the [New York Times' Upshot blog](https://www.nytimes.com/interactive/2020/03/21/upshot/coronavirus-deaths-by-country.html). It allows for a clear view of the growth of the cumulative statistics over time. Here you can see two plots for the countries that I thought would be most interesting to look at, Spain, Italy, China, South Korea, the United States and Singapore (used in all of the plots). Here, it can be seen that the growth rates for most of the countries is now saturating, though the US has not yet reached that point and while Singapore had initially been doing quite well, it appears to now be experiencing a surge in cases.

The code for this plots everything together, so I've inserted the images instead. For all of the plots we can see the daily and cumulative view of our four main variables over time. For the daily variables, a three period rolling average was used to smooth out some of the turbulence caused by the coarseness of the data.

In [None]:
#plot_country_list(full_df, countries, threshold=25)

![China](figures/China_timeComp_25.png)

The plot for China is the most promising and most similar to the expected theory; after an initial surge, the cases are brought under control and the number of active cases decays towards zero.

![SouthKorea](figures/SouthKorea_timeComp_25.png)

South Korea is also quite promising, showing a similar trend to China, but at an earlier point in the cycle.

![Spain](figures/Spain_timeComp_25.png)

![Italy](figures/Italy_timeComp_25.png)

Spain and Italy appear, unfortunately to be still in the midst of the worst part of the curve. While the net increase in the number of active cases is decreasing, the cumulative case load still hasn't peaked.

![United States](figures/UnitedStates_timeComp_25.png)

The United States is even earlier in the cycle and, indeed merits analysis at a more granular level.

![Singapore](figures/Singapore_timeComp_25.png)

Finally Singapore which had appeared to have missed the worst of the epidemic is now experiencing a surge in cases, though whether this will be as bad as that experienced in Europe and the US remains to be seen.

In [None]:
plot_log_growth(full_df, countries, mean_window=3, days=45)

Next, I looked at the log growth of deaths and confirmed cases over time, again using a rolling average to smooth the noise. Here it can be seen that for most countries the last two weeks have been largely in the right direction after much more volatility before that as reporting and testing likely revealed more of the underlying cases. Interestingly again, Singapore appears to be in a much earlier stage of the cycle.

In [None]:
plot_normalised(full_df, countries, mean_window=3, days=75)

Finally, here I look at deaths and cases normalised by population, to see get some understanding of the the true toll of the crisis in different countries. One interesting thing from this plot is that while it took time to do so, Spain appears to be doing worse across both the deaths and confirmed cases than Italy. This suggests that Spain may not have been able to capitalise on the time and learning from Italy. In addition, it appears that the US may surpass both Spain and Italy. Here again, Singapore's later entry into the crisis is visible.

---