# Exploratory Data Analysis

In [1]:
from pathlib import Path
import re

import folium
import ipywidgets as widgets
import pandas as pd
import plotly.express as px
from tqdm.notebook import tqdm

## Load all data

Here we load all datasets into a dictionary which maps the filename to the corresponding `pandas.DataFrame`.

In [2]:
datasets = {f.name : pd.read_csv(f) for f in tqdm(Path('.').glob('*.csv'))}

|          | 0/? [00:00<?, ?it/s]

Let's just perform some sanity checks.

In [3]:
len(datasets)

6

In [4]:
datasets.keys()

dict_keys(['cases_country.csv', 'time_series_covid19_confirmed_global.csv', 'time_series_covid19_confirmed_US.csv', 'time_series_covid19_deaths_global.csv', 'time_series_covid19_deaths_US.csv', 'time_series_covid19_recovered_global.csv'])

## Clean the dataframes

### Rename columns for easier handling

In [5]:
datasets = {
    k: df.rename(
        mapper=lambda c: c if re.match(r'\d{1,2}/\d{1,2}/\d{1,2}', c) else c.split('/')[0].lower().rstrip('_'),
        axis='columns')
    for (k, df) in datasets.items()
}

Let's check how the final column names look like.

In [6]:
{c for df in datasets.values() for c in df.filter(regex=r'[^\d{1,2}/]').columns}

{'active',
 'admin2',
 'code3',
 'combined_key',
 'confirmed',
 'country',
 'country_region',
 'deaths',
 'fips',
 'incident_rate',
 'iso2',
 'iso3',
 'last_update',
 'lat',
 'long',
 'mortality_rate',
 'people_hospitalized',
 'people_tested',
 'population',
 'province',
 'province_state',
 'recovered',
 'uid'}

## Explore aggregated data

In [7]:
country_df = datasets['cases_country.csv']

In [8]:
@widgets.interact(num_rows=widgets.IntSlider(min=1, max=len(country_df), continuous_update=False, description='N:'))
def render_df(num_rows):
    display(country_df.sort_values(by='confirmed', 
                                   ascending=False)
                      .head(num_rows)
                      .loc[:,['country_region', 
                              'confirmed', 
                              'deaths', 
                              'recovered', 
                              'active']]
                      .style.background_gradient(cmap='Reds'))

interactive(children=(IntSlider(value=1, continuous_update=False, description='N:', max=191, min=1), Output())…

## Create plots

### Worst affected countries

The following plot shows the top N countries in terms of confirmed cases. The size of the bubble is representative of the number of confirmed cases. The position along the x and y axis indicates how this number relates to recovered and death cases. Some rare instances had no given value. These cases have been filled with 0.0.

In [9]:
@widgets.interact(num_rows=widgets.IntSlider(min=1, max=len(country_df), continuous_update=False, description='N:'))
def render_df(num_rows):
    fig = px.scatter(country_df.sort_values(by='confirmed', 
                                            ascending=False)
                               .head(num_rows)
                               .fillna(0.0), 
                     x='deaths', 
                     y='recovered', 
                     size='confirmed',
                     color='country_region',
                     hover_name='country_region')
    fig.show()

interactive(children=(IntSlider(value=1, continuous_update=False, description='N:', max=191, min=1), Output())…

### Daily confirmed and death cases

Here we are going to plot the timeseries for confirmed and death cases. We aggregate the numbers on country level, simply adding up the numbers for individual provinces/states of a country.

In [10]:
confirmed_df = datasets['time_series_covid19_confirmed_global.csv'].groupby('country').sum().reset_index()
deaths_df = datasets['time_series_covid19_deaths_global.csv'].groupby('country').sum().reset_index()

In [11]:
@widgets.interact(country=confirmed_df['country'].unique())
def plot_daily_cases(country):
    ts_confirmed = confirmed_df.set_index('country').iloc[:,3:].loc[country,:]
    ts_deaths = deaths_df.set_index('country').iloc[:,3:].loc[country,:]
    df = ts_confirmed.to_frame().join(ts_deaths, lsuffix='_c', rsuffix='_d').reset_index()
    df.columns = ['date', 'confirmed', 'deaths']
    df['date'] = pd.to_datetime(df['date'])
    fig = px.line(df, x='date', y=df.columns, title=country)
    fig.show()

interactive(children=(Dropdown(description='country', options=('Afghanistan', 'Albania', 'Algeria', 'Andorra',…

### Top 10 of worst-hit countries

In [12]:
def plot_top10(metric):
    fig = px.bar(country_df.sort_values(by=metric, 
                                    ascending=False)
                       .head(10),
             x='country_region',
             y=metric,
             title=f'Top 10 ({metric})')
    fig.show()

In [13]:
plot_top10('confirmed')

In [14]:
plot_top10('deaths')

In [15]:
plot_top10('active')

In [16]:
plot_top10('recovered')

In [17]:
plot_top10('mortality_rate')

### COVID-19 spread on global map

In [18]:
scale_f = lambda x: 10 / country_df.confirmed.max() * x + 5

m = folium.Map(tiles="Stamen Toner", zoom_start=13)
for i,r in country_df.dropna(subset=['lat', 'long']).iterrows():
    folium.CircleMarker(
        location=[r.lat, r.long],
        radius=scale_f(r.confirmed),
        popup=f'''
               <table>
                 <tr>
                   <th colspan="2">{r.country_region}</th>
                 </tr>
                 <tr>
                   <td>confirmed:</td>
                   <td>{int(r.confirmed)}</td>
                 </tr>
                 <tr>
                   <td>deaths:</td>
                   <td>{int(r.deaths)}</td>
                 </tr>
                 <tr>
                   <td>death rate:</td>
                   <td>{r.mortality_rate:.3f}</td>
                 </tr>
               </table>
        ''',
        color='crimson',
        fill=True
    ).add_to(m)
m

## Conclusion

Chine has been identified as the first country to report COVID-19 cases. Accordingly, China exhibits a steep rise in confirmed cases early on but its trend has since plateaued. Other countries followed soon later. The current analysis shows that some countries managed the pandemy better than others.
Countries such as Germany were able to contain the pandemy during the first wave in summer 2020 but have seen steadily rising numbers in confirmed cases since. Countries such as Singapore managed to fight an initial outbreak and contain the cases.
Same is true for death cases. The US tops the list of confirmed and death cases. However, the numbers for recovered cases is missing. Nearly every country has had an exposure to COVID-19 to date.
It would be interesting to correlate measures such as lockdowns each country has taken with the trend in confirmed, death an recovered cases.