In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import datetime
from pathlib import Path

In [None]:
PATH = Path('../input/covid19-global-forecasting-week-2')
list(PATH.glob('*'))

In [None]:
data = pd.read_csv(PATH/'train.csv')
data.head()

## View daily global totals
Here we collapse all the case totals by day.

In [None]:
agg_funcs = {'Date': 'first', 'ConfirmedCases': 'sum', 'Fatalities': 'sum'}
data_sum = data.groupby(data['Date']).aggregate(agg_funcs)
data_sum

Plot both gloabl cases as well as fatalities.

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=data_sum['Date'], y=data_sum['ConfirmedCases'], mode='lines', name='Confirmed Cases'))
fig.add_trace(go.Scatter(x=data_sum['Date'], y=data_sum['Fatalities'], mode='lines', name='Fatalities'))
fig.show()

## Countries
Aggregate cases and fatalities by country

In [None]:
countries = list(set(list(data['Country_Region'])))
print(countries)

Let's get the sum total of cases for all countries and order them. The following will print the top 20.

In [None]:
num_cases = []
for c, country in enumerate(countries):
    data2 = data.loc[data['Country_Region'] == country]
    num_cases_country = data2.groupby(data2['Date']).aggregate(agg_funcs).max().ConfirmedCases
    num_cases.append(num_cases_country)

# index ordered by num_cases    
idx_top_by_cases = list(reversed(np.argsort(num_cases)))

for i in range(20):
    idx_top = idx_top_by_cases[i]
    print('%d: %s (%d cases)' % (i+1, countries[idx_top], num_cases[idx_top]))

The plot below will graph the top 20 countries daily.

In [None]:
countries_str = '[%s]'% (', '.join(["'%s'"%countries[idx] for idx in idx_top_by_cases[:20]]))   # there must be a less ugly way to do this in pandas
data_top_countries = data.query("Country_Region == %s" % countries_str) 

fig = px.line(data_top_countries, x="Date", y="ConfirmedCases", color="Country_Region",
              line_group="Country_Region", hover_name="Country_Region",
              title="Daily cases for top 20 countries (with range slider)")
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

Let's get the sum total of fatalities for all countries and order them. The following will print the top 20.

In [None]:
num_fatalities = []
for c, country in enumerate(countries):
    data2 = data.loc[data['Country_Region'] == country]
    num_fatalities_country = data2.groupby(data2['Date']).aggregate(agg_funcs).max().Fatalities
    num_fatalities.append(num_fatalities_country)

# index ordered by num_cases    
idx_top_by_fatalities = list(reversed(np.argsort(num_fatalities)))

for i in range(20):
    idx_top = idx_top_by_fatalities[i]
    print('%d: %s (%d fatalities)' % (i+1, countries[idx_top], num_fatalities[idx_top]))

The plot below will graph the top 20 countries daily.

In [None]:
countries_str = '[%s]'% (', '.join(["'%s'"%countries[idx] for idx in idx_top_by_fatalities[:20]]))   # there must be a less ugly way to do this in pandas
data_top_countries = data.query("Country_Region == %s" % countries_str) 

fig = px.line(data_top_countries, x="Date", y="Fatalities", color="Country_Region",
              line_group="Country_Region", hover_name="Country_Region",
              title="Daily fatalities for top 20 countries (with range slider)")
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()