In [1]:
import pandas as pd
import numpy as np
import glob
import re

# Standard plotly imports
import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

In [2]:
path = 'COVID-19/csse_covid_19_data/csse_covid_19_daily_reports'
all_files = glob.glob(path + "/*.csv")

files = []

for filename in all_files:
    date = re.search(r'([0-9]{2}\-[0-9]{2}\-[0-9]{4})', filename)[0]
    df = pd.read_csv(filename, index_col=None, header=0)
    df['date'] = pd.to_datetime(date)
    files.append(df)

df = pd.concat(files, axis=0, ignore_index=True, sort=False)
df = df[['date',
         'Country/Region',
         'Province/State',
         'Confirmed',
         'Deaths',
         'Recovered',
         'Latitude',
         'Longitude']]

df['Confirmed'] = df['Confirmed'].fillna(0).astype(int)
df['Deaths'] = df['Deaths'].fillna(0).astype(int)
df['Recovered'] = df['Recovered'].fillna(0).astype(int)
df['Active'] = df['Confirmed'] - df['Deaths'] - df['Recovered']

In [3]:
# Rename some countries with duplicate naming conventions
df['Country/Region'].replace('Mainland China', 'China', inplace=True)
df['Country/Region'].replace('Hong Kong SAR', 'Hong Kong', inplace=True)
df['Country/Region'].replace(' Azerbaijan', 'Azerbaijan', inplace=True)
df['Country/Region'].replace('Holy See', 'Vatican City', inplace=True)
df['Country/Region'].replace('Iran (Islamic Republic of)', 'Iran', inplace=True)
df['Country/Region'].replace('Taiwan*', 'Taiwan', inplace=True)
df['Country/Region'].replace('Korea, South', 'South Korea', inplace=True)
df['Country/Region'].replace('Viet Nam', 'Vietnam', inplace=True)
df['Country/Region'].replace('Macao SAR', 'Macau', inplace=True)
df['Country/Region'].replace('Russian Federation', 'Russia', inplace=True)
df['Country/Region'].replace('Republic of Moldova', 'Moldova', inplace=True)
df['Country/Region'].replace('Czechia', 'Czech Republic', inplace=True)
df['Country/Region'].replace('Congo (Kinshasa)', 'Congo', inplace=True)
df['Country/Region'].replace('Northern Ireland', 'United Kingdom', inplace=True)
df['Country/Region'].replace('Republic of Korea', 'North Korea', inplace=True)
df['Country/Region'].replace('Others', 'Cruise Ship', inplace=True)
df['Province/State'].replace('Cruise Ship', 'Diamond Princess cruise ship', inplace=True)
df['Province/State'].replace('From Diamond Princess', 'Diamond Princess cruise ship', inplace=True)

In [4]:
# Replace missing values for latitude and longitude
df['Latitude'] = df['Latitude'].fillna(df.groupby('Province/State')['Latitude'].transform('mean'))
df['Longitude'] = df['Longitude'].fillna(df.groupby('Province/State')['Longitude'].transform('mean'))

In [5]:
df

Unnamed: 0,date,Country/Region,Province/State,Confirmed,Deaths,Recovered,Latitude,Longitude,Active
0,2020-01-22,China,Anhui,1,0,0,31.8257,117.2264,1
1,2020-01-22,China,Beijing,14,0,0,40.1824,116.4142,14
2,2020-01-22,China,Chongqing,6,0,0,30.0572,107.8740,6
3,2020-01-22,China,Fujian,1,0,0,26.0789,117.9874,1
4,2020-01-22,China,Gansu,0,0,0,36.2797,103.4873,0
...,...,...,...,...,...,...,...,...,...
5627,2020-03-14,US,"Virgin Islands, U.S.",1,0,0,18.3358,-64.8963,1
5628,2020-03-14,United Kingdom,Gibraltar,1,0,1,36.1408,-5.3536,0
5629,2020-03-14,Australia,Diamond Princess cruise ship,0,0,0,35.4437,139.6380,0
5630,2020-03-14,US,West Virginia,0,0,0,38.4912,-80.9545,0


In [6]:
df['Country/Region'].unique()

array(['China', 'Hong Kong', 'Macau', 'Taiwan', 'US', 'Japan', 'Thailand',
       'South Korea', 'Singapore', 'Philippines', 'Malaysia', 'Vietnam',
       'Australia', 'Mexico', 'Brazil', 'Colombia', 'France', 'Nepal',
       'Canada', 'Cambodia', 'Sri Lanka', 'Ivory Coast', 'Germany',
       'Finland', 'United Arab Emirates', 'India', 'Italy', 'UK',
       'Russia', 'Sweden', 'Spain', 'Belgium', 'Cruise Ship', 'Egypt',
       'Iran', 'Israel', 'Lebanon', 'Iraq', 'Oman', 'Afghanistan',
       'Bahrain', 'Kuwait', 'Austria', 'Algeria', 'Croatia',
       'Switzerland', 'Pakistan', 'Georgia', 'Greece', 'North Macedonia',
       'Norway', 'Romania', 'Denmark', 'Estonia', 'Netherlands',
       'San Marino', 'Azerbaijan', 'Belarus', 'Iceland', 'Lithuania',
       'New Zealand', 'Nigeria', 'North Ireland', 'Ireland', 'Luxembourg',
       'Monaco', 'Qatar', 'Ecuador', 'Czech Republic', 'Armenia',
       'Dominican Republic', 'Indonesia', 'Portugal', 'Andorra', 'Latvia',
       'Morocco', 'Saud

In [7]:
df[df['Country/Region'] == 'US'].groupby(['date', 'Province/State'])[['Confirmed', 'Deaths', 'Recovered', 'Active']].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed,Deaths,Recovered,Active
date,Province/State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-22,Washington,1,0,0,1
2020-01-23,Washington,1,0,0,1
2020-01-24,Chicago,1,0,0,1
2020-01-24,Washington,1,0,0,1
2020-01-25,Illinois,1,0,0,1
...,...,...,...,...,...
2020-03-14,Virginia,41,1,0,40
2020-03-14,Washington,572,37,1,534
2020-03-14,West Virginia,0,0,0,0
2020-03-14,Wisconsin,27,0,1,26


In [8]:
df[df['Country/Region'] == 'US'].groupby('date')[['Confirmed', 'Deaths', 'Recovered', 'Active']].sum()

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-22,1,0,0,1
2020-01-23,1,0,0,1
2020-01-24,2,0,0,2
2020-01-25,2,0,0,2
2020-01-26,5,0,0,5
2020-01-27,5,0,0,5
2020-01-28,5,0,0,5
2020-01-29,5,0,0,5
2020-01-30,5,0,0,5
2020-01-31,6,0,0,6


In [9]:
# active cases
df[(df['Country/Region'] == 'US') & (df['date'] == df['date'].iloc[-1])]['Confirmed'].sum() - \
df[(df['Country/Region'] == 'US') & (df['date'] == df['date'].iloc[-1])]['Deaths'].sum() - \
df[(df['Country/Region'] == 'US') & (df['date'] == df['date'].iloc[-1])]['Recovered'].sum()

2660

In [10]:
# confirmed
df[(df['Country/Region'] == 'US') & (df['date'] == df['date'].iloc[-1])]['Confirmed'].sum()

2726

In [11]:
# recovered
df[(df['Country/Region'] == 'US') & (df['date'] == df['date'].iloc[-1])]['Recovered'].sum()

12

In [12]:
# deaths
df[(df['Country/Region'] == 'US') & (df['date'] == df['date'].iloc[-1])]['Deaths'].sum()

54

In [13]:
df[df['date'] == df['date'].iloc[-1]]['Confirmed'].sum()

156099

In [14]:
# fatality rate
'{:.2f}%'.format(100 *
                df[df['date'] == df['date'].iloc[-1]]['Deaths'].sum() /
                df[df['date'] == df['date'].iloc[-1]]['Confirmed'].sum())

'3.73%'

In [15]:
fig = go.Figure([go.Scatter(x=df[df['Country/Region'] == 'US'].groupby('date')['date'].first(),
                            y=df[df['Country/Region'] == 'US'].groupby('date')['Active'].sum())])
fig.show()

In [16]:
region = 'US'

print('Fatality rate: {:.2f}%'.format(100 *
                                     df[(df['Country/Region'] == region) &
                                        (df['date'] == df['date'].iloc[-1])]['Deaths'].sum() /
                                     df[(df['Country/Region'] == region) &
                                        (df['date'] == df['date'].iloc[-1])]['Confirmed'].sum()))

fig = go.Figure()
fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Active'].sum(),
                name="Active cases",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Confirmed'].sum(),
                name="Confirmed",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Deaths'].sum(),
                name="Deaths",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Recovered'].sum(),
                name="Recovered",
                opacity=0.8))

fig.update_layout(title="COVID-19 infections in {}".format(region),
                  xaxis_title="Date",
                  yaxis_title="Number of Individuals")

fig.show()

Fatality rate: 1.98%


In [17]:
region = 'China'

print('Fatality rate: {:.2f}%'.format(100 *
                                     df[(df['Country/Region'] == region) &
                                        (df['date'] == df['date'].iloc[-1])]['Deaths'].sum() /
                                     df[(df['Country/Region'] == region) &
                                        (df['date'] == df['date'].iloc[-1])]['Confirmed'].sum()))

fig = go.Figure()
fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Active'].sum(),
                name="Active cases",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Confirmed'].sum(),
                name="Confirmed",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Deaths'].sum(),
                name="Deaths",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Recovered'].sum(),
                name="Recovered",
                opacity=0.8))

fig.update_layout(title="COVID-19 infections in {}".format(region),
                  xaxis_title="Date",
                  yaxis_title="Number of Individuals")

fig.show()

Fatality rate: 3.94%


In [18]:
region = 'Italy'

print('Fatality rate: {:.2f}%'.format(100 *
                                     df[(df['Country/Region'] == region) &
                                        (df['date'] == df['date'].iloc[-1])]['Deaths'].sum() /
                                     df[(df['Country/Region'] == region) &
                                        (df['date'] == df['date'].iloc[-1])]['Confirmed'].sum()))

fig = go.Figure()
fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Active'].sum(),
                name="Active cases",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Confirmed'].sum(),
                name="Confirmed",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Deaths'].sum(),
                name="Deaths",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Recovered'].sum(),
                name="Recovered",
                opacity=0.8))

fig.update_layout(title="COVID-19 infections in {}".format(region),
                  xaxis_title="Date",
                  yaxis_title="Number of Individuals")

fig.show()

Fatality rate: 6.81%


In [19]:
region = 'South Korea'

print('Fatality rate: {:.2f}%'.format(100 *
                                     df[(df['Country/Region'] == region) &
                                        (df['date'] == df['date'].iloc[-1])]['Deaths'].sum() /
                                     df[(df['Country/Region'] == region) &
                                        (df['date'] == df['date'].iloc[-1])]['Confirmed'].sum()))

fig = go.Figure()
fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Active'].sum(),
                name="Active cases",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Confirmed'].sum(),
                name="Confirmed",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Deaths'].sum(),
                name="Deaths",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == region].groupby('date')['Recovered'].sum(),
                name="Recovered",
                opacity=0.8))

fig.update_layout(title="COVID-19 infections in {}".format(region),
                  xaxis_title="Date",
                  yaxis_title="Number of Individuals")

fig.show()

Fatality rate: 0.89%


In [20]:
print('Fatality rate: {:.2f}%'.format(100 *
                                     df[df['date'] == df['date'].iloc[-1]]['Deaths'].sum() /
                                     df[df['date'] == df['date'].iloc[-1]]['Confirmed'].sum()))

fig = go.Figure()
fig.add_trace(go.Scatter(
                x=df.groupby('date')['date'].first(),
                y=df.groupby('date')['Active'].sum(),
                name="Active cases",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df.groupby('date')['date'].first(),
                y=df.groupby('date')['Confirmed'].sum(),
                name="Confirmed",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df.groupby('date')['date'].first(),
                y=df.groupby('date')['Deaths'].sum(),
                name="Deaths",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df.groupby('date')['date'].first(),
                y=df.groupby('date')['Recovered'].sum(),
                name="Recovered",
                opacity=0.8))

fig.update_layout(title="COVID-19 infections Worldwide",
                  xaxis_title="Date",
                  yaxis_title="Number of Individuals")

fig.show()

Fatality rate: 3.73%


In [21]:
fig = go.Figure()
fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == 'China'].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == 'China'].groupby('date')['Active'].sum(),
                name="China",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == 'Italy'].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == 'Italy'].groupby('date')['Active'].sum(),
                name="Italy",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == 'South Korea'].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == 'South Korea'].groupby('date')['Active'].sum(),
                name="South Korea",
                opacity=0.8))

fig.add_trace(go.Scatter(
                x=df[df['Country/Region'] == 'US'].groupby('date')['date'].first(),
                y=df[df['Country/Region'] == 'US'].groupby('date')['Active'].sum(),
                name="US",
                opacity=0.8))

fig.update_layout(title="Active COVID-19 cases",
                  xaxis_title="Date",
                  yaxis_title="Number of Individuals")

fig.show()

In [22]:
fig = go.Figure()
for region in ['China', 'Italy', 'US', 'South Korea']:
    fig.add_trace(go.Scatter(
                    x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                    y=df[df['Country/Region'] == region].groupby('date')['Active'].sum(),
                    name=region,
                    hoverinfo='x+y+z+text+name',
                    opacity=0.8,
                    stackgroup='one'))

fig.update_layout(title="COVID-19 Active Cases Worldwide",
                  xaxis_title="Date",
                  yaxis_title="Number of Individuals")

fig.show()

In [23]:
fig = go.Figure()
for region in df['Country/Region'].unique():
    if df[(df['date'] == df['date'].iloc[-1]) & (df['Country/Region'] == region)]['Confirmed'].sum() > 500:
        fig.add_trace(go.Scatter(
                        x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                        y=df[df['Country/Region'] == region].groupby('date')['Active'].sum(),
                        name=region,
                        hoverinfo='x+y+z+text+name',
                        opacity=0.8,
                        stackgroup='one'))

fig.update_layout(title="COVID-19 Active Cases Worldwide (Countries with greater than 100 confirmed cases)",
                  xaxis_title="Date",
                  yaxis_title="Number of Individuals")

fig.show()

In [24]:
fig = go.Figure()
for region in df['Country/Region'].unique():
    if df[(df['date'] == df['date'].iloc[-1]) & (df['Country/Region'] == region)]['Confirmed'].sum() > 100:
        fig.add_trace(go.Scatter(
                        x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                        y=df[df['Country/Region'] == region].groupby('date')['Confirmed'].sum(),
                        name=region,
                        hoverinfo='x+y+z+text+name',
                        opacity=0.8,
                        stackgroup='one'))

fig.update_layout(title="COVID-19 Confirmed Cases Worldwide (Countries with greater than 100 confirmed cases)",
                  xaxis_title="Date",
                  yaxis_title="Number of Individuals")

fig.show()

In [25]:
fig = go.Figure()
for region in df['Country/Region'].unique():
    if df[(df['date'] == df['date'].iloc[-1]) & (df['Country/Region'] == region)]['Confirmed'].sum() > 100:
        fig.add_trace(go.Scatter(
                        x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                        y=df[df['Country/Region'] == region].groupby('date')['Recovered'].sum(),
                        name=region,
                        hoverinfo='x+y+z+text+name',
                        opacity=0.8,
                        stackgroup='one'))

fig.update_layout(title="COVID-19 Recovered Cases Worldwide (Countries with greater than 100 confirmed cases)",
                  xaxis_title="Date",
                  yaxis_title="Number of Individuals")

fig.show()

In [26]:
fig = go.Figure()
for region in df['Country/Region'].unique():
    if df[(df['date'] == df['date'].iloc[-1]) & (df['Country/Region'] == region)]['Deaths'].sum() > 5:
        fig.add_trace(go.Scatter(
                        x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
                        y=df[df['Country/Region'] == region].groupby('date')['Deaths'].sum(),
                        name=region,
                        hoverinfo='x+y+z+text+name',
                        opacity=0.8,
                        stackgroup='one'))

fig.update_layout(title="COVID-19 Deaths Worldwide (countries with greater than 5 deaths)",
                  xaxis_title="Date",
                  yaxis_title="Number of Individuals")

fig.show()

In [27]:
data = df[df['date'] == df['date'].iloc[-1]].groupby('Country/Region').agg({'Confirmed': 'sum',
                                                                            'Longitude': 'mean',
                                                                            'Latitude': 'mean',
                                                                            'Country/Region': 'first'})
# Manually change some country centroids which are mislocated due to far off colonies
data.loc[data['Country/Region'] == 'US', 'Latitude'] = 39.810489
data.loc[data['Country/Region'] == 'US', 'Longitude'] = -98.555759

data.loc[data['Country/Region'] == 'France', 'Latitude'] = 46.2276
data.loc[data['Country/Region'] == 'France', 'Longitude'] = -3.4360

data.loc[data['Country/Region'] == 'United Kingdom', 'Latitude'] = 55.3781
data.loc[data['Country/Region'] == 'United Kingdom', 'Longitude'] = 2.2137

data.loc[data['Country/Region'] == 'Denmark', 'Latitude'] = 56.2639
data.loc[data['Country/Region'] == 'Denmark', 'Longitude'] = 9.5018

fig = go.Figure(data=go.Scattergeo(
        lon = data['Longitude'],
        lat = data['Latitude'],
        text = data['Country/Region'] + ': ' + data['Confirmed'].astype(str) + ' cases',
        mode = 'markers',
        marker_size = np.sqrt(data['Confirmed'] / 25),
        marker = dict(reversescale = False,
                      autocolorscale = False,
                      symbol = 'circle',
                      line = dict(width=1, color='rgba(102, 102, 102)'),
                      colorscale = 'Reds',
                      cmin = 0,
                      color = data['Confirmed'],
                      cmax = data['Confirmed'].max(),
                      colorbar_title="Confirmed Cases")))

fig.update_layout(title = 'Number of cases by country',
                  geo=dict(scope='world',
                           projection_type="natural earth",
                           showland = True,
                           landcolor = "rgb(100, 125, 100)",
                           showocean = True,
                           oceancolor = "rgb(150, 150, 250)",
                           showcountries=True,
                           showlakes=False,))
fig.show()

In [28]:
df2 = df.copy()

df2.loc[df2['Country/Region'] == 'Georgia', 'Province/State'] = 'Georgia (country)'
df2['Province/State'].fillna(df2['Country/Region'], inplace=True)

data = df2[df2['date'] == df2['date'].iloc[-1]].groupby('Province/State').agg({'Confirmed': 'sum',
                                                                               'Longitude': 'mean',
                                                                               'Latitude': 'mean',
                                                                               'Country/Region': 'first',
                                                                               'Province/State': 'first'})


fig = go.Figure(data=go.Scattergeo(
        lon = data['Longitude'],
        lat = data['Latitude'],
        text = data['Province/State'] + ', ' + data['Country/Region'] + ': ' + data['Confirmed'].astype(str),
        mode = 'markers',
        marker_size = np.sqrt(data['Confirmed'] / 5),
        marker = dict(reversescale = False,
                      autocolorscale = False,
                      symbol = 'circle',
                      line = dict(width=1, color='rgba(102, 102, 102)'),
                      colorscale = 'Reds',
                      cmin = 0,
                      color = data['Confirmed'],
                      cmax = data['Confirmed'].max(),
                      colorbar_title="Confirmed Cases")))

fig.update_layout(title = 'Number of cases by region',
                  geo=dict(scope='world',
                           projection_type="natural earth",
                           showland = True,
                           landcolor = "rgb(100, 125, 100)",
                           showocean = True,
                           oceancolor = "rgb(150, 150, 250)",
                           showcountries=True,
                           showlakes=False,))
fig.show()