In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import plotly.express as px
from plotly.offline import plot

In [28]:
covid_confirmed   = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
covid_deaths = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
covid_recovered = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')


In [29]:
print(covid_confirmed.shape)
print(covid_deaths.shape)
print(covid_recovered.shape)


(254, 73)
(254, 73)
(240, 73)


In [30]:
covid_confirmed_long = pd.melt(covid_confirmed,
                               id_vars=covid_confirmed.iloc[:, :4],
                               var_name='date',
                               value_name='confirmed')

covid_deaths_long = pd.melt(covid_deaths,
                               id_vars=covid_deaths.iloc[:, :4],
                               var_name='date',
                               value_name='deaths')

covid_recovered_long = pd.melt(covid_recovered,
                               id_vars=covid_recovered.iloc[:, :4],
                               var_name='date',
                               value_name='recovered')

print(covid_confirmed_long.head())

#Merge all three dataframes into one for easy plotting
covid_df = covid_confirmed_long
covid_df['deaths'] = covid_deaths_long['deaths']
covid_df['recovered'] = covid_recovered_long['recovered']

print(covid_df.head())

#Add a new column for active cases
covid_df['active'] = covid_df['confirmed'] - covid_df['deaths'] - covid_df['recovered']
print(covid_df.head())

#Since we are reading raw csv files again, let us clean up the data
#Replace mainland china by china and fill null values with 0

covid_df['Country/Region'].replace('Mainland China', 'China', inplace=True)
covid_df[['Province/State']] = covid_df[['Province/State']].fillna('')
covid_df.fillna(0, inplace=True)
print(covid_df.isna().sum().sum())

#Save the data as csv to local drive
covid_df.to_csv('covid_df.csv', index=None) #optional but would be nice to have a local copy

#if saved to local drive reload it.
#pd.read_csv('covid_df.csv')

#Aggregate data by Country/Region and then Province/State
#Find out maximum values as a function of time
covid_countries_df = covid_df.groupby(['Country/Region', 'Province/State']).max().reset_index()

#Group the data by Country/Region, get sum of cases every state in the country.
covid_countries_df = covid_countries_df.groupby('Country/Region').sum().reset_index()

#Remove Lat and Long columns as we would not be using them
covid_countries_df.drop(['Lat', 'Long'], axis=1, inplace=True)

print(covid_countries_df)

# DATA is READY to be plotted.

#Top 10 countries with confirmed cases
top_10_confirmed = covid_countries_df.sort_values(by='confirmed', ascending=False).head(10)

fig = px.bar(top_10_confirmed.sort_values(by='confirmed', ascending=False),
              x="Country/Region", y="confirmed",
             title='Confirmed Cases of COVID-19 of top 10 Countries', 
             text="confirmed", template='plotly_dark')

fig.update_traces(textposition='outside', texttemplate='%{text:.2s}')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

plot(fig)

  Province/State Country/Region      Lat     Long     date  confirmed
0            NaN    Afghanistan  33.0000  65.0000  1/22/20          0
1            NaN        Albania  41.1533  20.1683  1/22/20          0
2            NaN        Algeria  28.0339   1.6596  1/22/20          0
3            NaN        Andorra  42.5063   1.5218  1/22/20          0
4            NaN         Angola -11.2027  17.8739  1/22/20          0
  Province/State Country/Region      Lat     Long     date  confirmed  deaths  \
0            NaN    Afghanistan  33.0000  65.0000  1/22/20          0       0   
1            NaN        Albania  41.1533  20.1683  1/22/20          0       0   
2            NaN        Algeria  28.0339   1.6596  1/22/20          0       0   
3            NaN        Andorra  42.5063   1.5218  1/22/20          0       0   
4            NaN         Angola -11.2027  17.8739  1/22/20          0       0   

   recovered  
0        0.0  
1        0.0  
2        0.0  
3        0.0  
4        0.0  
  P

'temp-plot.html'

In [32]:
#Top 10 countries with high recovery numbers
top_10_recovered = covid_countries_df.sort_values(by='recovered', ascending=False).head(10)
fig = px.bar(top_10_confirmed.sort_values(by='recovered', ascending=True),
             x="recovered", y="Country/Region",
             title='Recovered Cases', text='recovered',
             template='plotly_dark', orientation='h')

fig.update_traces(marker_color='#2ecc71', textposition='outside')
#fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

plot(fig)

'temp-plot.html'

In [31]:
#Top 10 countries with highest number of deaths
top_10_deaths = covid_countries_df.sort_values(by='deaths', ascending=False).head(10)
fig = px.bar(top_10_confirmed.sort_values(by='deaths', ascending=False),
             y="deaths", x="Country/Region",
             title='Death Cases of COVID-19 of top 10 Countries', 
             template='plotly_dark', text="deaths")

fig.update_traces(textposition='outside',texttemplate='%{text:.2s}',marker_color='#e74c3c')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')


plot(fig)

'temp-plot.html'

In [7]:
#Top 10 countries with highest mortality rate
covid_countries_df['mortality_rate'] = round(covid_countries_df['deaths'] / covid_countries_df['confirmed'] * 100, 2)
temp = covid_countries_df[covid_countries_df['confirmed'] > 100]
top_10_mortality_rate = temp.sort_values(by='mortality_rate', ascending=False).head(10)

fig = px.bar(top_10_mortality_rate.sort_values(by='mortality_rate', ascending=True),
             x="mortality_rate", y="Country/Region",
             title='Mortality rate', text='mortality_rate',
             template='plotly_dark', orientation='h',
             width=700, height=600)

fig.update_traces(marker_color='#c0392b', textposition='inside')

plot(fig)

'temp-plot.html'

In [35]:
top_10_active = covid_countries_df.sort_values(by='active', ascending=False).head(10)
fig = px.bar(top_10_active.sort_values(by='active', ascending=True),
             x="active", y="Country/Region",
             title='Active Cases', text='active',
             template='plotly_dark', orientation='h')

fig.update_traces(marker_color='#2ecc71', textposition='outside')
#fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

plot(fig)

'temp-plot.html'