In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import plotly.express as px
from plotly.offline import plot

In [3]:
covid_confirmed   = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
covid_deaths = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
covid_recovered = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')


In [4]:
print(covid_confirmed.shape)
print(covid_deaths.shape)
print(covid_recovered.shape)


(289, 1147)
(289, 1147)
(274, 1147)


In [5]:
covid_confirmed_long = pd.melt(covid_confirmed,
                               id_vars=covid_confirmed.iloc[:, :4],
                               var_name='date',
                               value_name='confirmed')

covid_deaths_long = pd.melt(covid_deaths,
                               id_vars=covid_deaths.iloc[:, :4],
                               var_name='date',
                               value_name='deaths')

covid_recovered_long = pd.melt(covid_recovered,
                               id_vars=covid_recovered.iloc[:, :4],
                               var_name='date',
                               value_name='recovered')

print(covid_confirmed_long.head())

  Province/State Country/Region       Lat       Long     date  confirmed
0            NaN    Afghanistan  33.93911  67.709953  1/22/20          0
1            NaN        Albania  41.15330  20.168300  1/22/20          0
2            NaN        Algeria  28.03390   1.659600  1/22/20          0
3            NaN        Andorra  42.50630   1.521800  1/22/20          0
4            NaN         Angola -11.20270  17.873900  1/22/20          0


In [6]:
covid_df = covid_confirmed_long
covid_df['deaths'] = covid_deaths_long['deaths']
covid_df['recovered'] = covid_recovered_long['recovered']

print(covid_df.head())

  Province/State Country/Region       Lat       Long     date  confirmed  \
0            NaN    Afghanistan  33.93911  67.709953  1/22/20          0   
1            NaN        Albania  41.15330  20.168300  1/22/20          0   
2            NaN        Algeria  28.03390   1.659600  1/22/20          0   
3            NaN        Andorra  42.50630   1.521800  1/22/20          0   
4            NaN         Angola -11.20270  17.873900  1/22/20          0   

   deaths  recovered  
0       0        0.0  
1       0        0.0  
2       0        0.0  
3       0        0.0  
4       0        0.0  


In [7]:
covid_df['active'] = covid_df['confirmed'] - covid_df['deaths'] - covid_df['recovered']
print(covid_df.head())

#Since we are reading raw csv files again, let us clean up the data
#Replace mainland china by china and fill null values with 0

covid_df['Country/Region'].replace('Mainland China', 'China', inplace=True)
covid_df[['Province/State']] = covid_df[['Province/State']].fillna('')
covid_df.fillna(0, inplace=True)
print(covid_df.isna().sum().sum())

#Save the data as csv to local drive
covid_df.to_csv('covid_df.csv', index=None)

  Province/State Country/Region       Lat       Long     date  confirmed  \
0            NaN    Afghanistan  33.93911  67.709953  1/22/20          0   
1            NaN        Albania  41.15330  20.168300  1/22/20          0   
2            NaN        Algeria  28.03390   1.659600  1/22/20          0   
3            NaN        Andorra  42.50630   1.521800  1/22/20          0   
4            NaN         Angola -11.20270  17.873900  1/22/20          0   

   deaths  recovered  active  
0       0        0.0     0.0  
1       0        0.0     0.0  
2       0        0.0     0.0  
3       0        0.0     0.0  
4       0        0.0     0.0  
0


In [8]:
covid_countries_df = covid_df.groupby(['Country/Region', 'Province/State']).max().reset_index()

#Group the data by Country/Region, get sum of cases every state in the country.
covid_countries_df = covid_countries_df.groupby('Country/Region').sum().reset_index()

#Remove Lat and Long columns as we would not be using them
covid_countries_df.drop(['Lat', 'Long'], axis=1, inplace=True)

print(covid_countries_df)

           Country/Region  confirmed  deaths   recovered    active
0             Afghanistan     209451    7896  15055747.0  199987.0
1                 Albania     334457    3598  29700430.0  330352.0
2                 Algeria     271496    6881   9058822.0  264374.0
3                 Andorra      47890     165  11878958.0   47616.0
4                  Angola     105288    1933  17930960.0  103165.0
..                    ...        ...     ...         ...       ...
196    West Bank and Gaza     703228    5708  10129111.0  697520.0
197  Winter Olympics 2022        535       0  14080089.0     535.0
198                 Yemen      11945    2159  28388100.0    9786.0
199                Zambia     343135    4057   8249579.0  331517.0
200              Zimbabwe     264276    5671  10665161.0  254344.0

[201 rows x 5 columns]



The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [11]:
from IPython.display import display
import plotly.express as px

# Your existing code
top_10_confirmed = covid_countries_df.sort_values(by='confirmed', ascending=False).head(10)

fig = px.bar(top_10_confirmed.sort_values(by='confirmed', ascending=True),
             x="confirmed", y="Country/Region",
             title='Confirmed Cases', text='confirmed',
             template='plotly_dark', orientation='h')

fig.update_traces(marker_color='#3498db', textposition='outside')

# Use display() to show the plot directly in the Colab notebook
display(fig)


In [12]:
from IPython.display import display
import plotly.express as px

top_10_recovered = covid_countries_df.sort_values(by='recovered', ascending=False).head(10)
fig = px.bar(top_10_recovered.sort_values(by='recovered', ascending=True),
             x="recovered", y="Country/Region",
             title='Recovered Cases', text='recovered',
             template='plotly_dark', orientation='h')

fig.update_traces(marker_color='#2ecc71', textposition='outside')

display(fig)

In [13]:
from IPython.display import display
import plotly.express as px

top_10_deaths = covid_countries_df.sort_values(by='deaths', ascending=False).head(10)
fig = px.bar(top_10_confirmed.sort_values(by='deaths', ascending=True),
             x="deaths", y="Country/Region",
             title='Death Cases', text='deaths',
             template='plotly_dark', orientation='h')

fig.update_traces(marker_color='#e74c3c', textposition='outside')
display(fig)

In [14]:
covid_countries_df['mortality_rate'] = round(covid_countries_df['deaths'] / covid_countries_df['confirmed'] * 100, 2)
temp = covid_countries_df[covid_countries_df['confirmed'] > 100]
top_10_mortality_rate = temp.sort_values(by='mortality_rate', ascending=False).head(10)

In [15]:
from IPython.display import display
import plotly.express as px

fig = px.bar(top_10_mortality_rate.sort_values(by='mortality_rate', ascending=True),
             x="mortality_rate", y="Country/Region",
             title='Mortality rate', text='mortality_rate',
             template='plotly_dark', orientation='h',
             width=700, height=600)

fig.update_traces(marker_color='#c0392b', textposition='outside')
display(fig)