<h1>COVID-19 Exploratory data analysis</h1>

In [None]:
!pip install dython

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import dython
from dython.nominal import associations
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import datetime
import seaborn as sns

In [None]:
df = pd.read_csv('../input/novel-corona-virus-2019-dataset/covid_19_data.csv')

<h2>Bulk data analysis</h2>

In [None]:
print(df.shape)

In [None]:
df.head()

In [None]:
df.drop(['SNo'],axis=1).describe()

In [None]:
cols = [col for col in df.columns if col not in ['SNo','ObservationDate']]
print(associations(df[cols],figsize=(10,10)))

<h2>Null Values</h2>

In [None]:
df.isnull().sum()

<h2>Percentage of NULL values</h2>

In [None]:
(df.isnull().sum()/df.shape[0])*100

<h2>Fill NULL values</h2>

In [None]:
df['Province/State'] = df['Province/State'].fillna('Unrecorded')

In [None]:
df.dtypes

In [None]:
df[['Confirmed','Deaths','Recovered']] = df[['Confirmed','Deaths','Recovered']].astype(int)

In [None]:
df.dtypes

<h2>Unique values</h2>

In [None]:
df.nunique()

In [None]:
df['Country/Region'].unique()

In [None]:
df['Country/Region'].replace({'Mainland China':'China'},inplace=True)

<h2>All columns</h2>

In [None]:
df.columns

<h2>Feature modeling - live cases</h2>

Live cases are = Confirmed cases - (Deaths + Recovered)

In [None]:
df['Live cases'] = df['Confirmed'] - (df['Deaths']+df['Recovered'])

In [None]:
df.head()

We require the latest update for the current numbers. Set the last update as the most recent one

In [None]:
df[df['Province/State']=='Anhui']

In [None]:
DF = df[df['ObservationDate']==max(df['ObservationDate'])].reset_index()

In [None]:
DF.drop(['index'],axis=1,inplace=True)

In [None]:
DF.head()

In [None]:
DF[DF['Province/State']=='Anhui']

<h2>Exploratory Data analysis</h2>

We have a common last update value for all columns, we can use it as the latest statistic of our dataset

In [None]:
DF.nunique()

In [None]:
df1 = DF.groupby(['ObservationDate'])[['Confirmed','Live cases','Recovered','Deaths']].sum().reset_index()
df1

In [None]:
cols = ['Last Update','Confirmed','Live cases','Recovered','Deaths']
splot = go.Figure(data=[go.Table(header=dict(values=cols),cells=dict(values=df1.loc[0,['ObservationDate','Confirmed','Live cases','Recovered','Deaths']]))])
splot.update_layout(title='Current COVID-19 statistics around the world')
splot.show()

In [None]:
cols = ['Live cases','Recovered','Deaths']
values = df1.loc[0,cols]
fig = px.pie(df1,values=values,names=cols)
fig.update_layout(title='Cases to date:'+str(df1['Confirmed'][0]))
fig.show()

<h2>Coronavirus with respect to time</h2>

In [None]:
df2 = df.groupby('ObservationDate')[['Confirmed','Live cases','Recovered','Deaths']].sum().reset_index()

In [None]:
df2.head()

In [None]:
df2 = df2.sort_values(by='ObservationDate',ascending=True).reset_index()

In [None]:
df2.drop(['index'],axis=1,inplace=True)

In [None]:
df2.head()

In [None]:
splot = go.Figure()
splot.add_trace(go.Scatter(x=df2.index,y=df2.Confirmed,mode='lines',name='Total confirmed cases'))
splot.update_layout(title='Rate of infection with respect to time',xaxis_title='Days',yaxis_title='Total confirmed cases',template='plotly_white')
splot.show()

In [None]:
splot1 = go.Figure()
splot1.add_trace(go.Scatter(x=df2.index,y=df2['Live cases'],mode='lines',name='Live Cases'))
splot1.update_layout(title='Active case rate with respect to time',xaxis_title='Days',yaxis_title='Total Live cases',template='plotly_white')
splot1.show()

In [None]:
splot3 = go.Figure()
splot3.add_trace(go.Scatter(x=df2.index,y=df2.Recovered,mode='lines',name='Recovered'))
splot3.update_layout(title='Recovery rate with resepct to time',xaxis_title='Days',yaxis_title='Recovered',template='plotly_white')

In [None]:
splot3 = go.Figure()
splot3.add_trace(go.Scatter(x=df2.index,y=df2.Deaths,mode='lines',name='Deaths',marker_color='red'))
splot3.update_layout(title='Death toll with resepct to time',xaxis_title='Days',yaxis_title='Deaths',template='plotly_dark')

In [None]:
t1 = go.Figure(go.Bar(x=df2.ObservationDate,y=df2.Confirmed))
t1.update_layout(title='Confirmed cases with respect to days',template='plotly_white',xaxis_title='Total confirmed cases',yaxis_title='Days')
t1.show()

In [None]:
t1 = go.Figure(go.Bar(x=df2.ObservationDate,y=df2['Live cases']))
t1.update_layout(title='Live cases with respect to days',template='plotly_white',xaxis_title='Total live cases',yaxis_title='Days')
t1.show()

In [None]:
t1 = go.Figure(go.Bar(x=df2.ObservationDate,y=df2.Recovered,marker_color='rgb(17,247,5)'))
t1.update_layout(title='Recovered cases with respect to days',template='plotly_white',xaxis_title='Total recovered cases',yaxis_title='Days')
t1.show()

In [None]:
t1 = go.Figure(go.Bar(x=df2.ObservationDate,y=df2.Deaths,marker_color='rgb(255,0,0)'))
t1.update_layout(title='Death toll with respect to days',template='plotly_dark',xaxis_title='Total deaths',yaxis_title='Days')
t1.show()

<h2>Cases with respect to Country</h2>

In [None]:
df3 = df.groupby('Country/Region')[['Confirmed','Live cases','Recovered','Deaths']]

In [None]:
df3 = df3.sum().reset_index()

In [None]:
df3 = df3.sort_values('Confirmed',ascending=False).reset_index(drop=True)

In [None]:
summ = go.Figure(data=[go.Table(header=dict(values=['<b>Name of country</b>','<b>Total confirmed cases</b>'],fill_color='grey',align=['center','center'],font=dict(color='white',size=14)),
                               cells=dict(values=[df3['Country/Region'],df3['Confirmed']],fill_color='white',line_color='grey',align=['center','center'],font=dict(color='black',size=12)))])
summ.update_layout(title='Total confirmed cases with respect to country')
summ.show()

In [None]:
b1 = go.Figure(go.Bar(x=df3.Confirmed,y=df3['Country/Region'],orientation='h'))
b1.update_layout(title='Confirmed cases with respect to country',template='plotly_white',xaxis_title='Confirmed cases',yaxis_title='Countries')
b1.show()

In [None]:
b2 = go.Figure(go.Bar(x=df3['Live cases'],y=df3['Country/Region'],orientation='h'))
b2.update_layout(title='Live cases with respect to country',template='plotly_white',xaxis_title='Live cases',yaxis_title='Countries')
b2.show()

In [None]:
b3 = go.Figure(go.Bar(x=df3.Recovered,y=df3['Country/Region'],orientation='h',marker_color='Green'))
b3.update_layout(title='Recovered cases with respect to country',template='plotly_white',xaxis_title='Recovered cases',yaxis_title='Countries')
b3.show()

In [None]:
b3 = go.Figure(go.Bar(x=df3.Deaths,y=df3['Country/Region'],orientation='h',marker_color='Red'))
b3.update_layout(title='Death toll with respect to country',template='plotly_dark',xaxis_title='Deaths',yaxis_title='Countries')
b3.show()

<h2>Map plots</h2>

In [None]:
m1 = px.choropleth(df3,locations=df3['Country/Region'],color=df3.Confirmed,locationmode='country names',hover_name=df3['Country/Region'],color_continuous_scale=px.colors.sequential.Cividis_r)
m1.update_layout(title='Confirmed cases in each country')
m1.show()

In [None]:
m2 = px.choropleth(df3,locations=df3['Country/Region'],color=df3['Live cases'],locationmode='country names',hover_name=df3['Country/Region'])
m2.update_layout(title='Live cases in each country')
m2.show()

In [None]:
m3 = px.choropleth(df3,locations=df3['Country/Region'],color=df3.Recovered,locationmode='country names',hover_name=df3['Country/Region'],color_continuous_scale=px.colors.sequential.Tealgrn)
m3.update_layout(title='Recovered cases in each country')
m3.show()

In [None]:
m4 = px.choropleth(df3,locations=df3['Country/Region'],color=df3.Deaths,locationmode='country names',hover_name=df3['Country/Region'],color_continuous_scale='OrRd',template='plotly_dark')
m4.update_layout(title='Deaths in each country')
m4.show()

COVID trends over time

In [None]:
df4 = df.groupby(['Country/Region','ObservationDate'])[['Confirmed','Live cases','Recovered','Deaths']].sum().reset_index()
df4.head()

In [None]:
df4 = df4.sort_values('ObservationDate',ascending=True).reset_index(drop=True)
df4.head()

In [None]:
m1 = px.choropleth(df4,locations=df4['Country/Region'],color=df4.Confirmed,locationmode='country names',hover_name=df4['Country/Region'],color_continuous_scale=px.colors.sequential.Cividis_r,animation_frame='ObservationDate')
m1.update_layout(title='Confirmed cases in each country')
m1.show()

In [None]:
m2 = px.choropleth(df4,locations=df4['Country/Region'],color=df4['Live cases'],locationmode='country names',hover_name=df4['Country/Region'],animation_frame='ObservationDate')
m2.update_layout(title='Live cases in each country')
m2.show()

In [None]:
m3 = px.choropleth(df4,locations=df4['Country/Region'],color=df4.Recovered,locationmode='country names',hover_name=df4['Country/Region'],color_continuous_scale=px.colors.sequential.Tealgrn,animation_frame='ObservationDate')
m3.update_layout(title='Recovered cases in each country')
m3.show()

In [None]:
m4 = px.choropleth(df4,locations=df4['Country/Region'],color=df4.Deaths,locationmode='country names',hover_name=df4['Country/Region'],color_continuous_scale='OrRd',template='plotly_dark',animation_frame='ObservationDate')
m4.update_layout(title='Deaths in each country')
m4.show()

In [None]:
bar1 = go.Figure(data=[go.Bar(x=df3['Country/Region'][0:10],y=df3['Confirmed'][0:10],text=df3['Confirmed'][0:10],marker_color='black',textposition='auto')])
bar1.update_layout(title='Top infected countries',xaxis_title='Countries',yaxis_title='Confirmed cases',template='plotly_white')
bar1.show()

In [None]:
bar1 = go.Figure(data=[go.Bar(x=df3['Country/Region'][0:10],y=df3['Live cases'][0:10],text=df3['Confirmed'][0:10],marker_color='black',textposition='auto')])
bar1.update_layout(title='Top live cases countries',xaxis_title='Countries',yaxis_title='Live case count',template='plotly_white')
bar1.show()

In [None]:
bar1 = go.Figure(data=[go.Bar(x=df3['Country/Region'][0:10],y=df3['Recovered'][0:10],text=df3['Confirmed'][0:10],marker_color='green',textposition='auto')])
bar1.update_layout(title='Top recovered cases countries',xaxis_title='Countries',yaxis_title='Recovered cases',template='plotly_white')
bar1.show()

In [None]:
bar1 = go.Figure(data=[go.Bar(x=df3['Country/Region'][0:10],y=df3['Deaths'][0:10],text=df3['Confirmed'][0:10],marker_color='green',textposition='auto')])
bar1.update_layout(title='Top death tolls',xaxis_title='Countries',yaxis_title='Deaths',template='plotly_white')
bar1.show()

In [None]:
fig = go.Figure(data=[go.Scatter(x=df3['Country/Region'][0:10],y=df3.Confirmed[0:10],mode='markers',marker=dict(color=100+np.random.randn(500),size=(df3['Confirmed'][0:10]/2500000),showscale=True))])
fig.update_layout(title='Top confirmed cases',xaxis_title='Countries',yaxis_title='Confirmed cases',template='plotly_white')
fig.show()

<h2>Live cases</h2>

In [None]:
df5 = df.groupby(["Country/Region"])["Live cases"].sum().reset_index().sort_values("Live cases",ascending=False).reset_index(drop=True)

In [None]:
summ = go.Figure(data=[go.Table(header=dict(values=['<b>Name of country</b>','<b>Total active cases</b>'],fill_color='grey',align=['center','center'],font=dict(color='white',size=14)),
                               cells=dict(values=[df5['Country/Region'],df5['Live cases']],fill_color='white',line_color='grey',align=['center','center'],font=dict(color='black',size=12)))])
summ.update_layout(title='Total live cases with respect to country')
summ.show()

In [None]:
bar1 = go.Figure(data=[go.Bar(x=df5['Country/Region'][0:10],y=df5['Live cases'][0:10],text=df5['Live cases'][0:10],marker_color='black',textposition='auto')])
bar1.update_layout(title='Top live cases countries',xaxis_title='Countries',yaxis_title='Live case count',template='plotly_white')
bar1.show()

In [None]:
fig = go.Figure(data=[go.Scatter(x=df5['Country/Region'][0:10],y=df5['Live cases'][0:10],mode='markers',marker=dict(color=100+np.random.randn(500),size=(df5['Live cases'][0:10]/1500000),showscale=True))])
fig.update_layout(title='Live cases',xaxis_title='Countries',yaxis_title='Live cases',template='plotly_white')
fig.show()

In [None]:
fig = go.Figure(data=[go.Scatter(x=df3['Country/Region'][0:10],y=df3['Live cases'][0:10],mode='markers',marker=dict(color=100+np.random.randn(500),size=(df3['Live cases'][0:10]/1500000),showscale=True))])
fig.update_layout(title='Live cases',xaxis_title='Countries',yaxis_title='Live cases',template='plotly_white')
fig.show()

<h2>Death Toll</h2>

In [None]:
df6 = df.groupby(['Country/Region'])[['Deaths']].sum().reset_index().sort_values(by='Deaths',ascending=False).reset_index(drop=True)

In [None]:
df6.head()

In [None]:
dth = go.Figure(data=[go.Table(header=dict(values=['<b>Name of country</b>','<b>Total Deaths</b>'],fill_color='grey',align=['center','center'],font=dict(color='white',size=14)),
                               cells=dict(values=[df6['Country/Region'],df6['Deaths']],fill_color='white',line_color='grey',align=['center','center'],font=dict(color='black',size=12)))])
dth.update_layout(title='Total deaths with respect to country')
dth.show()

In [None]:
bar1 = go.Figure(data=[go.Bar(x=df6['Country/Region'][0:10],y=df6['Deaths'][0:10],text=df6['Deaths'][0:10],marker_color='red',textposition='auto')])
bar1.update_layout(title='Death tolls',xaxis_title='Countries',yaxis_title='Death toll',template='plotly_dark')
bar1.show()

In [None]:
fig = go.Figure(data=[go.Scatter(x=df6['Country/Region'][0:10],y=df6['Deaths'][0:10],mode='markers',marker=dict(color=100+np.random.randn(500),size=(df6['Deaths'][0:10]/150000),showscale=True),marker_colorscale='Oranges')])
fig.update_layout(title='Total Deaths with respect to country',xaxis_title='Countries',yaxis_title='Deaths',template='plotly_dark')
fig.show()

<h2>Recovered cases</h2>

In [None]:
df7 = df.groupby('Country/Region').sum().reset_index().sort_values(by='Recovered',ascending=False).reset_index(drop=True)

In [None]:
df7.head()

In [None]:
dth = go.Figure(data=[go.Table(header=dict(values=['<b>Name of country</b>','<b>Total Recovered cases</b>'],fill_color='grey',align=['center','center'],font=dict(color='white',size=14)),
                               cells=dict(values=[df7['Country/Region'],df7['Recovered']],fill_color='white',line_color='grey',align=['center','center'],font=dict(color='black',size=12)))])
dth.update_layout(title='Total deaths with respect to country')
dth.show()

In [None]:
bar1 = go.Figure(data=[go.Bar(x=df7['Country/Region'][0:10],y=df7['Recovered'][0:10],text=df7['Recovered'][0:10],marker_color='green',textposition='auto')])
bar1.update_layout(title='Top 10 countries - Recovered cases',xaxis_title='Countries',yaxis_title='Death toll',template='plotly_white')
bar1.show()

In [None]:
fig = go.Figure(data=[go.Scatter(x=df7['Country/Region'][0:10],y=df7['Recovered'][0:10],mode='markers',marker=dict(color=100+np.random.randn(500),size=(df7['Recovered'][0:10]/1050000),showscale=True),marker_colorscale='Greens')])
fig.update_layout(title='Total Recovered cases with respect to country',xaxis_title='Countries',yaxis_title='Recovered',template='plotly_white')
fig.show()

<h2>Conclusion</h2>
Through the given notebook, we were able to derive key-insights regarding the spread and patterns of the COVID-19 pandemic. We can further analyze each country through their respective datasets to dig deeper and extract patterns or causes regarding the same