# Introduction

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import plotly.graph_objects as go
import plotly.express as px

In [None]:
covid19 = pd.read_csv('../input/novel-corona-virus-2019-dataset/covid_19_data.csv')

In [None]:
covid19.info()

In [None]:
covid19['Active'] = covid19['Confirmed'] - covid19['Deaths'] - covid19['Recovered']

In [None]:
covid19['ObservationDate'] = pd.to_datetime(covid19['ObservationDate'],  format='%m/%d/%Y')

In [None]:
covid19.sample(5)

In [None]:
total_countries = []
for i in covid19['ObservationDate'].unique():
    total_countries.append(covid19[covid19['ObservationDate']==i]['Country/Region'].unique().size)

In [None]:
plt.figure(figsize=(20,5))
plt.plot(covid19['ObservationDate'].unique(), total_countries, label='number of countries')
plt.legend()
plt.xticks(rotation=75)
plt.show()

In [None]:
by_day = covid19.groupby('ObservationDate').sum().sort_values(by='Confirmed')

In [None]:
by_day.tail()

In [None]:
def plot_cases_by_day(df):
    fig, ax1 = plt.subplots(1,1,figsize=(18,7))
    ax1.plot(df['Confirmed'], label='Confirmed')
    ax1.legend(loc='upper left')
    ax1.set_xticklabels(by_day.index, rotation=75)
    ax1.set_ylabel(by_day.columns[1:2][0], fontsize=15, color='b')

    ax2=ax1.twinx()
    ax2._get_lines.prop_cycler = ax1._get_lines.prop_cycler
    ax2.plot(df['Deaths'], 'r', label='Deaths')
    ax2.legend(loc='upper center', bbox_to_anchor=(.3, .998))
    ax2.set_ylabel('Deaths', fontsize=15, color='r')

    ax3=ax1.twinx()
    ax3.spines['right'].set_position(('axes', 1.06))
    ax3._get_lines.prop_cycler = ax1._get_lines.prop_cycler
    ax3.plot(df['Recovered'], 'g', label='Recovered')
    ax3.legend(loc='upper right', bbox_to_anchor=(.6, .998))
    ax3.set_ylabel('Recovered', fontsize=15, color='g')
    ax3.set_xticks(df.index)
    

In [None]:
plot_cases_by_day(by_day)

1. Confirmed Cases plateaued by the end of February, then started increasing again as disease spreads to the other countries.
2. Recovered Cases are increasing linearly now.

In [None]:
def plot_ratio(df):
    fig, ax1 = plt.subplots(1,1,figsize=(20,7))
    ax1.plot(df['Deaths']/df['Confirmed'], 'r', label='Death Ratio')
    ax1.legend(loc='upper left')
    ax1.set_xticklabels(by_day.index, rotation=75)
    ax1.set_ylabel('Death Ratio', fontsize=15, color='r')

    ax2=ax1.twinx()
    ax2._get_lines.prop_cycler = ax1._get_lines.prop_cycler
    ax2.plot(df['Recovered']/df['Confirmed'], 'g', label='Recovered Ratio')
    ax2.legend(loc='upper center')
    ax2.set_ylabel('Recovered Ratio', fontsize=15, color='g')


In [None]:
plot_ratio(by_day)

1. Deaths as a percentage of confirmed cases are stabilized at 3.4% and then increasing
2. Around 30% of the confirmed cases are recovered.

In [None]:
plt.figure(figsize=(20,7))
plt.bar(by_day.index, by_day['Deaths'], label='Deaths')
plt.bar(by_day.index, by_day['Active'], bottom=by_day['Deaths'], label='Under Treatment')
plt.bar(by_day.index, by_day['Recovered'], bottom = by_day['Confirmed'] - by_day['Recovered'], label='Recovered')
# plt.bar(by_day.index, by_day['Recovered'], bottom=by_day['Deaths'], label='Recovered')
# plt.bar(by_day.index, by_day['Confirmed'] - (by_day['Deaths']+by_day['Recovered']), bottom=by_day['Deaths']+by_day['Recovered'], label='Under Treatment')
plt.legend()
plt.xticks(rotation=75)
plt.show()

Calculating the days for all confirmed cases to become either deaths or recovered.

In [None]:
def plot_recovery_days(df):
    recovery_days = [0]
    for i in range(df.shape[0]):
        for j in range(recovery_days[-1], df.shape[0]):
            if df.iloc[j]['Deaths'] + df.iloc[j]['Recovered'] > df.iloc[i]['Confirmed']:
                recovery_days.append(j - i)
                break
        else:
            break
#     return recovery_days
    plt.plot(recovery_days[1:])
    plt.ylabel('days')

In [None]:
plot_recovery_days(by_day)

It is taking 14 days to recover on average now

Since many people are still under treatment ratio of deaths compared to recovered gives better idea about mortality rate, assuming all infected people are documented.

In [None]:
(by_day['Deaths'] / (by_day['Deaths'] + by_day['Recovered'])).tail(40).plot(figsize=(10,5))

# China Cases

In [None]:
china_by_day = covid19[(covid19['Country/Region']=='Mainland China')].groupby('ObservationDate').sum()

In [None]:
china_by_day.tail()

In [None]:
plot_cases_by_day(china_by_day)

In [None]:
plot_ratio(china_by_day)

1. Confirmed cases in China are almost constant in the past few days as new cases are very low.
2. Around 90% of people are recovered.

In [None]:
plot_recovery_days(china_by_day)

In [None]:
(china_by_day['Deaths'] / (china_by_day['Deaths'] + china_by_day['Recovered'])).tail(25).plot(figsize=(10,5))

Mortality Rate is around 4.5% compared to Recovered cases in China.

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Deaths', x=china_by_day.index, y=china_by_day['Deaths']),
    go.Bar(name='Under Treatment', x=china_by_day.index, y=china_by_day['Active']),
    go.Bar(name='Recovered', x=china_by_day.index, y=china_by_day['Recovered'])],
                
    layout=go.Layout(height=500))
fig.update_layout(barmode='stack')
fig.show()

In [None]:
Hubei_by_day = covid19[covid19['Province/State']=='Hubei'].groupby('ObservationDate').sum()

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Deaths', x=Hubei_by_day.index, y=Hubei_by_day['Deaths']),
    go.Bar(name='Under Treatment', x=Hubei_by_day.index, y=Hubei_by_day['Active']),
    go.Bar(name='Recovered', x=Hubei_by_day.index, y=Hubei_by_day['Recovered'])],
                
    layout=go.Layout(height=500))
fig.update_layout(barmode='stack')
fig.show()

In [None]:
plot_cases_by_day(Hubei_by_day)

In [None]:
plot_ratio(Hubei_by_day)

In [None]:
(Hubei_by_day['Deaths'] / (Hubei_by_day['Deaths'] + Hubei_by_day['Recovered'])).tail(25).plot(figsize=(10,5))

In [None]:
Non_hubei = covid19[(covid19['Province/State']!='Hubei') & (covid19['Country/Region']=='Mainland China')].groupby('ObservationDate').sum()

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Deaths', x=Non_hubei.index, y=Non_hubei['Deaths']),
    go.Bar(name='Under Treatment', x=Non_hubei.index, y=Non_hubei['Active']),
    go.Bar(name='Recovered', x=Non_hubei.index, y=Non_hubei['Recovered'])],
                
    layout=go.Layout(height=500))
fig.update_layout(barmode='stack')
fig.show()

In [None]:
plot_cases_by_day(Non_hubei)

In [None]:
plot_ratio(Non_hubei)

In [None]:
(Non_hubei['Deaths'] / (Non_hubei['Deaths'] + Non_hubei['Recovered'])).tail(25).plot(figsize=(10,5))

1. Most of the people are recovered in Non-Hubei provinces of China.
2. Mortality Rate is around 0.9%

In [None]:
plot_recovery_days(Non_hubei)

## Logistic Function

1. Epidemics Usually follow Logistic Function pattern(https://en.wikipedia.org/wiki/Logistic_function) where the initial growth is exponential then will become linear and finally plateaus.
2. The midpoint of Logistic Function is where the growth factor is 1. Growth factor is defined as ratio of new cases in a day to new cases previous day. So, Total cases would be double the cases at midpoint.
3. Confirmed Cases of Non-Hubei provinces of china roughly follows Logistic function.

In [None]:
plt.figure(figsize=(15,5))
plt.plot(Non_hubei.index, Non_hubei['Confirmed'].diff() / Non_hubei['Confirmed'].diff().shift(1)[:30])
plt.plot(Non_hubei.index, [1 for i in range(len(Non_hubei.index))])

In [None]:
plt.figure(figsize=(15,5))
plt.plot((Non_hubei['Confirmed'].diff(2) / Non_hubei['Confirmed'].diff(2).shift(1))[:30])
plt.plot(Non_hubei.index, [1 for i in range(len(Non_hubei.index))])

In [None]:
Non_hubei.loc['02/05/2020', 'Confirmed']

If take the midpoint as 02/05/2020 then total cases can expected at around 15000

# Non China Cases

In [None]:
Non_china = covid19[(covid19['Country/Region']!='Mainland China')].groupby('ObservationDate').sum()

In [None]:
Non_china.tail()

In [None]:
plot_cases_by_day(Non_china)

In [None]:
plot_ratio(Non_china)

In [None]:
plot_recovery_days(Non_china)

In [None]:
(Non_china['Deaths'] / (Non_china['Deaths'] + Non_china['Recovered'])).plot(figsize=(10,5))

In [None]:
plt.figure(figsize=(15,5))
plt.plot(Non_china['Confirmed'].diff() / Non_china['Confirmed'].diff().shift(1))
plt.plot(Non_china.index, [1 for i in range(len(Non_china.index))])

In [None]:
plt.figure(figsize=(15,5))
plt.plot(Non_china['Confirmed'].diff(3) / Non_china['Confirmed'].diff(3).shift())
plt.plot(Non_china.index, [1 for i in range(len(Non_china.index))])

## Observations

1. Non-China cases are increasing almost exponentially.
2. Only 10% of people are recovered.
3. Mortality rate compared to Recovered people is around 30 %
4. Growth factor is very volatile in the initial days because of small no.of cases. In recent days even though it touched 1 a few times, still above 1 most of the times.
5. So, midpoint of Logistic Function is not reached yet.

In [None]:
by_country = covid19[covid19['ObservationDate']==covid19['ObservationDate'].unique()[-1]].groupby('Country/Region').sum().iloc[:, 1:].sort_values(by='Confirmed', ascending=False)

In [None]:
by_country['Case Fatality Rate'] = by_country['Deaths'] / by_country['Confirmed']

In [None]:
by_country = by_country.reset_index()

In [None]:
px.bar(by_country[:30], x = 'Country/Region', y = 'Confirmed', log_y=True, height=500)

In [None]:
px.bar(by_country.sort_values(by='Active', ascending=False)[:30], x = 'Country/Region', y = 'Active', log_y=True, height=500)

In [None]:
px.bar(by_country.sort_values(by='Deaths', ascending=False)[:30], x = 'Country/Region', y = 'Deaths', height=500)

In [None]:
px.bar(by_country[by_country['Confirmed']>100].sort_values(by='Case Fatality Rate', ascending=False).iloc[:40, :], x = 'Country/Region', y = 'Case Fatality Rate', height=500)

In [None]:
by_country = by_country.set_index('Country/Region')

In [None]:
death_ratio = (by_country['Deaths']/(by_country['Deaths'] + by_country['Recovered']))[((by_country['Recovered'])>20)].sort_values(ascending=False)

In [None]:
px.bar(x=death_ratio.index, y=death_ratio.values)

In [None]:
data = go.Choropleth(z = by_country['Confirmed'], locations = by_country.index, locationmode = 'country names', text = 'Confirmed', colorscale = 'YlOrRd', 
        reversescale=False, marker_line_color='darkgray', marker_line_width=0.5, colorbar_tickprefix = '', colorbar_title = 'cases')
    
layout = go.Layout(autosize=False, width=1000, height=500, title_text='Confirmed Cases',
        geo=dict(showframe=True, showcoastlines=True, projection_type='robinson'))

fig = go.Figure(data = data, layout = layout)

fig.show()

In [None]:
data = go.Choropleth(z = by_country['Deaths'], locations = by_country.index, locationmode = 'country names', text = 'Deaths', colorscale = 'Reds', autocolorscale=False,
        reversescale=False, marker_line_color='darkgray', marker_line_width=0.5, colorbar_tickprefix = '', colorbar_title = 'Deaths')
    
layout = go.Layout(autosize=False, width=1000, height=500, title_text='Deaths',
        geo=dict(showframe=True, showcoastlines=True, projection_type='robinson'))

fig = go.Figure(data = data, layout = layout)

fig.show()

In [None]:
covid19[covid19['ObservationDate']==covid19['ObservationDate'].unique()[-1]].groupby(['Country/Region', 'Province/State']).sum().groupby(['Country/Region']).size()

In [None]:
USA = covid19[covid19['ObservationDate']==covid19['ObservationDate'].unique()[-1]].groupby(['Country/Region','Province/State']).sum().xs('US').iloc[:, 1:].sort_values('Confirmed', ascending=False)

In [None]:
px.bar(USA, x=USA.index, y='Confirmed')

In [None]:
fig = px.line(covid19[(covid19['Country/Region']!='Mainland China')].groupby(['ObservationDate', 'Country/Region']).sum().reset_index().iloc[1000:, :],
              x="ObservationDate", y="Confirmed", color='Country/Region')
fig.show()

In [None]:
df = covid19.groupby(['Country/Region','ObservationDate']).sum()

In [None]:
country_100 = []
for i in by_country.index[by_country['Confirmed'] >= 100]:
    country_100.append(pd.Series(df[df['Confirmed'] > 100].loc[i].reset_index()['Confirmed'], name=i))

In [None]:
country_100 = pd.concat(country_100, axis=1)

In [None]:
country_100

In [None]:
px.line(pd.melt(country_100.reset_index(), id_vars='index', var_name='Country', value_name='Confirmed Cases'), x='index', y='Confirmed Cases', color='Country', log_y=True)

In [None]:
South_Korea = covid19[covid19['Country/Region']=='South Korea'].groupby('ObservationDate').sum()

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Deaths', x=South_Korea.index, y=South_Korea['Deaths']),
    go.Bar(name='Under Treatment', x=South_Korea.index, y=South_Korea['Active']),
    go.Bar(name='Recovered', x=South_Korea.index, y=South_Korea['Recovered'])],
                
    layout=go.Layout(height=500))
fig.update_layout(barmode='stack')
fig.show()

In [None]:
plt.figure(figsize=(15,5))
plt.plot((South_Korea['Confirmed'].diff() / South_Korea['Confirmed'].diff().shift(1))[35:])
plt.plot(South_Korea.index, [1 for i in range(len(South_Korea.index))])

In [None]:
Italy = covid19[covid19['Country/Region']=='Italy'].groupby('ObservationDate').sum()

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Deaths', x=Italy.index, y=Italy['Deaths']),
    go.Bar(name='Under Treatment', x=Italy.index, y=Italy['Active']),
    go.Bar(name='Recovered', x=Italy.index, y=Italy['Recovered'])],
                
    layout=go.Layout(height=500))
fig.update_layout(barmode='stack')
fig.show()

In [None]:
plt.figure(figsize=(15,5))
plt.plot(Italy['Confirmed'].diff(2) / Italy['Confirmed'].diff(2).shift()[30:])
plt.plot(Italy.index, [1 for i in range(len(Italy.index))])

In [None]:
India = covid19[covid19['Country/Region']=='India'].groupby('ObservationDate').sum()

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Deaths', x=India.index, y=India['Deaths']),
    go.Bar(name='Under Treatment', x=India.index, y=India['Active']),
    go.Bar(name='Recovered', x=India.index, y=India['Recovered'])],
                
    layout=go.Layout(height=500))
fig.update_layout(barmode='stack')
fig.show()

In [None]:
plt.figure(figsize=(15,5))
plt.plot(India['Confirmed'].diff(2) / India['Confirmed'].diff(2).shift(1)[35:])
plt.plot(India.index, [1 for i in range(len(India.index))])

In [None]:
df = pd.read_csv('../input/novel-corona-virus-2019-dataset/COVID19_line_list_data.csv')

In [None]:
df.isnull().sum()

Mortality Rates by age

In [None]:
df[(df['age']>0) & (df['age']<=65) & (df['death']=='1')].shape[0] / df[(df['age']>0) & (df['age']<=65)].shape[0]

In [None]:
df[(df['age']>55) & (df['age']<=60) & (df['death']=='1')].shape[0] / df[(df['age']>55) & (df['age']<=60)].shape[0]

In [None]:
df[(df['age']>60) & (df['age']<=65) & (df['death']=='1')].shape[0] / df[(df['age']>60) & (df['age']<=65)].shape[0]

In [None]:
df[(df['age']>65) & (df['death']=='1')].shape[0] / df[df['age']>65].shape[0]

In [None]:
df[(df['age']>75) & (df['death']=='1')].shape[0] / df[df['age']>75].shape[0]

In [None]:
df['age'].sort_values().reset_index()['age'].plot(kind='hist')

In [None]:
df[df['death']=='1']['age'].sort_values().reset_index()['age'].plot(kind='hist')