In [None]:
import gc
import os
from pathlib import Path
import random
import sys

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import scipy as sp
from scipy import integrate, optimize

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML

# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.templates.default = "plotly_dark"



# Time Series Analysis: Country Level

In [None]:
covid_19_india = pd.read_csv('/kaggle/input/covid19-in-india/covid_19_india.csv')
covid_19_india.head()

In [None]:
covid_19_india['Date'] = pd.to_datetime(covid_19_india['Date'], dayfirst=True)
CS_covid = covid_19_india[['Date','State/UnionTerritory','Cured','Deaths','Confirmed']]
CS_covid = CS_covid.groupby('Date')[['Confirmed', 'Cured','Deaths']].sum().reset_index()
CS_covid.tail()

In [None]:
CS_covid['Active'] = CS_covid['Confirmed'] - CS_covid['Cured'] - CS_covid['Deaths']
CS_covid['new_case/day'] = CS_covid['Confirmed'] - CS_covid['Confirmed'].shift(1)
CS_covid['growth_ratio'] = CS_covid['new_case/day'] / CS_covid['new_case/day'].shift(1)
CS_covid['new_case/day'] = CS_covid['new_case/day'].replace(np.nan,'0.0')
CS_covid['growth_ratio'] = CS_covid['growth_ratio'].replace(np.nan,'0.0')

In [None]:
CS_covid_melt_df = pd.melt(CS_covid, id_vars=['Date'], value_vars=['Confirmed','Active','Cured','new_case/day','Deaths'])

In [None]:
target_date = CS_covid_melt_df['Date'].max()
fig = px.line(CS_covid_melt_df, x="Date", y="value", color='variable', 
              title=f'All-India Cases as of {target_date}')
fig.show()

# Analysis

*     we can see that number cases increase exponentially after 12th of april. 
*     New cases are not increasing exponentially which is a good sign.
*     From 29th May we can see that numbder of cured patients were increasing and there is very less difference between active and cured patients from 30th may to 10th June.
*     After 10th june, number of cured cased got increased. **#Need to check the reason behind this behaviour (check testing curve between this range)**
*     There is no exponential growth in deaths which is also a good sign.


In [None]:
fig = px.line(CS_covid_melt_df, x="Date", y="value", color='variable',
              title="All-India Cases Over Time (Log scale)",
             log_y=True)
fig.show()

*    we can see that after march 1, 2020, growth of confirmed cases increased and after April 12. growth increase more rapidly 
*    Despite of having lockdown1.0, we can see positive growth in number of new cases of corona.

In [None]:
CS_covid['mortality'] = CS_covid['Deaths'] / CS_covid['Confirmed']

fig = px.line(CS_covid, x="Date", y="mortality", 
              title="All-India Mortality Rate Over Time")
fig.show()


*  We can a see a jump on 14th march and this might be a reason to announce Janta curphew on 22nd march. 
*  we can see mortality is highest on 13th april and continues to be high and after 6th may we can see drop of mortality. may be this because of advancements of medical facilities
*  On 17th april mortality rate shoot up once again it is because of relaxation given by the govt
*  from 17th Jun mortality rate is continously decreasing by time **#Need to check the no of testing happening per day as testing is somehow related to mortality.**

***The growth ratio on day N is the number of confirmed cases on day N divided by the number of confirmed cases on day N-1***

In [None]:
fig = px.line(CS_covid, x="Date", y="growth_ratio", 
              title="All India Growth Factor Over Time")
fig.add_trace(go.Scatter(x=[CS_covid['Date'].min(), CS_covid['Date'].max()], y=[1., 1.], name='Growth factor=1.', line=dict(dash='dash', color=('rgb(255, 0, 0)'))))
fig.update_yaxes(range=[0., 4.])
fig.show()


* From march 3rd till April 5th, growth ratio was very high
* We can see some few instance where growth ratio drops i.e cases where decreasing.
* we can see that from May 22, growth ratio is around one i.e cases are not increasing yet not descreasing.

# Time Series Analysis: States having confirmed cases more than 35000

In [None]:
covid_19_state = pd.read_csv('/kaggle/input/covid19-in-india/covid_19_india.csv')
covid_19_state['Date'] = pd.to_datetime(covid_19_state['Date'], dayfirst=True)
covid_19_state = covid_19_state.drop('Sno',axis=1)

In [None]:
states = covid_19_state['State/UnionTerritory'].unique()
print(f'There are {len(states)} states data present \nwhich are as follows: {states}')


In [None]:
covid_19_state = covid_19_state.groupby(['Date','State/UnionTerritory'])[['Confirmed', 'Cured','Deaths']].sum().reset_index()
covid_19_state.tail()

In [None]:
target_date = covid_19_state['Date'].max()

print('As per Date:', target_date)
for i in [1, 10, 100, 1000, 10000,25000,35000,57000]:
    n_states = len(covid_19_state.query('(Date == @target_date) & Confirmed > @i'))
    print(f'{n_states} states have more than {i} confirmed cases')

# Confirmed/Cured/Deaths in states

In [None]:
top_states_df = covid_19_state.query('(Date == @target_date) & (Confirmed > 35000)').sort_values('Confirmed', ascending=False)
top_states_df['Active'] = top_states_df['Confirmed'] - top_states_df['Cured'] - top_states_df['Deaths']
top_states_melt_df = pd.melt(top_states_df, id_vars='State/UnionTerritory', value_vars=['Confirmed','Active', 'Cured','Deaths'])

In [None]:
fig = px.bar(top_states_melt_df.iloc[::-1],
             x='value', y='State/UnionTerritory', color='variable', barmode='group',
             title=f'Confirmed/Cured/Deaths as on {target_date}', text='value', height=800, orientation='h')
fig.show()

* We have 9 states as of 17th July which are having more than 35000 confirmed cases.


# Mortality rate HIGH

In [None]:
top_state_df = covid_19_state.query('(Date == @target_date) & (Confirmed > 35000)')
top_state_df['mortality_rate'] = covid_19_state['Deaths'] / covid_19_state['Confirmed']
top_state_df = top_state_df.sort_values('mortality_rate', ascending=False)

In [None]:
fig = px.bar(top_state_df[:].iloc[::-1],
             x='mortality_rate', y='State/UnionTerritory',
             title=f'Mortality rate HIGH as on {target_date}', text='mortality_rate', height=500, orientation='h')
fig.show()

# Lowest Mortality rate states

In [None]:
fig = px.bar(top_state_df[::].iloc[::],
             x='mortality_rate', y='State/UnionTerritory',
             title=f'Lowest Mortality rate states on {target_date}', text='mortality_rate', height=500, orientation='h')
fig.show()

* Despite of having more confirmed cases, Tamil Nadu has less deaths as compared to Delhi. **#Need to check: Can we have that if testing in tamil nadu is going on a fater rate then probably we can diagnose the disease in the early stages**
* Despite of having less confirmed cases, West bengal has more deaths as compared to others. 

# Time Series Analysis: State Level

In [None]:
covid_19_state_ = covid_19_state.query('Date > "2020-01-01"')
covid_19_state_['prev_confirmed'] = covid_19_state_.groupby('State/UnionTerritory')['Confirmed'].shift(1)
covid_19_state_['new_case'] = covid_19_state_['Confirmed'] - covid_19_state_['prev_confirmed']
covid_19_state_['new_case'].fillna(0, inplace=True)

In [None]:
covid_19_state_['prev_new_case'] = covid_19_state_.groupby('State/UnionTerritory')['new_case'].shift(1)
covid_19_state_['growth_factor'] = covid_19_state_['new_case'] / covid_19_state_['prev_new_case']
covid_19_state_['growth_factor'].fillna(0, inplace=True)


In [None]:
def find_daily_cases(state):
    df_cases_state=covid_19_state_[covid_19_state_['State/UnionTerritory']== state].groupby('Date').sum()
    daily_cases=[df_cases_state['Confirmed'].iloc[0]]
    for i in range(1,len(df_cases_state)):
        daily_cases.append(df_cases_state['Confirmed'].iloc[i]-df_cases_state['Confirmed'].iloc[i-1])
    df_cases_state['new_case']=daily_cases
    return df_cases_state

In [None]:
def state_wise_patients(name,df):
    data = df.loc[df['State/UnionTerritory']==name]
    df = data[['Confirmed','Cured','new_case','growth_factor','Date','State/UnionTerritory']]
#     data = df.groupby('Date')['Confirmed'].nunique()
    data = data.reset_index()
    data['Date']=pd.to_datetime(data['Date'],format = '%d/%m/%Y')
    data = data.sort_values(by=['Date'], ascending=True)
#     data['id'] = data.id.cumsum()
    return data

In [None]:
collection = {}
for i in covid_19_state_['State/UnionTerritory'].unique():
    collection['patients in '+ str(i)] = state_wise_patients(i,covid_19_state_)

In [None]:
keys = list(collection.keys())

In [None]:
visible_True=[]
for i in range(len(keys)):
    visible_True.append(True)
def t2f(i):
    visible = []
    for a in range(len(keys)):
        if a == i:
            visible.append(True)
        else:
            visible.append(False)
    return visible

In [None]:
def create_buttons(keys,title):
    l=[dict(label = 'All',
                  method = 'update',
                  args = [{'visible': visible_True},
                          {'title': title+' India',
                           'showlegend':True}])]
    for i in range(len(keys)):
        l.append(dict(label = keys[i],
                  method = 'update',
                  args = [{'visible': t2f(i)}, # the index of True aligns with the indices of plot traces
                          {'title': title+keys[i].split('in')[1],
                           'showlegend':True}]))
    return l

# Confirmed cases by state in INDIA

In [None]:

fig = go.Figure()
keys = list(collection.keys())
for column in collection:
    fig.add_trace(
        go.Line(
            x = collection[column].Date,
            y = collection[column].Confirmed,
            name = column
        )
    )
    
fig.update_layout(updatemenus=[go.layout.Updatemenu( active=0,buttons=list(create_buttons(keys,'Confirmed Cases:')))])

fig.show()

* As you have seen in maharasshtra the situation started getting bad in the period of 29 - 3 april and then their is near exponancial growth shows that their might had been an outbreak or a system failure
* Though the condition is uttar pradesh was not that harsh as it was in maharashtra but the conditions goes out of handle in the period of 29 - 5 april when cluster spreading starts
* In rajasthan Same period of 29 - 5 april is time when it starts to grow in an exponancial manner
* Okay see again the time bound of 29 - 5 april here we se a sudden growth 30 March(67) - 5 april(571) 9x time growth
* Delhi is too different as the period was 1-5 april when the outbreak happens

In [None]:
covid_19_india = pd.read_csv("../input/covid19-in-india/covid_19_india.csv")
df = covid_19_india
df_top_states = df[(df['State/UnionTerritory']=='Delhi') | (df['State/UnionTerritory']=='Maharashtra') | (df['State/UnionTerritory']=='Gujarat') | (df['State/UnionTerritory']=='Uttar Pradesh') | (df['State/UnionTerritory']=='Tamil Nadu')]


df_test = pd.read_csv("../input/covid19-in-india/StatewiseTestingDetails.csv")
df_top_states_test=df_test[(df_test['State']=='Delhi') | (df_test['State']=='Maharashtra') | (df_test['State']=='Gujarat') | (df_test['State']=='Uttar Pradesh') | (df_test['State']=='Tamil Nadu')]


In [None]:
df = df_top_states
df_delhi = df[(df['State/UnionTerritory']=='Delhi')]
plt.figure(figsize=(8,8))
plt.plot(df_delhi['Date'],df_delhi['Confirmed'],'-r')
plt.rc("xtick",labelsize=9)
plt.rc("ytick",labelsize=10)
ax = plt.gca()
start, end = ax.get_xlim()
ax.xaxis.set_ticks(np.arange(start, end, 12))
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.xticks(rotation=45)
plt.xlabel('Dates')
plt.ylabel('Confirmed Cases')
plt.title('Dates V/S Confirmed Cases')
ax.annotate('Lockdown1', xy =('23/03/20',29), xytext=('23/03/20',15000),ha='center',
                arrowprops = dict(facecolor ='Green',shrink = 10.05),) 
ax.annotate('Lockdown 2', xy =('15/04/20',1561), xytext=('15/04/20',25000),ha='center',
                arrowprops = dict(facecolor ='Blue',shrink = 0.05),) 
ax.annotate('Lockdown 3', xy =('04/05/20',4549), xytext=('04/05/20',35000),ha='center', 
                arrowprops = dict(facecolor ='Yellow',shrink = 0.05),) 
ax.annotate('Lockdown 4', xy =('18/05/20',10054), xytext=('18/05/20',45000),ha='center',
                arrowprops = dict(facecolor ='Black',shrink = 0.05),) 
ax.annotate('Unlock 1.0', xy =('08/06/20',27654), xytext=('08/06/20',5000),ha='center',
                arrowprops = dict(facecolor ='orange',shrink = 0.05),)


df_test = df_top_states_test
df_delhi_test=df_test[(df_test['State']=='Delhi')]
plt.figure(figsize=(8,8))
plt.plot(df_delhi_test['Date'],df_delhi_test['TotalSamples'],'-r')
plt.rc("xtick",labelsize=9)
plt.rc("ytick",labelsize=10)
ax = plt.gca()
start, end = ax.get_xlim()
ax.xaxis.set_ticks(np.arange(start, end, 12))
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.xticks(rotation=45)
plt.xlabel('Dates')
plt.ylabel('Total Samples')
plt.title('Dates V/S Total Samples')
ax.annotate('Lockdown 2', xy =('2020-04-15',16605), xytext=('2020-04-15',55000),ha='center',
                arrowprops = dict(facecolor ='Blue',shrink = 0.05),) 
ax.annotate('Lockdown 3', xy =('2020-05-04',64108), xytext=('2020-05-04',15000),ha='center', 
                arrowprops = dict(facecolor ='Yellow',shrink = 0.05),) 
ax.annotate('Lockdown 4', xy =('2020-05-18',139727), xytext=('2020-05-18',185000),ha='center',
                arrowprops = dict(facecolor ='Black',shrink = 0.05),) 
ax.annotate('Unlock 1.0', xy =('2020-06-08',255615), xytext=('2020-06-08',200000),ha='center',
                arrowprops = dict(facecolor ='orange',shrink = 0.05),) 

* As we can see the cases have been rising eversince the first lockdown and event though the lockdowns helped in keeping the rate of increase in check,after the unlock as we can see in the graph the rate has increased a lot

* The testing has also seen a great amount of increase in Delhi but compared to the cases ,it is a bit underwhelming and also a fact to look at is the testing started around lockdown 2.the rate as we can see has a constant rate of increase ulike the cases graph but we can observe after the unlock the rate at which testing is done has seen a massive increase

# Deaths by states in INDIA

In [None]:
fig = go.Figure()
keys = list(collection.keys())
for column in collection:
    fig.add_trace(
        go.Line(
            x = collection[column].Date,
            y = collection[column].Deaths,
            name = column
        )
    )
    
fig.update_layout(updatemenus=[go.layout.Updatemenu( active=0,buttons=list(create_buttons(keys,'Death Cases: ')))])


fig.show()

# Cured by states in India

In [None]:
fig = go.Figure()
keys = list(collection.keys())
for column in collection:
    fig.add_trace(
        go.Line(
            x = collection[column].Date,
            y = collection[column].Cured,
            name = column
        )
    )
    
fig.update_layout(updatemenus=[go.layout.Updatemenu( active=0,buttons=list(create_buttons(keys,'Cured Cases:')))])


fig.show()

# DAILY New Confirmed cases by states in INDIA

In [None]:
fig = go.Figure()
keys = list(collection.keys())
for column in collection:
    fig.add_trace(
        go.Line(
            x = collection[column].Date,
            y = collection[column].new_case,
            name = column
        )
    )
    
fig.update_layout(updatemenus=[go.layout.Updatemenu( active=0,buttons=list(create_buttons(keys,'New case per Day:')))])


fig.show()

* In Maharastra the daily increase in number of patients were 117 till 8 April but the sudden increase is noted on april 9 and april 13
* Thought uttar pradesh saw a spike on april 4 and after 11 we see peaks on 13 april(75) and 14 april(102)
* Well in rajasthan we see spike on 5 april but latter we saw spikes on 9 and after this day we saw sudden growth of 317 in number of patients
* Tamil Nadu see unprecedental increase on 1 April around 100 patients +ve in a single day explains why 29 - 5 April period in tamil nadu was severe
* Though Delhi has seen up and down in the daily rate of +ve cases but 13 april saw 356 patients in a day

# Growth factor by state in INDIA

In [None]:
fig = px.line(covid_19_state_,
              x='Date', y='growth_factor', color='State/UnionTerritory',
              title=f'Growth factor by state in INDIA')
fig.add_trace(go.Scatter(x=[covid_19_state_['Date'].min(), covid_19_state_['Date'].max()], y=[1., 1.],
                         name='Growth factor=1.', line=dict(dash='dash', color=('rgb(255, 0, 0)'))))
fig.update_yaxes(range=[0., 15.])
fig.show()

# Analysis on age groups 

In [None]:
df1 = pd.read_csv('../input/covid19-in-india/AgeGroupDetails.csv')
df1

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

labels = df1.AgeGroup
values = df1.TotalCases
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'xy'}, {'type':'domain'}]])
fig.add_trace(go.Bar(x=labels, y=values, name="bar",marker = dict(color = 'rgba(0, 174, 174, 0.5)',
                             line=dict(color='rgb(0,0,0)',width=1.5)),
                text = labels),
              1, 1)
fig.add_trace(go.Pie(labels=labels, values=values, name="patients"),
              1, 2)


fig.update_layout(
    title_text="Covid-19 Age group details ")
fig.show()

* As you can see the patients are most in the region of 20-29,30-39 age group
* 20-29 region comprises of 24.9% of the people that are infected while 30-39 capture 21.1% shows that people in the these 2 regions are more infected but nothing much can be said as age group data is less avialable


# Indian COVID-19 Patients Outcome Age-Wise

In [None]:
df_indi = pd.read_csv('../input/covid19-in-india/IndividualDetails.csv')
df_indi.dropna(subset=['current_status', 'age'], inplace=True)
df_indi.reset_index(drop=True, inplace=True)

In [None]:
df_indi['current_status'].unique(), df_indi.shape

In [None]:
df1_indians = df_indi[df_indi['current_status'] == 'Deceased']
df2_indians = df_indi[df_indi['current_status'] == 'Hospitalized']
df3_indians = df_indi[df_indi['current_status'] == 'Recovered']

fig = go.Figure()
fig.add_trace(go.Box(y=df1_indians['age'], name="Deceased Patients"))
fig.add_trace(go.Box(y=df2_indians['age'], name="Hospitalized Patients"))
fig.add_trace(go.Box(y=df3_indians['age'], name="Recovered Patients"))
fig.update_layout(title_text='Indian COVID-19 Patients Outcome Age-Wise')
fig.show()

# Testing All-India 

In [None]:
testingIndia = pd.read_csv('../input/covid19-in-india/StatewiseTestingDetails.csv')
# testingIndia.dtypes

In [None]:
testingIndia['Date'] = pd.to_datetime(testingIndia['Date'])
# testingIndia['Negative'] = pd.to_numeric(testingIndia['Negative'],errors='ignore')

In [None]:
testingIndia_ = testingIndia.groupby(['Date']).sum().reset_index()
testingIndia_

In [None]:
testingIndia_melt_df = pd.melt(testingIndia_, id_vars=['Date'], value_vars=['TotalSamples','Positive'])

In [None]:
target_date = testingIndia_melt_df['Date'].max()
fig = px.line(testingIndia_melt_df, x="Date", y="value", color='variable', 
              title=f'All-India testing Over Time {target_date}')
fig.update_layout(yaxis_type="log")
fig.show()

* Total samples tested are increased in a exponancial way.
* Though the testing rate is increasing but still number of tests are very low in comparison to the 10% population of india
* You can witness how is the increase in TotalSample leads to increase in Total+ve cases in a same manner leading us to infer that more and more testing should be done

In [None]:
testingIndia_state = testingIndia.groupby(['Date','State'])['TotalSamples','Positive'].sum().reset_index()
testingIndia_state.tail()

testingIndia_state_ = testingIndia_state.query('Date > "2020-01-01"')
fig = px.line(testingIndia_state_,
              x='Date', y='TotalSamples', color='State',
              title=f'TotalSamples by state in INDIA, as of {target_date}')
fig.show()

# Medical Facilities

In [None]:
df = pd.read_csv('../input/covid19-in-india/HospitalBedsIndia.csv')
df =df.fillna(0)
df['total_Beds'] = df['NumUrbanBeds_NHP18'] + df['NumRuralBeds_NHP18'] + df['NumPublicBeds_HMIS'] 
df['total_Hospitals'] = df['NumUrbanHospitals_NHP18'] + df['NumRuralHospitals_NHP18'] + df['NumSubDistrictHospitals_HMIS'] + df['NumDistrictHospitals_HMIS']
df.index=df['State/UT']
df = df.drop(columns=['Sno','State/UT'])

In [None]:
df1 = pd.read_csv('../input/covid19-in-india/population_india_census2011.csv')
df1 = df1.sort_values(by='State / Union Territory')
df1 = df1.reset_index()
df1.index = df1['State / Union Territory']
df1 = df1.drop(columns=['index','Sno','State / Union Territory'])
from IPython.display import display, HTML
# display(HTML(df1.to_html()))

In [None]:
df.index.values[0] = df1.index.values[0]
df.index.values[-5] = df1.index.values[-5]
df.index.values[14] = df1.index.values[13]

In [None]:
df['Population'] = df1['Population']
df['Rural population'] = df1['Rural population']
df['Urban population'] = df1['Urban population']
df = df.drop(['Dadra & Nagar Haveli','Daman & Diu'],axis=0)

In [None]:
df['NumPrimaryHealthCenters_HMIS'] = pd.to_numeric(df['NumPrimaryHealthCenters_HMIS'],errors='coerce')
df['NumCommunityHealthCenters_HMIS'] = pd.to_numeric(df['NumCommunityHealthCenters_HMIS'],errors='coerce')
df['TotalPublicHealthFacilities_HMIS'] = pd.to_numeric(df['TotalPublicHealthFacilities_HMIS'],errors='coerce')

In [None]:
df['total_Rural_Hospitals'] = df['NumRuralHospitals_NHP18'] + df['NumSubDistrictHospitals_HMIS'] + df['NumDistrictHospitals_HMIS'] 
df['total_Rural_Beds'] = df['NumRuralBeds_NHP18'] + df['NumPublicBeds_HMIS'] 
df['total_Urban_Hospitals'] =  df['NumUrbanHospitals_NHP18'] + df['NumSubDistrictHospitals_HMIS'] + df['NumDistrictHospitals_HMIS']
df['total_Urban_Beds'] = df['NumUrbanBeds_NHP18'] + df['NumPublicBeds_HMIS'] 
df['total_medical_centres'] = df['NumPrimaryHealthCenters_HMIS'] + df['NumCommunityHealthCenters_HMIS'] + df['TotalPublicHealthFacilities_HMIS']

In [None]:
df["Hospitals (per 100000)"]= np.round(100000*df["total_Hospitals"]/df["Population"],2)
df["Beds (per 100000)"]= np.round(100000*df["total_Beds"]/df["Population"],2)
df["rural Hospitals (per 100000)"]= np.round(100000*df["total_Rural_Hospitals"]/df["Rural population"],2)
df["rural Beds (per 100000)"]= np.round(100000*df["total_Rural_Beds"]/df["Rural population"],2)
df["Urban Hospitals (per 100000)"]= np.round(100000*df["total_Urban_Hospitals"]/df["Urban population"],2)
df["Urban Beds (per 100000)"]= np.round(100000*df["total_Urban_Beds"]/df["Urban population"],2)

In [None]:
df = df[['total_Rural_Beds','total_Urban_Hospitals','total_Urban_Beds','total_medical_centres','Hospitals (per 100000)','Beds (per 100000)'
    ,'rural Hospitals (per 100000)','rural Beds (per 100000)','Urban Hospitals (per 100000)','Urban Beds (per 100000)']]

In [None]:
df_india = covid_19_india.sort_index()
df_india

In [None]:
df_india = df_india.groupby('State/UnionTerritory').sum()

In [None]:
df['confirmed'] = df_india['Confirmed']
df['recovered'] = df_india['Cured']
df['deaths'] = df_india['Deaths']

In [None]:
df = df.sort_values('confirmed',ascending=False)

In [None]:
df.style.background_gradient(cmap='Blues',subset=["Beds (per 100000)"])\
                        .background_gradient(cmap='Reds',subset=["Urban Hospitals (per 100000)"])\
                        .background_gradient(cmap='Greens',subset=["rural Hospitals (per 100000)"])\
                        .background_gradient(cmap='Purples',subset=["rural Beds (per 100000)"])\
                        .background_gradient(cmap='YlOrBr',subset=["Urban Beds (per 100000)"])\
                        .background_gradient(cmap='Oranges',subset=["Hospitals (per 100000)"])\
                        .background_gradient(cmap='Purples',subset=["confirmed"])\
                        .background_gradient(cmap='Greens',subset=["deaths"])\
                        .background_gradient(cmap='Oranges',subset=["recovered"])\

* States like Andaman and Nicobar Islands ,Sikkim ,Tripura,Himachal Pradesh has a very good number in Urban_beds per 100000 as compared to other states but keeping in mind these states has low population as compared to Uttar pradesh, Maharashtra etc.
* Chandigrah has around 13000 rural beds for every 100000 which very good as compared to other region,Delhi also has around 5000 rural beds per 100000 which is also very good this might be a reason that delhi has such low mortality rate
* Rajasthan , Tamil nadu ,Uttar pradesh has very low beds per person but still they have very low mortality rate which is good leading us to think that the government of these states has done some good work on social distancing
* Madhya pradesh and Maharashtra has high mortality rate also they have low bed and hospitals per 100000 in all aspects

In [None]:
import plotly.graph_objects as go


fig = go.Figure(data=go.Heatmap(
                   z=df.corr(),
                   x=df.columns.values,
                   y=df.columns.values,
                   hoverongaps = False),
               layout=go.Layout(height=600, width=900))
fig.show()

* Hospitals(per 100000) have a correlation of -.35 with confirmed cases and -.27 with deaths
* Urban_beds(per 100000) have a correlation values of -.33 with active cases
* Beds(per 100000) has a -ve correlation value with Mortality Rate

# **Predictions**

# Logistic Growth Model

In [None]:
covid_19_ = covid_19_state.copy()
covid_19_.rename(columns = {'State/UnionTerritory':'State_name'}, inplace = True)
columns = covid_19_.sort_values('Confirmed', ascending=False).iloc[:]['State_name'].unique()[:5]

In [None]:
def sigmoid(t, M, beta, alpha, offset=0):
    alpha += offset
    return M / (1 + np.exp(-beta * (t - alpha)))

def error(x, y, params):
    M, beta, alpha = params
    y_pred = sigmoid(x, M, beta, alpha)

    # apply weight, latest number is more important than past.
    weight = np.arange(len(y_pred)) ** 2
    loss_mse = np.mean((y_pred - y) ** 2 * weight)
    return loss_mse

def gen_random_color(min_value=0, max_value=256) -> str:
    """Generate random color for plotly"""
    r, g, b = np.random.randint(min_value, max_value, 3)
    return f'rgb({r},{g},{b})'

In [None]:
def fit_sigmoid(exclude_days=0):
    target_country_df_list = []
    pred_df_list = []
    for target_country in columns:
        print('target_country', target_country)
        # --- Train ---
        target_country_df = covid_19_.query('State_name == @target_country')

        #train_start_date = target_country_df['date'].min()
        train_start_date = target_country_df.query('Confirmed > 1')['Date'].min()
        train_end_date = pd.to_datetime(target_date) - pd.Timedelta(f'{exclude_days} days')
#         print(train_start_date)
#         print(train_end_date)
        target_date_df = target_country_df.query('(Date >= @train_start_date) & (Date <= @train_end_date)')
        if len(target_date_df) <= 7:
            print('WARNING: the data is not enough, use 7 more days...')
            train_start_date -= pd.Timedelta('7 days')
            target_date_df = target_country_df.query('(Date >= @train_start_date) & (Date <= @train_end_date)')

        confirmed = target_date_df['Confirmed'].values
        x = np.arange(len(confirmed))

        lossfun = lambda params: error(x, confirmed, params)
        res = sp.optimize.minimize(lossfun, x0=[np.max(confirmed) * 5, 0.04, 2 * len(confirmed) / 3.], method='nelder-mead')
        M, beta, alpha = res.x
#         sigmoid_models[key] = (M, beta, alpha)
#         np.clip(sigmoid(list(range(len(data), len(data) + steps)), M, beta, alpha), 0, None).astype(int)

        # --- Pred ---
        pred_start_date = target_country_df['Date'].min()
        pred_end_date = pd.to_datetime('2020-10-01')
        days = int((pred_end_date - pred_start_date) / pd.Timedelta('1 days'))
        print('pred start', pred_start_date, 'end', pred_end_date, 'days', days)

        x = np.arange(days)
        offset = (train_start_date - pred_start_date) / pd.Timedelta('1 days')
        print('train_start_date', train_start_date, 'offset', offset, 'params', M, beta, alpha)
        y_pred = sigmoid(x, M, beta, alpha, offset=offset)
#         print(y_pred)
#         target_country_df['confirmed_pred'] = y_pred

        all_dates = [pred_start_date + np.timedelta64(x, 'D') for x in range(days)]
        pred_df = pd.DataFrame({
            'date': all_dates,
            'country': target_country,
            'confirmed_pred': y_pred,
        })

        target_country_df_list.append(target_country_df)
        pred_df_list.append(pred_df)
#         print('*fineshed*')
    return target_country_df_list, pred_df_list

In [None]:
def plot_sigmoid_fitting(target_country_df_list, pred_df_list, title=''):
    n_countries = len(columns)

    # --- Plot ---
    fig = go.Figure()

    for i in range(n_countries):
        target_country = columns[i]
        target_country_df = target_country_df_list[i]
        pred_df = pred_df_list[i]
        color = gen_random_color(min_value=20)
        # Prediction
        fig.add_trace(go.Scatter(
            x=pred_df['date'], y=pred_df['confirmed_pred'],
            name=f'{target_country}_pred',
            line=dict(color=color, dash='dash')
        ))

        # Ground truth
        fig.add_trace(go.Scatter(
            x=target_country_df['Date'], y=target_country_df['Confirmed'],
            mode='markers', name=f'{target_country}_actual',
            line=dict(color=color),
        ))
    fig.update_layout(
        title=title, xaxis_title='Date', yaxis_title='Confirmed cases')
    fig.show()

In [None]:
target_states_df_list, pred_df_list = fit_sigmoid(exclude_days=0)

In [None]:
plot_sigmoid_fitting(target_states_df_list, pred_df_list, title='Sigmoid fitting with all latest data')

In [None]:
target_states_df_list, pred_df_list = fit_sigmoid(exclude_days=14)

In [None]:
plot_sigmoid_fitting(target_states_df_list, pred_df_list, title='Sigmoid fitting without last 14days data')

We can see after ginoring last two weeks of data, our model tends to have be underfitting. 

# COVID-19 Analysis Using SIR MODEL

In [None]:
import pandas as pd
import numpy as np
from scipy.integrate import solve_ivp
from scipy.optimize import minimize
from scipy.integrate import odeint
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score

In [None]:
df_india = pd.read_csv('../input/covid19-in-india/covid_19_india.csv')


In [None]:
# df_IndividualDetails = pd.read_csv('../input/covid19-in-india/IndividualDetails.csv')
# df_IndividualDetails

In [None]:
df_india['Date'] = pd.to_datetime(df_india['Date'], dayfirst=True)
df_india = df_india[['Date','State/UnionTerritory','Cured','Deaths','Confirmed']]
df_india = df_india.groupby('Date')[['Confirmed', 'Cured','Deaths']].sum().reset_index()
df_india.tail()

# Let's see if we can estimate the Beta & Gamma parameter with splitting the dataset into train and validation considering the lockdown

In [None]:
df_india['Active Cases'] = df_india['Confirmed'] - df_india['Cured'] - df_india['Deaths']
df_india['Recovered'] = df_india['Cured'] + df_india['Deaths']
df_india.tail(3)

In [None]:
dbd_prelockdown = df_india[(df_india['Date']>'2020-03-2') & (df_india['Date']<'2020-03-25')].reset_index(drop=True) # considering pre lockdown period
dbd_prelockdown.head(2)

In [None]:
dbd_from_lockdown = df_india[(df_india['Date']>='2020-03-25') & (df_india['Date']<='2020-05-31')].reset_index(drop=True) # considering from lockdown date
dbd_from_lockdown = dbd_from_lockdown[:-1]
dbd_from_lockdown.head(2)

In [None]:
dbd_from_lockdown.tail(2)

In [None]:
dbd_after_lockdown = df_india[(df_india['Date'] > '2020-05-31')].reset_index(drop=True) # considering after lockdown date
dbd_after_lockdown = dbd_after_lockdown[:-1]
dbd_after_lockdown.head(2)

**Beta and Gamma are estimated in the following way:**

* Validation data used is from 3rd Mar to 24th Mar (pre-lockdown period), 25th Mar to 31st May (lockdown period) and 1 june to till date
* Forward prediction of 180 days have been done from current date considering parameter values derived during lockdown period
* Define y(t) for the SIR model, and then use RMSE as the loss function, and used L-BFGS-B gradient descent optimization to minimise the loss function

# Pre-Lockdown Period (2nd March-25th March)



Assumptions taken:
* An initial population of 150000 could have been potentially exposed to COVID-19 as of 3rd March

In [None]:
data = dbd_prelockdown.set_index('Date')['Active Cases']
infected = dbd_prelockdown.set_index('Date')['Confirmed']
recovered = dbd_prelockdown.set_index('Date')['Recovered']

In [None]:
s_0 = 150000
i_0 = 6
r_0 = 3

In [None]:
def loss(point, data, recovered, s_0, i_0, r_0):
    size = len(data)
    beta, gamma = point
    def SIR(t, y):
        S = y[0]
        I = y[1]
        R = y[2]
        return [-beta*S*I/s_0, beta*S*I/s_0-gamma*I, gamma*I]
    solution = solve_ivp(SIR, [0, size], [s_0,i_0,r_0], t_eval=np.arange(0, size, 1), vectorized=True)
    l1 = np.sqrt(np.mean((solution.y[1] - data)**2))
    l2 = np.sqrt(np.mean((solution.y[2] - recovered)**2))
    alpha = 0.1
    return alpha * l1 + (1 - alpha) * l2

In [None]:
def predict(beta, gamma, data, recovered, s_0, i_0, r_0):
    new_index = list(data.index.values)
    size = len(new_index)
    def SIR(t, y):
        S = y[0]
        I = y[1]
        R = y[2]
        return [-beta*S*I/s_0, beta*S*I/s_0-gamma*I, gamma*I]
    extended_actual = np.concatenate((data.values, [None] * (size - len(data.values))))
    extended_recovered = np.concatenate((recovered.values, [None] * (size - len(recovered.values))))
    return new_index, extended_actual, extended_recovered, solve_ivp(SIR, [0, size], [s_0,i_0,r_0], t_eval=np.arange(0, size, 1))

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
def train(recovered, infected, data):
    recovered = recovered
    infected = infected
    data = data

    optimal = minimize(loss, [0.001, 0.001], args=(data, recovered, s_0, i_0, r_0), method='L-BFGS-B', bounds=[(0.00000001, 2), (0.00000001, 0.4)])
    print(optimal)
    beta, gamma = optimal.x
    new_index, extended_actual, extended_recovered, prediction = predict(beta, gamma, data, recovered, s_0, i_0, r_0)
    df = pd.DataFrame({'Actual Infected': extended_actual, 'Actual Recovered': extended_recovered, 'Susceptible': prediction.y[0], 'Predicted Infected': prediction.y[1], 'Predicted Recovered': prediction.y[2]}, index=new_index)
#     print(df)
    MAE = mean_absolute_error(df['Actual Infected'], df['Predicted Infected'])
    MSE = mean_squared_error(df['Actual Infected'], df['Predicted Infected'])
    R_2 = r2_score(df['Actual Infected'], df['Predicted Infected'])
    fig, ax = plt.subplots(figsize=(15, 10))
    ax.set_title('Estimating Beta and Gamma for India during pre-lockdown')
    df.plot(ax=ax)
    print(f"country=India, beta={beta:.8f}, gamma={gamma:.8f}, r_0:{(beta/gamma):.8f},MSE:{MSE:.8f},MAE:{MAE:.8f},R_2:{R_2:.8f}")

In [None]:
train(recovered, infected, data)

# Lockdown Period (25th Mar - 31st May)

Assumptions taken:
* An initial population of 750000 could have been potentially exposed to COVID-19 as of 25th March

In [None]:
data = dbd_from_lockdown.set_index('Date')['Active Cases']
infected = dbd_from_lockdown.set_index('Date')['Confirmed']
recovered = dbd_from_lockdown.set_index('Date')['Recovered']

In [None]:
s_0 = 750000 
i_0 = 606
r_0 = 43

In [None]:
def loss(point, data, recovered, s_0, i_0, r_0):
    size = len(data)
    beta, gamma = point
    def SIR(t, y):
        S = y[0]
        I = y[1]
        R = y[2]
        return [-beta*S*I/s_0, beta*S*I/s_0-gamma*I, gamma*I]
    solution = solve_ivp(SIR, [0, size], [s_0,i_0,r_0], t_eval=np.arange(0, size, 1), vectorized=True)
    l1 = np.sqrt(np.mean((solution.y[1] - data)**2))
    l2 = np.sqrt(np.mean((solution.y[2] - recovered)**2))
    alpha = 0.1
    return alpha * l1 + (1 - alpha) * l2

In [None]:
def predict(beta, gamma, data, recovered, s_0, i_0, r_0):
    new_index = list(data.index.values)
    size = len(new_index)
    def SIR(t, y):
        S = y[0]
        I = y[1]
        R = y[2]
        return [-beta*S*I/s_0, beta*S*I/s_0-gamma*I, gamma*I]
    extended_actual = np.concatenate((data.values, [None] * (size - len(data.values))))
    extended_recovered = np.concatenate((recovered.values, [None] * (size - len(recovered.values))))
    return new_index, extended_actual, extended_recovered, solve_ivp(SIR, [0, size], [s_0,i_0,r_0], t_eval=np.arange(0, size, 1))

In [None]:
def train(recovered, infected, data):
    recovered = recovered
    infected = infected
    data = data

    optimal = minimize(loss, [0.001, 0.001], args=(data, recovered, s_0, i_0, r_0), method='L-BFGS-B', bounds=[(0.000001, 0.5), (0.00000001, 0.4)])
    print(optimal)
    beta, gamma = optimal.x
    new_index, extended_actual, extended_recovered, prediction = predict(beta, gamma, data, recovered, s_0, i_0, r_0)
    df = pd.DataFrame({'Actual Infected': extended_actual, 'Actual Recovered': extended_recovered, 'Susceptible': prediction.y[0], 'Predicted Infected': prediction.y[1], 'Predicted Recovered': prediction.y[2]}, index=new_index)
    MAE = mean_absolute_error(df['Actual Infected'], df['Predicted Infected'])
    MSE = mean_squared_error(df['Actual Infected'], df['Predicted Infected'])
    R_2 = r2_score(df['Actual Infected'], df['Predicted Infected'])
    fig, ax = plt.subplots(figsize=(15, 10))
    ax.set_title('COVID19 India Scenario during lockdown phase')
    df.plot(ax=ax)
    print(f"country=India, beta={beta:.8f}, gamma={gamma:.8f}, r_0:{(beta/gamma):.8f},MSE:{MSE:.8f},MAE:{MAE:.8f},R_2:{R_2:.8f}")

# - Estimating Beta and Gamma during lockdown period

In [None]:
train(recovered, infected, data)

# Affect of Lockdown

* Beta reduced from 0.22861345 to 0.14658033 
* R_0 reduced from 11.57259201 to 2.31412054 

# After Lockdown (1st June - )


Assumptions taken:

* An initial population of 750000 could have been potentially exposed to COVID-19 as of 1st June

In [None]:
data = dbd_after_lockdown.set_index('Date')['Active Cases']
infected = dbd_after_lockdown.set_index('Date')['Confirmed']
recovered = dbd_after_lockdown.set_index('Date')['Recovered']

In [None]:
s_0 = 750000 
i_0 = 190535
r_0 = 91819

In [None]:
def loss(point, data, recovered, s_0, i_0, r_0):
    size = len(data)
    beta, gamma = point
    def SIR(t, y):
        S = y[0]
        I = y[1]
        R = y[2]
        return [-beta*S*I/s_0, beta*S*I/s_0-gamma*I, gamma*I]
    solution = solve_ivp(SIR, [0, size], [s_0,i_0,r_0], t_eval=np.arange(0, size, 1), vectorized=True)
    l1 = np.sqrt(np.mean((solution.y[1] - data)**2))
    l2 = np.sqrt(np.mean((solution.y[2] - recovered)**2))
    alpha = 0.1
    return alpha * l1 + (1 - alpha) * l2

In [None]:
pres_fut = np.array(list(data.index.values)+ list((np.array(pd.date_range(df_india['Date'].max(), periods=180))))) #  months from df_india['Date'].max()

In [None]:
def predict(beta, gamma, data, recovered, s_0, i_0, r_0):
    new_index = pres_fut
    size = len(new_index)
    def SIR(t, y):
        S = y[0]
        I = y[1]
        R = y[2]
        return [-beta*S*I/s_0, beta*S*I/s_0-gamma*I, gamma*I]
    extended_actual = np.concatenate((data.values, [None] * (size - len(data.values))))
    extended_recovered = np.concatenate((recovered.values, [None] * (size - len(recovered.values))))
    return new_index, extended_actual, extended_recovered, solve_ivp(SIR, [0, size], [s_0,i_0,r_0], t_eval=np.arange(0, size, 1))

In [None]:
def train(recovered, infected, data):
    recovered = recovered
    infected = infected
    data = data

    optimal = minimize(loss, [0.001, 0.001], args=(data, recovered, s_0, i_0, r_0), method='L-BFGS-B', bounds=[(0.000001, 0.5), (0.00000001, 0.4)])
    print(optimal)
    beta, gamma = optimal.x
    new_index, extended_actual, extended_recovered, prediction = predict(beta, gamma, data, recovered, s_0, i_0, r_0)
    df = pd.DataFrame({'Actual Infected': extended_actual, 'Actual Recovered': extended_recovered, 'Susceptible': prediction.y[0], 'Predicted Infected': prediction.y[1], 'Predicted Recovered': prediction.y[2]}, index=new_index)
    fig, ax = plt.subplots(figsize=(15, 10))
    ax.set_title('Possible COVID19 India Scenario next 6 months after lockdown')
    df.plot(ax=ax)
    print(f"country=India, beta={beta:.8f}, gamma={gamma:.8f}, r_0:{(beta/gamma):.8f}")

In [None]:
train(recovered, infected, data)