In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Covid-19 Analysis

# Read Data

In [None]:
state_df = pd.read_csv('../input/covid19-in-india/StatewiseTestingDetails.csv')
country_df = pd.read_csv('../input/covid19-in-india/covid_19_india.csv')
vaccine_df = pd.read_csv('../input/covid19-in-india/covid_vaccine_statewise.csv')

In [None]:
state_df[state_df['State']=='Tamil Nadu'].head()

In [None]:
country_df[((country_df['Date']=='2020-04-04')&(country_df['State/UnionTerritory']=='Tamil Nadu'))]

Statewise Testing Details and Covid_19 India.csv has similar data, only difference i could see is that the total positive count is placed in next dat count in Covid_19_India.csv.

In [None]:
print("Minimum date :", min(country_df['Date']))
print("Maximum date :", max(country_df['Date']))

we have records starting Jan 2020 to 8th May 2021.

In [None]:
#Copying the contry_df data to df for easy reference
df=country_df.copy()

# Checking for Null value

In [None]:
df.isnull().sum()

There is no null value to handel

In [None]:
#Check data types of the dataframe
df.info()

Date, Time, State/UnitonTerritory ConfirmedIndianNational, ConfirmedForignNational are in Categorical value. 

In [None]:
df.head()

In [None]:
#lets convert the Date feature to Date&time datatype
df['Date']=pd.to_datetime(df['Date'],format='%Y-%m-%d')

#Time is not required as it doesnt make much difference
df.drop(['Time'],axis=1, inplace=True)

#Renaming State/UnionTerritory to States for easy reference
df.rename(columns={'State/UnionTerritory':'States'}, inplace=True)

In [None]:
df['Active_cases']=df['Confirmed']-(df['Cured']+df['Deaths'])

In [None]:
df[df['States']=='Tamil Nadu'].tail()

# EDA & Future Engineering

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mtd
import seaborn as sns
from matplotlib.ticker import ScalarFormatter
colors=['#0C68C7','#3A6794','#00FAF3','#FA643C','#C71D12']
sns.set(palette=colors, style='white')

sns.palplot(colors)

In [None]:
top_10=df.groupby(by='States').max()[['Active_cases','Date']].sort_values(by=['Active_cases'],ascending=False).reset_index()
with plt.xkcd():
    fig=plt.figure(figsize=(15,8))
    plt.title("Top 10 highly impacted sates as on 8th May", size=20)
    ax=sns.barplot(data=top_10.iloc[:10],y='Active_cases',x='States', linewidth=2, edgecolor='black')
    ax.set_xticklabels(labels=ax.get_xticklabels(),rotation=90)
    for i in ax.patches:
        ax.text(x=i.get_x(),y=i.get_height(),s=i.get_height())

***As on May 8th Data, Maharastra has highest number active cases and comes the Karnataka, Kerala, UP & Rajestan***

In [None]:
fig=plt.figure(figsize=(15,8))
with plt.xkcd():
    ax=sns.lineplot(data=df[df['States'].isin(['Kerala','Tamil Nadu','Delhi','Maharashtra'])],x='Date',y='Active_cases',hue='States')
    ax.set_title("Active cases by States", size=20)
   
    


***Almost all states shares the same trend in Covid cases, as mentioned in the above chart, Maharastra has highes number cases since july 2020.

In [None]:
with plt.xkcd():
    fig, ax=plt.subplots(nrows=14, ncols=3, figsize=[15,20], sharex=True, sharey=True)
    ax=ax.flatten()

    for i,s in enumerate(df['States'].unique()):
        data1=df[df['States']==s][['Date','Active_cases']]
        sns.lineplot(data=data1, x='Date',y='Active_cases', ax=ax[i])
        ax[i].set_title(s)

In [None]:
with plt.xkcd():
    fig=plt.figure(figsize=(15,8))
    sns.boxplot(data=df[df['States'].isin(['Kerala','Tamil Nadu','Delhi','Maharashtra'])],x='Active_cases',y='States')
    plt.title("Bar plot for sample states" ,size=20)

In [None]:
median_states=df[df['States'].isin(['Kerala','Tamil Nadu','Delhi','Maharashtra'])]
median_states.groupby(by=['States']).median().style.bar(['Active_cases'])

***Observation:***
1. Maharastra - Most of the times Active cases stays between 50000, 190000 range and the range went upto 35000+. mdeian is around 83K cases
2. Kerala - Active cases ranging from 0 to less than 100000. Median is around 40. median value is around 24K
3. Tamilnadu - Avtive cases in Tamilnadu ranging less than 50,000. Median is around 12K

is it because of high rate for cure/deaths?. lets check the same

In [None]:
with plt.xkcd():
    fig=plt.figure(figsize=(15,8))
    sns.boxplot(data=df[df['States'].isin(['Kerala','Tamil Nadu','Delhi','Maharashtra'])],x='Cured',y='States')
    plt.title("Bar plot for sample states" ,size=20)

In [None]:
median_states.groupby(by=['States']).median().style.bar(['Cured'])

***Observation***
***Observation:***
1. Maharastra - has high range of cured value as well. 
2. Kerala - is in 3rd place for cured count
3. Tamilnadu - is in 2nd place in the sample in terms of cure.  
but, we can't conclude the rate by count. we will do some testing

In [None]:
states=['Kerala', 'Tamil Nadu', 'Maharashtra']
tn=df[df['States']=='Tamil Nadu']
kl=df[df['States']=='Kerala']
mh=df[df['States']=='Maharashtra']

with plt.xkcd():

    fig, ax=plt.subplots(nrows=3, ncols=3, figsize=(15,10), squeeze=False, sharex=True, sharey=False, constrained_layout=True )
    plt.suptitle("Comparison of Active, Cured & Deaths for 3 States")
    sns.lineplot(data=tn, x='Date',y='Active_cases', ax=ax[0,0], color=colors[1])
    ax[0,0].set_title("Tamil Nadu")
    sns.lineplot(data=tn, x='Date',y='Cured', ax=ax[1,0], color=colors[1])
    sns.lineplot(data=tn, x='Date',y='Deaths', ax=ax[2,0], color=colors[1])

    sns.lineplot(data=kl, x='Date',y='Active_cases', ax=ax[0,1], color=colors[2])
    ax[0,1].set_title("Kerala")
    sns.lineplot(data=kl, x='Date',y='Cured', ax=ax[1,1], color=colors[2])
    sns.lineplot(data=kl, x='Date',y='Deaths', ax=ax[2,1], color=colors[2])

    sns.lineplot(data=mh, x='Date',y='Active_cases', ax=ax[0,2], color=colors[3])
    ax[0,2].set_title("Maharashtra")
    sns.lineplot(data=mh, x='Date',y='Cured', ax=ax[1,2], color=colors[3])
    sns.lineplot(data=mh, x='Date',y='Deaths', ax=ax[2,2], color=colors[3])

In [None]:
states=['Kerala', 'Tamil Nadu', 'Maharashtra']
tn=df[df['States']=='Tamil Nadu']
kl=df[df['States']=='Kerala']
mh=df[df['States']=='Maharashtra']

with plt.xkcd():

    fig, ax=plt.subplots(nrows=3, ncols=3, figsize=(15,10), squeeze=False, sharex=False, sharey=False, constrained_layout=True )
    plt.suptitle("Distribution of Active, Cured & Deaths for 3 States")
    sns.kdeplot(data=tn['Active_cases'],ax=ax[0,0],color=colors[1], fill=True)
    ax[0,0].set_title("Tamil Nadu")
    sns.kdeplot(data=tn['Cured'],  ax=ax[1,0],color=colors[1],fill=True)
    sns.kdeplot(data=tn['Deaths'], ax=ax[2,0], color=colors[1],fill=True)

    sns.kdeplot(data=kl['Active_cases'], ax=ax[0,1], color=colors[2],fill=True)
    ax[0,1].set_title("Kerala")
    sns.kdeplot(data=kl['Cured'], ax=ax[1,1], color=colors[2],fill=True)
    sns.kdeplot(data=kl['Deaths'], ax=ax[2,1], color=colors[2],fill=True)

    sns.kdeplot(data=mh['Active_cases'], ax=ax[0,2], color=colors[3],fill=True)
    ax[0,2].set_title("Maharashtra")
    sns.kdeplot(data=mh['Cured'], ax=ax[1,2], color=colors[3],fill=True)
    sns.kdeplot(data=mh['Deaths'], ax=ax[2,2], color=colors[3],fill=True)

# Statistical testing

## Cure rate Testing

In [None]:
# for testing purpose we shall take Sample states as Kerala, Tamil Nadu & Maharastra
tn=df[df['States']=='Tamil Nadu']['Cured']
mh=df[df['States']=='Maharashtra']['Cured']
kl=df[df['States']=='Kerala']['Cured']

from scipy.stats import ttest_ind
from statsmodels.stats.proportion import proportions_ztest


## Null Hyposthesis
H0 - mean of states cure rate are significant
H1 - mean of states cure rate are not significant

Significant rate: 0.05

In [None]:
st,p_value=ttest_ind(tn,kl)
if p_value <0.05:
    print("Both states {} & {} have significant difference in Cure rate".format('Tamil Nadu','Kerala'))
else:
    print("Both states {} & {} have no significant difference in Cure rate".format('Tamil Nadu','Kerala'))

st,p_value=ttest_ind(tn,mh)
if p_value <0.05:
    print("Both states {} & {} have significant difference in Cure rate".format('Tamil Nadu','Maharashtra'))
else:
    print("Both states {} & {} have no significant difference in Cure rate".format('Tamil Nadu','Maharashtra'))

st,p_value=ttest_ind(kl,mh)
if p_value <0.05:
    print("Both states {} & {} have significant difference in Cure rate".format('Kerala','Maharashtra'))
else:
    print("Both states {} & {} have no significant difference in Cure rate".format('Kerala','Maharashtra'))

### But, the difference is becuase of high active cases? Lets us do another test

## Proportion Test

In [None]:
#lets take mean impacted vs mean cured
tn_cured = df[df['States']=='Tamil Nadu']['Cured'].max()
mh_cured=df[df['States']=='Maharashtra']['Cured'].max()
kl_cured=df[df['States']=='Kerala']['Cured'].max()

tn_active = df[df['States']=='Tamil Nadu']['Confirmed'].max()
mh_active=df[df['States']=='Maharashtra']['Confirmed'].max()
kl_active=df[df['States']=='Kerala']['Confirmed'].max()

In [None]:
print([tn_cured, mh_cured] , [tn_active, mh_active])
print(f' Proportion of cured cases in Tamil Nadu, Maharastra = {round(tn_cured/tn_active,2)}%, {round(mh_cured/mh_active,2)}% respectively \n')

print([tn_cured, kl_cured] , [tn_active, kl_active])
print(f' Proportion of cured cases in Tamil Nadu, Kerala = {round(tn_cured/tn_active,2)}%, {round(kl_cured/kl_active,2)}% respectively \n')

print([mh_cured, kl_cured] , [mh_active, kl_active])
print(f' Proportion of cured cases in Tamil Nadu, Kerala = {round(mh_cured/mh_active,2)}%, {round(kl_cured/kl_active,2)}% respectively \n')

In [None]:
stat, p_value = proportions_ztest([tn_cured, mh_cured] , [tn_active, mh_active])

if p_value <0.05:
    print("Both states {} & {} have significant difference in Cure rate".format('Tamil Nadu','Maharashtra'))
else:
    print("Both states {} & {} have no significant difference in Cure rate".format('Tamil Nadu','Maharashtra'))

stat, p_value = proportions_ztest([tn_cured, kl_cured] , [tn_active, kl_active])

if p_value <0.05:
    print("Both states {} & {} have significant difference in Cure rate".format('Tamil Nadu','Kerala'))
else:
    print("Both states {} & {} have no significant difference in Cure rate".format('Tamil Nadu','Kerala'))

stat, p_value = proportions_ztest([kl_cured, mh_cured] , [kl_active, mh_active])

if p_value <0.05:
    print("Both states {} & {} have significant difference in Cure rate".format('Kerala','Maharashtra'))
else:
    print("Both states {} & {} have no significant difference in Cure rate".format('Kerala','Maharashtra'))


In [None]:
from scipy.stats import norm
with plt.xkcd():
    fig=plt.figure(figsize=(15,8))
    plt.title("Distrubuption of Cure rate from ")
    ax=sns.kdeplot(data=median_states[median_states['States'].isin(['Tamil Nadu','Maharashtra','Kerala'])],x='Cured', hue='States', fill=True)
    ax.axvline(median_states[median_states['States']=='Kerala']['Cured'].mean(), ls='--', c=colors[0])
    ax.axvline(median_states[median_states['States']=='Tamil Nadu']['Cured'].mean(), ls='--', c=colors[1])
    ax.axvline(median_states[median_states['States']=='Maharashtra']['Cured'].mean(), ls='--', c=colors[2])
    

In [None]:
#lets take mean impacted vs mean cured
tn_cured = df[df['States']=='Tamil Nadu']['Deaths'].max()
mh_cured=df[df['States']=='Maharashtra']['Deaths'].max()
kl_cured=df[df['States']=='Kerala']['Deaths'].max()

tn_active = df[df['States']=='Tamil Nadu']['Confirmed'].max()
mh_active=df[df['States']=='Maharashtra']['Confirmed'].max()
kl_active=df[df['States']=='Kerala']['Confirmed'].max()

print([tn_cured, mh_cured] , [tn_active, mh_active])
print(f' Proportion of Death cases in Tamil Nadu, Maharastra = {round(tn_cured/tn_active,2)}%, {round(mh_cured/mh_active,2)}% respectively \n')

print([tn_cured, kl_cured] , [tn_active, kl_active])
print(f' Proportion of Death cases in Tamil Nadu, Kerala = {round(tn_cured/tn_active,2)}%, {round(kl_cured/kl_active,2)}% respectively \n')

print([mh_cured, kl_cured] , [mh_active, kl_active])
print(f' Proportion of Death cases in Tamil Nadu, Kerala = {round(mh_cured/mh_active,2)}%, {round(kl_cured/kl_active,2)}% respectively \n')


stat, p_value = proportions_ztest([tn_cured, mh_cured] , [tn_active, mh_active])

if p_value <0.05:
    print("Both states {} & {} have significant difference in Death rate".format('Tamil Nadu','Maharashtra'))
else:
    print("Both states {} & {} have no significant difference in Death rate".format('Tamil Nadu','Maharashtra'))

stat, p_value = proportions_ztest([tn_cured, kl_cured] , [tn_active, kl_active])

if p_value <0.05:
    print("Both states {} & {} have significant difference in Death rate".format('Tamil Nadu','Kerala'))
else:
    print("Both states {} & {} have no significant difference in Death rate".format('Tamil Nadu','Kerala'))

stat, p_value = proportions_ztest([kl_cured, mh_cured] , [kl_active, mh_active])

if p_value <0.05:
    print("Both states {} & {} have significant difference in Death rate".format('Kerala','Maharashtra'))
else:
    print("Both states {} & {} have no significant difference in Death rate".format('Kerala','Maharashtra'))


In [None]:
from scipy.stats import norm
with plt.xkcd():
    fig=plt.figure(figsize=(15,8))
    ax=sns.kdeplot(data=median_states[median_states['States'].isin(['Tamil Nadu','Maharashtra','Kerala'])],x='Deaths', hue='States', fill=True)
    ax.axvline(median_states[median_states['States']=='Kerala']['Deaths'].mean(), ls='--', c=colors[0])
    ax.axvline(median_states[median_states['States']=='Tamil Nadu']['Deaths'].mean(), ls='--', c=colors[1])
    ax.axvline(median_states[median_states['States']=='Maharashtra']['Deaths'].mean(), ls='--', c=colors[2])
    

***All 3 states Keral, Tamil Nadu & Maharashtra have significant difference in Cure rate & Death rated compared to over all impact***

In [None]:
with plt.xkcd():
    fig=plt.figure(figsize=(7,7))
    plt.pie(normalize=False, x=top_10[top_10['Date']=='2021-01-08 00:00:00']['Active_cases'], labels=top_10[top_10['Date']=='2021-02-08 00:00:00']['States'], autopct='%.2f',radius=1.5);
    plt.suptitle("% of active cases in India by States as on 8th May 2021",size=20)

# Lets take TamilNadu as a Sample state for Prediciton

In [None]:
#Finding our Daily cases for the data

tn_count=df[df['States']=='Tamil Nadu']
daily=[]
daily.append(0)
for i in range(1, len(tn_count)):
    daily.append(tn_count['Confirmed'].iloc[i]-tn_count['Confirmed'].iloc[i-1])
tn_count['Daily']=daily

In [None]:
from fbprophet import Prophet
ds=pd.DataFrame(columns=['ds','y'])

ds['ds']=tn_count['Date']
ds['y']=tn_count['Daily']
model = Prophet(changepoint_range=1)
model.fit(ds)
future = model.make_future_dataframe(periods=20)
f=model.predict(future)
f['ds']=pd.to_datetime(f['ds'], format='%Y-%m-%d')

In [None]:

today=f[f['ds']=='2021-05-11 00:00:00']
with plt.xkcd():
    fig=plt.figure(figsize=(15,8))
    
        
    ax=sns.lineplot(data=f, x='ds',y='yhat', alpha=1, color='red')
    ax=sns.lineplot(data=tn_count, x='Date',y='Daily')
    ax.set_xlim(mtd.datestr2num('2021-01-01'))
    
    ax.annotate(text=str(np.round(f['yhat'].iloc[-1],0))+ "\n"+ str(f['ds'].iloc[-1]), xy=(f['ds'].iloc[-1],f['yhat'].iloc[-1]))
    ax.annotate(text=str(np.round(today['yhat'].iloc[-1],0))+ "\n"+ str(today['ds'].iloc[-1]), xy=(today['ds'].iloc[-1],today['yhat'].iloc[-1]))
    
    ax.annotate(text="2021 Lock down in TamilNadu", xy=(0.70,0.25), xycoords='figure fraction', xytext=(0.60, 0.15),arrowprops={'arrowstyle':'-|>','color':'black'})
    ax.axvspan(xmin='2021-05-10 00:00:00', xmax='2021-05-24 00:00:00', alpha=0.2, facecolor='y')
    
    ax.legend(labels=['Predicted','Actual'], loc='upper left')
    
    
    
    



**We expect the Lockdown to bring down the increasing impact rate. Also, the effect of lockdown can been seen at the end of lockdown days**