## Overview

This notebook performs exploratory data analysis on India's Covid-19 Situation.


## Importing Libraries and Dependencies


In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno

# for visualisation
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
from matplotlib import cm
import geopandas as gpd
import plotly.express as px
import seaborn as sns
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)    
import folium as flm
from folium import plugins
import ipywidgets
import geopy
import geoplot as gplt
import geoplot.crs as gcrs
import plotly.graph_objects as go

from IPython.display import HTML,display
import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Input Datasets

The input directories and dataset structure is as follows:

1. `covid19-in-india`  directory contains : 
    * `covid_19_india.csv` file contains the COVID-19 cases at daily level
    * `StatewiseTestingDetails.csv` file contains the Statewise testing details
    * `covid_vaccine_statewise.csv` file contains data on the vaccination situation 


2. `geojson` contains `india_states.geojson` json file which contains India state borders along with their non-spatial attributes (id, name, etc) for use in Plotly Choropleth maps

3. `india-states` directory contains files in various format for geometric information of the geospatial representation of the states of India. We will use the `Indian_States.shp` shape file.

## 

In [None]:
#importing main dataset 
df1 = pd.read_csv('../input/covid19-in-india/covid_19_india.csv')

# Delete a single column from the DataFrame
df1 = df1.drop(labels="Sno", axis=1)

# delete a range of rows which are all null
df1 = df1.drop(labels=range(15086, 15114), axis=0)

df1.tail(10)


In [None]:
df1 = df1.replace("-",np.nan)
missingno.matrix(df1)

From the above missing values matrix we realise that most of the values of the columns `ConfirmedIndianNational` and `ConfirmedForeignNational` are missing. Thus, its better to drop them.

In [None]:
#keeping only required columns
df1 = df1[['Date', 'State/UnionTerritory','Cured','Deaths','Confirmed']]

df1.tail(10)

In [None]:
#keeping only required columns
df1_1 = df1[['State/UnionTerritory','Cured','Deaths','Confirmed']]

### Reading in the GIS info shape file


In [None]:
fp = r'../input/india-states/Igismap/Indian_States.shp'
map_df = gpd.read_file(fp)
map_df.head()

Merging the dataset file with the shape file

In [None]:
merged = map_df.set_index('st_nm').join(df1_1.set_index('State/UnionTerritory'))
merged.head()

### Visualizing total cases in India statewise on the map using geoplot

In [None]:
gplt.choropleth(
    merged, hue='Confirmed', projection=gcrs.AlbersEqualArea(),
    edgecolor='black', linewidth=1,
    cmap='Blues', legend=True
)


From above plot, it is evident that Maharastra has the highest number of confirmed cases.

The north-eastern states on the other hand have very few cases.

### Visualising total cases in India statewise on the map using folium

In [None]:
import json

# shapefiles can be converted to geojson with QGIS
with open(r'../input/geojson/india_states.geojson') as f:
    geojson_counties = json.load(f)

In [None]:
for i in geojson_counties['features']:
    i['id'] = i['properties']['NAME_1']
    
map1 = flm.Map(location=[20.5937,78.9629], zoom_start=4)

flm.Choropleth(
    geo_data=geojson_counties,
    name='choropleth',
    data=df1_1,
    columns=['State/UnionTerritory', 'Confirmed'],
    # see folium.Choropleth? for details on key_on
    key_on='feature.id',
    fill_color='YlGn',
    fill_opacity=0.5,
    line_opacity=0.5).add_to(map1)

# layer control to turn choropleth on or off
flm.LayerControl().add_to(map1)

# display map
map1

### Visualising total cured cases in India statewise on the map

In [None]:
gplt.choropleth(
    merged, hue='Cured', projection=gcrs.AlbersEqualArea(),
    edgecolor='black', linewidth=1,
    cmap='Greens', legend=True
)

### Visualising total deaths in India statewise on the map

In [None]:
gplt.choropleth(
    merged, hue='Deaths', projection=gcrs.AlbersEqualArea(),
    edgecolor='black', linewidth=1,
    cmap='Reds', legend=True
)

### Datewise Visualizing total cases, cured and deaths

In [None]:
df1_2 = df1.groupby(['Date'])['Confirmed', 'Deaths','Cured',].sum().reset_index()

In [None]:
#making columns for daily new cases
df1_2['new_confirmed'] = df1_1.Confirmed.diff()
df1_2['new_deaths'] = df1_2.Deaths.diff()
df1_2['new_cured'] = df1_2.Cured.diff()

In [None]:
fig = go.Figure(go.Bar(x= df1_2.Date, y= df1_2.Cured, name='Recovered'))
fig.add_trace(go.Bar(x=df1_2.Date, y= df1_2.Deaths, name='Deaths'))
fig.add_trace(go.Bar(x=df1_2.Date, y= df1_2.Confirmed, name='Confirmed'))

fig.update_layout(barmode='stack',legend_orientation="h",legend=dict(x= 0.3, y=1.0),
                  xaxis={'categoryorder':'total descending'},
                 title_text='<b>Covid-19 Total cases in India (March 2020 - May 2021)<b>',
                  title_x=0.5,
                      width= 1100,
    height= 1000
                 )
fig.update_xaxes(tickfont=dict(color='black', size=14))
fig.update_yaxes(tickfont=dict(color='black', size=14))
fig.show()

From the above plot, we realise that the deaths are significantly smaller than the recovered cases so let us plot them separately and analyse the data.

In [None]:
import plotly.graph_objects as go


fig = go.Figure([go.Bar(x= df1_2.Date, y= df1_2.new_deaths,marker_color='red')])
fig.update_xaxes(tickfont=dict(color='black', size=14))
fig.update_yaxes(tickfont=dict(color='black', size=14))


fig.update_layout(
    title_text='<b>Covid-19 Total Daily Deaths in India (March 2020 - May 2021)<b>',
                  title_x=0.5,
        width= 1100,
    height= 800
)

fig.show()

From the above plot, we can conclude that 
* no. of daily death cases gradually increased from March 2020 to mid-September 2020
* there was a sudden spike in the death count on 17 June 2020
* first wave peaked was reached in Sept 2020 and the daily death cases started decreasing thereafter till March 2021
* second wave hit around March 2021 and led to exponential increase in deaths up until now

## Testing done in India EDA

### Reading in Statewise Testing Details

In [None]:
#importing tests dataset
df2 = pd.read_csv('../input/covid19-in-india/StatewiseTestingDetails.csv' )

df2.head(10)

In [None]:
df2 = df2.replace("-",np.nan)
missingno.matrix(df2)

From the above matrix, we realise a lot data for postive and negative testing results as missing so we drop it temporarily.

In [None]:
#keeping only required columns
df2_1 = df2[['Date', 'State','TotalSamples']]

df2_1.tail(10)

In [None]:
fig = px.bar(df2_1,x="TotalSamples", y="State", orientation='h')
fig.update_xaxes(tickfont=dict(color='black', size=14))
fig.update_yaxes(tickfont=dict(color='black', size=14))
fig.update_layout(
    title_text='<b>Number of Samples Tested <b>',
    title_x=0.5,
    autosize=False,
    width= 1100,
    height= 1000)
fig.update_traces(marker_color='red')
fig.show()

Thus, states like UP,Bihar, Tamil Nadu and Maharastra have higher total Samples than the rest.

## Vaccination details EDA

In [None]:
# vaccine statewise dataset
df3 = pd.read_csv('../input/covid19-in-india/covid_vaccine_statewise.csv' )
df3 = df3.drop(labels = 4588, axis =0)
df3.tail(10)

In [None]:
df3 = df3.replace("-",np.nan)
missingno.matrix(df3)

From the above matrix, we realise a lot data for age group columns missing so we drop them temporarily.

In [None]:
df3_1 = df3.drop(labels=["AEFI", "18-30 years (Age)", "30-45 years (Age)", "45-60 years (Age)", "60+ years (Age)"], axis=1)

df3_1.tail()

From the above dataframe we now make a new dataframe `data` and take only the last updated data i.e. on 18/05/2021 for each state.

In [None]:
data = df3_1.loc[df3_1['Updated On'] == '18/05/2021']
data.shape

In [None]:
data.tail(37)

Replacing a few state and union territories names in the GIS info dataframe to match exactly to the way they have been described in the dataframe `data`

In [None]:
map_df['st_nm'] = map_df['st_nm'].replace(['NTC of Delhi','Jammu & Kashmir','Arunanchal Pradesh', 'Andaman & Nicobar Island'],['Delhi','Jammu and Kashmir','Arunachal Pradesh', 'Andaman and Nicobar Islands'])

In [None]:
merged2 = map_df.set_index('st_nm').join(data.set_index('State'))

In [None]:
gplt.choropleth(
    merged2, hue='Total Individuals Vaccinated', projection=gcrs.AlbersEqualArea(),
    edgecolor='black', linewidth=1,
    cmap='Greens', legend=True
)


From the above map, we conclude that the number of people vaccinated are the highest in Maharastra followed by states like UP and Rajasthan.

### Visualizing Total, First, Second Doses Administered (Statewise)

In [None]:
df3_2 = data.groupby(['State'])['Total Doses Administered', 'First Dose Administered','Second Dose Administered',].sum().reset_index()

display(df3_2.iloc[13])


Thus, the total number of doses administered in the entire country as of Mid-May 2021 are about 18.5 crore out of which approximately 14.4 crore are First Doses while 4 crore are Second Doses.

Since India has a population of about 130 crores, 18.5 crores is significantly small even though a large value in itself. Morever only 4 crore i.e. 3.1% percent of the population has been full vaccinated (Second Dose Administered).

In [None]:
df3_2 = df3_2.drop(labels=13, axis=0) # dropping the row with India as a state
df3_2

In [None]:
df3_2.columns = ['State', 'TotalDose','FirstDose','SecondDose']
fig = go.Figure(go.Bar(x= df3_2.State, y= df3_2.FirstDose, name='First Dose'))
fig.add_trace(go.Bar(x=df3_2.State, y= df3_2.SecondDose, name='Second Dose'))
fig.add_trace(go.Bar(x=df3_2.State, y= df3_2.TotalDose, name='Total Doses'))

fig.update_layout(barmode='stack',legend_orientation="h",legend=dict(x= 0.3, y=1.0),
                  xaxis={'categoryorder':'total descending'},
                 title_text='<b>Covid-19 Total Vaccinations in India as of 18 May 2021<b>',
                  title_x=0.5,
                  width= 1100,
                  height= 1000
                 )
fig.update_xaxes(tickfont=dict(color='black', size=14))
fig.update_yaxes(tickfont=dict(color='black', size=14))
fig.show()

### Visualizing Vaccination w.r.t. Sex

In [None]:
df3_3 = data.groupby(['State'])['Male(Individuals Vaccinated)','Female(Individuals Vaccinated)','Transgender(Individuals Vaccinated)','Total Individuals Vaccinated'].sum().reset_index()
display(df3_3.iloc[13])


In [None]:
df3_3 = df3_3.drop(labels=13, axis=0) # dropping the row with India as a state
df3_3

In [None]:
df3_3.columns = ['State', 'Male','Female','Transgender' ,'Total']
fig = go.Figure(go.Bar(x= df3_3.State, y= df3_3.Male, name='Male Individuals'))
fig.add_trace(go.Bar(x=df3_3.State, y= df3_3.Female, name='Female Individuals'))
fig.add_trace(go.Bar(x=df3_3.State, y= df3_3.Transgender, name='Transgender Individuals'))
fig.add_trace(go.Bar(x=df3_3.State, y= df3_3.Total, name='Total Individuals'))


fig.update_layout(barmode='stack',legend_orientation="h",legend=dict(x= 0.3, y=1.0),
                  xaxis={'categoryorder':'total descending'},
                 title_text='<b>Covid-19 Total Vaccinations in India according to Sex (as of 18 May 2021)<b>',
                  title_x=0.5,
                  width= 1100,
                  height= 1000
                 )
fig.update_xaxes(tickfont=dict(color='black', size=14))
fig.update_yaxes(tickfont=dict(color='black', size=14))
fig.show()

### Visualizing Vaccination w.r.t. type of Vaccine

In [None]:
df3_4 = data.groupby(['State'])['Total Covaxin Administered','Total CoviShield Administered','Total Doses Administered'].sum().reset_index()
display(df3_4.iloc[13])


From the above data, we conclude that out of the 18.5 crore doses given, CoviShield was Administered to about 16.5 core individuals approximately 89% which is significantly larger approximately of Covaxin given to the rest 2 crore (10 percent).

In [None]:
df3_4 = df3_4.drop(labels=13, axis=0) # dropping the row with India as a state
df3_4

In [None]:
df3_4.columns = ['State', 'Covaxin','CoviShield' ,'Total']
fig = go.Figure(go.Bar(x= df3_4.State, y= df3_4.Covaxin, name='Covaxin Administered'))
fig.add_trace(go.Bar(x=df3_4.State, y= df3_4.CoviShield, name='CoviShield Administered'))
fig.add_trace(go.Bar(x=df3_4.State, y= df3_4.Total, name='Total Vaccine Administered'))


fig.update_layout(barmode='stack',legend_orientation="h",legend=dict(x= 0.3, y=1.0),
                  xaxis={'categoryorder':'total descending'},
                 title_text='<b>Covid-19 Total Vaccinations in India according to type of vaccine(as of 18 May 2021)<b>',
                  title_x=0.5,
                  width= 1100,
                  height= 1000
                 )
fig.update_xaxes(tickfont=dict(color='black', size=14))
fig.update_yaxes(tickfont=dict(color='black', size=14))
fig.show()