In [None]:
!pip install tabula-py

In [None]:
!pip install dash

In [None]:

import pandas as pd
import numpy as np
import os
import sys

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import seaborn as sns
import matplotlib.pyplot as plt

import requests

import ast

import warnings
warnings.filterwarnings('ignore')

import glob
from tabula.io import read_pdf

### Introduction

On 22nd January 2021, PM Modi had declared Indias victory against the deadly COVID Pandemic - The first wave came and the curve flattened and people started walking out on the streets all precautions gone to the wind, and then came March 2021, almost a year since COVID pandemic started - and along with it brought the rise in COVID Cases across the country - this time because of a double/triple mutant. Unlike the first wave, this was more infectious, infecting the young as well as children.. Soon Hospital Beds were full and there was shortage of Oxygen and Ventilators. 

This notebook is to analyse the second wave in India.For this analysis, we will be using the Google Mobility Data. This Mobility Data is anonymised and gives an aggregate measure of how peoples movements have changed over time since the beginning of the pandemic. 

## Load the Data

1. **Download the State wise data of India from the covid19india api and fill in the missing dates**

2. **Download the Population, GDP and Area Data**

3. **Read the Mobility Data**

Merge all the Datasets 

In [None]:
### Downloading the Statewise COVID Data for India
statewise_data_url="https://api.covid19india.org/csv/latest/states.csv"
india_statewise_data=pd.read_csv(statewise_data_url,parse_dates=['Date'],infer_datetime_format=True)
india_statewise_data.head()

The COVID Data, contains data for India as well as the States in the same table. Let us seperate them into two datasets 

In [None]:
def handleMissingDates(data,min_date,max_date,date_col="Date"):
    data=data.set_index(date_col,drop=True)
    all_days = pd.date_range(min_date,max_date, freq='D')
    #print(len(all_days))
    data=data.reindex(all_days)
    #print(data.shape)
    data=data.ffill() ## Get data from previous rows
    data[date_col]=data.index
    data.reset_index(drop=True,inplace=True)
    data.fillna(0,inplace=True)
    return data

    

    
    
    

In [None]:
min_date=min(india_statewise_data['Date'])
max_date=max(india_statewise_data['Date'])

## Group by State and fill the Missing Values. 
india_statewise_data=india_statewise_data.groupby("State").apply(lambda x:handleMissingDates(x,min_date,max_date))
india_statewise_data.drop(['State'],axis=1,inplace=True)
india_statewise_data=india_statewise_data.reset_index()
india_statewise_data.drop(['level_1'],axis=1,inplace=True)
india_statewise_data.head()


## Seperate the India Overall and State wise data

covid_india_data=india_statewise_data[india_statewise_data['State']=="India"]
india_statewise_data=india_statewise_data[india_statewise_data['State']!="India"]

In [None]:
### Getting Statewise GDP in Billion Dollars
gdp_url="https://statisticstimes.com/m/economy/json/india/gsdp-dollar.json"
response=requests.get(gdp_url)
state_gdp=ast.literal_eval(response.text)
state_gdp=pd.DataFrame(state_gdp)
state_gdp=state_gdp.rename(columns={'name':'State','dollar':'gdp_billion_dollars'})
state_gdp['State']=state_gdp['State'].apply(lambda x:x.replace("&","and"))
state_gdp=state_gdp.rename(columns={'rank':'Rank by GDP'})
state_gdp.head()

In [None]:
## Getting Statewise Population
table = read_pdf("https://uidai.gov.in/images/state-wise-aadhaar-saturation.pdf",pages=1)
india_state_population=table[0]


india_state_population=india_state_population[pd.notna(india_state_population['S.No'])]
india_state_population=india_state_population[india_state_population['S.No']!="Total"]
india_state_population=india_state_population[pd.notna(india_state_population['Saturation %\r(LIVE)'])]
india_state_population['State']=india_state_population['State Name'].apply(lambda x:x.replace("*",""))
india_state_population.drop(['Numbers of  Aadhaar\rassigned (LIVE)','Saturation %\r(LIVE)','S.No','State Name'],axis=1,inplace=True)
india_state_population=india_state_population.rename(columns={'Total Population\r(Projected 2020)':'Population_2020'})
india_state_population['State']=india_state_population['State'].apply(lambda x:x.replace("A & N Islands","Andaman and Nicobar Islands"))
india_state_population['State']=india_state_population['State'].apply(lambda x:x.replace("&","and"))
india_state_population['State']=india_state_population['State'].apply(lambda x:x.replace("Jammu Kashmir",'Jammu and Kashmir'))
india_state_population['Population_2020']=india_state_population['Population_2020'].apply(lambda x:int(x.replace(",","")))
india_state_population.head()

In [None]:
### Getting State wise Area information
WIKI_URL = "https://en.wikipedia.org/wiki/List_of_states_and_union_territories_of_India_by_area"
dfs = pd.read_html(WIKI_URL, header=0)
india_state_area=dfs[1]
india_state_area=india_state_area.rename(columns={'State (S) / Union territory (UT)':"State",'Area (km2)':'Area_sqkm'})
india_state_area['Rank']=india_state_area['Rank'].apply(lambda x:x.split("(")[0].strip())
india_state_area=india_state_area[pd.notna(india_state_area['Region'])]
india_state_area['Area_sqkm']=india_state_area['Area_sqkm'].apply(lambda x:int(x.replace(",","")))
india_state_area=india_state_area.drop(['Ref','Political entities of comparable size (land mass)'],axis=1)
india_state_area['State']=india_state_area['State'].apply(lambda x:x.replace("NCT Delhi","Delhi"))
india_state_area=india_state_area.rename(columns={'Rank':'Rank in Area'})
india_state_area=india_state_area.rename(columns={'National Share (%)':'Percent_National_Share_By_Area'})

india_state_area

In [None]:
india_area_population_gdp=pd.merge(india_state_population,india_state_area,on="State",how="outer")
india_area_population_gdp=pd.merge(india_area_population_gdp,state_gdp,on="State",how="outer")
india_area_population_gdp.head()

There are UT like Daman and Diu and Andaman and Nicobar Islands for whom GDP data is not available. Let us map the Population, Area and GDP information with the COVID data for India States


In [None]:
### Reading the Google Mobility Data for India

'''
This function takes a country code and path to the mobility_report folder and returns the merged data for all years for the given Country Code

'''
def getCountryMobilityReport(country_code,mobility_report_path):
    __files__= glob.glob(mobility_report_path+"*_"+country_code+"_*.csv")
    mobility_report_list=[]
    for __file__ in __files__:
        mobility_report_list.append(pd.read_csv(__file__))
    mobility_report=pd.concat(mobility_report_list)
    return mobility_report


india_mobility_report=getCountryMobilityReport("IN",mobility_report_path="/kaggle/input/google-mobility-data/")
india_mobility_report.shape

In [None]:
india_mobility_report.head()

1. **For India as a country data, we need to get the data where country_region="India" and sub_region_1 and sub_region_2 are NULL**

2. **For state level, we need to get where country_region= "India" and sub_region_1=state_name.** 
Since we have two dataframes one for complete India COVID Cases and another for state level, let us extract the google mobility data at the same level.

3. **sub_region_2 is for district level and since we are not looking at district level, we can drop this column for this analysis**

In [None]:
india_mobility=india_mobility_report[(india_mobility_report['country_region']=="India") & (pd.isna(india_mobility_report['sub_region_1']))]
india_mobility.shape

In [None]:
india_state_mobility=india_mobility_report[(india_mobility_report['country_region']=="India") & (pd.notna(india_mobility_report['sub_region_1'])) & (pd.isna(india_mobility_report['sub_region_2']))]
india_state_mobility.shape

In [None]:
cols_to_include=['date','country_region','sub_region_1']+[col for col in india_mobility_report if "percent_change_from_baseline" in col]
india_mobility=india_mobility[cols_to_include]
india_state_mobility=india_state_mobility[cols_to_include]
india_mobility=india_mobility.drop(['sub_region_1'],axis=1)
india_state_mobility=india_state_mobility.drop(['country_region'],axis=1)
india_mobility=india_mobility.rename(columns={'country_region':'State'})
india_state_mobility=india_state_mobility.rename(columns={'sub_region_1':'State'})
india_state_mobility.head()


In [None]:
pd.isna(india_state_mobility).sum()

In [None]:
pd.isna(india_mobility).sum()

As per google mobility Reports, there can be days where the percent change from baseline can be missing - this may be because the data couldnt be captired without loss of anonymity indicating lower activity level.So, for our purposes we will replace the Nans in google mobility Report by 0

Also, there is no missing value at country level but missing data is seen at State Level Report

In [None]:
activity_cols=[col for col in india_mobility_report if "percent_change_from_baseline" in col]
for col in activity_cols:
    india_state_mobility.loc[pd.isna(india_state_mobility[col]),col]=0




In [None]:
india_mobility['Date']=pd.to_datetime(india_mobility['date'])
india_state_mobility['Date']=pd.to_datetime(india_state_mobility['date'])


### Merging all the Data

In [None]:
## india_statewise_covid with india area, population and gdp information
india_statewise_data=pd.merge(india_statewise_data,india_area_population_gdp,on="State",how="left")
india_statewise_data.shape

In [None]:
set(india_statewise_data['State']) - set(india_state_mobility['State'])

1. **In mobility data,Dadra and Nagar Haveli and Daman and Diu are captired at two levels - Dadra and Nagar Haveli , Daman and Diu  and Ladakh is not captured seperately.As these are smaller UTs and do not have a major impact, for now let us ignore them in our analysis.**

2. **The mobility data has information 15 Feb 2020, while COVID data is from 30 Jan 2020(first instance seen in Kerala). For our analysis, let us consider the data from 15Feb 2020 as before this the case numbers was insignificant**

3. **Also, the mobility data is updated weekly while COVID data is updated daily. So for a few days in the end, the mobility data may be NULL**

In [None]:
## Dropping Dates before Mobility Data was available
min_mobility_date=min(india_mobility['Date'])
india_statewise_data =india_statewise_data[(india_statewise_data['Date'] >= min_mobility_date)]
covid_india_data=covid_india_data[(covid_india_data['Date'] >= min_mobility_date)]

In [None]:
## Dropping states for which mobility data was not available
ignore_states=list(set(india_statewise_data['State']) - set(india_state_mobility['State']))
india_statewise_data=india_statewise_data[~india_statewise_data['State'].isin(ignore_states)]
india_statewise_data.shape

In [None]:
india_statewise_data=pd.merge(india_statewise_data,india_state_mobility,on=['Date','State'],how="left")
india_statewise_data.shape

In [None]:
covid_india_data=pd.merge(covid_india_data,india_mobility,on=['Date','State'],how="left")
covid_india_data.shape

In [None]:
covid_india_data['Population_2020']=1370508600
covid_india_data['Area_sqkm']=3287263

### Preprocessing the Data

1. **In the raw data, the Confirmed, Recovered and Deceased columns are cumulative.We need to get the cases confirmed, recovered or deceased for that particular day** -  To analyse the number of cases each day, we need to subtract total cases of today from total cases of the previous day. 

2. **To smoothen the curve, we need to calculate the rolling 7 day moving average for Confirmed, Recovered and Deceased**

3. **Get Active Number of Cases for Each Day - this is Total_Confirmed_Cases - Total_Recovered_Cases - Total_Deceased**

4. **Create a column Population Density on statewise data- this field can help us understand if spread of the virus is more in denser areas**

5. **To smoothen the mobility curve, we will also do a 7 day moving average**

6. **Create a column that measures the Recovery Rate and Death Rate** 

7. **Create a column for Doubling Rate**

8. **Mobility Data - Average across all outdoor place**




Both the calculations must be done for each state. 


In [None]:
'''
This function takes the data and for each of the rate_column, calculates the current Confirmed, Recovered and Deceased Values
'''
def calculateTodayRate(data,rate_col):
    if type(rate_col)=="str":
        rate_col=[rate_col]
    for col in rate_col:
        data['Prev_Day_'+col]=data[col].shift(1)
        data['Today_'+col]=data[col] - data['Prev_Day_'+col]
        #data.drop(['Prev_Day_'+col],axis=1,inplace=True)
    
    return data

In [None]:
'''
This function is based on http://njcmindia.org/uploads/11-3_141-143.pdf

For Doubling Rate, we will look at 7days prior cases

Only slight change, adding +1 to log in the denominator to avoid divison by 0
'''
def getDoublingRate(data,window=7,confirmed_cases_col='Confirmed',date_col="Date"):
    ## Set Date Column as Index
    data=data.set_index("Date",drop=True)
    data['7DaysPrior_Cases']=data[confirmed_cases_col].shift(window)
    data[date_col]=data.index
    data.reset_index(drop=True,inplace=True)
    
    data['log_Current_by_Prior']=data.apply(lambda row:np.log((row[confirmed_cases_col]+1)/(row['7DaysPrior_Cases']+1)) if row[confirmed_cases_col]!=0 else 0,axis=1)
    data['Doubling_Rate']=data.apply(lambda row:window*(np.log(2)/(row['log_Current_by_Prior'])),axis=1)
    data.loc[pd.isna(data['7DaysPrior_Cases']),'Doubling_Rate']=0
    data.loc[(data[confirmed_cases_col]==0) & (data['7DaysPrior_Cases']==0),'Doubling_Rate']=0
    #if row['log_Current_by_Prior']!=0 else 0
    return data
    
    
    


In [None]:
'''
This function takes the data, the columns for which the rolling average is to be perfomed and the Rolling Window Period which is default to 7
'''
def getRollingAverage(data,rate_col,window=7):
    if type(rate_col)=="str":
        rate_col=[rate_col]
    for col in rate_col:
        #print(col)
        data[str(window)+"Day_Rolling_Average_"+col.replace("Today_","")]=data[col].rolling(window=window).mean()
    return data

In [None]:
def getActiveCases(data,confirmed_col="Confirmed",recovered_col="Recovered",deceased_col="Deceased"):
    data['Active_Cases']=data[confirmed_col] - data[recovered_col] - data[deceased_col]
    return data

def getRecoveryDeathRate(data,recovered_col="Recovered",confirmed_col="Confirmed",deceased_col="Deceased"):
    data['Recovery_Rate']=(data[recovered_col]/data[confirmed_col])
    data['Death_Rate']=(data[deceased_col]/data[confirmed_col])
    return data

In [None]:
def getPopulationDensity(data,population_col,area_col):
    return data[population_col]/data[area_col]

In [None]:
def CasesPerMillion(data,population_col,cases_col):
    data[cases_col+"_Per_Million"]=data[cases_col]/data[population_col]*1000000
    return data


In [None]:
def getOutdoorMobility(data,outdoor_mobility_cols):
    data['outdoor_percent_change_from_baseline']=data.loc[:,outdoor_mobility_cols].mean(axis=1)
    return data
    

In [None]:
mobility_cols=[col for col in india_statewise_data if "percent_change_from_baseline" in col]

#print(mobility_cols)


india_statewise_data=india_statewise_data.groupby("State").apply(lambda x:calculateTodayRate(x,['Confirmed','Recovered','Deceased']))
india_statewise_data=india_statewise_data.groupby("State").apply(lambda x:getRollingAverage(x,['Today_Confirmed','Today_Recovered','Today_Deceased']))
india_statewise_data=india_statewise_data.groupby("State").apply(lambda x:getActiveCases(x,"Confirmed","Recovered","Deceased"))
india_statewise_data=india_statewise_data.groupby("State").apply(lambda x:getRecoveryDeathRate(x))

india_statewise_data=getOutdoorMobility(india_statewise_data,mobility_cols)
india_statewise_data=india_statewise_data.groupby("State").apply(lambda x:getRollingAverage(x,mobility_cols+['outdoor_percent_change_from_baseline']))


india_statewise_data=CasesPerMillion(india_statewise_data,"Population_2020","Confirmed")
india_statewise_data=CasesPerMillion(india_statewise_data,"Population_2020","Recovered")
india_statewise_data=CasesPerMillion(india_statewise_data,"Population_2020","Deceased")

india_statewise_data['Population_Density']=getPopulationDensity(india_statewise_data,"Population_2020","Area_sqkm")

india_statewise_data=india_statewise_data.groupby("State").apply(lambda x:getDoublingRate(x))

india_statewise_data.drop(['State'],axis=1,inplace=True)
india_statewise_data=india_statewise_data.reset_index()
india_statewise_data.drop(['level_1'],axis=1,inplace=True)


covid_india_data=covid_india_data.groupby("State").apply(lambda x:calculateTodayRate(x,['Confirmed','Recovered','Deceased']))
covid_india_data=covid_india_data.groupby("State").apply(lambda x:getRollingAverage(x,['Today_Confirmed','Today_Recovered','Today_Deceased']))
covid_india_data=covid_india_data.groupby("State").apply(lambda x:getRecoveryDeathRate(x))


covid_india_data=getOutdoorMobility(covid_india_data,mobility_cols)

covid_india_data=covid_india_data.groupby("State").apply(lambda x:getRollingAverage(x,mobility_cols+['outdoor_percent_change_from_baseline']))
covid_india_data=getActiveCases(covid_india_data,"Confirmed","Recovered","Deceased")
covid_india_data=CasesPerMillion(covid_india_data,"Population_2020","Confirmed")
covid_india_data=CasesPerMillion(covid_india_data,"Population_2020","Recovered")
covid_india_data=CasesPerMillion(covid_india_data,"Population_2020","Deceased")


covid_india_data=getDoublingRate(covid_india_data)

india_statewise_data.head()

In [None]:

covid_india_data.head(20)

## Exploring the Data

### Rise of COVID in India 

In [None]:
def plotCovidTimeSeries(data,rate_col,date_col,color_pallete,plot_moving_average=True):

    if type(rate_col)=="str":
        rate_col=[rate_col]
    num_cols=len(rate_col)
    
    subplot_title=[col.replace("Today_","")+" Cases" for col in rate_col]
    fig = make_subplots(num_cols,1,shared_xaxes=True,subplot_titles=subplot_title)
    col_id=1
    for col in rate_col:
        
        fig.add_trace(go.Bar(x=data[date_col], y=data[col],
                     name=col.replace("New_","").replace("Today_","")+" Cases",
                     
                     
                     opacity=0.5,
                     marker_line_color=color_pallete[col_id-1][3],
                     marker_line_width=2),
              row = col_id, col = 1)
        ## Add line chart for the rolling average if plot_moving_average=True
        if plot_moving_average==True:
            rolling_avg_col=col.replace("Today_","7Day_Rolling_Average_")
            fig.add_trace(go.Scatter(x=data[date_col],y=data[rolling_avg_col],line=dict(color=color_pallete[col_id-1][8]),name=col.replace("New_","").replace("Today_","")+" Cases 7 Day Rolling Average"),row=col_id,col=1)
        
        #fig.update_layout(xaxis=dict(showgrid=False),yaxis=dict(showgrid=False))
        col_id=col_id+1
    fig.update_layout(height=1500,width=1000)
    fig.update_yaxes(showgrid=False)
    fig.update_xaxes(
    
    showgrid=False,
    rangeslider_visible=False,
    rangeselector=dict(
        buttons=list([
            dict(count=15,label="15days",step="day",stepmode="backward"),
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=2, label="2m", step="month", stepmode="backward"),
            dict(count=3, label="3m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=9, label="9m", step="month", stepmode="backward"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            
            dict(step="all")
        ])
    )
)
    return fig
    


In [None]:
india_covid_fig=plotCovidTimeSeries(covid_india_data,[col for col in covid_india_data if col.startswith("Today_")],"Date",[px.colors.sequential.Blues,px.colors.sequential.Greens,px.colors.sequential.Reds])
india_covid_fig.update_layout(title="Rise of COVID - INDIA",title_x=0.5)

In [None]:
px.line(covid_india_data,x="Date",y=['Confirmed_Per_Million','Recovered_Per_Million'],color_discrete_sequence=[px.colors.sequential.Blues[6],px.colors.sequential.Greens[6],px.colors.sequential.Reds[6]],title="Confirmed and Recovered Cases Per Million- INDIA")

In [None]:
px.line(covid_india_data,x="Date",y=['Deceased_Per_Million'],color_discrete_sequence=[px.colors.sequential.Reds[6]],title="Deceased Cases Per Million- INDIA")

In [None]:
px.line(covid_india_data,x="Date",y=['Doubling_Rate'],color_discrete_sequence=[px.colors.diverging.PuOr[10]],title="Deceased Cases Per Million- INDIA")

1. **As we can see in the above plot, around Sep 2020 India saw peak in COVID Cases and then there was a lull - the time we belived we defeated COVID. But as March dawned , COVID started rising steeply - catching everyone unawares - leading to Oxygen shortage and lack of beds and life saving drugs**

2. **In the Second wave the number of daily cases is atleast 4 Times that of the peak cases during first wave  and is increasing (as of 8 May 2021)**

3. **In the Second Wave, the Gap between Confirmed Cases Per million and Recovered Cases Per Million has increased.**


### Most Affected State - By Number of Cases


In [None]:

def getTopN(data,sort_col,n_top=10,is_top=True):
    return data.sort_values(by=sort_col,ascending=not is_top).head(10).sort_values(by=sort_col,ascending=is_top)

def plotTopN(data,sort_col,n_top=10,is_top=True,x_col="State",orientation="h",color_continuous_scale=None,text_font_color="black",text_format="%{text:.2s}"):
    top_n_data=getTopN(data,sort_col,n_top,is_top)
    if is_top:
        top="Top "
    else:
        top="Bottom "
    if orientation=="h":
        fig=px.bar(top_n_data,y=x_col,x=sort_col,orientation=orientation,title=top+str(n_top)+" "+ x_col+"s by "+sort_col.replace("_"," "),color_continuous_scale=color_continuous_scale,color=sort_col,text=sort_col)
    else:
        fig=px.bar(top_n_data,x=x_col,y=sort_col,orientation=orientation,title=top+str(n_top)+" "+ x_col+"s by "+sort_col.replace("_"," "),color_continuous_scale=color_continuous_scale,color=sort_col,text=sort_col)
    
    
    fig.update_traces(textfont_color=text_font_color,textposition="outside",texttemplate=text_format)
    
    
    
    

    
    return fig
        

In [None]:
max_date=max(india_statewise_data['Date'])
case_on_max_date=india_statewise_data[india_statewise_data['Date']==max_date]
#case_on_max_date['Recovery_Rate']=(case_on_max_date['Recovered']/case_on_max_date['Confirmed'])*100
#case_on_max_date['Death_Rate']=(case_on_max_date['Deceased']/case_on_max_date['Confirmed'])*100
plotTopN(case_on_max_date,"Confirmed",n_top=10,color_continuous_scale=px.colors.sequential.Blues)

In [None]:
plotTopN(case_on_max_date,"Active_Cases",n_top=10,color_continuous_scale=px.colors.sequential.Blues)

In [None]:
plotTopN(case_on_max_date,"Doubling_Rate",n_top=10,color_continuous_scale=px.colors.sequential.Aggrnyl)

In [None]:
plotTopN(case_on_max_date,"Doubling_Rate",n_top=10,is_top=False,color_continuous_scale=px.colors.sequential.Aggrnyl)

1. **Maharastra, leads in the total number of cases. The cases in Maharastra is close to 5Million, more than the second highest state (Karnataka) by almost 3 Times.**

2. **Uttar Pradesh which has the highest populated state in the country has much lesser number of cases**

3. **The number of Active Cases in Maharastra and Karnataka are almost the same**

4. **Karnataka has one of the lowest doubling rate of 35 days, while Maharastra though has a lot of cases, takes 87 days to double the cases**

### Most Affected States - By Recovery Rate and Death Rate

In [None]:
plotTopN(case_on_max_date,"Recovery_Rate",n_top=20,color_continuous_scale=px.colors.sequential.Greens,text_format="%{text:.1%}")


In [None]:
plotTopN(case_on_max_date,"Death_Rate",n_top=10,color_continuous_scale=px.colors.sequential.Reds,text_format="%{text:.1%}")


### Overall India what is the contribution of Different States in the Total Number of Deaths and Cases?

In [None]:
px.pie(case_on_max_date, values='Confirmed', names='State', title='Confirmed Cases across States',height=800,color_discrete_sequence=px.colors.sequential.Blues_r)

In [None]:
px.pie(case_on_max_date, values='Deceased', names='State', title='Death Cases across States',height=800,color_discrete_sequence=px.colors.sequential.Reds_r)

1. **Punjab has a very high death rate**
2. **Southern States like Karnataka and Kerala, though have a very high number of confirmed cases, there recovery and death rate is much lower**
3. **Maharastra contributes to 21% of the total Cases, and 30% of Death Cases.**
In all the above graphs, we looked at number of Cases oveall - but how does this change when we look at per Millio people how many cases?



### Impact of Area of State on Cases Per Million

In [None]:
px.scatter(case_on_max_date,y="Confirmed_Per_Million",x="Area_sqkm",size="Deceased_Per_Million",color="State",title="Confirmed Cases Per Million vs Area - Size by Deceased Per Million")

1. **Compared to larger states, UT and states like Punjab,Kerala,Uttarakhand, J&K, have higher cases per million people.**

2. **For example, Kerala whose area is less then 50k sq km has most number of confirmed cases per million**

3. **States which have greater than 200k sqkm area except Maharastra - seem to have lower number of Confirmed Cases Per Million People**

4. **Between around 70k sqkm Area and less than 200sqkm area , there is a linear relationship between the number of Confirmed Cases Per Million and Area of the state**

5. **States like GOA,Delhi,Pudhucherry have smaller area and at same time high Death Rate per million**

6. **States that have higher area(beyound 200k sqkm) like UP,Madhya Pradesh, Rajasthan have very low death rate per million**

7. **Kerala is one state which has a high COnfirmed_Per_Million, but its Death rate is low**

8. **In States like Himachal Pradesh, Punjab, Uttarakhand, we can see that Confirmed_Per_Million is less than 30K, but Death Rate is higher**

9. **Confirmed Cases Per Million is higher than Maharastra which has recorded the highest number of COVID Cases - but Kerala also has one of the lowest Death Rates**

### How has Movement Had an impact on the COVID Cases?

#### Impact of Movement on India

One key measure that can help understand how movement impacted the rise of COVID is the doubling rate. 

In [None]:
covid_india_data.columns

In [None]:
def createMobilityPlot(data,date_col,primary_axis_cols,secondary_axis_cols,primary_yaxis_title,secondary_yaxis_title,primary_col_colors,plot_title):
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    for idx,col in enumerate(secondary_axis_cols):
        fig.add_trace(
    go.Line(x=data[date_col], y=data[col], name=col.replace("7Day_Rolling_Average_","").replace("percent_change_from_baseline","mobility").replace("_"," ").title()),
    secondary_y=True,
)
        
    for idx,col in enumerate(primary_axis_cols):
        fig.add_trace(go.Line(x=data[date_col], y=data[col], name=col.replace("_"," "),line=dict(color=primary_col_colors[idx])),
    secondary_y=False,
                      
)
        
    
    
   
    fig.update_layout(
    title_text=plot_title,
)
    #Set x-axes title
    fig.update_xaxes(title_text="<b>Date")
    fig.update_yaxes(title_text="<b>"+primary_yaxis_title, secondary_y=False)
    fig.update_yaxes(title_text="<b>"+secondary_yaxis_title, secondary_y=True)
    return fig


    
    

In [None]:
fig=createMobilityPlot(covid_india_data,date_col="Date",primary_axis_cols=['Doubling_Rate'],secondary_axis_cols=[col for col in covid_india_data if "7Day_Rolling_Average" in col and "percent_change_from_baseline" in col],primary_col_colors=[px.colors.diverging.PuOr[10]],primary_yaxis_title="Doubling Rate",secondary_yaxis_title="Mobility Percent Change (7 days rolling average)",plot_title="<b>Impact of Mobility on Doubling Rate Across India")

## Adding Range filter
fig.update_xaxes(
    
    showgrid=False,
    rangeslider_visible=False,
    rangeselector=dict(
        buttons=list([
            dict(count=15,label="15days",step="day",stepmode="backward"),
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=2, label="2m", step="month", stepmode="backward"),
            dict(count=3, label="3m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=9, label="9m", step="month", stepmode="backward"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            
            dict(step="all")
        ])
    )
)



fig.show()

1. **Doubling rate was highest around Feb 21 - the TIme when India declared it won over COVID and from then on its been a steep drop. Since May 2nd the doubling rate has begun to increase**
2. **From the graph, we can see that in beginning of April, there is a drop in the retail and recreation mobility - It is time when states like Karnataka imposed a rule of 50% seating only in movie theatres. This is the time where the drop in Doubling Rate slowed down**
3. **By May 8th number of people staying inside homes(Residential Mobility Change  increased by 24% compared to baseline) and the doubling rate started increasing**
4. **Drop in Grocery and Pharmacy Mobility has been the highest since April 6th**




### Impact of Lockdown By States

During the Second Wave States like Maharastra, Delhi, Karnataka, Kerala have imposed strict lockdowns . Let us see if the lockdowns had any impact on the cases

In [None]:
second_wave_start_date=np.datetime64('2021-03-01')
second_wave_start_date

In [None]:
def getStateData(data,state,start_date=None):
   
    state_data=data[data['State']==state]
    if start_date!=None:
        #print("Filtering Start Date")
        state_data=state_data[state_data['Date']>=start_date]
    return state_data

#### Maharastra - Second Wave

Maharastra has been one of the most affected states during the Second Wave. 
This state imposed lockdown since April 14 2021 and the lockdown now has been extended till June 1st 2021. 

In [None]:
mh_data=getStateData(india_statewise_data,"Maharashtra",start_date=second_wave_start_date)
mh_data.head()

In [None]:
fig=createMobilityPlot(mh_data,date_col="Date",primary_axis_cols=['Doubling_Rate'],secondary_axis_cols=[col for col in covid_india_data if "7Day_Rolling_Average" in col and "percent_change_from_baseline" in col],primary_col_colors=[px.colors.diverging.PuOr[10]],primary_yaxis_title="Doubling Rate",secondary_yaxis_title="Mobility Percent Change (7 days rolling average)",plot_title="<b>Impact of Mobility on Doubling Rate Across Maharastra")
fig.show()

1. **The Doubling Rate in Maharastra started to increase around April 14th - the day lockdown was announced**
2. **Around April 14th, the doubling rate was 40 days, as of May 18th it is 118 days.**
3. **Drop in Outdoor activities like Retail and Recreation, Parks, Workplaces can be observed since March 28th when night curfew was announced**
4. **From april 24th we can see around 70% less mobility than baseline and it is around this time that the Doubling rate started increasing**


In [None]:
fig=createMobilityPlot(mh_data,date_col="Date",primary_axis_cols=['7Day_Rolling_Average_Confirmed'],secondary_axis_cols=[col for col in covid_india_data if "7Day_Rolling_Average" in col and "percent_change_from_baseline" in col],primary_col_colors=[px.colors.sequential.Blues_r[0]],primary_yaxis_title="7 Day Rolling New Cases",secondary_yaxis_title="Mobility Percent Change (7 days rolling average)",plot_title="<b>Impact of Mobility on  New Cases Across Maharastra</b> (Dashed line is where lockdown was imposed)")
fig.add_vline(x="2021-04-14",line_width=3, line_dash="dash", line_color="green")



1.**After the lockdown there has been an significant drop in Outdoor Mobility**
