In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import sklearn as sk
color = sns.color_palette()
%matplotlib inline

import plotly.offline as py
from plotly import tools
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import os
os.getcwd()
from janitor import groupby_agg

# OBJECTIVE:
## Comparitive Analysis  Covid-19  of US and India 
### Through This notebook we will learn and infer how covid-19 pandemic has swept across US and India, the two Most affected Countries in world.
### As I am working on this comparision COVID-19 has taken a deadly 2nd wave in India .
### By analysing this data we will try to figure out with statistical reasoning whether how it might behave in India WRT to the United States by pointing out Both the similarities and differences between the two.

  * This Notebook also works as baseline of me publishing my first kaggle notebook and help me revising after long gap due to health issues. Any constructive criticsims and recommendations on How I can improve my skills is deeply appreciated.

In [2]:
#Read the data
Ind_df=pd.read_csv('https://api.covid19india.org/csv/latest/state_wise_daily.csv')
US_df=pd.read_csv('https://api.covidtracking.com/v1/us/daily.csv')

In [3]:
statelist=list(Ind_df)
del statelist[:4]
statelist
Ind_df['Country Total']=Ind_df[statelist].sum(axis=1)
# Creating a State List

In [4]:
Ind_df['Date']=pd.to_datetime(Ind_df['Date'],infer_datetime_format=True) #converting to datetime

In [5]:
Ind_df_Confirmed=Ind_df.dropna()
Ind_df_Confirmed.drop([x for x in statelist], axis=1, inplace=True)
Ind_df_Confirmed.drop(['TT','Date_YMD'], axis=1, inplace=True) # Keep Only the Total Value of all confirmed cases


In [6]:
Ind_df_Converted = pd.pivot_table(Ind_df_Confirmed, values='Country Total', index=['Date'],
                    columns=['Status'])# pivoting the table to conversion
# Ind_df_Converted['Date'] = Ind_df_Converted.index
Ind_df_Converted=Ind_df_Converted.reset_index()


In [7]:
Ind_df_Converted.head()

Status,Date,Confirmed,Deceased,Recovered
0,2020-03-14,81,2,9
1,2020-03-15,27,0,4
2,2020-03-16,15,0,1
3,2020-03-17,11,1,1
4,2020-03-18,37,0,0


In [8]:
# Creating graphical representation of daily count of covid cases in India
for Cols in Ind_df_Converted.columns:
    if Cols!='Date':
        fig = px.bar(Ind_df_Converted, x="Date", y=Cols)
        layout = go.Layout(
            title=go.layout.Title(
            text="Daily count of COVID-19 "+ Cols + " cases in India",
            x=0.5
            ),
            font=dict(size=14),
            width=1040,
            height=600,
            xaxis_title = "Date of observation",
            yaxis_title = "Number of "+ Cols + " cases"
            )
        fig.update_layout(layout)
        fig.show()

# INFERENCES AND FURTHER OBJECTIVES
WE CAN CLEARLY SEE A SHARP INCREASE IN COVID 19 CASES IN RECENT DAYS. WE WILL INFER MORE AT LATTER STAGE .
-
Now we will reorganise US Covid Data To Properly Compare India's Covid Feature Set 
Source: https://covidtracking.com/data/api
CSV format
/v1/us/daily.csv

Fleids in Dataset Relevant to Us:
## date
Field type:integer
Date

Date on which data was collected by The COVID Tracking Project.

## deathIncrease AKA Daily Deceased as shown in Indian dataset
Field type:integer
New deaths

Daily increase in death, calculated from the previous day’s value.

Returns
null
if no data is available

## positiveIncrease AKA Daily Confirmed as shown in Indian dataset
Field type:integer
New cases

The daily increase in API field positive, which measures Cases (confirmed plus probable) calculated based on the previous day’s value.

Returns
null
if no data is available

## recovered AKA Daily Recovery as Show in Indian dataset
Field type:integer
Recovered

Total number of people that are identified as recovered from COVID-19. States provide very disparate definitions on what constitutes a “recovered” COVID-19 case. Types of “recovered” cases include those who are discharged from hospitals, released from isolation after meeting CDC guidance on symptoms cessation, or those who have not been identified as fatalities after a number of days (30 or more) post disease onset. Specifics vary for each state or territory.

Returns
null
if no data is available

In [9]:
US_df.isna().sum() #Checking NaN values 

date                          0
states                        0
positive                      1
negative                     48
pending                      51
hospitalizedCurrently        64
hospitalizedCumulative       51
inIcuCurrently               73
inIcuCumulative              72
onVentilatorCurrently        72
onVentilatorCumulative       79
dateChecked                   0
death                        28
hospitalized                 51
totalTestResults              0
lastModified                  0
recovered                   420
total                         0
posNeg                        0
deathIncrease                 0
hospitalizedIncrease          0
negativeIncrease              0
positiveIncrease              0
totalTestResultsIncrease      0
hash                          0
dtype: int64

In [10]:
US_df.head()
US_df.columns

Index(['date', 'states', 'positive', 'negative', 'pending',
       'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently',
       'inIcuCumulative', 'onVentilatorCurrently', 'onVentilatorCumulative',
       'dateChecked', 'death', 'hospitalized', 'totalTestResults',
       'lastModified', 'recovered', 'total', 'posNeg', 'deathIncrease',
       'hospitalizedIncrease', 'negativeIncrease', 'positiveIncrease',
       'totalTestResultsIncrease', 'hash'],
      dtype='object')

In [11]:
US_df.drop([ 'states', 'positive', 'negative', 'pending','hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently',
       'inIcuCumulative', 'onVentilatorCurrently', 'onVentilatorCumulative',
       'dateChecked', 'death', 'hospitalized', 'totalTestResults',
       'lastModified', 'total', 'posNeg',
       'hospitalizedIncrease', 'negativeIncrease',
       'totalTestResultsIncrease', 'hash'], axis=1, inplace=True) # Keep Only the Total Value of all confirmed cases

In [12]:
US_df.head()

Unnamed: 0,date,recovered,deathIncrease,positiveIncrease
0,20210307,,842,41835
1,20210306,,1680,60015
2,20210305,,2221,68787
3,20210304,,1743,65487
4,20210303,,2449,66836


Since Recovered has no Values in the dataset we will drop Both in US and India Datasets For Now 

In [13]:
US_df.drop(['recovered'], axis=1, inplace=True)
Ind_df_Converted.drop(['Recovered'], axis=1,inplace=True)

In [14]:
print (Ind_df_Converted.head())
print(US_df.head())

Status       Date  Confirmed  Deceased
0      2020-03-14         81         2
1      2020-03-15         27         0
2      2020-03-16         15         0
3      2020-03-17         11         1
4      2020-03-18         37         0
       date  deathIncrease  positiveIncrease
0  20210307            842             41835
1  20210306           1680             60015
2  20210305           2221             68787
3  20210304           1743             65487
4  20210303           2449             66836


### Now Lets Make US Dataset Similar to India in date order and Column order with same names

In [15]:
US_df_new=US_df.loc[::-1].reset_index(drop = True) # reversing date order
US_df_new.head()

Unnamed: 0,date,deathIncrease,positiveIncrease
0,20200113,0,0
1,20200114,0,0
2,20200115,0,0
3,20200116,0,0
4,20200117,0,0


In [16]:
USD=US_df_new.iloc[:, :0:-1]
USA=US_df_new.iloc[:, :1:1]
US=pd.concat([USA, USD],axis=1) # Rearranging Columns and Assigning new dataset variable name

In [17]:
US.columns # checking Column Names in Both Data Sets 

Index(['date', 'positiveIncrease', 'deathIncrease'], dtype='object')

In [18]:
Ind_df_Converted.columns

Index(['Date', 'Confirmed', 'Deceased'], dtype='object', name='Status')

In [19]:
US.rename(columns={'date': 'Date', 'positiveIncrease': 'Confirmed','deathIncrease': 'Deceased'}, inplace=True)

In [20]:
US['Date']=pd.to_datetime(US['Date'],format=('%Y%m%d')) # Date to date time Conversion  P.S when is Doubt refer to https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior

In [21]:
India=Ind_df_Converted # taking new Variable Indian Data Set

# Creating Visualisation For Analysis

In [22]:
# Creating graphical representation of daily count of covid cases in US
for Cols in US.columns:
    if Cols!='Date':
        fig = px.bar(US, x="Date", y=Cols)
        layout = go.Layout(
            title=go.layout.Title(
            text="Daily count of COVID-19 "+ Cols + " cases in US",
            x=0.5
            ),
            font=dict(size=14),
            width=1040,
            height=600,
            xaxis_title = "Date of observation",
            yaxis_title = "Number of "+ Cols + " cases"
            )
        fig.update_layout(layout)
        fig.show()

In [23]:
# Creating graphical representation of daily count of covid cases in India
for Cols in India.columns:
    if Cols!='Date':
        fig = px.bar(India, x="Date", y=Cols)
        layout = go.Layout(
            title=go.layout.Title(
            text="Daily count of COVID-19 "+ Cols + " cases in India",
            x=0.5
            ),
            font=dict(size=14),
            width=1040,
            height=600,
            xaxis_title = "Date of observation",
            yaxis_title = "Number of "+ Cols + " cases"
            )
        fig.update_layout(layout)
        fig.show()

# Inference
- As we can see by the above graphs US had till now Three Distinct Waves in Confirmed Covid 19 cases namely during april-may 2020 , 2nd wave in july-august 2020 , 3rd wave recenty dec20-feb 2021
- We can also note that India had 1st Wave last time during sep 2020.
- We can also Understand while India had low number of covid cases in comparision to US even though US has a Population less than 1/3rd of India.
- Now Lets Understand all the different Waves in each Country.
    * The first wave in US is maybe due to Delayed Lockdown In US. We can see that the Cases reported were nearly 1000+ by march 16th in US when state of Emergency was issued in final states but Full restrictions might have taken upto April 2020 by which reported case had increased to nearly 30K (Source:https://en.wikipedia.org/wiki/U.S._state_and_local_government_responses_to_the_COVID-19_pandemic)
    * The Contrast that with India decisive March 24 th Full Scale Lockdown when reported cases was within hundreds or Less.This slow the spread of the disease.(Source :https://cfo.economictimes.indiatimes.com/news/pm-narendra-modi-announces-lockdown-of-india-for-21-days/74797779)
    * But something is amiss why did it increase during September in India even though there was a Lockdown event .
    * See even community spread was there the hard Lockdown event had slowed the daily infection rate to a few thousands within 10k which is high  considering pure numbers But Low considering total population of India (1.4 bn).
    * Then Started a series of Unlock restrictions . Starting June :Unlock 1.0 (1–30 June),Unlock 2.0 (1–31 July),Unlock 3.0 (1–31 August),Unlock 4.0 (1–30 September),Unlock 5.0 (1–31 October)and finally Unlock 6.0 (1–30 November)(Sources:https://en.wikipedia.org/wiki/COVID-19_lockdown_in_India#Unlock_1.0_(1%E2%80%9330_June))
    * Now as lock down restrictions started the numbers began to rise peaking during September 2020 But than falling steadly to pre Unlock levels by December. This article(https://timesofindia.indiatimes.com/india/why-are-coronavirus-cases-falling-in-india/articleshow/80705782.cms) in Times of India had said :`Scientists around the world have proposed numerous theories to explain this unprecedented fall in Covid-19 cases in India. However, none has been able to give a definite answer. Experts said that without conclusive data, it was impossible to say why India's figures had fallen so dramatically.` 
    * So we can just Speculate why it decreased . But I can say that till then , it was just that in my opinion by just looking at the data and graph and news articles of US and India,that we had temporarily slowed the Covid-19 and the effects of lockdown  created way forward to lower transmission or infection rates because of alert audience till then . 
    * However , it was fun while it lasted. Because all that changed in recent days with relaxed and WHATEVER attitude (https://timesofindia.indiatimes.com/india/half-of-india-isnt-wearing-masks-and-the-other-half-is-wearing-them-wrong/articleshow/81983995.cms#:~:text=But%20that's%20not%20what%20the,aren't%20doing%20it%20properly.) by both the govt and citizens even though health experts and Health COVID Handle repeated stated we wont be able to dip this spread till we are all vaccinated by at 60 % of the population. It started Spiral out of control by April 2021. Today as we speak the daily infection breached 350 K which is still not as high in absolute per population terms in comparision to US or Brazil but is alarming to say the least as India's Per Capita wealth a meagre USD 6,700 (2019 est.) compared to USD 62,530 (2019 est.) in US(https://www.cia.gov/the-world-factbook/field/real-gdp-per-capita/)
    * Now Lets Take a Look back Second Wave and Third wave ,what happened in US Preceding to that:
        -Note that Unlike India in initial lockdown, US had Local State Authority do lockdowns or restrictions as per their discretion . Also Politics in USA played a Significant Part is That. As there was a greater political divide between citizens . This created  chaos.
        -In June before Resurgence Occurred a lot of states had eased some or more restrictions or were going to reopen which further added to the increase quite posibly..(https://www.reuters.com/article/us-health-coronavirus-usa/reopenings-stall-as-u-s-records-nearly-50000-cases-of-covid-19-in-single-day-idUSKBN2426LN) 
         - Also the Second wave could be a By Product of Close quarter Nationwide Protests of Unarmed Black Man : George Floyd Murder that start in May 26 .(https://www.theguardian.com/world/2020/may/31/fears-grow-of-surge-in-us-coronavirus-cases-from-george-floyd-protests). Even Masks were used Protest didnot had social distancing as repeated warned by Dr Fauci:(https://thehill.com/homenews/administration/502001-fauci-underscores-concerns-about-protests-spreading-coronavirus)
         - Election Campaigns and rallys also attributed to this significant resurgence. For example after 30-day period 2020 Trump Tulsa rally ,the rate of new COVID-19 cases in Oklahoma more than tripled, to 513 cases per day.
         -2nd wave decreased some what, however holding steady and increased exponentially by NOV 2020 .This can be attributed to US elections.
    * Finally Lets Talk about the fall in cases in United States :
      -Till Today 140 Million(49%)  were administered at least one dose and Nearly 95+(29.2) million have fully Vaccinated(two Dose or single dose of Johnson and Johnson)(As per Google Data).(https://www.google.com/search?q=vaccine+tracker+covid&rlz=1C1CHBF_enIN856IN856&oq=vaccine&aqs=chrome.2.69i59l5j69i60l3.4471j0j4&sourceid=chrome&ie=UTF-8)
      -That means Vaccine has somewhat decreased the spread although it is far From over.
     * so What about India well India Has Administered 140 MILLION Total doses with 12 million peole getting 1st dose and 2.38 million fully vaccinated. While this is impressive number again by sheer volume it is less than 2% percent by total Population of India which is Alarming. 
     * At Current rate `At current rate of 2.2mn doses per day, India can only cover 30% population by end-2021`
https://www.firstpost.com/india/covid-19-vaccine-data-analysis-at-current-rate-of-2-2mn-doses-per-day-india-can-only-cover-30-population-by-end-2021-9562811.html. 
     * Although the Current rate will increase by June because both Production in The world largest Vaccine Producer(https://www.moneycontrol.com/news/business/indias-reputation-as-the-worlds-vaccine-hub-was-surprisingly-under-the-wraps-6335621.html) which ironically is India , has Vowed to Increased by Multifold as critical raw materials those were with held by US were just released.(https://www.cnbctv18.com/healthcare/us-president-biden-extends-india-help-in-covid-19-fight-but-after-criticism-9069131.htm) and because the government has allowed vaccination for all by registration starting 1st May 2021 it will still be a long time.
     
     
# Now lets Add a New variable CFR=Case Fatality Rate to the Mix.
 As per this artcle: https://science.thewire.in/the-sciences/covid-19-pandemic-case-fatality-rate-calculation/
 The three estimates of CFR 
 i. ‘First’ way to measure: D(May 4)/C(May 4) = 3.42%

ii. The research article’s way: D(May 4)/[D(May 4) + R(May 4)] = 11.01%

iii. The alternative way: D(May 4)/C(April 27) = 5.14%

We will take the first one as it more simpler and total approach for a broader understanding.
Although do note:
`the actual CFR of COVID-19 could be lower. To calculate the CFR, we need to know how many people were infected – and this figure hasn’t been easy to pin down with the novel coronavirus. We already know that there are many people with mild symptoms as well as no symptoms at all.`(https://science.thewire.in/the-sciences/covid-19-pandemic-case-fatality-rate-calculation/)

There are Other Variables that we will study later after adding and analysing the graph

In [24]:
CFR = (India["Deceased"] / India["Confirmed"])*100
India["CFR"] = CFR
India.head()

Status,Date,Confirmed,Deceased,CFR
0,2020-03-14,81,2,2.469136
1,2020-03-15,27,0,0.0
2,2020-03-16,15,0,0.0
3,2020-03-17,11,1,9.090909
4,2020-03-18,37,0,0.0


In [25]:
USCFR = (US["Deceased"] / US["Confirmed"])*100
US["CFR"] = USCFR
US.CFR=US.CFR.fillna(value=0)
US.head()

Unnamed: 0,Date,Confirmed,Deceased,CFR
0,2020-01-13,0,0,0.0
1,2020-01-14,0,0,0.0
2,2020-01-15,0,0,0.0
3,2020-01-16,0,0,0.0
4,2020-01-17,0,0,0.0


In [26]:
# Creating graphical representation of daily count of covid cases vs CFR in India
for Cols in India.columns:
    if Cols!='Date':
        if Cols!='Deceased':
            fig = px.bar(India, x="Date", y=Cols)
            layout = go.Layout(
                title=go.layout.Title(
                text="Daily count of COVID-19 "+ Cols + " cases in India",
                x=0.5
                ),
                font=dict(size=14),
                width=1040,
                height=600,
                xaxis_title = "Date of observation",
                yaxis_title = "Number of "+ Cols + " cases"
                )
            fig.update_layout(layout)
            fig.show()

In [27]:
# Creating graphical representation of daily count of covid cases vs CFR in India
for Cols in US.columns:
    if Cols!='Date':
        if Cols!='Deceased':
            fig = px.bar(US, x="Date", y=Cols)
            layout = go.Layout(
                title=go.layout.Title(
                text="Daily count of COVID-19 "+ Cols + " cases in India",
                x=0.5
                ),
                font=dict(size=14),
                width=1040,
                height=600,
                xaxis_title = "Date of observation",
                yaxis_title = "Number of "+ Cols + " cases"
                )
            fig.update_layout(layout)
            fig.show()