# COVID-19 in India

### Importing the modules

In [1]:
# importing the required modules
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Manipulating the default plot size
plt.rcParams['figure.figsize'] = 10, 12

# Disable warnings 
import warnings
warnings.filterwarnings('ignore')

## Scraping the datasets from Ministry of Health and Family Welfare website


In [2]:
from datetime import datetime
import re
import requests 
from bs4 import BeautifulSoup

In [3]:
# link for the site
link = 'https://www.mohfw.gov.in/'

req = requests.get(link)

soup = BeautifulSoup(req.content, "html.parser")

In [4]:
thead = soup.find_all('thead')[-1]
# print(thead)

head = thead.find_all('tr')
# print(head)

tbody = soup.find_all('tbody')[-1]
# print(tbody)

body = tbody.find_all('tr')
# print(body)

In [5]:
head_rows = []
body_rows = []

for tr in head:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    head_rows.append(row)

for tr in body:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    body_rows.append(row)

In [6]:
df_bs = pd.DataFrame(body_rows[:len(body_rows) - 6], columns = head_rows[0])      

df_bs.drop('S. No.', axis=1, inplace = True)

df_bs

Unnamed: 0,Name of State / UT,Active Cases*,Cured/Discharged/Migrated*,Deaths**,Total Confirmed cases*
0,Andaman and Nicobar Islands,0,33,0,33
1,Andhra Pradesh,1654,2576,73,4303
2,Arunachal Pradesh,44,1,0,45
3,Assam,1651,498,4,2153
4,Bihar,2342,2225,29,4596
5,Chandigarh,77,222,5,304
6,Chhattisgarh,633,244,2,879
7,Dadar Nagar Haveli,13,1,0,14
8,Delhi,15311,10315,708,26334
9,Goa,131,65,0,196


## Cleaning the Data

In [7]:
df_India = df_bs.copy()

# today's date
now  = datetime.now()

# formatting date to month-day-year
df_India['Date'] = now.strftime("%m/%d/%Y") 

# addding 'Date' column to dataframe
df_India['Date'] = pd.to_datetime(df_India['Date'], format='%m/%d/%Y')

df_India.head()

Unnamed: 0,Name of State / UT,Active Cases*,Cured/Discharged/Migrated*,Deaths**,Total Confirmed cases*,Date
0,Andaman and Nicobar Islands,0,33,0,33,2020-06-07
1,Andhra Pradesh,1654,2576,73,4303,2020-06-07
2,Arunachal Pradesh,44,1,0,45,2020-06-07
3,Assam,1651,498,4,2153,2020-06-07
4,Bihar,2342,2225,29,4596,2020-06-07


In [8]:
# removing extra characters from 'Name of State/UT' column

df_India['Name of State / UT'] = df_India['Name of State / UT'].str.replace('#', '')
df_India['Deaths**'] = df_India['Deaths**'].str.replace('#', '')

In [9]:
# latitude of the states
lat = {'Delhi':28.7041, 'Haryana':29.0588, 'Kerala':10.8505, 'Rajasthan':27.0238,
       'Telengana':18.1124, 'Uttar Pradesh':26.8467, 'Ladakh':34.2996, 'Tamil Nadu':11.1271,
       'Jammu and Kashmir':33.7782, 'Punjab':31.1471, 'Karnataka':15.3173, 'Maharashtra':19.7515,
       'Andhra Pradesh':15.9129, 'Odisha':20.9517, 'Uttarakhand':30.0668, 'West Bengal':22.9868, 
       'Puducherry': 11.9416, 'Chandigarh': 30.7333, 'Chhattisgarh':21.2787, 'Gujarat': 22.2587, 
       'Himachal Pradesh': 31.1048, 'Madhya Pradesh': 22.9734, 'Bihar': 25.0961, 'Manipur':24.6637, 
       'Mizoram':23.1645, 'Goa': 15.2993, 'Andaman and Nicobar Islands': 11.7401, 'Assam' : 26.2006, 
       'Jharkhand': 23.6102, 'Arunachal Pradesh': 28.2180, 'Tripura': 23.9408, 'Nagaland': 26.1584, 
       'Meghalaya' : 25.4670, 'Dadar Nagar Haveli' : 20.1809, 'Sikkim': 27.5330}

# longitude of the states
long = {'Delhi':77.1025, 'Haryana':76.0856, 'Kerala':76.2711, 'Rajasthan':74.2179,
        'Telengana':79.0193, 'Uttar Pradesh':80.9462, 'Ladakh':78.2932, 'Tamil Nadu':78.6569,
        'Jammu and Kashmir':76.5762, 'Punjab':75.3412, 'Karnataka':75.7139, 'Maharashtra':75.7139,
        'Andhra Pradesh':79.7400, 'Odisha':85.0985, 'Uttarakhand':79.0193, 'West Bengal':87.8550, 
        'Puducherry': 79.8083, 'Chandigarh': 76.7794, 'Chhattisgarh':81.8661, 'Gujarat': 71.1924, 
        'Himachal Pradesh': 77.1734, 'Madhya Pradesh': 78.6569, 'Bihar': 85.3131, 'Manipur':93.9063, 
        'Mizoram':92.9376, 'Goa': 74.1240, 'Andaman and Nicobar Islands': 92.6586, 'Assam' : 92.9376, 
        'Jharkhand': 85.2799, 'Arunachal Pradesh': 94.7278, 'Tripura': 91.9882, 'Nagaland': 94.5624,
        'Meghalaya' : 91.3662, 'Dadar Nagar Haveli' : 73.0169, 'Sikkim': 88.5122}

# add latitude column according to the 'Name of State / UT' column
df_India['Latitude'] = df_India['Name of State / UT'].map(lat)

# add longitude column according to the 'Name of State / UT' column
df_India['Longitude'] = df_India['Name of State / UT'].map(long)

df_India.head()

Unnamed: 0,Name of State / UT,Active Cases*,Cured/Discharged/Migrated*,Deaths**,Total Confirmed cases*,Date,Latitude,Longitude
0,Andaman and Nicobar Islands,0,33,0,33,2020-06-07,11.7401,92.6586
1,Andhra Pradesh,1654,2576,73,4303,2020-06-07,15.9129,79.74
2,Arunachal Pradesh,44,1,0,45,2020-06-07,28.218,94.7278
3,Assam,1651,498,4,2153,2020-06-07,26.2006,92.9376
4,Bihar,2342,2225,29,4596,2020-06-07,25.0961,85.3131


In [10]:
# renaming the columns
df_India = df_India.rename(columns = {'Cured/Discharged/Migrated*' :'Cured', 
                                       'Total Confirmed cases*': 'Confirmed',
                                       'Deaths**': 'Deaths',
                                       'Active Cases*': 'Active'})

df_India = df_India.rename(columns = {'Name of State / UT': 'State/UT'})

df_India.head()

Unnamed: 0,State/UT,Active,Cured,Deaths,Confirmed,Date,Latitude,Longitude
0,Andaman and Nicobar Islands,0,33,0,33,2020-06-07,11.7401,92.6586
1,Andhra Pradesh,1654,2576,73,4303,2020-06-07,15.9129,79.74
2,Arunachal Pradesh,44,1,0,45,2020-06-07,28.218,94.7278
3,Assam,1651,498,4,2153,2020-06-07,26.2006,92.9376
4,Bihar,2342,2225,29,4596,2020-06-07,25.0961,85.3131


In [11]:
# unique state names
df_India['State/UT'].unique()

array(['Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Dadar Nagar Haveli', 'Delhi', 'Goa', 'Gujarat',
       'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand',
       'Karnataka', 'Kerala', 'Ladakh', 'Madhya Pradesh', 'Maharashtra',
       'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha',
       'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu',
       'Telengana', 'Tripura', 'Uttarakhand', 'Uttar Pradesh',
       'West Bengal'], dtype=object)

In [12]:
# number of missing values 
df_India.isna().sum()

State/UT     0
Active       0
Cured        0
Deaths       0
Confirmed    0
Date         0
Latitude     0
Longitude    0
dtype: int64

In [13]:
# number of unique values 
df_India.nunique()

State/UT     35
Active       35
Cured        32
Deaths       24
Confirmed    34
Date          1
Latitude     35
Longitude    31
dtype: int64

In [14]:
# converting date column to datetime datatype
df_India['Date'] = pd.to_datetime(df_India['Date'])

df_India.head()

Unnamed: 0,State/UT,Active,Cured,Deaths,Confirmed,Date,Latitude,Longitude
0,Andaman and Nicobar Islands,0,33,0,33,2020-06-07,11.7401,92.6586
1,Andhra Pradesh,1654,2576,73,4303,2020-06-07,15.9129,79.74
2,Arunachal Pradesh,44,1,0,45,2020-06-07,28.218,94.7278
3,Assam,1651,498,4,2153,2020-06-07,26.2006,92.9376
4,Bihar,2342,2225,29,4596,2020-06-07,25.0961,85.3131


In [15]:
# rename some of the state/UT names

df_India['State/UT'].replace('Chattisgarh', 'Chhattisgarh', inplace = True)
df_India['State/UT'].replace('Pondicherry', 'Puducherry', inplace = True) 

In [16]:
# changing the datatype of some of the columns
df_India[['Active', 'Cured', 'Deaths', 'Confirmed', 'Latitude', 'Longitude']] = df_India[['Active', 'Cured', 'Deaths', 'Confirmed', 'Latitude', 'Longitude']].apply(pd.to_numeric)

In [17]:
df_India.describe()

Unnamed: 0,Active,Cured,Deaths,Confirmed,Latitude,Longitude
count,35.0,35.0,35.0,35.0,35.0,35.0
mean,3078.571429,3259.228571,189.771429,6527.571429,23.357097,82.070229
std,7571.401826,6747.666416,521.575052,14632.261391,6.463139,7.255625
min,0.0,0.0,0.0,3.0,10.8505,71.1924
25%,84.0,44.5,0.0,115.5,19.9662,76.42365
50%,860.0,498.0,8.0,1699.0,23.9408,79.0193
75%,2424.5,2400.5,93.0,4715.5,27.8755,88.1836
max,42224.0,35156.0,2849.0,80229.0,34.2996,94.7278


### Final dataframe

In [18]:
df_India

Unnamed: 0,State/UT,Active,Cured,Deaths,Confirmed,Date,Latitude,Longitude
0,Andaman and Nicobar Islands,0,33,0,33,2020-06-07,11.7401,92.6586
1,Andhra Pradesh,1654,2576,73,4303,2020-06-07,15.9129,79.74
2,Arunachal Pradesh,44,1,0,45,2020-06-07,28.218,94.7278
3,Assam,1651,498,4,2153,2020-06-07,26.2006,92.9376
4,Bihar,2342,2225,29,4596,2020-06-07,25.0961,85.3131
5,Chandigarh,77,222,5,304,2020-06-07,30.7333,76.7794
6,Chhattisgarh,633,244,2,879,2020-06-07,21.2787,81.8661
7,Dadar Nagar Haveli,13,1,0,14,2020-06-07,20.1809,73.0169
8,Delhi,15311,10315,708,26334,2020-06-07,28.7041,77.1025
9,Goa,131,65,0,196,2020-06-07,15.2993,74.124


In [19]:
# complete data info
df_India.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 8 columns):
State/UT     35 non-null object
Active       35 non-null int64
Cured        35 non-null int64
Deaths       35 non-null int64
Confirmed    35 non-null int64
Date         35 non-null datetime64[ns]
Latitude     35 non-null float64
Longitude    35 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(4), object(1)
memory usage: 2.3+ KB


In [20]:
# simply copying the dataframe
df = df_India.copy()

## Analysing COVID 19 Cases in India

In [21]:
total_cases = df['Confirmed'].sum()
print('Total confirmed cases of COVID 19 across India till date are:', total_cases)

Total confirmed cases of COVID 19 across India till date are: 228465


In [22]:
# Highlighting the dataframe

df_temp = df.drop(['Latitude', 'Longitude', 'Date'], axis = 1) #Removing Date, Latitude and Longitude and other extra columns
df_temp.style.background_gradient(cmap = 'Set3')

Unnamed: 0,State/UT,Active,Cured,Deaths,Confirmed
0,Andaman and Nicobar Islands,0,33,0,33
1,Andhra Pradesh,1654,2576,73,4303
2,Arunachal Pradesh,44,1,0,45
3,Assam,1651,498,4,2153
4,Bihar,2342,2225,29,4596
5,Chandigarh,77,222,5,304
6,Chhattisgarh,633,244,2,879
7,Dadar Nagar Haveli,13,1,0,14
8,Delhi,15311,10315,708,26334
9,Goa,131,65,0,196


In [23]:
total_cured = df['Cured'].sum()
print("Total number of people cured as of today are:", total_cured)

total_cases = df['Confirmed'].sum()
print("Total number of people found Corona +ve as of today are:", total_cases)

total_death = df['Deaths'].sum()
print("Total deaths due to Corona as of today are:", total_death)

total_active = total_cases - (total_cured + total_death)
print("Total active cases as of today are:", total_active)

Total number of people cured as of today are: 114073
Total number of people found Corona +ve as of today are: 228465
Total deaths due to Corona as of today are: 6642
Total active cases as of today are: 107750


In [24]:
# Total Active cases = Total cases - (Number of death + Cured)

df['Total Active'] = df['Confirmed'] - (df['Deaths'] + df['Cured'])
total_active = df['Total Active'].sum()
print('Total number of active cases across India are:', total_active)

Tot_Cases = df.groupby('State/UT')['Total Active'].sum().sort_values(ascending = False).to_frame()

Tot_Cases.style.background_gradient(cmap = 'coolwarm')

Total number of active cases across India are: 107750


Unnamed: 0_level_0,Total Active
State/UT,Unnamed: 1_level_1
Maharashtra,42224
Delhi,15311
Tamil Nadu,12700
Gujarat,4901
West Bengal,4025
Uttar Pradesh,3828
Karnataka,3090
Madhya Pradesh,2734
Rajasthan,2507
Bihar,2342


In [25]:
import numpy as np
state_cases = df_India.groupby('State/UT')['Confirmed','Deaths','Cured'].max().reset_index()

state_cases['Active'] = state_cases['Confirmed'] - (state_cases['Deaths'] + state_cases['Cured'])

state_cases["Death Rate (per 100)"] = np.round(100 * state_cases["Deaths"] / state_cases["Confirmed"], 2)

state_cases["Cure Rate (per 100)"] = np.round(100 * state_cases["Cured"] / state_cases["Confirmed"], 2)

state_cases.sort_values('Confirmed', ascending = False).fillna(0).style.background_gradient(cmap = 'gist_ncar_r', subset = ["Confirmed"])\
                        .background_gradient(cmap = 'gist_ncar_r', subset = ["Deaths"])\
                        .background_gradient(cmap = 'gist_ncar_r', subset = ["Cured"])\
                        .background_gradient(cmap = 'gist_ncar_r', subset = ["Active"])\
                        .background_gradient(cmap = 'gist_ncar_r', subset = ["Death Rate (per 100)"])\
                        .background_gradient(cmap = 'gist_ncar_r', subset = ["Cure Rate (per 100)"])

Unnamed: 0,State/UT,Confirmed,Deaths,Cured,Active,Death Rate (per 100),Cure Rate (per 100)
19,Maharashtra,80229,2849,35156,42224,3.55,43.82
29,Tamil Nadu,28694,232,15762,12700,0.81,54.93
8,Delhi,26334,708,10315,15311,2.69,39.17
10,Gujarat,19094,1190,13003,4901,6.23,68.1
27,Rajasthan,10084,218,7359,2507,2.16,72.98
32,Uttar Pradesh,9733,257,5648,3828,2.64,58.03
18,Madhya Pradesh,8996,384,5878,2734,4.27,65.34
34,West Bengal,7303,366,2912,4025,5.01,39.87
15,Karnataka,4835,57,1688,3090,1.18,34.91
4,Bihar,4596,29,2225,2342,0.63,48.41
