In [1]:
# libraries

from datetime import datetime
import os
import glob
import requests 
import pandas as pd
from bs4 import BeautifulSoup

# Web Scrapping

In [2]:
# web scrapping

link = 'https://www.mohfw.gov.in/'
req = requests.get(link)
soup = BeautifulSoup(req.content, "html.parser")

table = soup.find_all('table')[0]
rows = table.find_all('tr')

row_list = []

for tr in rows:
    td = tr.find_all('td')
    row = [i.text for i in td]
    row_list.append(row)
    
    df_bs = pd.DataFrame(row_list[1:len(row_list)-1], columns=row_list[0])
    
df_bs.drop('S. No.', axis=1, inplace=True)
df_bs.head()

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured,Death
0,Delhi,6,0,0,0
1,Haryana,0,14,0,0
2,Kerala,19,0,3,0
3,Rajasthan,1,2,0,0
4,Telengana,1,0,0,0


# Data Cleaning

In [3]:
# date-time information
# ---------------------

now  = datetime.now()
df_bs['Date'] = now.strftime("%m/%d/%Y") 
df_bs['Date'] = pd.to_datetime(df_bs['Date'], format='%m/%d/%Y')
df_bs.head()

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured,Death,Date
0,Delhi,6,0,0,0,2020-03-13
1,Haryana,0,14,0,0,2020-03-13
2,Kerala,19,0,3,0,2020-03-13
3,Rajasthan,1,2,0,0,2020-03-13
4,Telengana,1,0,0,0,2020-03-13


In [4]:
# latitude and longitude information
# ----------------------------------

lat = {'Delhi':28.7041,
       'Haryana':29.0588,
       'Kerala':10.8505,
       'Rajasthan':27.0238,
       'Telengana':18.1124,
       'Uttar Pradesh':26.8467,
       'Union Territory of Ladakh':34.2996,
       'Tamil Nadu':11.1271,
       'Union Territory of Jammu and Kashmir':33.7782,
       'Punjab':31.1471,
       'Karnataka':15.3173,
       'Maharashtra':19.7515,
       'Andhra Pradesh':15.9129}

long = {'Delhi':77.1025,
        'Haryana':76.0856,
        'Kerala':76.2711,
        'Rajasthan':74.2179,
        'Telengana':79.0193,
        'Uttar Pradesh':80.9462,
        'Union Territory of Ladakh':78.2932,
        'Tamil Nadu':78.6569,
        'Union Territory of Jammu and Kashmir':76.5762,
        'Punjab':75.3412,
        'Karnataka':75.7139,
        'Maharashtra':75.7139,
        'Andhra Pradesh':79.7400}

df_bs['Latitude'] = df_bs['Name of State / UT'].map(lat)
df_bs['Longitude'] = df_bs['Name of State / UT'].map(long)

df_bs.head()

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured,Death,Date,Latitude,Longitude
0,Delhi,6,0,0,0,2020-03-13,28.7041,77.1025
1,Haryana,0,14,0,0,2020-03-13,29.0588,76.0856
2,Kerala,19,0,3,0,2020-03-13,10.8505,76.2711
3,Rajasthan,1,2,0,0,2020-03-13,27.0238,74.2179
4,Telengana,1,0,0,0,2020-03-13,18.1124,79.0193


In [5]:
df_bs.isna().sum()

Name of State / UT                            0
Total Confirmed cases (Indian National)       0
Total Confirmed cases ( Foreign National )    0
Cured                                         0
Death                                         0
Date                                          0
Latitude                                      0
Longitude                                     0
dtype: int64

# Saving data

In [6]:
# saving data
# -----------

file_name = now.strftime("%Y_%m_%d")+'.csv'
file_loc = r'C:\Users\imdevskp\Desktop\covid_india'
df_bs.to_csv(file_loc + file_name, index=False)

df_bs.head()

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured,Death,Date,Latitude,Longitude
0,Delhi,6,0,0,0,2020-03-13,28.7041,77.1025
1,Haryana,0,14,0,0,2020-03-13,29.0588,76.0856
2,Kerala,19,0,3,0,2020-03-13,10.8505,76.2711
3,Rajasthan,1,2,0,0,2020-03-13,27.0238,74.2179
4,Telengana,1,0,0,0,2020-03-13,18.1124,79.0193


# Combining data

In [7]:
# complete data

loc = "C:\\Users\\imdevskp\\Desktop\\covid_india\\other_data\\"

files = glob.glob(loc+'2020*.csv')
dfs = []
for i in files:
    dfs.append(pd.read_csv(i))
    
complete_data = pd.concat(dfs, ignore_index=True).sort_values(['Date'], ascending=True).reset_index(drop=True)
complete_data['Date'] = pd.to_datetime(complete_data['Date'])

complete_data.to_csv('complete.csv', index=False)
complete_data.head()

Unnamed: 0,Date,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured,Latitude,Longitude,Death
0,2020-01-30,Kerala,1.0,0.0,0.0,10.8505,76.2711,0
1,2020-01-31,Kerala,1.0,0.0,0.0,10.8505,76.2711,0
2,2020-02-01,Kerala,2.0,0.0,0.0,10.8505,76.2711,0
3,2020-02-02,Kerala,3.0,0.0,0.0,10.8505,76.2711,0
4,2020-02-03,Kerala,3.0,0.0,0.0,10.8505,76.2711,0


In [8]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 8 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   Date                                        135 non-null    datetime64[ns]
 1   Name of State / UT                          135 non-null    object        
 2   Total Confirmed cases (Indian National)     135 non-null    float64       
 3   Total Confirmed cases ( Foreign National )  135 non-null    float64       
 4   Cured                                       135 non-null    float64       
 5   Latitude                                    135 non-null    float64       
 6   Longitude                                   135 non-null    float64       
 7   Death                                       135 non-null    int64         
dtypes: datetime64[ns](1), float64(5), int64(1), object(1)
memory usage: 8.6+ KB
