In [3]:
# libraries

from datetime import datetime
import os
import glob
import requests 
import pandas as pd
from bs4 import BeautifulSoup

# Web Scrapping

In [5]:
# web scrapping

link = 'https://www.mohfw.gov.in/'
req = requests.get(link)
soup = BeautifulSoup(req.content, "html.parser")

thead = soup.find_all('thead')
print(thead)
head = thead.find_all('tr')

tbody = soup.find_all('tbody')[2]
body = tbody.find_all('tr')

# print(rows)

head_rows = []
body_rows = []

for tr in head:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    head_rows.append(row)
    
for tr in body:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    body_rows.append(row)
    
print(head_rows)
    
df_bs = pd.DataFrame(body_rows[:len(body_rows)-1], columns=head_rows[0])
    
df_bs.drop('S. No.', axis=1, inplace=True)
df_bs.head(36)

[<thead class="thead-dark">
<tr bgcolor="#f8f8f8">
<th align="centre" width="20%"><strong>Date</strong></th>
<th align="centre" width="80%"><strong>TITLE</strong></th>
</tr>
</thead>, <thead class="thead-dark">
<tr bgcolor="#f8f8f8">
<th align="centre" width="20%"><strong>DATE</strong></th>
<th align="centre" width="80%"><strong>TITLE</strong></th>
</tr>
</thead>, <thead class="thead-dark">
<tr bgcolor="#f8f8f8">
<th align="centre" width="20%"><strong>DATE</strong></th>
<th align="centre" width="80%"><strong>TITLE</strong></th>
</tr>
</thead>, <thead class="thead-dark">
<tr bgcolor="#f8f8f8">
<th align="centre" width="20%"><strong>DATE</strong></th>
<th align="centre" width="80%"><strong>TITLE</strong></th>
</tr>
</thead>, <thead class="thead-dark">
<tr bgcolor="#f8f8f8">
<th align="centre" width="20%"><strong>DATE</strong></th>
<th align="centre" width="80%"><strong>TITLE</strong></th>
</tr>
</thead>, <thead class="thead-dark">
<tr bgcolor="#f8f8f8">
<th align="centre" width="80%"><st

AttributeError: ResultSet object has no attribute 'find_all'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

# Data Cleaning

In [None]:
# date-time information
# ---------------------

now  = datetime.now()
df_bs['Date'] = now.strftime("%m/%d/%Y") 
df_bs['Date'] = pd.to_datetime(df_bs['Date'], format='%m/%d/%Y')
df_bs.head(36)

In [None]:
df_bs['Name of State / UT'].unique()

In [None]:
# latitude and longitude information
# ----------------------------------

lat = {'Delhi':28.7041,
       'Haryana':29.0588,
       'Kerala':10.8505,
       'Rajasthan':27.0238,
       'Telengana':18.1124,
       'Uttar Pradesh':26.8467,
       'Ladakh':34.2996,
       'Tamil Nadu':11.1271,
       'Jammu and Kashmir':33.7782,
       'Punjab':31.1471,
       'Karnataka':15.3173,
       'Maharashtra':19.7515,
       'Andhra Pradesh':15.9129, 
       'Odisha':20.9517, 
       'Uttarakhand':30.0668, 
       'West Bengal':22.9868, 
       'Puducherry': 11.9416, 
       'Chandigarh': 30.7333, 
       'Chhattisgarh':21.2787, 
       'Gujarat': 22.2587, 
       'Himachal Pradesh': 31.1048, 
       'Madhya Pradesh': 22.9734, 
       'Bihar': 25.0961}

long = {'Delhi':77.1025,
        'Haryana':76.0856,
        'Kerala':76.2711,
        'Rajasthan':74.2179,
        'Telengana':79.0193,
        'Uttar Pradesh':80.9462,
        'Ladakh':78.2932,
        'Tamil Nadu':78.6569,
        'Jammu and Kashmir':76.5762,
        'Punjab':75.3412,
        'Karnataka':75.7139,
        'Maharashtra':75.7139,
        'Andhra Pradesh':79.7400, 
        'Odisha':85.0985, 
        'Uttarakhand':79.0193, 
        'West Bengal':87.8550, 
        'Puducherry': 79.8083, 
        'Chandigarh': 76.7794, 
        'Chhattisgarh':81.8661, 
        'Gujarat': 71.1924, 
        'Himachal Pradesh': 77.1734, 
        'Madhya Pradesh': 78.6569, 
        'Bihar': 85.3131}

df_bs['Latitude'] = df_bs['Name of State / UT'].map(lat)
df_bs['Longitude'] = df_bs['Name of State / UT'].map(long)

df_bs.head()

In [None]:
df_bs.isna().sum()

# Saving data

In [None]:
# saving data
# -----------

file_name = now.strftime("%Y_%m_%d")+'.csv'
file_loc = 'C:\\Users\\imdevskp\\Desktop\\covid_india\\.day_by_day_data\\'
df_bs.to_csv(file_loc + file_name, index=False)

df_bs.head(36)

In [None]:
df_bs.columns

# Combining data

In [None]:
! ls C:\Users\imdevskp\Desktop\covid_india\.day_by_day_data

In [None]:
# pd.read_csv?

In [None]:
# complete data

loc = "C:\\Users\\imdevskp\\Desktop\\covid_india\\.day_by_day_data\\"

files = glob.glob(loc+'2020*.csv')
dfs = []
for i in files:
    df_temp = pd.read_csv(i)
    df_temp = df_temp.rename(columns={'Cured':'Cured/Discharged'})
    df_temp = df_temp.rename(columns={'Cured/Discharged':'Cured/Discharged/Migrated'})
    dfs.append(df_temp)
    
# print(dfs)

complete_data = pd.concat(dfs, ignore_index=True).sort_values(['Date'], ascending=True).reset_index(drop=True)
complete_data['Date'] = pd.to_datetime(complete_data['Date'])
complete_data = complete_data.sort_values(['Date', 'Name of State / UT']).reset_index(drop=True)

cols = ['Total Confirmed cases (Indian National)', 'Total Confirmed cases ( Foreign National )', 
              'Cured/Discharged/Migrated', 'Death']

complete_data[cols] = complete_data[cols].fillna(0).astype('int')

# complete_data.tail(50)

In [None]:
complete_data.columns

In [None]:
complete_data['Name of State / UT'].replace('Chattisgarh', 'Chhattisgarh', inplace=True)
complete_data['Name of State / UT'].replace('Pondicherry', 'Puducherry', inplace=True) 

In [None]:
complete_data['Name of State / UT'].unique()

In [None]:
# sorted(complete_data['Name of State / UT'].unique())

In [None]:
complete_data.info()

In [None]:
complete_data.to_csv('complete.csv', index=False)