In [1]:
# libraries

from datetime import datetime
import os
import glob
import requests 
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# Web scrapping

In [2]:
# download data 
# =============

link = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSc_2y5N0I67wDU38DjDh35IZSIS30rQf7_NYZhtYYGU1jJYT6_kDx4YpF-qw0LSlGsBYP8pqM_a1Pd/pubhtml#'
req = requests.get(link)
soup = BeautifulSoup(req.content, "html.parser")

In [3]:
tbody = soup.find_all('tbody')[0]
body = tbody.find_all('tr')

# print(tbody)
# print(body)

In [4]:
head_row = [i.text for i in body[0].find_all('td')]
head_row

['Patient Number',
 'State Patient Number',
 'Date Announced',
 'Age Bracket',
 'Gender',
 'Detected City',
 'Detected District',
 'Detected State',
 'Current Status',
 'Notes',
 'Contracted from which Patient (Suspected)',
 'Nationality',
 'Type of transmission',
 'Status Change Date',
 'Source_1',
 'Source_2',
 'Source_3',
 'Backup Notes',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [5]:
contents = []

for i in range(len(body)):
    contents.append([i.text for i in body[i].find_all('td')])

# Saving to Dataframe

In [6]:
p_df = pd.DataFrame(contents[2:len(contents)], columns=head_row)
p_df.head()

Unnamed: 0,Patient Number,State Patient Number,Date Announced,Age Bracket,Gender,Detected City,Detected District,Detected State,Current Status,Notes,...,Source_2,Source_3,Backup Notes,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,1,KL-TS-P1,30/01/2020,20.0,F,Thrissur,Thrissur,Kerala,Recovered,Travelled from Wuhan,...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,,,,,,,,
1,2,KL-AL-P1,02/02/2020,,,Alappuzha,Alappuzha,Kerala,Recovered,Travelled from Wuhan,...,https://weather.com/en-IN/india/news/news/2020...,,Student from Wuhan,,,,,,,
2,3,KL-KS-P1,03/02/2020,,,Kasaragod,Kasaragod,Kerala,Recovered,Travelled from Wuhan,...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,,,,,,,
3,4,DL-P1,02/03/2020,45.0,M,East Delhi (Mayur Vihar),East Delhi,Delhi,Recovered,"Travelled from Austria, Italy",...,https://economictimes.indiatimes.com/news/poli...,,Travel history to Italy and Austria,,,,,,,
4,5,TS-P1,02/03/2020,24.0,M,Hyderabad,Hyderabad,Telangana,Recovered,"Travelled from Dubai to Bangalore on 20th Feb,...",...,https://www.indiatoday.in/india/story/coronavi...,https://www.thehindu.com/news/national/coronav...,"Travel history to Dubai, Singapore contact",,,,,,,


# Data cleaning and transformations

In [7]:
# shape of dataframe
# ==================

p_df.shape

(2071, 25)

In [8]:
# columns
# =======

p_df.columns

Index(['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Type of transmission', 'Status Change Date', 'Source_1', 'Source_2',
       'Source_3', 'Backup Notes', '', '', '', '', '', '', ''],
      dtype='object')

In [9]:
# selecting important columns only
# ================================

p_df = p_df[['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Status Change Date', 'Source_1', 'Source_2', 'Source_3',
       'Backup Notes']]

In [10]:
# looking for missing values 
# ==========================

p_df.isna().sum()

Patient Number                               0
State Patient Number                         0
Date Announced                               0
Age Bracket                                  0
Gender                                       0
Detected City                                0
Detected District                            0
Detected State                               0
Current Status                               0
Notes                                        0
Contracted from which Patient (Suspected)    0
Nationality                                  0
Status Change Date                           0
Source_1                                     0
Source_2                                     0
Source_3                                     0
Backup Notes                                 0
dtype: int64

In [11]:
# no. of empty strings in each column
# ===================================

print(p_df.shape)

for i in p_df.columns:
    print(i, '\t', p_df[p_df[i]==''].shape[0])

(2071, 17)
Patient Number 	 472
State Patient Number 	 1774
Date Announced 	 715
Age Bracket 	 1522
Gender 	 1480
Detected City 	 1399
Detected District 	 958
Detected State 	 715
Current Status 	 715
Notes 	 722
Contracted from which Patient (Suspected) 	 1795
Nationality 	 1651
Status Change Date 	 718
Source_1 	 721
Source_2 	 1446
Source_3 	 1929
Backup Notes 	 1711


In [12]:
# replacing empty strings with np.nan
# ==================================-

print(p_df.shape)

p_df = p_df.replace(r'', np.nan, regex=True)
p_df.isna().sum()

(2071, 17)


Patient Number                                472
State Patient Number                         1774
Date Announced                                715
Age Bracket                                  1522
Gender                                       1480
Detected City                                1399
Detected District                             958
Detected State                                715
Current Status                                715
Notes                                         722
Contracted from which Patient (Suspected)    1795
Nationality                                  1651
Status Change Date                            718
Source_1                                      721
Source_2                                     1446
Source_3                                     1929
Backup Notes                                 1711
dtype: int64

In [13]:
# droping empty rows (row with just row number but without patient entry
# ======================================================================

print(p_df.shape)
p_df.dropna(subset=['Detected State'], inplace=True)
print(p_df.shape)

(2071, 17)
(1356, 17)


In [14]:
p_df.columns

Index(['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Status Change Date', 'Source_1', 'Source_2', 'Source_3',
       'Backup Notes'],
      dtype='object')

In [15]:
# rename dateframe columns 
# ========================

p_df.columns = ['_'.join(col.lower().split()) for col in p_df.columns]
p_df.rename(columns = {'contracted_from_which_patient_(suspected)':'suspected_contacted_patient'}, inplace=True)
p_df.sample(5)

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,current_status,notes,suspected_contacted_patient,nationality,status_change_date,source_1,source_2,source_3,backup_notes
1317,1318,KA-P91,30/03/2020,,,Hospet,Ballari,Karnataka,Hospitalized,Related to P1316 and P1317,P1316,,30/03/2020,https://twitter.com/ANI/status/124465346073933...,https://twitter.com/sriramulubjp/status/124465...,,
178,179,,19/03/2020,,,Lucknow,Lucknow,Uttar Pradesh,Hospitalized,Travelled from UK,,India,19/03/2020,https://twitter.com/ANINewsUP/status/124053687...,https://www.livemint.com/news/india/coronaviru...,,Travel history to UK
52,53,KA-P4,10/03/2020,13.0,F,Bengaluru,Bengaluru,Karnataka,Recovered,Daughter of P51 and P52,P51,India,24/03/2020,https://www.indiatoday.in/india/story/coronavi...,,,Daughter of P51 and P52
360,361,,22/03/2020,,,,Shahid Bhagat Singh Nagar,Punjab,Hospitalized,"Close contact of Patient (Patient Unknown, lik...",P182,,22/03/2020,https://twitter.com/ANI/status/124164157216210...,https://www.hindustantimes.com/india-news/coro...,https://punjabupdate.com/8-new-cases-of-covid-...,Details awaited
45,46,PJ-P1,09/03/2020,,M,Amritsar,Amritsar,Punjab,Hospitalized,Travelled from Italy,,Italy,09/03/2020,https://www.indiatoday.in/india/story/coronavi...,https://punjabupdate.com/media-bulletin-21-03-...,,Returned from Milan


# Saving Data

In [16]:
p_df.sample(5)

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,current_status,notes,suspected_contacted_patient,nationality,status_change_date,source_1,source_2,source_3,backup_notes
30,31,DL-P3,06/03/2020,,,Uttam Nagar,West Delhi,Delhi,Recovered,Travelled from Thailand and Malaysia,,India,15/03/2020,https://indianexpress.com/article/india/corona...,https://www.indiatoday.in/india/story/delhi-po...,http://health.delhigovt.nic.in/wps/wcm/connect...,Travelled to Thailand and Malaysia
986,987,,28/03/2020,,,,Jammu,Jammu and Kashmir,Hospitalized,Details awaited,,,28/03/2020,https://twitter.com/kansalrohit69/status/12439...,,,
1293,1294,,30/03/2020,,,,,Delhi,Hospitalized,Details awaited,,,30/03/2020,https://twitter.com/ANI/status/124464075167946...,,,
637,638,,25/03/2020,55.0,F,Ludhiana,Ludhiana,Punjab,Hospitalized,"No Travel History (Husband, Son and 2 domestic...",,India,25/03/2020,https://twitter.com/PTI_News/status/1242807440...,https://www.outlookindia.com/newsscroll/corona...,https://www.tribuneindia.com/news/ludhiana-pat...,
1324,1325,,30/03/2020,,,,Jaipur,Rajasthan,Hospitalized,"Family member of Patient, ID Unknown",,,30/03/2020,https://twitter.com/pti_news/status/1244677217...,https://twitter.com/ANI/status/124470276227047...,,


In [17]:
p_df.to_csv('patients_data.csv', index=False)