In [1]:
# libraries

from datetime import datetime
import os
import glob
import requests 
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# Web scrapping

In [2]:
# download data 
# =============

link = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSc_2y5N0I67wDU38DjDh35IZSIS30rQf7_NYZhtYYGU1jJYT6_kDx4YpF-qw0LSlGsBYP8pqM_a1Pd/pubhtml#'
req = requests.get(link)
soup = BeautifulSoup(req.content, "html.parser")

In [3]:
tbody = soup.find_all('tbody')[0]
body = tbody.find_all('tr')

# print(tbody)
# print(body)

In [4]:
head_row = [i.text for i in body[0].find_all('td')]
head_row

['Patient Number',
 'State Patient Number',
 'Date Announced',
 'Age Bracket',
 'Gender',
 'Detected City',
 'Detected District',
 'Detected State',
 'Current Status',
 'Notes',
 'Contracted from which Patient (Suspected)',
 'Nationality',
 'Type of transmission',
 'Status Change Date',
 'Source_1',
 'Source_2',
 'Source_3',
 'Backup Notes',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [5]:
contents = []

for i in range(len(body)):
    contents.append([i.text for i in body[i].find_all('td')])

# Saving to Dataframe

In [6]:
p_df = pd.DataFrame(contents[2:len(contents)], columns=head_row)
p_df.head()

Unnamed: 0,Patient Number,State Patient Number,Date Announced,Age Bracket,Gender,Detected City,Detected District,Detected State,Current Status,Notes,...,Source_2,Source_3,Backup Notes,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,1,KL-TS-P1,30/01/2020,20.0,F,Thrissur,Thrissur,Kerala,Recovered,Travelled from Wuhan,...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,,,,,,,,
1,2,KL-AL-P1,02/02/2020,,,Alappuzha,Alappuzha,Kerala,Recovered,Travelled from Wuhan,...,https://weather.com/en-IN/india/news/news/2020...,,Student from Wuhan,,,,,,,
2,3,KL-KS-P1,03/02/2020,,,Kasaragod,Kasaragod,Kerala,Recovered,Travelled from Wuhan,...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,,,,,,,
3,4,DL-P1,02/03/2020,45.0,M,East Delhi (Mayur Vihar),East Delhi,Delhi,Recovered,"Travelled from Austria, Italy",...,https://economictimes.indiatimes.com/news/poli...,,Travel history to Italy and Austria,,,,,,,
4,5,TS-P1,02/03/2020,24.0,M,Hyderabad,Hyderabad,Telangana,Recovered,"Travelled from Dubai to Bangalore on 20th Feb,...",...,https://www.indiatoday.in/india/story/coronavi...,https://www.thehindu.com/news/national/coronav...,"Travel history to Dubai, Singapore contact",,,,,,,


# Data cleaning and transformations

In [7]:
# shape of dataframe
# ==================

p_df.shape

(2071, 25)

In [8]:
# columns
# =======

p_df.columns

Index(['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Type of transmission', 'Status Change Date', 'Source_1', 'Source_2',
       'Source_3', 'Backup Notes', '', '', '', '', '', '', ''],
      dtype='object')

In [9]:
# selecting important columns only
# ================================

p_df = p_df[['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Status Change Date', 'Source_1', 'Source_2', 'Source_3',
       'Backup Notes']]

In [10]:
# looking for missing values 
# ==========================

p_df.isna().sum()

Patient Number                               0
State Patient Number                         0
Date Announced                               0
Age Bracket                                  0
Gender                                       0
Detected City                                0
Detected District                            0
Detected State                               0
Current Status                               0
Notes                                        0
Contracted from which Patient (Suspected)    0
Nationality                                  0
Status Change Date                           0
Source_1                                     0
Source_2                                     0
Source_3                                     0
Backup Notes                                 0
dtype: int64

In [11]:
# no. of empty strings in each column
# ===================================

print(p_df.shape)

for i in p_df.columns:
    print(i, '\t', p_df[p_df[i]==''].shape[0])

(2071, 17)
Patient Number 	 409
State Patient Number 	 1660
Date Announced 	 409
Age Bracket 	 1453
Gender 	 1330
Detected City 	 1349
Detected District 	 762
Detected State 	 409
Current Status 	 409
Notes 	 417
Contracted from which Patient (Suspected) 	 1658
Nationality 	 1648
Status Change Date 	 412
Source_1 	 474
Source_2 	 1384
Source_3 	 1925
Backup Notes 	 1711


In [12]:
# replacing empty strings with np.nan
# ==================================-

print(p_df.shape)

p_df = p_df.replace(r'', np.nan, regex=True)
p_df.isna().sum()

(2071, 17)


Patient Number                                409
State Patient Number                         1660
Date Announced                                409
Age Bracket                                  1453
Gender                                       1330
Detected City                                1349
Detected District                             762
Detected State                                409
Current Status                                409
Notes                                         417
Contracted from which Patient (Suspected)    1658
Nationality                                  1648
Status Change Date                            412
Source_1                                      474
Source_2                                     1384
Source_3                                     1925
Backup Notes                                 1711
dtype: int64

In [13]:
# droping empty rows (row with just row number but without patient entry
# ======================================================================

p_df.dropna(subset=['Detected State'], inplace=True)
print(p_df.shape)
p_df.isna().sum()

(1662, 17)


Patient Number                                  0
State Patient Number                         1251
Date Announced                                  0
Age Bracket                                  1044
Gender                                        921
Detected City                                 940
Detected District                             353
Detected State                                  0
Current Status                                  0
Notes                                           8
Contracted from which Patient (Suspected)    1249
Nationality                                  1239
Status Change Date                              3
Source_1                                       65
Source_2                                      975
Source_3                                     1516
Backup Notes                                 1302
dtype: int64

In [14]:
p_df.columns

Index(['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Status Change Date', 'Source_1', 'Source_2', 'Source_3',
       'Backup Notes'],
      dtype='object')

In [15]:
# rename dateframe columns 
# ========================

p_df.columns = ['_'.join(col.lower().split()) for col in p_df.columns]
p_df.rename(columns = {'contracted_from_which_patient_(suspected)':'suspected_contacted_patient'}, inplace=True)
p_df.sample(5)

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,current_status,notes,suspected_contacted_patient,nationality,status_change_date,source_1,source_2,source_3,backup_notes
1451,1452,,31/03/2020,,,,Kannur,Kerala,Hospitalized,Details awaited,,,31/03/2020,https://twitter.com/ANI/status/124497050308277...,,,
1442,1443,,31/03/2020,,F,,Sirsa,Haryana,Hospitalized,Details awaited,,,31/03/2020,https://twitter.com/ANI/status/124497168118297...,,,
1595,1596,,31/03/2020,,,,,Delhi,Hospitalized,Details Awaited,,,31/03/2020,https://twitter.com/ANI/status/124502337994339...,,,
984,985,,28/03/2020,,,,Jammu,Jammu and Kashmir,Hospitalized,Details awaited,,,28/03/2020,https://twitter.com/kansalrohit69/status/12439...,,,
1120,1121,,29/03/2020,,,,,Delhi,Hospitalized,Details awaited,,,29/03/2020,https://twitter.com/PTI_News/status/1244275406...,,,


In [16]:
# creating patient id column from patient number
# ===============================================

p_df['p_id'] = p_df['patient_number'].apply(lambda x : 'P'+str(x))
p_df.columns

Index(['patient_number', 'state_patient_number', 'date_announced',
       'age_bracket', 'gender', 'detected_city', 'detected_district',
       'detected_state', 'current_status', 'notes',
       'suspected_contacted_patient', 'nationality', 'status_change_date',
       'source_1', 'source_2', 'source_3', 'backup_notes', 'p_id'],
      dtype='object')

In [17]:
p_df = p_df[['patient_number', 'p_id', 'state_patient_number', 'date_announced',
       'age_bracket', 'gender', 'detected_city', 'detected_district',
       'detected_state', 'current_status', 'notes',
       'suspected_contacted_patient', 'nationality', 'status_change_date',
       'source_1', 'source_2', 'source_3', 'backup_notes']]

# Saving Data

In [18]:
p_df.sample(5)

Unnamed: 0,patient_number,p_id,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,current_status,notes,suspected_contacted_patient,nationality,status_change_date,source_1,source_2,source_3,backup_notes
1350,1351,P1351,,31/03/2020,,,,Buldana,Maharashtra,Hospitalized,Details awaited,,,31/03/2020,https://twitter.com/ANI/status/124483198605462...,,,
864,865,P865,,27/03/2020,45.0,F,,Nadia,West Bengal,Hospitalized,"Family, History of contact to a positive case ...",,,27/03/2020,https://www.wbhealth.gov.in/uploaded_files/cor...,,,
576,577,P577,,25/03/2020,65.0,F,Ujjain,Ujjain,Madhya Pradesh,Deceased,Details awaited,,,25/03/2020,https://twitter.com/PTI_News/status/1242660625...,https://dbpost.com/mp-coronavirus-cases-update...,https://twitter.com/the_hindu/status/124280258...,
645,646,P646,TS-P40,25/03/2020,43.0,F,,Hyderabad,Telangana,Hospitalized,Contact with TS-P34,P525,,25/03/2020,https://twitter.com/TelanganaHealth/status/124...,,,
1411,1412,P1412,AP-P35,31/03/2020,59.0,M,Chirala,Prakasam,Andhra Pradesh,Hospitalized,Attended Delhi Religious Conference,E0,,31/03/2020,http://hmfw.ap.gov.in/Daily_bullettin/31-03-20...,,,


In [19]:
p_df.to_csv('patients_data.csv', index=False)