In [1]:
# libraries

from datetime import datetime
import os
import glob
import requests 
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# Web scrapping

In [2]:
# download data 
# =============

link = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSc_2y5N0I67wDU38DjDh35IZSIS30rQf7_NYZhtYYGU1jJYT6_kDx4YpF-qw0LSlGsBYP8pqM_a1Pd/pubhtml#'
req = requests.get(link)
soup = BeautifulSoup(req.content, "html.parser")

In [3]:
tbody = soup.find_all('tbody')[0]
body = tbody.find_all('tr')

# print(tbody)
# print(body)

In [4]:
head_row = [i.text for i in body[0].find_all('td')]
head_row

['Patient Number',
 'State Patient Number',
 'Date Announced',
 'Age Bracket',
 'Gender',
 'Detected City',
 'Detected District',
 'Detected State',
 'State code',
 'Current Status',
 'Notes',
 'Contracted from which Patient (Suspected)',
 'Nationality',
 'Type of transmission',
 'Status Change Date',
 'Source_1',
 'Source_2',
 'Source_3',
 'Backup Notes',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [5]:
contents = []

for i in range(len(body)):
    contents.append([i.text for i in body[i].find_all('td')])

# Saving to Dataframe

In [6]:
p_df = pd.DataFrame(contents[2:len(contents)], columns=head_row)
p_df.head()

Unnamed: 0,Patient Number,State Patient Number,Date Announced,Age Bracket,Gender,Detected City,Detected District,Detected State,State code,Current Status,...,Source_2,Source_3,Backup Notes,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,1,KL-TS-P1,30/01/2020,20.0,F,Thrissur,Thrissur,Kerala,KL,Recovered,...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,,,,,,,,
1,2,KL-AL-P1,02/02/2020,,,Alappuzha,Alappuzha,Kerala,KL,Recovered,...,https://weather.com/en-IN/india/news/news/2020...,,Student from Wuhan,,,,,,,
2,3,KL-KS-P1,03/02/2020,,,Kasaragod,Kasaragod,Kerala,KL,Recovered,...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,,,,,,,
3,4,DL-P1,02/03/2020,45.0,M,East Delhi (Mayur Vihar),East Delhi,Delhi,DL,Recovered,...,https://economictimes.indiatimes.com/news/poli...,,Travel history to Italy and Austria,,,,,,,
4,5,TS-P1,02/03/2020,24.0,M,Hyderabad,Hyderabad,Telangana,TG,Recovered,...,https://www.indiatoday.in/india/story/coronavi...,https://www.thehindu.com/news/national/coronav...,"Travel history to Dubai, Singapore contact",,,,,,,


# Data cleaning and transformations

In [7]:
# shape of dataframe
# ==================

p_df.shape

(7100, 26)

In [8]:
# columns
# =======

p_df.columns

Index(['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'State code', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Type of transmission', 'Status Change Date', 'Source_1', 'Source_2',
       'Source_3', 'Backup Notes', '', '', '', '', '', '', ''],
      dtype='object')

In [9]:
# selecting important columns only
# ================================

p_df = p_df.loc[:, :'Backup Notes']

In [10]:
# looking for missing values 
# ==========================

p_df.isna().sum()

Patient Number                               0
State Patient Number                         0
Date Announced                               0
Age Bracket                                  0
Gender                                       0
Detected City                                0
Detected District                            0
Detected State                               0
State code                                   0
Current Status                               0
Notes                                        0
Contracted from which Patient (Suspected)    0
Nationality                                  0
Type of transmission                         0
Status Change Date                           0
Source_1                                     0
Source_2                                     0
Source_3                                     0
Backup Notes                                 0
dtype: int64

In [11]:
# no. of empty strings in each column
# ===================================

print(p_df.shape)

for i in p_df.columns:
    print(i, '\t', p_df[p_df[i]==''].shape[0])

(7100, 19)
Patient Number 	 467
State Patient Number 	 5905
Date Announced 	 893
Age Bracket 	 6049
Gender 	 5548
Detected City 	 6203
Detected District 	 2194
Detected State 	 893
State code 	 895
Current Status 	 894
Notes 	 2087
Contracted from which Patient (Suspected) 	 5851
Nationality 	 5937
Type of transmission 	 4307
Status Change Date 	 1003
Source_1 	 1084
Source_2 	 5199
Source_3 	 6804
Backup Notes 	 6740


In [12]:
# replacing empty strings with np.nan
# ==================================-

print(p_df.shape)

p_df = p_df.replace(r'', np.nan, regex=True)
p_df.isna().sum()

(7100, 19)


Patient Number                                467
State Patient Number                         5905
Date Announced                                893
Age Bracket                                  6049
Gender                                       5548
Detected City                                6203
Detected District                            2194
Detected State                                893
State code                                    895
Current Status                                894
Notes                                        2087
Contracted from which Patient (Suspected)    5851
Nationality                                  5937
Type of transmission                         4307
Status Change Date                           1003
Source_1                                     1084
Source_2                                     5199
Source_3                                     6804
Backup Notes                                 6740
dtype: int64

In [13]:
# droping empty rows (row with just row number but without patient entry
# ======================================================================

p_df.dropna(subset=['Detected State'], inplace=True)
print(p_df.shape)
p_df.isna().sum()

(6207, 19)


Patient Number                                  0
State Patient Number                         5012
Date Announced                                  0
Age Bracket                                  5156
Gender                                       4655
Detected City                                5310
Detected District                            1301
Detected State                                  0
State code                                      2
Current Status                                  1
Notes                                        1194
Contracted from which Patient (Suspected)    4958
Nationality                                  5044
Type of transmission                         3414
Status Change Date                            110
Source_1                                      191
Source_2                                     4306
Source_3                                     5911
Backup Notes                                 5847
dtype: int64

In [14]:
p_df.columns

Index(['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'State code', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Type of transmission', 'Status Change Date', 'Source_1', 'Source_2',
       'Source_3', 'Backup Notes'],
      dtype='object')

In [15]:
# rename dateframe columns 
# ========================

p_df.columns = ['_'.join(col.lower().split()) for col in p_df.columns]
p_df.rename(columns = {'contracted_from_which_patient_(suspected)':'suspected_contacted_patient'}, inplace=True)
p_df.sample(5)

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,state_code,current_status,notes,suspected_contacted_patient,nationality,type_of_transmission,status_change_date,source_1,source_2,source_3,backup_notes
5786,5787,,08/04/2020,,,,,Madhya Pradesh,MP,Hospitalized,Details awaited,,,,08/04/2020,https://twitter.com/JansamparkMP/status/124790...,,,
5074,5075,,07/04/2020,,,,Mumbai,Maharashtra,MH,Hospitalized,Details awaited,,,,07/04/2020,https://twitter.com/mybmc/status/1247523153707...,,,
5851,5852,,08/04/2020,,,,,Delhi,DL,Hospitalized,Details awaited,,,,08/04/2020,https://twitter.com/CMODelhi/status/1247929317...,,,
4612,4613,,06/04/2020,,,,Mumbai,Maharashtra,MH,Hospitalized,Details awaited,,,,06/04/2020,https://twitter.com/PTI_News/status/1247164710...,https://arogya.maharashtra.gov.in/pdf/covidmpr...,,
633,634,KA-P47,25/03/2020,63.0,F,Bengaluru,Bengaluru,Karnataka,KA,Hospitalized,Travel History to Athens and London and arrive...,,India,Imported,25/03/2020,https://twitter.com/DHFWKA/status/124280466314...,State Health Bulletin,,


In [16]:
# creating patient id column from patient number
# ===============================================

p_df['p_id'] = p_df['patient_number'].apply(lambda x : 'P'+str(x))
p_df.columns

Index(['patient_number', 'state_patient_number', 'date_announced',
       'age_bracket', 'gender', 'detected_city', 'detected_district',
       'detected_state', 'state_code', 'current_status', 'notes',
       'suspected_contacted_patient', 'nationality', 'type_of_transmission',
       'status_change_date', 'source_1', 'source_2', 'source_3',
       'backup_notes', 'p_id'],
      dtype='object')

In [17]:
p_df = p_df.loc[:, :'backup_notes']

# Saving Data

In [18]:
p_df.sample(5)

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,state_code,current_status,notes,suspected_contacted_patient,nationality,type_of_transmission,status_change_date,source_1,source_2,source_3,backup_notes
1188,1189,TN-P66,30/03/2020,50.0,F,Broadway,Chennai,Tamil Nadu,TN,Hospitalized,No Travel History,,,Local,30/03/2020,https://twitter.com/Vijayabaskarofl/status/124...,https://twitter.com/THChennai/status/124451822...,,
458,459,,23/03/2020,34.0,,Kasaragod,Kasaragod,Kerala,KL,Hospitalized,"Travelled from Dubai, UAE",,India,Imported,23/03/2020,https://twitter.com/ANI/status/124206972027616...,https://www.facebook.com/permalink.php?story_f...,,
2277,2278,TN-P306,02/04/2020,,,,Virudhunagar,Tamil Nadu,TN,Hospitalized,Travelled to Delhi,E0,,Local,02/04/2020,https://twitter.com/ANI/status/124569362647211...,,,
4496,4497,,06/04/2020,,,,Kasaragod,Kerala,KL,Hospitalized,Contact Transmission,,,Local,06/04/2020,https://twitter.com/PIBTvpm/status/12471458817...,http://dhs.kerala.gov.in/wp-content/uploads/20...,https://www.facebook.com/permalink.php?story_f...,
5866,5867,,08/04/2020,,,,,Delhi,DL,Hospitalized,Details awaited,,,,08/04/2020,https://twitter.com/CMODelhi/status/1247929317...,,,


In [19]:
p_df.to_csv('patients_data.csv', index=False)