In [71]:
# libraries

from datetime import datetime
import os
import glob
import requests 
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# Web scrapping

In [18]:
# download data 
# =============

link = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSc_2y5N0I67wDU38DjDh35IZSIS30rQf7_NYZhtYYGU1jJYT6_kDx4YpF-qw0LSlGsBYP8pqM_a1Pd/pubhtml#'
req = requests.get(link)
soup = BeautifulSoup(req.content, "html.parser")

In [29]:
tbody = soup.find_all('tbody')[0]
body = tbody.find_all('tr')

# print(tbody)
# print(body)

In [39]:
head_row = [i.text for i in body[0].find_all('td')]
head_row

['Patient Number',
 'State Patient Number',
 'Date Announced',
 'Age Bracket',
 'Gender',
 'Detected City',
 'Detected District',
 'Detected State',
 'Current Status',
 'Notes',
 'Contracted from which Patient (Suspected)',
 'Nationality',
 'Status Change Date',
 'Source_1',
 'Source_2',
 'Source_3',
 'Backup Notes',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [45]:
contents = []

for i in range(len(body)):
    contents.append([i.text for i in body[i].find_all('td')])

# Saving to Dataframe

In [49]:
p_df = pd.DataFrame(contents[2:len(contents)], columns=head_row)
p_df.head()

Unnamed: 0,Patient Number,State Patient Number,Date Announced,Age Bracket,Gender,Detected City,Detected District,Detected State,Current Status,Notes,...,Source_2,Source_3,Backup Notes,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,1,KL-TS-P1,30/01/2020,20.0,F,Thrissur,Thrissur,Kerala,Recovered,Travelled from Wuhan,...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,,,,,,,,
1,2,KL-AL-P1,02/02/2020,,,Alappuzha,Alappuzha,Kerala,Recovered,Travelled from Wuhan,...,https://weather.com/en-IN/india/news/news/2020...,,Student from Wuhan,,,,,,,
2,3,KL-KS-P1,03/02/2020,,,Kasaragod,Kasaragod,Kerala,Recovered,Travelled from Wuhan,...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,,,,,,,
3,4,DL-P1,02/03/2020,45.0,M,East Delhi (Mayur Vihar),East Delhi,Delhi,Recovered,"Travelled from Austria, Italy",...,https://economictimes.indiatimes.com/news/poli...,,Travel history to Italy and Austria,,,,,,,
4,5,TS-P1,02/03/2020,24.0,M,Hyderabad,Hyderabad,Telangana,Recovered,,...,https://www.indiatoday.in/india/story/coronavi...,https://www.thehindu.com/news/national/coronav...,"Travel history to Dubai, Singapore contact",,,,,,,


# Data cleaning and transformations

In [51]:
# shape of dataframe
# ==================

p_df.shape

(2072, 24)

In [57]:
# columns
# =======

p_df.columns

Index(['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Status Change Date', 'Source_1', 'Source_2', 'Source_3',
       'Backup Notes', '', '', '', '', '', '', ''],
      dtype='object')

In [58]:
# selecting important columns only
# ================================

p_df = p_df[['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Status Change Date', 'Source_1', 'Source_2', 'Source_3',
       'Backup Notes']]

In [59]:
# looking for missing values 
# ==========================

p_df.isna().sum()

Patient Number                               0
State Patient Number                         0
Date Announced                               0
Age Bracket                                  0
Gender                                       0
Detected City                                0
Detected District                            0
Detected State                               0
Current Status                               0
Notes                                        0
Contracted from which Patient (Suspected)    0
Nationality                                  0
Status Change Date                           0
Source_1                                     0
Source_2                                     0
Source_3                                     0
Backup Notes                                 0
dtype: int64

In [79]:
# no. of empty strings in each column
# ===================================

print(p_df.shape)

for i in p_df.columns:
    print(i, '\t', p_df[p_df[i]==''].shape[0])

(2072, 17)
Patient Number 	 0
State Patient Number 	 0
Date Announced 	 0
Age Bracket 	 0
Gender 	 0
Detected City 	 0
Detected District 	 0
Detected State 	 0
Current Status 	 0
Notes 	 0
Contracted from which Patient (Suspected) 	 0
Nationality 	 0
Status Change Date 	 0
Source_1 	 0
Source_2 	 0
Source_3 	 0
Backup Notes 	 0


In [80]:
# replacing empty strings with np.nan
# ==================================-

print(p_df.shape)

p_df = p_df.replace(r'', np.nan, regex=True)
p_df.isna().sum()

(2072, 17)


Patient Number                                767
State Patient Number                         1783
Date Announced                                876
Age Bracket                                  1536
Gender                                       1494
Detected City                                1402
Detected District                            1034
Detected State                                876
Current Status                                876
Notes                                         884
Contracted from which Patient (Suspected)    1823
Nationality                                  1656
Status Change Date                            879
Source_1                                      899
Source_2                                     1485
Source_3                                     1931
Backup Notes                                 1712
dtype: int64

In [85]:
# droping empty rows (row with just row number but without patient entry
# ======================================================================

print(p_df.shape)
p_df.dropna(subset=['Detected State'], inplace=True)
print(p_df.shape)

(1196, 17)
(1196, 17)


In [87]:
p_df.columns

Index(['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Status Change Date', 'Source_1', 'Source_2', 'Source_3',
       'Backup Notes'],
      dtype='object')

In [94]:
# rename dateframe columns 
# ========================

p_df.columns = ['_'.join(col.lower().split()) for col in p_df.columns]
p_df.rename(columns = {'contracted_from_which_patient_(suspected)':'suspected_contacted_patient'})
p_df.sample(5)

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,current_status,notes,contracted_from_which_patient_(suspected),nationality,status_change_date,source_1,source_2,source_3,backup_notes
315,316,,21/03/2020,,M,,Kasaragod,Kerala,Hospitalized,Travelled from Dubai,,India,21/03/2020,https://twitter.com/ANI/status/124134957636103...,https://www.thehindu.com/news/national/kerala/...,https://www.facebook.com/District-Information-...,Returned from Dubai
293,294,,21/03/2020,67.0,M,Surat,Surat,Gujarat,Deceased,Travelled to Delhi & Jaipur,,India,22/03/2020,https://twitter.com/ANI/status/124129257358444...,https://www.thehindu.com/news/national/coronav...,https://economictimes.indiatimes.com/news/poli...,Details awaited
408,409,,23/03/2020,,,Mumbai,Mumbai,Maharashtra,Hospitalized,Details awaited,,,23/03/2020,https://twitter.com/PTI_News/status/1241931502...,https://www.livemint.com/news/india/coronaviru...,,
26,27,,04/03/2020,,,Agra,Agra,Uttar Pradesh,Hospitalized,Family members of P4,P4,India,04/03/2020,https://www.businesstoday.in/latest/trends/cor...,,,Family members of P4
500,501,TN-P11,23/03/2020,48.0,M,Tiruppur,Tiruppur,Tamil Nadu,Hospitalized,Travelled from London via Coimbatore on 15th M...,,,23/03/2020,https://twitter.com/Vijayabaskarofl/status/124...,,,


# Saving Data

In [89]:
p_df.sample(5)

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,current_status,notes,contracted_from_which_patient_(suspected),nationality,status_change_date,source_1,source_2,source_3,backup_notes
899,900,,28/03/2020,,,,Indore,Madhya Pradesh,Hospitalized,Details awaited,,,28/03/2020,http://www.ptinews.com/news/11338684_Number-of...,,,
939,940,KL-TN-R8,28/03/2020,,,,Thiruvananthapuram,Kerala,Hospitalized,Details awaited,,,28/03/2020,https://twitter.com/ANI/status/124388112748542...,,,
1060,1061,,29/03/2020,48.0,,,Kannur,Kerala,Hospitalized,Travelled from Dubai Air India AI 938 landed a...,,,29/03/2020,https://twitter.com/ANI/status/124424253213483...,https://t.me/COVID19_Kerala/280,https://www.facebook.com/CollectorKNR/posts/28...,
943,944,,28/03/2020,35.0,M,,Kasaragod,Kerala,Hospitalized,Travelled from Dubai,,,28/03/2020,https://twitter.com/ANI/status/124388112748542...,https://www.facebook.com/permalink.php?story_f...,,
600,601,KA-P51,25/03/2020,34.0,M,Udupi,Udupi,Karnataka,Hospitalized,Travelled from Dubai on 18.03.2020,,India,25/03/2020,https://www.deccanherald.com/state/mangaluru/3...,https://www.thehindu.com/news/national/karnata...,State Health Bulletin,


In [84]:
p_df.to_csv('patients_data.csv', index=False)