In [71]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [72]:
website='https://www.worldometers.info/coronavirus/#countries' # url for the site 
website_url=requests.get(website).text
soup = BeautifulSoup(website_url,'html.parser')

In [73]:
my_table = soup.find('tbody')

In [74]:
table_data = []
for row in my_table.findAll('tr'):
    row_data = []
    for cell in row.findAll('td'):
        row_data.append(cell.text)
    if(len(row_data) > 0):
        data_item = {"Country": row_data[1],
                     "TotalCases": row_data[2],
                     "NewCases": row_data[3],
                     "TotalDeaths": row_data[4],
                     "NewDeaths": row_data[5],
                     "TotalRecovered": row_data[6],
                     "ActiveCases": row_data[8],
                     "SeriousCritical": row_data[9],
                     "TotCases/1Mpop": row_data[10],
                     "Deaths/1Mpop": row_data[11],
                     "TotalTests": row_data[12],
                     "Tests/1Mpop": row_data[13],
                     "Population": row_data[14],
        }
        table_data.append(data_item)

In [75]:
df = pd.DataFrame(table_data)

In [76]:
# df.head(10)
df = df[8:] # Exclude Continents Rows and NAN row

In [77]:
# Example
df.iloc[9]

Country                Germany
TotalCases           2,938,630
NewCases               +11,058
TotalDeaths            78,292 
NewDeaths                 +118
TotalRecovered       2,631,400
ActiveCases            228,938
SeriousCritical          4,439
TotCases/1Mpop          34,988
Deaths/1Mpop               932
TotalTests          51,559,277
Tests/1Mpop            613,878
Population         83,989,395 
Name: 17, dtype: object

## Clean Data

In [78]:
columns = df.columns.tolist()

def remove_punctuation(x):
    y = x.replace(',','').replace('+','')
    return y

def a_function(x):
    return x.replace('\n','')

dfr = pd.DataFrame()

for c in columns:
    dfr[c] = df[c].apply(remove_punctuation).replace(r'^\s*$', np.nan, regex=True)

dfr['Country'] = dfr['Country'].apply(a_function)
dfr.head()

Unnamed: 0,Country,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,SeriousCritical,TotCases/1Mpop,Deaths/1Mpop,TotalTests,Tests/1Mpop,Population
8,USA,31647543,10300.0,572988,139.0,24207043,6867512,9057,95184,1723,413593155,1243928,332489703
9,Brazil,13197031,,341097,,11664158,1191776,8318,61750,1596,28600000,133822,213716460
10,India,12978132,52071.0,167105,213.0,11867486,943541,8944,9334,120,252677379,181732,1390382831
11,France,4841308,,97722,,301299,4442287,5626,74044,1495,67791157,1036811,65384318
12,Russia,4614834,8672.0,101845,365.0,4239038,273951,2300,31612,698,122600000,839827,145982499


In [80]:
columns[1:]

['TotalCases',
 'NewCases',
 'TotalDeaths',
 'NewDeaths',
 'TotalRecovered',
 'ActiveCases',
 'SeriousCritical',
 'TotCases/1Mpop',
 'Deaths/1Mpop',
 'TotalTests',
 'Tests/1Mpop',
 'Population']

In [81]:
for c in columns[1:]:
    dfr[c] = dfr[c].fillna(0).replace('N/A','')
    dfr[c] = dfr[c].fillna({'':0}).replace('',0)
    dfr[c] = dfr[c].astype(float).astype(int)
    
dfr = dfr.reset_index(drop=True)
dfr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 8 to 228
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Country          221 non-null    object
 1   TotalCases       221 non-null    int32 
 2   NewCases         221 non-null    int32 
 3   TotalDeaths      221 non-null    int32 
 4   NewDeaths        221 non-null    int32 
 5   TotalRecovered   221 non-null    int32 
 6   ActiveCases      221 non-null    int32 
 7   SeriousCritical  221 non-null    int32 
 8   TotCases/1Mpop   221 non-null    int32 
 9   Deaths/1Mpop     221 non-null    int32 
 10  TotalTests       221 non-null    int32 
 11  Tests/1Mpop      221 non-null    int32 
 12  Population       221 non-null    int32 
dtypes: int32(12), object(1)
memory usage: 12.2+ KB


In [89]:
import datetime
dt = datetime.datetime.today()

dfr.to_csv('covid19-word-o-meter-{:02d}-{:02d}-{:4d}'.format(dt.month,dt.day,dt.year) +'.csv', index=True)