In [1]:
# libraries for Web Scrapping
import requests
from bs4 import BeautifulSoup

# library for advance string manipulation
import string

# library for data manipulation
import pandas as pd

# library for advance mathematical operations
import numpy as np

# Total Cases

In [2]:
def TotalCases():
    page = requests.get("https://www.worldometers.info/coronavirus/")
    soup = BeautifulSoup(page.content, 'html.parser')
    
    all_cases = soup.find(class_="content-inner")
    
    main_total_count = all_cases.find_all(class_="maincounter-number")
    total_cases = main_total_count[0].get_text().replace('\n','').strip()
    total_deaths = main_total_count[1].get_text().replace('\n','').strip()
    total_recovered = main_total_count[2].get_text().replace('\n','').strip()
    
    total_cases_dict = {'Total Cases':total_cases,
                       'Total Deaths':total_deaths,
                       'Total Recovered':total_recovered}
    
    return(total_cases_dict)

TotalCases()

{'Total Cases': '3,507,442',
 'Total Deaths': '245,241',
 'Total Recovered': '1,130,122'}

# Active Cases

In [3]:
def ActiveCases():
    page = requests.get("https://www.worldometers.info/coronavirus/")
    soup = BeautifulSoup(page.content, 'html.parser')

    all_cases = soup.find(class_="content-inner")

    currently_infected_people = all_cases.find_all(class_="number-table-main")
    CIP = currently_infected_people[0].get_text()

    currently_infected_people_condition = all_cases.find_all(class_="number-table")
    CIPC_Mild = currently_infected_people_condition[0].get_text()
    CIPC_Severe = currently_infected_people_condition[1].get_text()

    active_cases_dict = {'Currently Infected People': CIP,
                        'Currently Infected People Condition (Mild)': CIPC_Mild,
                        'Currently Infected People Condition (Severe)': CIPC_Severe}
    
    return (active_cases_dict)

ActiveCases()

{'Currently Infected People': '2,132,079',
 'Currently Infected People Condition (Mild)': '2,081,472',
 'Currently Infected People Condition (Severe)': '50,607'}

# Scrape Table

In [4]:
# importing the datetime library to obtain the current date and time
from datetime import datetime

# function that returns the date and the time
def get_time_of_parsing():
    # datetime object containing current date and time
    now = datetime.now()
    
    # mm-dd-YY H-M-S
    date_string = now.strftime("%b-%d-%Y %H-%M-%S")
    return(date_string)

In [5]:
# running the get_time_of_parsing function to get the date and time at the time of accessing the webpage details
date_string = get_time_of_parsing()

page = requests.get("https://www.worldometers.info/coronavirus/")

# parsing the page using html parser. There are other parsers like lxml
soup = BeautifulSoup(page.content, 'html.parser')

In [6]:
table_wrapper = soup.find(class_='main_table_countries_div')
table = table_wrapper.find_all('table', class_="table table-bordered table-hover main_table_countries")

In [7]:
stat_table = table[0]

In [8]:
# creating an empty list to contain columns
columns = []

# the column names are persent inside the 'th' tag
for cols in stat_table.find_all('th'):
    # cleaning the text of the column names by replacing the unwanted characters by ''
    columns.append(cols.text.replace('\n', '').replace('\xa0', ''))

# cleaning the name of the first column 'Country, others'. Considering only the word 'Country' as the column name
columns[0] = columns[0].split(',')[0]
columns

['Country',
 'TotalCases',
 'NewCases',
 'TotalDeaths',
 'NewDeaths',
 'TotalRecovered',
 'ActiveCases',
 'Serious,Critical',
 'TotCases/1M pop',
 'Deaths/1M pop',
 'TotalTests',
 'Tests/1M pop',
 'Continent']

In [9]:
for row in stat_table.find_all('tr'):
    for cell in row.find_all('td'):
        print(cell.text)


North America

1,261,826
+3,224
73,924
+144
215,489
972,413
17,719




North America

Europe

1,438,691
+14,327
139,713
+303
535,829
763,149
17,472




Europe

Asia

553,336
+7,526
19,451
+112
287,048
246,837
5,389




Asia

South America

200,569
+819
10,259
+16
69,068
121,242
9,862




South America

Oceania

8,390
+22
115
+2
7,165
1,110
31




Australia/Oceania

Africa

43,909
+175
1,764
+1
14,878
27,267
130




Africa



721

15

645
61
4





World
3,507,442
+26,093
245,241
+578
1,130,122
2,132,079
50,607
450
31.5


All
USA
1,162,049
+1,275
67,492 
+48
173,910
920,647
16,475
3,511
204
6,937,747
20,960
North America
Spain
245,567

25,100 

146,233
74,234
2,386
5,252
537
1,528,833
32,699
Europe
Italy
209,328

28,710 

79,914
100,704
1,539
3,462
475
2,108,837
34,879
Europe
UK
182,260

28,131 

N/A
153,785
1,559
2,685
414
1,129,907
16,644
Europe
France
168,396

24,760 

50,562
93,074
3,827
2,580
379
1,100,228
16,856
Europe
Germany
165,016
+49
6,812 

130,600
27,604
2,105
1,970
81
2,5

2,369
8,244
North America
Mozambique
79

 

18
61

3

2,337
75
Africa
Sint Maarten
76

13 

44
19
7
1,773
303
329
7,673
North America
Tajikistan
76

2 


74

8
0.2


Asia
Cayman Islands
74

1 

10
63
3
1,126
15
1,927
29,320
North America
CAR
72

 

10
62

15



Africa
Nepal
69
+10
 

16
53

2

63,102
2,166
Asia
Libya
63

3 

22
38

9
0.4
2,155
314
Africa
French Polynesia
58

 

51
7
1
206

2,634
9,377
Australia/Oceania
South Sudan
46
+1
 


46

4

1,247
111
Africa
Macao
45

 

39
6
1
69



Asia
Syria
44

3 

27
14

3
0.2


Asia
Malawi
39
+1
3 

9
27
1
2
0.2
892
47
Africa
Eritrea
39

 

26
13

11



Africa
Mongolia
39

 

10
29

12

8,250
2,517
Asia
Saint Martin
38

3 

27
8
3
983
78


North America
Angola
35

2 

11
22

1
0.06
481
15
Africa
Zimbabwe
34

4 

5
25

2
0.3
7,642
514
Africa
Antigua and Barbuda
25

3 

15
7
1
255
31
151
1,542
North America
Timor-Leste
24

 

16
8

18

322
244
Asia
Botswana
23

1 

8
14

10
0.4
7,675
3,264
Africa
Grenada
21

 

13
8
4
187

1,406
12,495
North 

# Writing the data to a text file 'covid.txt'

In [10]:
# opening the text file in writing mode
with open('covid.txt', 'w') as r:
    # iterating the list items for 'tr' tag. We are taking the rows from 9 because the rows from 0 to 8 are unnecessary
    for row in stat_table.find_all('tr')[9:]:
        # iterate the cell value of each row. The cell value is present inside the 'td' tag
        for cell in row.find_all('td'):
            # write the cell to the text file
            r.write(cell.text)
            # leave a tab after each cell
            r.write('\t')
        # go to the newline after writing a row
        r.write('\n')

In [11]:
# reading the covid.txt file as a dataframe
# header = None: will not consider the first row as the columns
# sep = '\t': parse the cells separated by a tab
# names=columns: assigns the column name
# # index_col=False: will not consider the first column as the row index
covid = pd.read_csv('covid.txt', 
                    encoding='latin-1', 
                    header=None, 
                    sep='\t', 
                    names=columns, 
                    index_col=False
                   )

In [12]:
covid.tail(10)

Unnamed: 0,Country,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",TotCases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Continent
212,Saint Pierre Miquelon,1,,,,,1,,173.0,,,,North America
213,China,82877,2.0,4633.0,,77713.0,531,34.0,58.0,3.0,,,Asia
214,Total:,1261826,3224.0,73924.0,144.0,215489.0,972413,17719.0,,,,,North America
215,Total:,1438691,14327.0,139713.0,303.0,535829.0,763149,17472.0,,,,,Europe
216,Total:,553336,7526.0,19451.0,112.0,287048.0,246837,5389.0,,,,,Asia
217,Total:,200569,819.0,10259.0,16.0,69068.0,121242,9862.0,,,,,South America
218,Total:,8390,22.0,115.0,2.0,7165.0,1110,31.0,,,,,Australia/Oceania
219,Total:,43909,175.0,1764.0,1.0,14878.0,27267,130.0,,,,,Africa
220,Total:,721,,15.0,,645.0,61,4.0,,,,,
221,Total:,3507442,26093.0,245241.0,578.0,1130122.0,2132079,50607.0,450.0,31.5,,,All


In [13]:
# creating a separate dataset that contains the details of total corona cases for each continent
covid_cases_per_continent = covid[214:220].iloc[:, [1,2,3,4,5,6,7,12]]
covid_cases_per_continent

Unnamed: 0,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Continent
214,1261826,3224,73924,144.0,215489,972413,17719,North America
215,1438691,14327,139713,303.0,535829,763149,17472,Europe
216,553336,7526,19451,112.0,287048,246837,5389,Asia
217,200569,819,10259,16.0,69068,121242,9862,South America
218,8390,22,115,2.0,7165,1110,31,Australia/Oceania
219,43909,175,1764,1.0,14878,27267,130,Africa


### Writing the dataframe 'covid_cases_per_continent' to a csv file with the timestamp

In [14]:
# index=False ignores the index column
covid_cases_per_continent.to_csv('covid_per_continent {}.csv'.format(date_string), index=False)

In [15]:
# replacing the null values with 0
covid.replace(np.nan, 0, inplace = True)
covid

Unnamed: 0,Country,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",TotCases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Continent
0,USA,1162049,+1275,67492,48.0,173910,920647,16475,3511,204,6937747,20960,North America
1,Spain,245567,0,25100,0.0,146233,74234,2386,5252,537,1528833,32699,Europe
2,Italy,209328,0,28710,0.0,79914,100704,1539,3462,475,2108837,34879,Europe
3,UK,182260,0,28131,0.0,0,153785,1559,2685,414,1129907,16644,Europe
4,France,168396,0,24760,0.0,50562,93074,3827,2580,379,1100228,16856,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,Total:,200569,+819,10259,16.0,69068,121242,9862,0,0,0,0,South America
218,Total:,8390,+22,115,2.0,7165,1110,31,0,0,0,0,Australia/Oceania
219,Total:,43909,+175,1764,1.0,14878,27267,130,0,0,0,0,Africa
220,Total:,721,0,15,0.0,645,61,4,0,0,0,0,0


In [16]:
# ignoring the last rows that contains the details of total corona cases for each continent
# this will contain the details of total corona cases for each country 
covid = covid[0:214]
covid

Unnamed: 0,Country,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",TotCases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Continent
0,USA,1162049,+1275,67492,48.0,173910,920647,16475,3511,204,6937747,20960,North America
1,Spain,245567,0,25100,0.0,146233,74234,2386,5252,537,1528833,32699,Europe
2,Italy,209328,0,28710,0.0,79914,100704,1539,3462,475,2108837,34879,Europe
3,UK,182260,0,28131,0.0,0,153785,1559,2685,414,1129907,16644,Europe
4,France,168396,0,24760,0.0,50562,93074,3827,2580,379,1100228,16856,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,Western Sahara,6,0,,0.0,5,1,0,10,0,0,0,Africa
210,Anguilla,3,0,,0.0,3,0,0,200,0,0,0,North America
211,Comoros,3,0,,0.0,0,3,0,3,0,0,0,Africa
212,Saint Pierre Miquelon,1,0,,0.0,0,1,0,173,0,0,0,North America


# Writing the dataframe to a csv file with the timestamp

In [17]:
# index=False ignores the index column
covid.to_csv('covid {}.csv'.format(date_string), index=False)