  <tr>
        <td width="60%">
            <img src="data_scraping.png">
        </td>
        <td>
            <div align="center">
                <font size=24px>
                    <b> Data Scrapping: Corona Cases
                    </b>
                </font>
            </div>
        </td>
    </tr>

In [None]:
# libraries for Web Scrapping
import requests
from bs4 import BeautifulSoup

# library for advance string manipulation
import string

# library for data manipulation
import pandas as pd

# library for advance mathematical operations
import numpy as np

# Total Cases

### Get the webpage

In [None]:
page = requests.get("https://www.worldometers.info/coronavirus/")
soup = BeautifulSoup(page.content, 'html.parser')
# soup

### Extract the html code for class : content-inner

In [None]:
# find() returns a single element
all_cases = soup.find(class_="content-inner")
# all_cases

### Extract the html code for class : maincounter-number

In [None]:
# find_all() searches for the all the tags where the class "maincounter-number" is found and return a list of all the occurences 
main_total_count = all_cases.find_all(class_="maincounter-number")
main_total_count

In [None]:
# get the text of the first element form the above obtained list and clean the text
total_cases = main_total_count[0].get_text().replace('\n','').strip()
total_cases

In [None]:
# get the text of the second element form the above obtained list and clean the text
total_deaths = main_total_count[1].get_text().replace('\n','').strip()
total_deaths

In [None]:
# get the text of the third element form the above obtained list and clean the text
total_recovered = main_total_count[2].get_text().replace('\n','').strip()
total_recovered

In [None]:
# create a dictionary that hold all the extracted values
total_cases_dict = {'Total Cases':total_cases,
                   'Total Deaths':total_deaths,
                   'Total Recovered':total_recovered}

total_cases_dict

# Active Cases

In [None]:
all_cases = soup.find(class_="content-inner")
# all_cases

In [None]:
# find_all() searches for the all the tags where the class "number-table-main" is found and return a list of all the occurences 
currently_infected_people = all_cases.find_all(class_="number-table-main")
currently_infected_people

In [None]:
# get the text of the first element form the above obtained list and clean the text
CIP = currently_infected_people[0].get_text()
CIP

In [None]:
# find_all() searches for the all the tags where the class "number-table" is found and return a list of all the occurences 
currently_infected_people_condition = all_cases.find_all(class_="number-table")
currently_infected_people_condition

In [None]:
# get the text of the first element form the above obtained list and clean the text
CIPC_Mild = currently_infected_people_condition[0].get_text()
CIPC_Mild

In [None]:
# get the text of the second element form the above obtained list and clean the text
CIPC_Severe = currently_infected_people_condition[1].get_text()
CIPC_Severe

In [None]:
# create a dictionary that hold all the extracted values
active_cases_dict = {'Currently Infected People': CIP,
                    'Currently Infected People Condition (Mild)': CIPC_Mild,
                    'Currently Infected People Condition (Severe)': CIPC_Severe}

active_cases_dict

# Scrape Table

In [None]:
# importing the datetime library to obtain the current date and time
from datetime import datetime

# function that returns the date and the time
def get_time_of_parsing():
    # datetime object containing current date and time
    now = datetime.now()
    
    # mm-dd-YY H-M-S
    date_string = now.strftime("%b-%d-%Y %H-%M-%S")
    return(date_string)

get_time_of_parsing()

In [None]:
# running the get_time_of_parsing function to get the date and time at the time of accessing the webpage details
date_string = get_time_of_parsing()

page = requests.get("https://www.worldometers.info/coronavirus/")

# parsing the page using html parser. There are other parsers like lxml
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
# find() searches for the first occurrence where the class "main_table_countries_div" is found
table_wrapper = soup.find(class_='main_table_countries_div')
# table_wrapper

In [None]:
# find_all() searches for the all the tags where the class "table table-bordered table-hover main_table_countries"
# is found and return a list of all the occurences 
table = table_wrapper.find_all('table', class_="table table-bordered table-hover main_table_countries")
# table

In [None]:
# check how many tables are obtained
len(table)

In [None]:
# select the first table from the above list 'table'
stat_table = table[0]
# stat_table

### Column names before cleaning

In [None]:
# creating an empty list to contain columns
columns = []

# the column names are persent inside the 'th' tag
for cols in stat_table.find_all('th'):
    # cleaning the text of the column names by replacing the unwanted characters by ''
    columns.append(cols.text)

columns

### Column names after cleaning

In [None]:
# creating an empty list to contain columns
columns = []

# the column names are persent inside the 'th' tag
for cols in stat_table.find_all('th'):
    # cleaning the text of the column names by replacing the unwanted characters by ''
    columns.append(cols.text.replace('\n', '').replace('\xa0', ''))

# cleaning the name of the first column 'Country, others'. Considering only the word 'Country' as the column name
columns[0] = columns[0].split(',')[0]
columns

### Extract each cell from the table

In [None]:
for row in stat_table.find_all('tr'):
    for cell in row.find_all('td'):
        print(cell.text)

# Writing the data to a text file 'covid.txt'

#### Write all the data directly to the txt file

In [None]:
# opening the text file in writing mode
with open('covid.txt', 'w') as r:
    # iterating the list items for 'tr' tag. We are taking the rows from 9 because the rows from 0 to 8 are unnecessary
    for row in stat_table.find_all('tr'):
        # iterate the cell value of each row. The cell value is present inside the 'td' tag
        for cell in row.find_all('td'):
            # write the cell to the text file
            r.write(cell.text)
            # leave a tab after each cell
            r.write('\t')
        # go to the newline after writing a row
        r.write('\n')

#### Write the relevant data to the txt file

In [None]:
# opening the text file in writing mode
with open('covid.txt', 'w') as r:
    # iterating the list items for 'tr' tag. We are taking the rows from 9 because the rows from 0 to 8 are unnecessary
    for row in stat_table.find_all('tr')[9:]:
        # iterate the cell value of each row. The cell value is present inside the 'td' tag
        for cell in row.find_all('td'):
            # write the cell to the text file
            r.write(cell.text)
            # leave a tab after each cell
            r.write('\t')
        # go to the newline after writing a row
        r.write('\n')

### Read the covid.txt file as a dataframe

In [None]:
# reading the covid.txt file as a dataframe
# header = None: will not consider the first row as the columns
# sep = '\t': parse the cells separated by a tab
# names=columns: assigns the column name
# # index_col=False: will not consider the first column as the row index
covid = pd.read_csv('covid.txt', 
                    encoding='latin-1', 
                    header=None, 
                    sep='\t', 
                    names=columns, 
                    index_col=False
                   )

In [None]:
covid.tail(10)

In [None]:
# creating a separate dataset that contains the details of total corona cases for each continent
covid_cases_per_continent = covid[214:220].iloc[:, [1,2,3,4,5,6,7,12]]
covid_cases_per_continent

### Writing the dataframe 'covid_cases_per_continent' to a csv file with the timestamp

In [None]:
# index=False ignores the index column
covid_cases_per_continent.to_csv('covid_per_continent {}.csv'.format(date_string), index=False)

In [None]:
# replacing the null values with 0
covid.replace(np.nan, 0, inplace = True)
covid

In [None]:
# ignoring the last rows that contains the details of total corona cases for each continent
# this will contain the details of total corona cases for each country 
covid = covid[0:214]
covid

# Writing the dataframe to a csv file with the timestamp

In [None]:
# index=False ignores the index column
covid.to_csv('covid {}.csv'.format(date_string), index=False)

  <tr>
        <td width="50%">
            <img src="stage 1.png" class="center" width=100%>
        </td>
   
    </tr>