# COVID-19 Web Data Scraper

### Python Script for scraping & storing data in Table (csv file format)

In [1]:
import pandas as pd
#supress warnings
import warnings
warnings.filterwarnings('ignore')
import requests
from bs4 import BeautifulSoup

USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
# US english
LANGUAGE = "en-US,en;q=0.5"



def get_all_tables(soup):
    """Extracts and returns all tables in a soup object"""
    return soup.find_all("table")

def get_table_headers(table):
    """Given a table soup, returns all the headers"""
    headers = []
    for th in table.find("tr").find_all("th"):
        headers.append(th.text.strip())
    return headers


def get_table_rows(table):
    """Given a table, returns all its rows"""
    rows = []
    for tr in table.find_all("tr")[1:]:
        cells = []
        # grab all td tags in this table row
        tds = tr.find_all("td")
        if len(tds) == 0:
            # if no td tags, search for th tags
            # can be found especially in wikipedia tables below the table
            ths = tr.find_all("th")
            for th in ths:
                cells.append(th.text.strip())
        else:
            # use regular td tags
            for td in tds:
                cells.append(td.text.strip())
        rows.append(cells)
    return rows

def save_as_csv(table_name, headers, rows):
    pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")
    
    
def main(URL):    
    page = requests.get(URL)

    soup = BeautifulSoup(page.content, 'html.parser')
    # extract all the tables from the web page
    tables = get_all_tables(soup)
    print(f"[+] Found a total of {len(tables)} tables.")
    # iterate over all tables
    for i, table in enumerate(tables, start=1):
        # get the table headers
        headers = get_table_headers(table)
        # get all the rows of the table
        rows = get_table_rows(table)
        # save table as csv file
        table_name = f"table-{i}"
        if i==1:
            print(f"[+] Saving {table_name}")
            save_as_csv(table_name, headers, rows)
            break
        
if __name__ == "__main__":
    import sys
    try:
        URL = 'https://www.worldometers.info/coronavirus/'
    except IndexError:
        exit(1)
    main(URL)

[+] Found a total of 3 tables.
[+] Saving table-1


In [2]:
df = pd.read_csv('table-1.csv')
df = df[8:223]
df.head()

Unnamed: 0.1,Unnamed: 0,#,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/\n1M pop,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl
8,8,1.0,USA,5415666,,170415,,2843204,,2402047,17239,16350,514,68524602,206876,331235184,North America,61,1944,5
9,9,2.0,Brazil,3229621,,105564,,2356640,,767417,8318,15181,496,13464336,63290,212739546,South America,66,2015,16
10,10,3.0,India,2459613,,48144,,1750636,,660833,8944,1780,35,27694416,20045,1381604279,Asia,562,28697,50
11,11,4.0,Russia,907758,,15384,,716396,,175978,2300,6220,105,31598302,216513,145942116,Europe,161,9487,5
12,12,5.0,South Africa,572865,,11270,,437617,,123978,539,9645,190,3315497,55820,59395867,Africa,104,5270,18
