In [126]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
import numpy as np
import datetime

In [18]:
def clean(string):
    """
    Clean the script tag contents for easier retrieval of data.
    
    paramters:
        string: str.
        The contents of the script tag.
        
    returns:
        string: str.
        The cleaned contents of the script tag.
    """
    
    string = re.sub("[\n \\\']",'',str(string))
    string = string.replace(" ",'')
    string = re.sub('[{}\[\]():]',' ',string)
    string = re.sub('[\"\" /*]','',string)
    
    return string

In [19]:
def retrieve_dates(string):
    """
    Retrieve dates from the cleaned script tag contents.
    
    parameters:
        string: str.
        The cleaned contents of the script tag.
        
    returns:
        dates: list.
        A list of the dates.
    """
    
    start_string = 'categories'
    end_string = ',yAxis'
    
    start_index = string.find(start_string) + len(start_string)
    end_index = string.find(end_string)
    
    dates = string[start_index:end_index].strip().split(",")
    
    return dates

In [20]:
def retrieve_daily_stats(string):
    """
    Retrieves daily statistics from the cleaned script tag contents.
    
    parameters:
        string: str.
        Cleaned contents of the script tag.
                
    returns:
        values: list.
        A list of daily statistics ordered by date.
    """
    
    start_string = 'data'
    end_string = ',name'
    
    start_index = string.find(start_string) + len(start_string)
    end_index = string.find(end_string)
    
    values = string[start_index:end_index].strip().split(",")
    
    return values

In [21]:
def retrieve_overall_stats(string):
    """
    Retrieves overall statistics from the cleaned script tag contents.
    
    parameters:
        string: str.
        Cleaned contents of the script tag.
        
    returns:
        values: list.
        A list of daily statistics ordered by date.
    """
    
    start_string = 'data'
    end_string = ',resp'
    
    start_index = string.find(start_string) + len(start_string)
    end_index = string.find(end_string)
    
    values = string[start_index:end_index].strip().split(",")
    
    return values

In [22]:



url = "https://www.worldometers.info/coronavirus/country/"+country+"/"
page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')

result = soup.find_all('div',class_='col-md-12')

In [122]:
countries = [
    'us','brazil','russia','spain',
    'italy','france','germany',
    'turkey','india','iran','peru',
    'canada','chile','china','mexico',
    'saudi-arabia','pakistan','belgium',
    'qatar', 'bangladesh',
    'belarus', 'ecuador', 'sweden'
]


data_indexes = {
                'total_cases':0,
                'daily_cases':1,
                'active_cases':2,
                'total_deaths':3,
                'daily_deaths':4
               }

In [24]:
def page_contents(url):
    """
    Retrieves contents of the web page from the specified url and the specific div tag class - col-md-12
    
    paramters:
        url: str.
        The url to the web page to be scrapped.
        
    returns:
        result: str.
        HTML parsed web page content as string.
    """
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    result = soup.find_all('div', class_= 'col-md-12')
    
    return result

In [102]:
def script_tag_contents(page_content, stat):
    """
    Retrieves the script tag contents from the web page contents.
    
    paramters:
        page_content: str.
        HTML parsed web page contents
        
        stat: str.
        String specifying the kind of statistic from the data_indexes.
        
    returns:
        script_content: str.
        Script tag contents as string.
    """
    
    stat_data = page_content[data_indexes[stat]]
    script_content = stat_data.find('script').contents[0]
    
    return script_content

In [120]:
def build_dataframe(values,stat_name,dataframe=None,date=None):
    """
    Build a DataFrame from the dates and the values scraped.
    
    parameters:
        dataframe: DataFrame.
        A DataFrame containing dates and/or statistics.
        
        date: list.
        List of dates for the statistics in string format.
        
        values: list:
        List of values (data) for the statistics in string format.
        
        stat_name: str.
        Name of the statistic, for which the list of values are passed.
        
        returns:
            dataframe: DataFrame.
            DataFrame containing dates and passed statistic values.
    """
    
    if dataframe is None and date is not None:
        
        dataframe = pd.DataFrame({'date':date, stat_name:values})
        
    else:
        
        dataframe[stat_name] = values
        
    return dataframe

In [145]:
def clean_date(dataframe, date_col):
    """
    Clean the date column in the dataframe to standard date representation - YYYY-MM-DD
    
    parameters:
        dataframe: DataFrame.
            DataFrame whose dates are to be cleaned.
            
        date_col: str.
        Name of the date column in the DataFrame.
        
    returns:
        dataframe: DataFrame.
        Cleaned DataFrame.
    """
    
    dataframe[date_col] = dataframe[date_col].apply(
        lambda date: date[:3]+" "+date[3:]+" 2020"
    )
    dataframe[date_col] = dataframe[date_col].apply(
        lambda date: datetime.datetime.strptime(date,'%b %d %Y').date()
    )
    
    return dataframe

In [319]:
def scrape_data():
    """
    Scrape the web page for date, total cases, daily cases, total active cases, total_deaths, daily deaths
    daily recoveries per country. Creates a folder in local directory containing csv files per country with 
    the respective data. Website - worldometers.info
    
    parameters: None
    
    returns: bool.
    boolean.
    """
    
    has_been_run_once = False
        
    for country in countries:
                
        url = "https://www.worldometers.info/coronavirus/country/"+country+"/"
        content = page_contents(url)
        
        for stat in data_indexes:
                        
            script_contents = script_tag_contents(content, stat)
            script_contents = clean(script_contents)
            
            if 'daily' in stat:
                
                data = retrieve_daily_stats(script_contents)
            
            else:
                
                data = retrieve_overall_stats(script_contents)
                
            if not has_been_run_once:
                
                date = retrieve_dates(script_contents)
                dataframe = build_dataframe(data, stat, date= date)
                has_been_run_once = True
                
            else:
                
                dataframe = build_dataframe(data, stat, dataframe= dataframe)
            
        dataframe = clean_date(dataframe, date_col='date')
        dataframe.to_csv('./Data/covid19_'+country+'_stats.csv',index=False)
        has_been_run_once = False
        print("Scraped successfully: ",country)
    
    return True

In [320]:
scrape_data()

Scraped successfully:  us
Scraped successfully:  brazil
Scraped successfully:  russia
Scraped successfully:  spain
Scraped successfully:  italy
Scraped successfully:  france
Scraped successfully:  germany
Scraped successfully:  turkey
Scraped successfully:  india
Scraped successfully:  iran
Scraped successfully:  peru
Scraped successfully:  canada
Scraped successfully:  chile
Scraped successfully:  china
Scraped successfully:  mexico
Scraped successfully:  saudi-arabia
Scraped successfully:  pakistan
Scraped successfully:  belgium
Scraped successfully:  qatar
Scraped successfully:  bangladesh
Scraped successfully:  belarus
Scraped successfully:  ecuador
Scraped successfully:  sweden


True

In [321]:
df = pd.read_csv('./Data/covid19_india_stats.csv')
df

Unnamed: 0,date,total_cases,daily_cases,active_cases,total_deaths,daily_deaths
0,2020-02-15,3,,0,0,
1,2020-02-16,3,0.0,0,0,
2,2020-02-17,3,0.0,0,0,
3,2020-02-18,3,0.0,0,0,
4,2020-02-19,3,0.0,0,0,
...,...,...,...,...,...,...
100,2020-05-25,144950,6414.0,80072,4172,148.0
101,2020-05-26,150793,5843.0,82172,4344,172.0
102,2020-05-27,158086,7293.0,85803,4534,190.0
103,2020-05-28,165386,7300.0,89755,4711,177.0


In [236]:
def table_contents(url):
    """
    Retrieve contents of the table on the url.
    
    parameters:
        url: str.
        URL of the webpage. The table contents are scraped from this.
        
    returns:
        table_data: str.
        HTML parsed table data in string format.
    """
    
    page_content = requests.get(url)
    soup = BeautifulSoup(page_content.content, 'html.parser')
    table_data = soup.find('table', id='thetable')
    
    return table_data

In [240]:
def country_names(table_data):
    """
    Scrape country names from the table contents scraped from the Wikipedia url.
    
    parameters:
        table_data: str.
        Scraped table data from webpage in string format.
        
    returns:
        countries: list.
        A list of country names from the table on webpage.
    """
    
    countries= []
    table_head_data = table_data.find_all('th', scope= 'row')
    
    for data in table_head_data:
        
        anchor_data = data.find('a')
        
        if anchor_data is not None:
            
            countries.append(anchor_data.contents[0])
    
    return countries

In [242]:
def chunks(lst, size):
    """
    Segments the input list into equal chunks of specified size.
    
    parameters:
        lst: list.
        List to be segmented.
        
        size: int.
        Size of the chunks.
    """
    
    for i in range(0, len(lst), size):
        
        yield lst[i:i+size]
        

In [270]:
def clean_stats(stats):
    """
    Clean the statistics scraped from the webpage. Remove ',' and '\n'.
    
    parameters:
        stats: list.
        A list of statistics.
        
    returns:
        cleaned_stats: list.
        A cleaned list of statistics.
    """
    
    cleaned_stats = []
    
    for stat in stats:
        
        stat = re.sub(',','',stat)
        stat = stat.rstrip()
        cleaned_stats.append(stat)
        
    cleaned_stats.pop()
    
    return cleaned_stats

In [279]:
def table_statistics(table_data):
    """
    Scrape overall statistics on total cases, total deaths and total recoveries from 
    the table contents on the webpage, for each country.
    
    parameters:
        table_data: str.
        Scraped table data in string format.
        
    returns:
        stats: list.
        A list of lists giving country wise statistics.
    """
    
    stats = []
    row_data = table_data.find_all('td')
    
    for data in row_data:
        
        if data.find('sup'):
            
            continue
            
        if data.find('span'):
            
            stats.append('null')
            continue
        
        stats.append(data.contents[0])
        
    stats = clean_stats(stats)    
    stats = list(chunks(stats,3))
        
    return stats

In [306]:
def scrape_overall_data():
    """
    Scrape overall statistics country wise from the Wikipedia page on COVID-19 pandemic into a DataFrame.
    Writes the DataFrame to a csv file.
    
    returns: bool.
    
    """
    
    url = 'https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory'
    table_data = table_contents(url)
    
    countries = country_names(table_data)
    statistics = table_statistics(table_data)
    
    statistics_dict = {}
    
    for country, statistic in zip(countries, statistics):
        
        statistics_dict[country] = statistic
        
    dataframe = pd.DataFrame.from_dict(statistics_dict, orient= 'index', columns= [
        'total_cases',
        'total_deaths',
        'total_recoveries'
    ]).reset_index()
    
    dataframe.rename(columns= {'index':'country'}, inplace= True)
    dataframe.to_csv('./Data/covid19_overall_stat.csv', index= False)
    
    print("Successfully scraped table")
    
    return True

In [322]:
scrape_overall_data()

Successfully scraped table


True

In [323]:
pd.read_csv('./Data/covid19_overall_stat.csv')

Unnamed: 0,country,total_cases,total_deaths,total_recoveries
0,United States,1783132,104166,384821.0
1,Brazil,468338,27944,189476.0
2,Russia,387623,4374,159257.0
3,United Kingdom,271222,38161,
4,Spain,238564,27121,150376.0
...,...,...,...,...
223,Saba,3,0,3.0
224,Bonaire,2,0,2.0
225,Lesotho,2,0,1.0
226,Sint Eustatius,2,0,2.0


In [309]:
def updated_stats(url):
    """
    Gathers updated data on the number of cases and deaths in a day.
    
    paramters:
        url: str.
        URL from where the updates are scraped.
        
    returns:
        result: tuple.
        A tuple with total cases and total deaths of the day.
    """
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    updated_list = soup.find('li', class_= 'news_li')
    updates = updated_list.find_all('strong')
    
    daily_cases = updates[0].contents[0]
    daily_deaths = updates[1].contents[0]
    
    daily_cases = re.sub('[, new cases]','',daily_cases)
    daily_deaths = re.sub('[, new deaths]','',daily_deaths)
    
    result = (int(daily_cases), int(daily_deaths))
    
    return result

In [332]:
url = "https://www.worldometers.info/coronavirus/country/us/"
page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')
updates = soup.find('li',class_='news_li').find_all('strong')
updates[0].contents

['25,069 new cases']

In [333]:
updated_stats(url)

(25069, 1212)