In [1]:
from bs4 import BeautifulSoup as soup
from datetime import date, datetime
from urllib.request import Request, urlopen

import pandas as pd
import numpy as np



In [2]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import seaborn as sns
import gc
import warnings
warnings.filterwarnings("ignore")

from pandas_profiling import ProfileReport

**WEB SCRAPPING**

In [3]:
today = datetime.now()
yesterday_str = "%s %d,%d" %(date.today().strftime("%b"), today.day-1, today. year)
yesterday_str


'Nov 20,2022'

In [4]:
url = "https://www.worldometers.info/coronavirus/#countries"
req = Request(url , headers={'user-Agent' : "Mozilla/5.0"})
webpage = urlopen(req)
page_soup = soup(webpage, "html.parser")


In [5]:
table = page_soup.findAll("table",{"id": "main_table_countries_yesterday"})

containers = table[0].findAll("tr",{"style":""})
title = containers[0]

del containers[0]

all_data = []
clean = True

for country in containers:
    country_data = []
    country_container = country.findAll("td")

    if country_container[1].text == "China":
        continue
    for i in range(1, len(country_container)):
        final_feature = country_container[i].text
        if clean:
            if i != 1 and i != len(country_container)-1:
                final_feature = final_feature.replace(",","")

                if final_feature.find('+') != -1:
                    final_feature = final_feature.replace("+","")
                    final_feature = float(final_feature)
                elif final_feature.find("-") != -1:
                    final_feature = final_feature.replace("-","")
                    final_feature = float(final_feature)*-1
        if final_feature == 'N/A':
            final_feature = 0
        elif final_feature == "" or final_feature == " ":
            final_feature = -1
        country_data.append(final_feature)
    all_data.append(country_data)


In [6]:
all_data

[['World',
  '643105643',
  209901.0,
  '6626087',
  396.0,
  '622217754',
  193233.0,
  '14261802',
  '36119',
  '82504',
  '850.1',
  -1,
  -1,
  -1,
  'All',
  '\n',
  -1,
  -1,
  -1,
  -1,
  -1],
 ['USA',
  '100209101',
  4996.0,
  '1102668                                ',
  2.0,
  '97751014',
  18915.0,
  '1355419',
  '2745',
  '299306',
  '3293',
  '1137483003',
  '3397447',
  '334805269 ',
  'North America',
  '3',
  '304',
  '0',
  '15',
  '0.01',
  '4,048'],
 ['India',
  '44670438',
  421.0,
  '530586                                ',
  12.0,
  '44132433',
  481.0,
  '7419',
  '698',
  '31757',
  '377',
  '903809991',
  '642535',
  '1406631776 ',
  'Asia',
  '31',
  '2651',
  '2',
  '0.3',
  '0.01',
  '5'],
 ['France',
  '37348839',
  14204.0,
  '158163                                ',
  -1,
  '36655222',
  3768.0,
  '535454',
  '869',
  '569476',
  '2412',
  '271490188',
  '4139547',
  '65584518 ',
  'Europe',
  '2',
  '415',
  '0',
  '217',
  -1,
  '8,164'],
 ['Germany',
 

In [7]:
df = pd.DataFrame(all_data)
df.drop([15,16,17,18,19,20] , inplace = True,axis = 1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,World,643105643,209901.0,6626087,396.0,622217754,193233.0,14261802,36119,82504,850.1,-1,-1,-1,All
1,USA,100209101,4996.0,1102668,2.0,97751014,18915.0,1355419,2745,299306,3293.0,1137483003,3397447,334805269,North America
2,India,44670438,421.0,530586,12.0,44132433,481.0,7419,698,31757,377.0,903809991,642535,1406631776,Asia
3,France,37348839,14204.0,158163,-1.0,36655222,3768.0,535454,869,569476,2412.0,271490188,4139547,65584518,Europe
4,Germany,36205405,-1.0,156613,-1.0,35359000,11600.0,689792,1406,431615,1867.0,122332384,1458359,83883596,Europe


In [8]:
column_labels = ["Country","Total Cases","New Cases","Total Deaths","New Deaths","Total Recovered","New Recovered","Active Cases","Serious/Critical","Total Cases/1M","Deaths/1M","Total Tests","Test/1M","Population","Continent"]
df.columns = column_labels

In [9]:
df.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Test/1M,Population,Continent
0,World,643105643,209901.0,6626087,396.0,622217754,193233.0,14261802,36119,82504,850.1,-1,-1,-1,All
1,USA,100209101,4996.0,1102668,2.0,97751014,18915.0,1355419,2745,299306,3293.0,1137483003,3397447,334805269,North America
2,India,44670438,421.0,530586,12.0,44132433,481.0,7419,698,31757,377.0,903809991,642535,1406631776,Asia
3,France,37348839,14204.0,158163,-1.0,36655222,3768.0,535454,869,569476,2412.0,271490188,4139547,65584518,Europe
4,Germany,36205405,-1.0,156613,-1.0,35359000,11600.0,689792,1406,431615,1867.0,122332384,1458359,83883596,Europe


In [10]:
for label in df.columns:
  if label != 'Country' and label != "Continent":
      df[label] = pd.to_numeric(df[label])

In [11]:
df["%Inc Cases"] = df["New Cases"]/df["Total Cases"]*100
df["%Inc Deaths"] = df["New Deaths"]/df["Total Deaths"]*100
df["%Inc Recovered"] = df["New Recovered"]/df["Total Recovered"]*100


In [12]:
df.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Test/1M,Population,Continent,%Inc Cases,%Inc Deaths,%Inc Recovered
0,World,643105643,209901.0,6626087,396.0,622217754,193233.0,14261802,36119,82504.0,850.1,-1,-1,-1,All,0.032639,0.005976,0.031056
1,USA,100209101,4996.0,1102668,2.0,97751014,18915.0,1355419,2745,299306.0,3293.0,1137483003,3397447,334805269,North America,0.004986,0.000181,0.01935
2,India,44670438,421.0,530586,12.0,44132433,481.0,7419,698,31757.0,377.0,903809991,642535,1406631776,Asia,0.000942,0.002262,0.00109
3,France,37348839,14204.0,158163,-1.0,36655222,3768.0,535454,869,569476.0,2412.0,271490188,4139547,65584518,Europe,0.038031,-0.000632,0.01028
4,Germany,36205405,-1.0,156613,-1.0,35359000,11600.0,689792,1406,431615.0,1867.0,122332384,1458359,83883596,Europe,-3e-06,-0.000639,0.032806


**EXPLORATORY DATA ANALYSIS**



In [13]:
cases = df[["Total Recovered", "Active Cases", "Total Deaths"]].loc[0]
cases_df = pd.DataFrame(cases).reset_index()
cases_df.columns = ["Type" , "Total"]

cases_df["Percentage"] = np.round(100*cases_df['Total']/np.sum(cases_df["Total"]),2)
cases_df["Virus"] = ["COVID-19" for i in range(len(cases_df))]

fig = px.bar(cases_df, x = "Virus", y = "Percentage", color = "Type", hover_data=["Total"])
fig.show()


In [14]:
cases = df[["New Cases", "New Recovered", "New Deaths"]].loc[0]
cases_df = pd.DataFrame(cases).reset_index()
cases_df.columns = ["Type" , "Total"]

cases_df["Percentage"] = np.round(100*cases_df['Total']/np.sum(cases_df["Total"]),2)
cases_df["Virus"] = ["COVID-19" for i in range(len(cases_df))]

fig = px.bar(cases_df, x = "Virus", y = "Percentage", color = "Type", hover_data=["Total"])
fig.show()


In [15]:
per = np.round(df[["%Inc Cases", "%Inc Deaths", "%Inc Recovered"]].loc[0], 2)

per_df = pd.DataFrame(per)
per_df.columns = ["Percentage"]

fig = go.Figure()

fig.add_trace(go.Bar(x = per_df.index , y = per_df["Percentage"], marker_color = ["red","blue","green"]))
fig.show()

**CONTINENT**

In [16]:
continent_df = df.groupby("Continent").sum().drop("All")
continent_df = continent_df.reset_index()
continent_df

Unnamed: 0,Continent,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Test/1M,Population,%Inc Cases,%Inc Deaths,%Inc Recovered
0,Africa,12679748,368.0,257812,-55.0,10877760,411.0,339119,814,2360160.0,18045.0,109188842,10730181,1402212660,0.212612,86.136432,-49.865554
1,Asia,175158848,159268.0,1388159,248.0,167335878,103805.0,6219103,9372,6828977.0,32048.0,1998016161,88765856,3151332728,1.343188,-23.504679,1.201522
2,Australia/Oceania,12864664,6175.0,22029,-13.0,12518029,3062.0,226277,113,3599683.0,8207.0,88293817,20348041,43469030,16.580254,-64.130085,100.91021
3,Europe,236828403,24536.0,1955924,29.0,231035330,54319.0,3557843,7624,18020573.0,121136.0,2791657942,209901879,747543038,0.3015,-10.902569,0.884146
4,North America,118728245,5144.0,1560777,-34.0,114070483,22210.0,2219017,7890,8600341.0,56988.0,1271831670,99144087,598140916,-0.274337,-163.385584,-0.286476
5,South America,64790842,11990.0,1334559,27.0,62991252,8184.0,385195,10129,1951171.0,32587.0,238797449,11276984,437690904,0.161565,-0.3959,0.197124


In [17]:
def continent_visualization(vis_list):
    for label in vis_list:
        c_df = continent_df[["Continent", label]]
        c_df["Percentage"] = np.round(100*c_df[label]/np.sum(c_df[label]),2)
        c_df["Virus"] = ["Covid-19" for i in range(len(c_df))]

        fig = px.bar(c_df, x = "Virus", y = "Percentage", color = "Continent", hover_data=[label])
        fig.update_layout(title = {"text": f"{label}"})
        fig.show()
        gc.collect()

In [18]:
cases_list = ["Total Cases", "Active Cases", "New Cases", "Serious/Critical", "Total Cases/1M"]

deaths_list = ["Total Deaths","New Deaths","Deaths/1M"]

recovered_list = ["Total Recovered","New Recovered","%Inc Recovered"]

In [19]:
continent_visualization(cases_list)

**COUNTRIES**

In [20]:
df = df.drop([len(df)-1])
country_df = df.drop([0])

country_df

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Test/1M,Population,Continent,%Inc Cases,%Inc Deaths,%Inc Recovered
1,USA,100209101,4996.0,1102668,2.0,97751014,18915.0,1355419,2745,299306.0,3293.0,1137483003,3397447,334805269,North America,0.004986,0.000181,0.019350
2,India,44670438,421.0,530586,12.0,44132433,481.0,7419,698,31757.0,377.0,903809991,642535,1406631776,Asia,0.000942,0.002262,0.001090
3,France,37348839,14204.0,158163,-1.0,36655222,3768.0,535454,869,569476.0,2412.0,271490188,4139547,65584518,Europe,0.038031,-0.000632,0.010280
4,Germany,36205405,-1.0,156613,-1.0,35359000,11600.0,689792,1406,431615.0,1867.0,122332384,1458359,83883596,Europe,-0.000003,-0.000639,0.032806
5,Brazil,35064320,4300.0,689003,9.0,34167667,-1.0,207650,8318,162822.0,3199.0,63776166,296146,215353593,South America,0.012263,0.001306,-0.000003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,Saint Helena,1806,-1.0,-1,-1.0,2,-1.0,1804,-1,295339.0,-1.0,-1,-1,6115,Africa,-0.055371,100.000000,-50.000000
217,Montserrat,1403,-1.0,8,-1.0,1376,-1.0,19,-1,282578.0,1611.0,17762,3577442,4965,North America,-0.071276,-12.500000,-0.072674
218,Macao,796,-1.0,6,-1.0,789,-1.0,1,-1,1193.0,9.0,7850,11760,667490,Asia,-0.125628,-16.666667,-0.126743
219,Wallis and Futuna,761,-1.0,7,-1.0,438,-1.0,316,-1,69295.0,637.0,20508,1867419,10982,Australia/Oceania,-0.131406,-14.285714,-0.228311


In [21]:
LOOK_AT = 5
country = country_df.columns[1:14]

fig = go.Figure()
c = 0
for i in country_df.index:
    if c < LOOK_AT:
        fig.add_trace(go.Bar(name = country_df['Country'][i], x = country, y = country_df.loc[i][1:14])) 
    else:
        break
    c += 1

fig.update_layout(title = {"text":f'top {LOOK_AT} countries affected'}, yaxis_type = "log")
fig.show()