<a href="https://colab.research.google.com/github/saikumarreddykorsapati-source/Covid-19_Recent-Trends_Data-Analysis_Python/blob/main/Recent_COVID_19_Trends_Analysis_(Python).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


#                 **Covid-19_Recent-Trends_Data-Analysis_Python**





---








# * Steps Performed :

Step-1 : Data Collection |
Step-2 : Data Pre Processing |
Step-3 : EDA |
Step-4 : Conclusions

# Step-1 : 
**Performing Data Collection by using Web Scraping**

`BeautifulSoup` – Python library for getting data out of HTML, XML, and other markup languages. <br>

`Command to install beautifulsoup4!` : ! pip install beautifulsoup4

In [2]:
from bs4 import BeautifulSoup as soup
from datetime import date, datetime
from urllib.request import Request, urlopen

import pandas as pd # Pandas – Python library for data manipulation and analysis.
import numpy as np  # NumPy is a Python library used for working with arrays. It also has functions for working in domain of linear algebra, fourier transform, and matrices.

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import seaborn as sns
import gc
import warnings
warnings.filterwarnings("ignore")

# ! pip install pandas_profiling

from pandas_profiling import ProfileReport

In [3]:
today = datetime.now()

yesterday_str = "%s %d,%d" %(date.today().strftime("%b"), today.day-1, today.year)

`Web Scraping :`

In [4]:
url = "https://www.worldometers.info/coronavirus/#countries"
req = Request(url , headers={'User-Agent':"Mozilla/5.0"})

webpage = urlopen(req)
page_soup = soup(webpage, "html.parser")

In [5]:
table = page_soup.findAll("table",{"id":"main_table_countries_yesterday"})

containers = table[0].findAll("tr",{"style":""})
title = containers[0]

del containers[0]

all_data = []
clean = True

for country in containers:
    country_data = []
    country_container = country.findAll("td")
    
    if country_container[1].text == "China":
        continue
    for i in range(1, len(country_container)):
        final_feature = country_container[i].text
        if clean :
            if i !=1 and i != len(country_container)-1:
                final_feature = final_feature.replace(",","")
                
                if final_feature.find('+') != -1:
                    final_feature = final_feature.replace("+","")
                    final_feature = float(final_feature)
                    
                elif final_feature.find('-') != -1:
                    final_feature = final_feature.replace("-","")
                    final_feature = float(final_feature)
            
            if final_feature == 'N/A':
                final_feature = 0
            elif final_feature == "" or final_feature == " ":
                final_feature = -1
                
            country_data.append(final_feature)
            
    all_data.append(country_data)

# Step-2 : 
**Performing Data Pre Processing by using Pandas**

In [23]:
df = pd.DataFrame(all_data)
df.drop([15,16,17,18,19,20], inplace = True, axis = 1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,World,314023118,2846498.0,5521223,8265.0,261713993,975709.0,46787902,95534,40286,708.3,-1,-1,-1,All
1,USA,63390876,672872.0,863896,2173.0,42641852,136478.0,19885128,24368,189812,2587.0,844714275,2529338,333966543,North America
2,India,36070510,194720.0,484655,442.0,34630536,60405.0,955319,8944,25751,346.0,693155280,494857,1400717076,Asia
3,Brazil,22630142,71447.0,620281,139.0,21626836,-1.0,383025,8318,105322,2887.0,63776166,296818,214866497,South America
4,UK,14732551,120806.0,150609,379.0,10945874,87362.0,3636068,820,215292,2201.0,425464553,6217464,68430567,Europe


In [7]:
column_labels = ["Country", "Total Cases", "New Cases", "Total Deaths", "New Deaths", "Total Recovered", "New Recovered", 
                 "Active Cases","Serious/Critical", "Total Cases/1M", "Deaths/1M", "Total Tests", "Test/1M", "Population", 
                 "Continent"]
df.columns = column_labels

In [8]:
df.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Test/1M,Population,Continent
0,World,314023118,2846498.0,5521223,8265.0,261713993,975709.0,46787902,95534,40286,708.3,-1,-1,-1,All
1,USA,63390876,672872.0,863896,2173.0,42641852,136478.0,19885128,24368,189812,2587.0,844714275,2529338,333966543,North America
2,India,36070510,194720.0,484655,442.0,34630536,60405.0,955319,8944,25751,346.0,693155280,494857,1400717076,Asia
3,Brazil,22630142,71447.0,620281,139.0,21626836,-1.0,383025,8318,105322,2887.0,63776166,296818,214866497,South America
4,UK,14732551,120806.0,150609,379.0,10945874,87362.0,3636068,820,215292,2201.0,425464553,6217464,68430567,Europe


In [9]:
for label in df.columns:
    if label != 'Country' and label != 'Continent':
        df[label] = pd.to_numeric(df[label])

In [10]:
df["%Increase Cases"] = df["New Cases"]/df["Total Cases"]*100
df["%Increase Deaths"] = df["New Deaths"]/df["Total Deaths"]*100
df["%Increase Recovered"] = df["New Recovered"]/df["Total Recovered"]*100

In [11]:
df.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Test/1M,Population,Continent,%Increase Cases,%Increase Deaths,%Increase Recovered
0,World,314023118,2846498.0,5521223,8265.0,261713993,975709.0,46787902,95534,40286.0,708.3,-1,-1,-1,All,0.906461,0.149695,0.372815
1,USA,63390876,672872.0,863896,2173.0,42641852,136478.0,19885128,24368,189812.0,2587.0,844714275,2529338,333966543,North America,1.061465,0.251535,0.320056
2,India,36070510,194720.0,484655,442.0,34630536,60405.0,955319,8944,25751.0,346.0,693155280,494857,1400717076,Asia,0.539832,0.091199,0.174427
3,Brazil,22630142,71447.0,620281,139.0,21626836,-1.0,383025,8318,105322.0,2887.0,63776166,296818,214866497,South America,0.315716,0.022409,-5e-06
4,UK,14732551,120806.0,150609,379.0,10945874,87362.0,3636068,820,215292.0,2201.0,425464553,6217464,68430567,Europe,0.819994,0.251645,0.798127


# Step-3 :
**Performing EDA - Exploratory Data Analysis**

In [12]:
cases = df[["Total Recovered", "Active Cases", "Total Deaths"]].loc[0]

cases_df = pd.DataFrame(cases).reset_index()
cases_df.columns = ["Type", "Total"]

cases_df['Percentage'] = np.round(100*cases_df['Total']/np.sum(cases_df["Total"]),2)
cases_df["virus"] = ["COVID-19" for i in range(len(cases_df))]

fig = px.bar(cases_df, x = "virus", y = "Percentage", color = "Type", hover_data=["Total"])
fig.show()

In [13]:
cases = df[["New Cases", "New Recovered", "New Deaths"]].loc[0]

cases_df = pd.DataFrame(cases).reset_index()
cases_df.columns = ["Type", "Total"]

cases_df['Percentage'] = np.round(100*cases_df['Total']/np.sum(cases_df["Total"]),2)
cases_df["virus"] = ["COVID-19" for i in range(len(cases_df))]

fig = px.bar(cases_df, x = "virus", y = "Percentage", color = "Type", hover_data=["Total"])
fig.show()

In [14]:
per = np.round(df[["%Increase Cases","%Increase Deaths","%Increase Recovered"]].loc[0], 2)

per_df = pd.DataFrame(per)
per_df.columns = ["Percentage"]

fig = go.Figure()

fig.add_trace(go.Bar(x = per_df.index, y = per_df['Percentage'], marker_color = ["yellow","blue","red"]))
fig.show()

#  Continent Level Visualization

In [15]:
continent_df = df.groupby("Continent").sum().drop("All")
continent_df = continent_df.reset_index()
continent_df

Unnamed: 0,Continent,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Test/1M,Population,%Increase Cases,%Increase Deaths,%Increase Recovered
0,Africa,10297066,55765.0,232666,311.0,9069295,37252.0,957952,2619,1274247.0,16115.0,89721251,9204171,1389530855,41.944479,-89.50257,18.496748
1,Asia,87584406,420665.0,1261504,1321.0,83054584,136997.0,3268317,26819,2661373.0,26348.0,1577782939,68135869,3221832309,18.360696,69.758727,3.767739
2,Australia/Oceania,1312852,93605.0,4697,23.0,541659,436.0,720416,372,369237.0,4807.0,64142257,7172573,42440133,4.092049,284.890159,-37.259427
3,Europe,98616120,1254716.0,1557956,3805.0,78825759,534420.0,18232405,21172,7662960.0,95453.0,1780702856,152667527,748318872,55.400244,6.392763,42.240972
4,North America,74497342,742329.0,1264377,2373.0,51861783,197533.0,21364100,30516,3878852.0,42524.0,946509258,63987294,596156946,61.320722,-108.649198,58.165758
5,South America,41610602,279190.0,1195365,348.0,36501280,68878.0,1704451,13956,1201344.0,28323.0,190670472,10649198,436186051,11.437395,100.310135,3.054274


In [16]:
def continent_visualization(v_list):
    for label in v_list:
        c_df = continent_df[['Continent', label]]
        c_df['Percentage'] = np.round(100*c_df[label]/np.sum(c_df[label]), 2)
        c_df['Virus'] = ['COVID-19' for i in range(len(c_df))]
        
        fig = px.bar(c_df, x = "Virus", y = "Percentage", color = "Continent", hover_data=[label])
        fig.update_layout(title = {'text' : label})
        fig.show()
        gc.collect()

In [17]:
cases_list = ["Total Cases", "Active Cases", "New Cases", "Serious/Critical", "Total Cases/1M"]

deaths_list = ["Total Deaths","New Deaths","Deaths/1M"]

recovered_list = ["Total Recovered", "New Recovered", "%Increase Recovered"]

In [18]:
continent_visualization(cases_list)

In [19]:
continent_visualization(deaths_list)

In [20]:
continent_visualization(recovered_list)

# Country Level Visualization

In [21]:
df = df.drop([len(df)-1])
country_df = df.drop([0])
country_df

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Serious/Critical,Total Cases/1M,Deaths/1M,Total Tests,Test/1M,Population,Continent,%Increase Cases,%Increase Deaths,%Increase Recovered
1,USA,63390876,672872.0,863896,2173.0,42641852,136478.0,19885128,24368,189812.0,2587.0,844714275,2529338,333966543,North America,1.061465,0.251535,0.320056
2,India,36070510,194720.0,484655,442.0,34630536,60405.0,955319,8944,25751.0,346.0,693155280,494857,1400717076,Asia,0.539832,0.091199,0.174427
3,Brazil,22630142,71447.0,620281,139.0,21626836,-1.0,383025,8318,105322.0,2887.0,63776166,296818,214866497,South America,0.315716,0.022409,-0.000005
4,UK,14732551,120806.0,150609,379.0,10945874,87362.0,3636068,820,215292.0,2201.0,425464553,6217464,68430567,Europe,0.819994,0.251645,0.798127
5,France,12573263,368149.0,126059,341.0,8672310,83992.0,3774894,3333,191975.0,1925.0,188795159,2882628,65494103,Europe,2.928031,0.270508,0.968508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,Macao,79,-1.0,-1,-1.0,77,-1.0,2,-1,119.0,-1.0,5075,7656,662844,Asia,-1.265823,100.000000,-1.298701
212,Palau,32,4.0,-1,-1.0,12,-1.0,20,-1,1756.0,-1.0,18788,1030835,18226,Australia/Oceania,12.500000,100.000000,-8.333333
213,Solomon Islands,25,-1.0,-1,-1.0,20,-1.0,5,-1,35.0,-1.0,4500,6315,712566,Australia/Oceania,-4.000000,100.000000,-5.000000
214,Western Sahara,10,-1.0,1,-1.0,8,-1.0,1,-1,16.0,2.0,-1,-1,619731,Africa,-10.000000,-100.000000,-12.500000


# Step-4 : 
**Conclusions :** Top 5 Countries Covid-19 trends 

In [22]:
LOOK_AT = 5
country = country_df.columns[1:14]

fig = go.Figure()
c=0
for i in country_df.index:
    if c < LOOK_AT:
        fig.add_trace(go.Bar(name = country_df['Country'][i], x = country, y = country_df.loc[i][1:14]))
    else :
        break
    c += 1
    
fig.update_layout(title = {"text":f'top {LOOK_AT} countries affected '}, yaxis_type = "log")
fig.show()