In [9]:
# importing required libraries
import numpy as np  # For array handling and numerical operations
import pandas as pd  # For creating DataFrames and data manipulation
import re  # For cleaning raw or scraped data
from bs4 import BeautifulSoup  # For web scraping HTML contentp
import requests  # For making HTTP requests

# Data Collection By Web Scraping

In [10]:

# URL of the data source
url = "https://www.worldometers.info/coronavirus/countries-where-coronavirus-has-spread/"
# Send an HTTP GET request to fetch data from the URL
r = requests.get(url)

# Print the HTTP status code to check if the request was successful
print(f"Status code: {r.status_code}")

Status code: 200


In [11]:
# Parse the HTML content using BeautifulSoup for further extraction
soup = BeautifulSoup(r.content, features="html.parser")

# Extract all table data (<td>) elements and store them in an array
arr = []
for i in soup.findAll("td"):
    arr.append(str(i))    
# Display a sample of the raw data to check the structure
print(arr[:5])

['<td style="font-weight: bold; font-size:16px; text-align:left; padding-left:5px; padding-top:10px; padding-bottom:10px">United States</td>', '<td style="font-weight: bold; text-align:right">111,820,082</td>', '<td style="font-weight: bold; text-align:right">1,219,487</td>', '<td style="font-size:14px; color:#aaa; text-align:right">North America</td>', '<td style="font-weight: bold; font-size:16px; text-align:left; padding-left:5px; padding-top:10px; padding-bottom:10px">India</td>']


In [12]:
# Testing the cleaning of raw data using regex
print(arr[0])  # Print the raw data from the first element
f = re.sub('<td.*">|<.*>', " ", arr[0])  # Remove unwanted HTML tags using regex
print(f)  # Print the cleaned data

<td style="font-weight: bold; font-size:16px; text-align:left; padding-left:5px; padding-top:10px; padding-bottom:10px">United States</td>
 United States 


In [13]:
#crating the null lists for column data store 
country, cases, death, region = [], [], [], []
count=0
# Clean the raw data using regex and insert each item into the corresponding column list
for i in arr:
    i = re.sub('<td.*">|<.*>', " ", i)  # Remove HTML tags from raw data
    if count == 0:
        country.append(i)  # Append to 'country' list
        count += 1
    elif count == 1:
        cases.append(i)  # Append to 'cases' list
        count += 1
    elif count == 2:
        death.append(i)  # Append to 'death' list
        count += 1
    else:
        region.append(i)  # Append to 'region' list
        count = 0  # Reset counter for the next row
# Create a DataFrame from the cleaned data with columns: country, cases, death, and region
df = pd.DataFrame({"country": pd.Series(country), "cases": pd.Series(cases), "death": pd.Series(death), "region": pd.Series(region)})

# Display the resulting DataFrame to check the final data
print(df)


              country          cases        death               region
0      United States    111,820,082    1,219,487        North America 
1              India     45,035,393      533,570                 Asia 
2             France     40,138,560      167,642               Europe 
3            Germany     38,828,995      183,027               Europe 
4             Brazil     38,743,918      711,380        South America 
..                ...            ...          ...                  ...
225             Niue          1,059            0    Australia/Oceania 
226          Tokelau             80            0    Australia/Oceania 
227         Holy See             29            0               Europe 
228   Western Sahara             10            1               Africa 
229       MS Zaandam              9            2                      

[230 rows x 4 columns]


In [14]:
# saving the collected data in a data set(tabular format) to use or load in another files  or folder of my port folio folder 
df.to_csv("scraped_data.csv",index=False)