## Imports

In [132]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re  # Import the regular expression module
from tqdm import tqdm

## Setups

In [169]:
output_path = "../output/"

## 3.1 Web-crawling Demographics Data

In [133]:
SCRAPE_SOURCE = "https://www.worldometers.info/demographics/"
SCRAPE_ROOT = "https://www.worldometers.info"
response = requests.get(SCRAPE_SOURCE)
response.raise_for_status()  # Raise an exception for bad status codes
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
# Initialize a list to collect demographic data
demographics_data = []

df_demographics = pd.DataFrame(columns=["Country",
                                        "LifeExpectancy_Both",  # (Both Sexes, in years)
                                        "LifeExpectancy_Female",  # (Females, in years)
                                        "LifeExpectancy_Male",  # (Males, in years)
                                        "UrbanPopulation_Percentage",  # (percentage without commas)
                                        "UrbanPopulation_Absolute",  # (if available)
                                        "PopulationDensity"])
progress_bar = tqdm(soup.find_all(attrs={'data-country': True}))
for country_link in progress_bar:
    progress_bar.set_description(f"Scraping data for {country_link.text.strip()}")

    country_url = SCRAPE_ROOT + soup.find_all(attrs={'data-country': True})[0]['href']
    country_url = SCRAPE_ROOT + country_link['href']  # Fixed to use current country_link

    response = requests.get(country_url)
    country_soup = BeautifulSoup(response.content, 'html.parser')

    # Get life expectancies
    expectancies = map( lambda x: float(x.text.strip()), country_soup.find_all(attrs={'class': 'grid grid-col-1 lg:grid-cols-3 gap-4'})[0].find_all(
                          attrs={'class': 'text-2xl font-bold mb-1.5'}))

    expectancies = list(expectancies)  # Convert map object to list

    # Get urban population data
    populations= re.findall('\d+\.?\d+',
                        country_soup.find(lambda tag: tag.name == 'p' and 'Currently' in tag.text).text.replace(',', ''))[0:2]
    # Dealing with lack of absolute number
    if len(populations) == 1:
        populations.append('None')

    # Get population density
    density = re.findall('\d+\.?\d+',
                        country_soup.find(lambda tag: tag.name == 'p' and 'population density in ' in tag.text).text.replace(',', ''))[0:2]
    populations.append(density[1])
    # Populations = [percentage urban, total population, population density per sqkm]

    # Append data as a list to demographics_data
    demographics_data.append([
        country_link.text.strip(),  # Country name
        *expectancies,              # Life expectancy (Both, Female, Male)
        *populations               # Urban population percentage, absolute, and density
    ])


In [151]:
# Create DataFrame from collected data
df_demographics = pd.DataFrame(
    demographics_data,
    columns=["Country", "LifeExpectancy_Both", "LifeExpectancy_Female", "LifeExpectancy_Male",
             "UrbanPopulation_Percentage", "UrbanPopulation_Absolute", "PopulationDensity"]
).apply(pd.to_numeric, errors='ignore')

  df_demographics = pd.DataFrame(


In [160]:
df_demographics.to_csv(output_path + 'demographics_data.csv', index=False)
head = df_demographics.head(10)
head.to_csv(output_path + 'demographics_before_sort.csv', index=False)
head.sort_values('Country', inplace=False).to_csv(output_path + 'demographics_after_sort.csv', index=False)

### Testing for country 0

In [None]:
(country_url := SCRAPE_ROOT + soup.find_all(attrs={'data-country': True})[0]['href'])

In [None]:
response = requests.get(country_url)

#### Getting Life Expectancy

In [None]:
country_soup = BeautifulSoup(response.content, 'html.parser')
expectancies = map(lambda x: float(x.text.strip()),
                   country_soup.find_all(attrs={'class': 'grid grid-col-1 lg:grid-cols-3 gap-4'})[0].find_all(
                       attrs={'class': 'text-2xl font-bold mb-1.5'}))
list(expectancies)

#### Getting Population

In [None]:
populations = re.findall('\d+\.?\d+',
                         country_soup.find(lambda tag: tag.name == 'p' and 'Currently' in tag.text).text.replace(',',
                                                                                                                 ''))[
              0:2]
# Dealing with lack of absolute number
if len(populations) == 1:
    populations.append('None')
populations

#### Getting Population Density

In [None]:
density = re.findall('\d+\.?\d+',
                     country_soup.find(
                         lambda tag: tag.name == 'p' and 'population density in ' in tag.text).text.replace(',', ''))[
          0:2]
populations.append(density[1])
populations

## 3.2 Load the 2021 CSVs

### a) Reading CSVs

In [164]:
df_gdp = pd.read_csv('../gdp_per_capita_2021.csv', na_values=['None'])
df_pop = pd.read_csv('../population_2021.csv', na_values=['None'])

### b) verifying contents

In [166]:
df_gdp.head(1)

Unnamed: 0,Country,GDP_per_capita_PPP
0,Afghanistan,2144.1665


In [167]:
df_pop.head(1)

Unnamed: 0,Country,Population
0,Afghanistan,40000360


### c) ensuring numeric

In [168]:
df_gdp['GDP_per_capita_PPP'] = pd.to_numeric(df_gdp['GDP_per_capita_PPP'])
df_pop['Population'] = pd.to_numeric(df_pop['Population'])

### d) print thing

In [170]:
# Process GDP DataFrame
gdp_before = df_gdp.head(5)
gdp_before.to_csv(output_path + "gdp_before_sort.csv", index=False)

gdp_sorted = df_gdp.sort_values(by="Country", ascending=True)
gdp_after = gdp_sorted.head(5)
gdp_after.to_csv(output_path + "gdp_after_sort.csv", index=False)

# Process Population DataFrame
pop_before = df_pop.head(5)
pop_before.to_csv(output_path + "pop_before_sort.csv", index=False)

pop_sorted = df_pop.sort_values(by="Country", ascending=True)
pop_after = pop_sorted.head(5)
pop_after.to_csv(output_path + "pop_after_sort.csv", index=False)

# Print outputs (optional for console/logging purposes)
print("GDP Before Sort:\n", gdp_before)
print("GDP After Sort:\n", gdp_after)
print("Population Before Sort:\n", pop_before)
print("Population After Sort:\n", pop_after)


GDP Before Sort:
        Country  GDP_per_capita_PPP
0  Afghanistan           2144.1665
1      Albania          16353.8090
2      Algeria          14496.8650
3      Andorra          59332.2030
4       Angola           7408.1265
GDP After Sort:
        Country  GDP_per_capita_PPP
0  Afghanistan           2144.1665
1      Albania          16353.8090
2      Algeria          14496.8650
3      Andorra          59332.2030
4       Angola           7408.1265
Population Before Sort:
        Country  Population
0  Afghanistan    40000360
1       Africa  1413750475
2  Africa (UN)  1413753005
3      Albania     2849591
4      Algeria    44761051
Population After Sort:
        Country  Population
0  Afghanistan    40000360
1       Africa  1413750475
2  Africa (UN)  1413753005
3      Albania     2849591
4      Algeria    44761051


### e) describe

In [172]:
df_pop.describe().to_csv(output_path + "pop_describe.csv")
df_gdp.describe().to_csv(output_path + "gdp_describe.csv")

## For Part 1 review

In [175]:
print(f'df_demographics shape: {df_demographics.shape}')
print(f'df_gdp shape: {df_gdp.shape}')
print(f'df_pop shape: {df_pop.shape}')

df_demographics shape: (201, 7)
df_gdp shape: (213, 2)
df_pop shape: (260, 2)


In [186]:
print(f'Pearson correlation between LifeExpectancy Both and PopulationDensity is {df_demographics["LifeExpectancy_Both"].corr(df_demographics["PopulationDensity"])}')

Pearson correlation between LifeExpectancy Both and PopulationDensity is 0.09810860899481587
