### Instalação do ambiente para o Google Colab
- Java 17
- Apache Spark 3.5.5 (hadoop 3.3)
- Python 3.11.11

In [1]:
!apt-get install openjdk-17-jdk-headless -qq > /dev/null
!wget -O spark-3.5.5-bin-hadoop3.tgz http://archive.apache.org/dist/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
!tar xf spark-3.5.5-bin-hadoop3.tgz
!python --version

--2025-03-05 14:38:57--  http://archive.apache.org/dist/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
Resolving archive.apache.org (archive.apache.org)... 65.108.204.189, 2a01:4f9:1a:a084::2
Connecting to archive.apache.org (archive.apache.org)|65.108.204.189|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400724056 (382M) [application/x-gzip]
Saving to: ‘spark-3.5.5-bin-hadoop3.tgz’


2025-03-05 15:19:30 (161 KB/s) - ‘spark-3.5.5-bin-hadoop3.tgz’ saved [400724056/400724056]

Python 3.11.11


In [2]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"

os.environ["SPARK_HOME"] = "/content/spark-3.5.5-bin-hadoop3"

### Instalação das libs do projeto

In [3]:
# pyspark
!pip install -q findspark pyspark

# raspagem
!pip install selenium bs4 lxml pandas

Collecting selenium
  Downloading selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.29.0-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading trio-0.29.0-py3-none-any.whl (492 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m21.9 MB

### Extração de dados

In [4]:
import time
import pandas as pd # manipulação de dados
from bs4 import BeautifulSoup # Parseador de HTML
from selenium import webdriver
from selenium.webdriver.common.by import By

URL = "https://www.worldometers.info/world-population/population-by-country/"
XPATH = '//*[@id="example2"]'

options = webdriver.FirefoxOptions()
options.add_argument("-headless")

driver = webdriver.Firefox(options=options)
driver.get(URL)
time.sleep(5)
table = BeautifulSoup(driver.find_element(By.XPATH, XPATH).get_attribute('outerHTML'), 'lxml').find(name='table')
driver.quit()

df = pd.read_html(str(table))[0]

df_needed_columns = df[['Country (or dependency)', 'Population  (2024)', 'Yearly  Change']]

df_needed_columns.to_csv("populacao_paises.csv", index=False)

  df = pd.read_html(str(table))[0]


### Processamento com PySpark

In [5]:
from pyspark.sql import SparkSession

session = SparkSession.builder.getOrCreate()

In [6]:
from pyspark.sql.functions import col, split, cast, pow
from pyspark.sql.types import DoubleType

df = session.read.csv(
	'populacao_paises.csv',
	sep=',',
	header=True,
	inferSchema=True
)

df = df.withColumnsRenamed({
    'Country (or dependency)': 'country',
    'Population  (2024)': 'population_2024',
    'Yearly  Change': 'yearly_change'
})

df = df.withColumn('yearly_change', split(df.yearly_change, ' %')[0])
df = df.withColumn('yearly_change', df.yearly_change.cast(DoubleType()) / 100)

df.printSchema()
df.show(5)

pop_filter = df.filter(df.population_2024 >= 1000000)
pop_filter.sort(pop_filter.population_2024, ascending=True).show(5)

# pop2025 = pop2024 x (1 + yearly_change)
#
# pop2026 = pop2025 x (1 + yearly_change)
# pop2026 = pop2024 x (1 + yearly_change) x (1 + yearly_change)
# pop2026 = pop2024 x (1 + yearly_change) ^ 2
#
# pop2027 = pop2024 x (1 + yearly_change) ^ 3
# ...
# pop2030 = pop2024 x (1 + yearly_change) ^ 6
pop_2030 = pop_filter.withColumn('population_2030', col('population_2024') * pow(1 + col('yearly_change'), 6))
pop_2030.show(5)

pop_2030.write.mode('error').parquet('populacao_paises_processado.parquet')

root
 |-- country: string (nullable = true)
 |-- population_2024: integer (nullable = true)
 |-- yearly_change: double (nullable = true)

+-------------+---------------+--------------------+
|      country|population_2024|       yearly_change|
+-------------+---------------+--------------------+
|        India|     1450935791|              0.0089|
|        China|     1419321278|             -0.0023|
|United States|      345426571|0.005699999999999999|
|    Indonesia|      283487931|0.008199999999999999|
|     Pakistan|      251269164|              0.0152|
+-------------+---------------+--------------------+
only showing top 5 rows

+---------+---------------+--------------------+
|  country|population_2024|       yearly_change|
+---------+---------------+--------------------+
| Djibouti|        1168722|              0.0137|
| Eswatini|        1242822|                0.01|
|Mauritius|        1271169|             -0.0019|
|   Cyprus|        1358282|0.009899999999999999|
|  Estonia|      