# ETL

## Definição da Constelação de Fatos

**Tabelas de dimensão**

- dim_data (data_pk, data_completa, data_dia, data_mes, data_semestre, data_ano)
- dim_localidade (localidade_pk, latitude, longitude, cidade, estado, regiao, pais)
- dim_tipo_cancer (tipo_cancer_pk, tipo_cancer, mortalidade, taxa_incidencia_total)
- dim_faixa_etaria (faixa_pk, faixa_idade, idade_min, idade_max, id_idade)
- dim_metrica (metrica_pk, tipo_metrica)
- dim_sexo (sexo_pk, sexo)

**Tabelas de fatos**

- fato_cancer (ano_pk, estado_pk, tipo_cancer_pk, sexo_pk, faixa_pk, metrica_pk, obitos_cancer, incidencia_cancer, prevalencia_cancer)
- fato_clima (data_pk, localidade_pk, temperatura_media, temperatura_max, temperatura_min, radiacao_uv, radiacao_uva, radiacao_uvb, precipitacao)

**Views**

- vw_cidade (localidade_pk, latitude, longitude, clima_cidade, clima_estado, clima_regiao, clima_pais)
- vw_estado (estado_pk, cancer_regiao, cancer_pais)
- vw_dia (data_pk, clima_data_completa, clima_dia, clima_mes, clima_semestre, clima_ano)
- vw_ano (ano_pk, cancer_ano, cancer_decada)

In [1]:
import pandas as pd
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from dotenv import load_dotenv
import psycopg2
import os

In [2]:
# Python 3.11
# Java 11
# PySpark == 3.4

spark = SparkSession.builder \
    .appName("OLAP - P2") \
    .config("spark.jars.packages",
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.11.1026") \
    .config("spark.jars", "/home/rodrigo/jars/postgresql-42.7.3.jar") \
    .config("spark.jars.ivyLogLevel", "ERROR") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()

print(spark.version)

:: loading settings :: url = jar:file:/home/rodrigo/.local/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/rodrigo/.ivy2/cache
The jars for the packages stored in: /home/rodrigo/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9dc84400-7eb6-4561-b034-755dc34c16b6;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 736ms :: artifacts dl 10ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	:: evicted modules:
	com.amazonaws#aws-java-sdk-bundle;1.11.1026 by [com.amazonaws#aws-java-sdk-bundle;1.12.262] in [default]
	------------------------------------------------------------

3.4.0


## Extract

In [3]:
# Extração dos dados no datalake dentro do Minio com a Tipagem

schema_cancer = StructType([
    StructField("measure_id", IntegerType(), True),
    StructField("measure_name", StringType(), True),
    StructField("location_id", IntegerType(), True),
    StructField("location_name", StringType(), True),
    StructField("sex_id", IntegerType(), True),
    StructField("sex_name", StringType(), True),
    StructField("age_id", IntegerType(), True),
    StructField("age_name", StringType(), True),
    StructField("cause_id", IntegerType(), True),
    StructField("cause_name", StringType(), True),
    StructField("metric_id", IntegerType(), True),
    StructField("metric_name", StringType(), True),
    StructField("year", IntegerType(), True),
    StructField("val", DoubleType(), True),
    StructField("upper", DoubleType(), True),
    StructField("lower", DoubleType(), True),
])

df_cancer1 = spark.read.csv("s3a://datalake/cancer-1.csv", schema=schema_cancer, header=True, inferSchema=True)
df_cancer2 = spark.read.csv("s3a://datalake/cancer-2.csv", schema=schema_cancer, header=True, inferSchema=True)
df_cancer = df_cancer1.union(df_cancer2)

print("Numero de Tuplas: ", df_cancer.count())
df_cancer.show()

25/06/27 09:16:26 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
25/06/27 09:16:26 INFO SharedState: Warehouse path is 'file:/home/rodrigo/Projects/cancer-climate-dw/spark-warehouse'.
25/06/27 09:16:29 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/06/27 09:16:29 INFO MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
25/06/27 09:16:29 INFO MetricsSystemImpl: s3a-file-system metrics system started
25/06/27 09:16:31 INFO InMemoryFileIndex: It took 187 ms to list leaf files for 1 paths.
25/06/27 09:16:35 INFO InMemoryFileIndex: It took 19 ms to list leaf files for 1 paths.
25/06/27 09:16:36 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:16:36 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:16:36 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:16:36 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:16:37 I

Numero de Tuplas:  520506


25/06/27 09:16:42 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:16:42 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:16:42 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:16:42 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:16:42 INFO CodeGenerator: Code generated in 54.690023 ms
25/06/27 09:16:42 INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 203.2 KiB, free 433.7 MiB)
25/06/27 09:16:42 INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 35.4 KiB, free 433.7 MiB)
25/06/27 09:16:42 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on hal9000:34557 (size: 35.4 KiB, free: 434.3 MiB)
25/06/27 09:16:42 INFO SparkContext: Created broadcast 4 from showString at NativeMethodAccessorImpl.java:0
25/06/27 09:16:43 INFO FileSourceScanExec: Planning scan with bin packing, max size: 11024413 bytes, open cost is considered as scanning 4194304 bytes.
25/06/27 09:16:43 INFO BlockManagerInfo: Remo

+----------+------------+-----------+------------------+------+---------+------+-----------+--------+--------------------+---------+-----------+----+--------------------+--------------------+--------------------+
|measure_id|measure_name|location_id|     location_name|sex_id| sex_name|age_id|   age_name|cause_id|          cause_name|metric_id|metric_name|year|                 val|               upper|               lower|
+----------+------------+-----------+------------------+------+---------+------+-----------+--------+--------------------+---------+-----------+----+--------------------+--------------------+--------------------+
|         1|      Óbitos|       4761|Mato Grosso do Sul|     1|Masculino|     8|15 -19 anos|     459|Melanoma maligno ...|        1|     Número|2001| 0.03482425671072571| 0.04761777852821798|0.025552037408807626|
|         1|      Óbitos|       4761|Mato Grosso do Sul|     2| Feminino|     8|15 -19 anos|     459|Melanoma maligno ...|        1|     Número|2001

25/06/27 09:16:43 INFO CodeGenerator: Code generated in 94.458516 ms


In [4]:
# Extração dos dados no datalake dentro do Minio com a Tipagem

schema_clima = StructType([
    StructField("cidade", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("YEAR", IntegerType(), True),
    StructField("DOY", IntegerType(), True),
    StructField("T2M", DoubleType(), True),
    StructField("T2M_MAX", DoubleType(), True),
    StructField("T2M_MIN", DoubleType(), True),
    StructField("ALLSKY_SFC_UV_INDEX", DoubleType(), True),
    StructField("ALLSKY_SFC_UVA", DoubleType(), True),
    StructField("ALLSKY_SFC_UVB", DoubleType(), True),
    StructField("PRECTOTCORR", DoubleType(), True),
    StructField("codigo_ibge", IntegerType(), True),
    StructField("capital", StringType(), True),
    StructField("estado", StringType(), True),
])

df_clima = spark.read.csv("s3a://datalake/climate.csv", schema=schema_clima, header=True, inferSchema=True)

print("Numero de Tuplas: ", df_clima.count())
df_clima.show()

25/06/27 09:16:43 INFO InMemoryFileIndex: It took 19 ms to list leaf files for 1 paths.
25/06/27 09:16:44 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:16:44 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:16:44 INFO MemoryStore: Block broadcast_7 stored as values in memory (estimated size 203.2 KiB, free 433.2 MiB)
25/06/27 09:16:44 INFO BlockManagerInfo: Removed broadcast_6_piece0 on hal9000:34557 in memory (size: 11.2 KiB, free: 434.3 MiB)
25/06/27 09:16:44 INFO MemoryStore: Block broadcast_7_piece0 stored as bytes in memory (estimated size 35.4 KiB, free 433.2 MiB)
25/06/27 09:16:44 INFO BlockManagerInfo: Added broadcast_7_piece0 in memory on hal9000:34557 (size: 35.4 KiB, free: 434.2 MiB)
25/06/27 09:16:44 INFO BlockManagerInfo: Removed broadcast_4_piece0 on hal9000:34557 in memory (size: 35.4 KiB, free: 434.3 MiB)
25/06/27 09:16:44 INFO SparkContext: Created broadcast 7 from count at NativeMethodAccessorImpl.java:0
25/06/27 09:16:44 INFO FileSourceScanExec: Plann

Numero de Tuplas:  42721900


25/06/27 09:17:16 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:17:16 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:17:16 INFO CodeGenerator: Code generated in 69.269784 ms
25/06/27 09:17:17 INFO MemoryStore: Block broadcast_10 stored as values in memory (estimated size 203.2 KiB, free 433.5 MiB)
25/06/27 09:17:17 INFO MemoryStore: Block broadcast_10_piece0 stored as bytes in memory (estimated size 35.4 KiB, free 433.4 MiB)
25/06/27 09:17:17 INFO BlockManagerInfo: Added broadcast_10_piece0 in memory on hal9000:34557 (size: 35.4 KiB, free: 434.2 MiB)
25/06/27 09:17:17 INFO SparkContext: Created broadcast 10 from showString at NativeMethodAccessorImpl.java:0
25/06/27 09:17:17 INFO FileSourceScanExec: Planning scan with bin packing, max size: 134217728 bytes, open cost is considered as scanning 4194304 bytes.
25/06/27 09:17:17 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
25/06/27 09:17:17 INFO DAGScheduler: Got job 5 (showString at Nati

+---------------+--------+---------+----+---+-----+-------+-------+-------------------+--------------+--------------+-----------+-----------+-------+------+
|         cidade|latitude|longitude|YEAR|DOY|  T2M|T2M_MAX|T2M_MIN|ALLSKY_SFC_UV_INDEX|ALLSKY_SFC_UVA|ALLSKY_SFC_UVB|PRECTOTCORR|codigo_ibge|capital|estado|
+---------------+--------+---------+----+---+-----+-------+-------+-------------------+--------------+--------------+-----------+-----------+-------+------+
|Abadia de Goiás|-16.7573| -49.4412|2001|  1|23.12|  26.25|  20.85|               2.29|           1.2|          0.04|      10.58|    5200050|  False| Goiás|
|Abadia de Goiás|-16.7573| -49.4412|2001|  2|21.86|  24.46|  19.76|               1.95|          1.05|          0.03|        4.7|    5200050|  False| Goiás|
|Abadia de Goiás|-16.7573| -49.4412|2001|  3|21.72|  25.39|  18.91|               2.27|           1.2|          0.04|        4.3|    5200050|  False| Goiás|
|Abadia de Goiás|-16.7573| -49.4412|2001|  4|23.14|  27.84

25/06/27 09:17:17 INFO TaskSetManager: Finished task 0.0 in stage 7.0 (TID 45) in 159 ms on hal9000 (executor driver) (1/1)
25/06/27 09:17:17 INFO TaskSchedulerImpl: Removed TaskSet 7.0, whose tasks have all completed, from pool 
25/06/27 09:17:17 INFO DAGScheduler: ResultStage 7 (showString at NativeMethodAccessorImpl.java:0) finished in 0.194 s
25/06/27 09:17:17 INFO DAGScheduler: Job 5 is finished. Cancelling potential speculative or zombie tasks for this job
25/06/27 09:17:17 INFO TaskSchedulerImpl: Killing all running tasks in stage 7: Stage finished
25/06/27 09:17:17 INFO DAGScheduler: Job 5 finished: showString at NativeMethodAccessorImpl.java:0, took 0.212519 s
25/06/27 09:17:17 INFO CodeGenerator: Code generated in 28.39563 ms


## Transform

In [5]:
# Transformção da data, os campos 'data_completa', 'mes', 'semestre' são derivados do DOY por funções do Spark

df_data = df_clima.withColumn(
    "data_completa",
    F.expr("date_add(to_date(concat(YEAR, '-01-01')), DOY - 1)")
).select(
    "data_completa",
    F.dayofmonth("data_completa").alias("data_dia"),
    F.month("data_completa").alias("data_mes"),
    ((F.month("data_completa")-1)/6).cast("int").alias("data_semestre"),
    F.year("data_completa").alias("data_ano")
).distinct()

windowSpec = Window.orderBy("data_completa")

df_data = df_data.withColumn(
    "data_pk",
    F.row_number().over(windowSpec)
)

df_data = df_data.withColumn(
    "data_decada",
    (F.floor(F.col("data_ano") / 10) * 10).cast("int")
)

df_data = df_data.select("data_pk", "data_completa", "data_dia", "data_mes", "data_semestre", "data_ano", "data_decada")

print("Numero de Tuplas: ", df_data.count())
df_data.show()

25/06/27 09:17:17 INFO BlockManagerInfo: Removed broadcast_11_piece0 on hal9000:34557 in memory (size: 9.0 KiB, free: 434.2 MiB)
25/06/27 09:17:17 INFO BlockManagerInfo: Removed broadcast_9_piece0 on hal9000:34557 in memory (size: 5.8 KiB, free: 434.3 MiB)
25/06/27 09:17:19 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:17:19 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:17:19 INFO HashAggregateExec: spark.sql.codegen.aggregate.map.twolevel.enabled is set to true, but current version of codegened fast hashmap does not support this aggregate.
25/06/27 09:17:19 INFO CodeGenerator: Code generated in 151.304182 ms
25/06/27 09:17:19 INFO MemoryStore: Block broadcast_12 stored as values in memory (estimated size 203.2 KiB, free 433.2 MiB)
25/06/27 09:17:19 INFO MemoryStore: Block broadcast_12_piece0 stored as bytes in memory (estimated size 35.4 KiB, free 433.2 MiB)
25/06/27 09:17:19 INFO BlockManagerInfo: Added broadcast_12_piece0 in memory on hal9000:34557 (size: 35.4 KiB

Numero de Tuplas:  7670


25/06/27 09:18:31 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:18:31 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:18:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/27 09:18:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/27 09:18:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/27 09:18:31 INFO HashAggregateExec: spark.sql.codegen.aggregate.map.twolevel.enabled is set to true, but current version of codegened fast hashmap does not support this aggregate.
25/06/27 09:18:31 INFO MemoryStore: Block broadcast_16 stored as values in memory (estimated size 203.2 KiB, free 433.1 MiB)
25/06/27 09:18:31 INFO MemoryStore: Block broadcast_16_piece0 stored as

+-------+-------------+--------+--------+-------------+--------+-----------+
|data_pk|data_completa|data_dia|data_mes|data_semestre|data_ano|data_decada|
+-------+-------------+--------+--------+-------------+--------+-----------+
|      1|   2001-01-01|       1|       1|            0|    2001|       2000|
|      2|   2001-01-02|       2|       1|            0|    2001|       2000|
|      3|   2001-01-03|       3|       1|            0|    2001|       2000|
|      4|   2001-01-04|       4|       1|            0|    2001|       2000|
|      5|   2001-01-05|       5|       1|            0|    2001|       2000|
|      6|   2001-01-06|       6|       1|            0|    2001|       2000|
|      7|   2001-01-07|       7|       1|            0|    2001|       2000|
|      8|   2001-01-08|       8|       1|            0|    2001|       2000|
|      9|   2001-01-09|       9|       1|            0|    2001|       2000|
|     10|   2001-01-10|      10|       1|            0|    2001|       2000|

25/06/27 09:19:48 INFO CodeGenerator: Code generated in 20.404103 ms
25/06/27 09:19:48 INFO CodeGenerator: Code generated in 15.753689 ms
25/06/27 09:19:48 INFO CodeGenerator: Code generated in 17.31529 ms
25/06/27 09:19:48 INFO CodeGenerator: Code generated in 15.335579 ms
25/06/27 09:19:48 INFO Executor: Finished task 0.0 in stage 17.0 (TID 121). 6836 bytes result sent to driver
25/06/27 09:19:48 INFO TaskSetManager: Finished task 0.0 in stage 17.0 (TID 121) in 224 ms on hal9000 (executor driver) (1/1)
25/06/27 09:19:48 INFO TaskSchedulerImpl: Removed TaskSet 17.0, whose tasks have all completed, from pool 
25/06/27 09:19:48 INFO DAGScheduler: ResultStage 17 (showString at NativeMethodAccessorImpl.java:0) finished in 0.257 s
25/06/27 09:19:48 INFO DAGScheduler: Job 10 is finished. Cancelling potential speculative or zombie tasks for this job
25/06/27 09:19:48 INFO TaskSchedulerImpl: Killing all running tasks in stage 17: Stage finished
25/06/27 09:19:48 INFO DAGScheduler: Job 10 fini

In [6]:
# Transformção da Localidade, os campos 'região', 'pais' são derivados do 'estado' pelo mapeamento a seguir

estado_para_regiao = {
    "Acre": "Norte",
    "Alagoas": "Nordeste",
    "Amapá": "Norte",
    "Amazonas": "Norte",
    "Bahia": "Nordeste",
    "Ceará": "Nordeste",
    "Distrito Federal": "Centro-Oeste",
    "Espírito Santo": "Sudeste",
    "Goiás": "Centro-Oeste",
    "Maranhão": "Nordeste",
    "Mato Grosso": "Centro-Oeste",
    "Mato Grosso do Sul": "Centro-Oeste",
    "Minas Gerais": "Sudeste",
    "Pará": "Norte",
    "Paraíba": "Nordeste",
    "Paraná": "Sul",
    "Pernambuco": "Nordeste",
    "Piauí": "Nordeste",
    "Rio de Janeiro": "Sudeste",
    "Rio Grande do Norte": "Nordeste",
    "Rio Grande do Sul": "Sul",
    "Rondônia": "Norte",
    "Roraima": "Norte",
    "Santa Catarina": "Sul",
    "São Paulo": "Sudeste",
    "Sergipe": "Nordeste",
    "Tocantins": "Norte"
}

# Cria um mapa de estado para região
estado_regiao_map = F.create_map([F.lit(x) for kv in estado_para_regiao.items() for x in kv])

df_localidade = (
    df_clima.select("cidade", "estado", "latitude", "longitude")
            .distinct()
            .withColumn("pais", F.lit("Brasil"))
            .withColumn(
                "regiao",
                F.when(F.col("estado").isNotNull(), estado_regiao_map.getItem(F.col("estado")))
            )
)

windowSpec = Window.orderBy("cidade", "estado", "latitude", "longitude")
df_localidade = (
    df_localidade.withColumn("localidade_pk", F.row_number().over(windowSpec))
                 .select("localidade_pk", "latitude", "longitude", "cidade", "estado", "regiao", "pais")
)

df_localidade.show(truncate=False)


25/06/27 09:19:49 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:19:49 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:19:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/27 09:19:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/27 09:19:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/27 09:19:49 INFO HashAggregateExec: spark.sql.codegen.aggregate.map.twolevel.enabled is set to true, but current version of codegened fast hashmap does not support this aggregate.
25/06/27 09:19:49 INFO CodeGenerator: Code generated in 54.180044 ms
25/06/27 09:19:49 INFO MemoryStore: Block broadcast_20 stored as values in memory (estimated size 203.2 KiB, free 433.8 MiB)
25

+-------------+--------+---------+-------------------+-------------------+------------+------+
|localidade_pk|latitude|longitude|cidade             |estado             |regiao      |pais  |
+-------------+--------+---------+-------------------+-------------------+------------+------+
|1            |-16.7573|-49.4412 |Abadia de Goiás    |Goiás              |Centro-Oeste|Brasil|
|2            |-18.4831|-47.3916 |Abadia dos Dourados|Minas Gerais       |Sudeste     |Brasil|
|3            |-16.197 |-48.7057 |Abadiânia          |Goiás              |Centro-Oeste|Brasil|
|4            |-1.7218 |-48.8788 |Abaetetuba         |Pará               |Norte       |Brasil|
|5            |-19.1551|-45.4444 |Abaeté             |Minas Gerais       |Sudeste     |Brasil|
|6            |-7.3459 |-39.0416 |Abaiara            |Ceará              |Nordeste    |Brasil|
|7            |-8.7207 |-39.1162 |Abaré              |Bahia              |Nordeste    |Brasil|
|8            |-23.3049|-50.3133 |Abatiá          

25/06/27 09:21:14 INFO CodeGenerator: Code generated in 26.878986 ms
25/06/27 09:21:14 INFO Executor: Finished task 0.0 in stage 20.0 (TID 155). 7737 bytes result sent to driver
25/06/27 09:21:14 INFO TaskSetManager: Finished task 0.0 in stage 20.0 (TID 155) in 182 ms on hal9000 (executor driver) (1/1)
25/06/27 09:21:14 INFO TaskSchedulerImpl: Removed TaskSet 20.0, whose tasks have all completed, from pool 
25/06/27 09:21:14 INFO DAGScheduler: ResultStage 20 (showString at NativeMethodAccessorImpl.java:0) finished in 0.197 s
25/06/27 09:21:14 INFO DAGScheduler: Job 12 is finished. Cancelling potential speculative or zombie tasks for this job
25/06/27 09:21:14 INFO TaskSchedulerImpl: Killing all running tasks in stage 20: Stage finished
25/06/27 09:21:14 INFO DAGScheduler: Job 12 finished: showString at NativeMethodAccessorImpl.java:0, took 0.219012 s
25/06/27 09:21:14 INFO CodeGenerator: Code generated in 19.394214 ms            


In [7]:
# Tranformação da Dimensão Sexo

df_sexo = df_cancer.select(
    F.col("sex_name").alias("sexo")
).distinct()

df_sexo = df_sexo.withColumn("sexo_pk", F.monotonically_increasing_id())

df_sexo.show()

25/06/27 09:21:14 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:21:14 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:21:14 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:21:14 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:21:14 INFO HashAggregateExec: spark.sql.codegen.aggregate.map.twolevel.enabled is set to true, but current version of codegened fast hashmap does not support this aggregate.
25/06/27 09:21:14 INFO CodeGenerator: Code generated in 28.54032 ms
25/06/27 09:21:14 INFO CodeGenerator: Code generated in 11.558611 ms
25/06/27 09:21:14 INFO MemoryStore: Block broadcast_23 stored as values in memory (estimated size 203.2 KiB, free 433.5 MiB)
25/06/27 09:21:14 INFO MemoryStore: Block broadcast_23_piece0 stored as bytes in memory (estimated size 35.4 KiB, free 433.5 MiB)
25/06/27 09:21:14 INFO BlockManagerInfo: Added broadcast_23_piece0 in memory on hal9000:34557 (size: 35.4 KiB, free: 434.2 MiB)
25/06/27 09:21:14 INFO SparkContext: Created bro

+---------+-------+
|     sexo|sexo_pk|
+---------+-------+
| Feminino|      0|
|Masculino|      1|
+---------+-------+



25/06/27 09:21:17 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
25/06/27 09:21:17 INFO DAGScheduler: Got job 14 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
25/06/27 09:21:17 INFO DAGScheduler: Final stage: ResultStage 23 (showString at NativeMethodAccessorImpl.java:0)
25/06/27 09:21:17 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 22)
25/06/27 09:21:17 INFO DAGScheduler: Missing parents: List()
25/06/27 09:21:17 INFO DAGScheduler: Submitting ResultStage 23 (MapPartitionsRDD[73] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
25/06/27 09:21:17 INFO MemoryStore: Block broadcast_26 stored as values in memory (estimated size 41.5 KiB, free 433.6 MiB)
25/06/27 09:21:17 INFO MemoryStore: Block broadcast_26_piece0 stored as bytes in memory (estimated size 19.7 KiB, free 433.6 MiB)
25/06/27 09:21:17 INFO BlockManagerInfo: Added broadcast_26_piece0 in memory on hal9000:34557 (size: 19

In [8]:
# Transformção da Dimensão Tipo de Cancer

# Os campos 'mortalidade' e 'taxa_incidencia_total' são derivados dos proprios dados 

total_casos = df_cancer.filter(F.col("metric_name") == "Número") \
    .agg(F.sum("val").alias("total_casos_geral")) \
    .collect()[0]["total_casos_geral"]

df_tipo_cancer = (
    df_cancer.filter(F.col("metric_name") == "Número") 
            .groupBy("cause_name")
            .agg(
                F.sum(F.when(F.col("measure_name") == "Óbitos", F.col("val")).otherwise(0)).alias("total_obitos"),
                F.sum(F.when(F.col("measure_name") != "Óbitos", F.col("val")).otherwise(0)).alias("total_casos")
            ) 
            .withColumn(
                "mortalidade",
                F.round(F.when(F.col("total_casos") > 0, (F.col("total_obitos") / F.col("total_casos")) * 100).otherwise(0),2)
            ) 
            .withColumn(
                "taxa_incidencia_total",
                F.round((F.col("total_casos") / F.lit(total_casos)) * 100, 2)
            ) 
            .withColumn(
                "tipo_cancer_pk",
                F.monotonically_increasing_id()
            ) 
            .select(
                "tipo_cancer_pk",
                F.col("cause_name").alias("tipo_cancer"),
                "mortalidade",
                "taxa_incidencia_total"
            )
)

df_tipo_cancer.show(truncate=False)

25/06/27 09:21:17 INFO FileSourceStrategy: Pushed Filters: IsNotNull(metric_name),EqualTo(metric_name,Número)
25/06/27 09:21:17 INFO FileSourceStrategy: Post-Scan Filters: isnotnull(metric_name#11),(metric_name#11 = Número)
25/06/27 09:21:17 INFO FileSourceStrategy: Pushed Filters: IsNotNull(metric_name),EqualTo(metric_name,Número)
25/06/27 09:21:17 INFO FileSourceStrategy: Post-Scan Filters: isnotnull(metric_name#43),(metric_name#43 = Número)
25/06/27 09:21:17 INFO CodeGenerator: Code generated in 15.772801 ms
25/06/27 09:21:17 INFO CodeGenerator: Code generated in 12.455727 ms
25/06/27 09:21:17 INFO MemoryStore: Block broadcast_27 stored as values in memory (estimated size 203.2 KiB, free 433.4 MiB)
25/06/27 09:21:17 INFO MemoryStore: Block broadcast_27_piece0 stored as bytes in memory (estimated size 35.4 KiB, free 433.4 MiB)
25/06/27 09:21:17 INFO BlockManagerInfo: Added broadcast_27_piece0 in memory on hal9000:34557 (size: 35.4 KiB, free: 434.2 MiB)
25/06/27 09:21:17 INFO SparkCon

+--------------+------------------------------------------------------------+-----------+---------------------+
|tipo_cancer_pk|tipo_cancer                                                 |mortalidade|taxa_incidencia_total|
+--------------+------------------------------------------------------------+-----------+---------------------+
|0             |Melanoma maligno da pele                                    |10.89      |9.49                 |
|1             |Câncer de pele não melanoma (carcinoma de células escamosas)|14.89      |7.76                 |
|2             |Câncer de pele não melanoma (carcinoma basocelular)         |0.0        |80.57                |
+--------------+------------------------------------------------------------+-----------+---------------------+



25/06/27 09:21:22 INFO CodeGenerator: Code generated in 69.01351 ms
25/06/27 09:21:22 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
25/06/27 09:21:22 INFO DAGScheduler: Got job 18 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
25/06/27 09:21:22 INFO DAGScheduler: Final stage: ResultStage 29 (showString at NativeMethodAccessorImpl.java:0)
25/06/27 09:21:22 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 28)
25/06/27 09:21:22 INFO DAGScheduler: Missing parents: List()
25/06/27 09:21:22 INFO DAGScheduler: Submitting ResultStage 29 (MapPartitionsRDD[97] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
25/06/27 09:21:22 INFO MemoryStore: Block broadcast_34 stored as values in memory (estimated size 59.8 KiB, free 433.0 MiB)
25/06/27 09:21:22 INFO MemoryStore: Block broadcast_34_piece0 stored as bytes in memory (estimated size 26.1 KiB, free 433.0 MiB)
25/06/27 09:21:22 INFO BlockManager

In [9]:
# Transformações da Dimensão Faixa Etaria

# Aqui há o tratamento das formatações desse campo, lidando com casos como:
# '20-25' | '20 - 25' | '20-25 anos' | -> Formatações Heterogeneas


df_faixa_etaria = (
    df_cancer.select(
        F.col("age_id").alias("id_idade"),
        F.col("age_name").alias("faixa_etaria")
    )
    .distinct()
    .withColumn(
        "faixa_etaria",
        F.regexp_replace(F.regexp_replace(F.col("faixa_etaria"), "anos", ""), " ", "")
    )
    .withColumn(
        "idade_min",
        F.when(
            F.col("faixa_etaria").contains("+"),
            F.regexp_replace(F.col("faixa_etaria"), "[^0-9]", "").cast("int")
        ).otherwise(
            F.split(F.col("faixa_etaria"), "-")[0].cast("int")
        )
    )
    .withColumn(
        "idade_max",
        F.when(
            F.col("faixa_etaria").contains("+"),
            F.lit(130)
        ).otherwise(
            F.split(F.col("faixa_etaria"), "-")[1].cast("int")
        )
    )
    .withColumn("faixa_pk", F.monotonically_increasing_id())
    .select("faixa_pk", "faixa_etaria", "idade_min", "idade_max", "id_idade")
)


df_faixa_etaria.show()

25/06/27 09:21:22 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:21:22 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:21:22 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:21:22 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:21:23 INFO HashAggregateExec: spark.sql.codegen.aggregate.map.twolevel.enabled is set to true, but current version of codegened fast hashmap does not support this aggregate.
25/06/27 09:21:23 INFO CodeGenerator: Code generated in 14.387454 ms
25/06/27 09:21:23 INFO CodeGenerator: Code generated in 24.999153 ms
25/06/27 09:21:23 INFO MemoryStore: Block broadcast_35 stored as values in memory (estimated size 203.2 KiB, free 432.8 MiB)
25/06/27 09:21:23 INFO MemoryStore: Block broadcast_35_piece0 stored as bytes in memory (estimated size 35.4 KiB, free 432.8 MiB)
25/06/27 09:21:23 INFO BlockManagerInfo: Added broadcast_35_piece0 in memory on hal9000:34557 (size: 35.4 KiB, free: 434.1 MiB)
25/06/27 09:21:23 INFO SparkContext: Created br

+--------+------------+---------+---------+--------+
|faixa_pk|faixa_etaria|idade_min|idade_max|id_idade|
+--------+------------+---------+---------+--------+
|       0|       35-39|       35|       39|      12|
|       1|       40-44|       40|       44|      13|
|       2|       25-29|       25|       29|      10|
|       3|       70-74|       70|       74|      19|
|       4|       20-24|       20|       24|       9|
|       5|       65-69|       65|       69|      18|
|       6|       50-54|       50|       54|      15|
|       7|       80-84|       80|       84|      30|
|       8|       90-94|       90|       94|      32|
|       9|       15-19|       15|       19|       8|
|      10|         95+|       95|      130|     235|
|      11|       75-79|       75|       79|      20|
|      12|       30-34|       30|       34|      11|
|      13|       60-64|       60|       64|      17|
|      14|       85-89|       85|       89|      31|
|      15|       55-59|       55|       59|   

25/06/27 09:21:25 INFO Executor: Finished task 0.0 in stage 32.0 (TID 195). 6092 bytes result sent to driver
25/06/27 09:21:25 INFO TaskSetManager: Finished task 0.0 in stage 32.0 (TID 195) in 67 ms on hal9000 (executor driver) (1/1)
25/06/27 09:21:25 INFO TaskSchedulerImpl: Removed TaskSet 32.0, whose tasks have all completed, from pool 
25/06/27 09:21:25 INFO DAGScheduler: ResultStage 32 (showString at NativeMethodAccessorImpl.java:0) finished in 0.084 s
25/06/27 09:21:25 INFO DAGScheduler: Job 20 is finished. Cancelling potential speculative or zombie tasks for this job
25/06/27 09:21:25 INFO TaskSchedulerImpl: Killing all running tasks in stage 32: Stage finished
25/06/27 09:21:25 INFO DAGScheduler: Job 20 finished: showString at NativeMethodAccessorImpl.java:0, took 0.094944 s
25/06/27 09:21:25 INFO CodeGenerator: Code generated in 16.091953 ms            


In [10]:
# Transfomações da Dimensão Metrica

df_metrica = (
    df_cancer
    .select(F.col("metric_name").alias("tipo_metrica"))
    .distinct()
    .withColumn("metrica_pk", F.row_number().over(Window.orderBy("tipo_metrica")))
    .select("metrica_pk", "tipo_metrica")
)

df_metrica.show()

25/06/27 09:21:25 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:21:25 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:21:25 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:21:25 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:21:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/27 09:21:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/27 09:21:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/27 09:21:25 INFO HashAggregateExec: spark.sql.codegen.aggregate.map.twolevel.enabled is set to true, but current version of codegened fast hashmap does not support this aggregate.
25/06/27 09:21:25 INFO MemoryStore: Block broadcast_39 stored as values i

+----------+------------+
|metrica_pk|tipo_metrica|
+----------+------------+
|         1|      Número|
|         2|  Percentual|
|         3|        Taxa|
+----------+------------+



25/06/27 09:21:27 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
25/06/27 09:21:27 INFO DAGScheduler: Got job 22 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
25/06/27 09:21:27 INFO DAGScheduler: Final stage: ResultStage 35 (showString at NativeMethodAccessorImpl.java:0)
25/06/27 09:21:27 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 34)
25/06/27 09:21:27 INFO DAGScheduler: Missing parents: List()
25/06/27 09:21:27 INFO DAGScheduler: Submitting ResultStage 35 (MapPartitionsRDD[124] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
25/06/27 09:21:27 INFO MemoryStore: Block broadcast_42 stored as values in memory (estimated size 49.8 KiB, free 433.1 MiB)
25/06/27 09:21:27 INFO MemoryStore: Block broadcast_42_piece0 stored as bytes in memory (estimated size 22.8 KiB, free 433.1 MiB)
25/06/27 09:21:27 INFO BlockManagerInfo: Added broadcast_42_piece0 in memory on hal9000:34557 (size: 2

In [None]:
# Transformação da Dimensão Clima

df_fato_clima = (
    df_clima.withColumn(
            "data_completa",
            F.expr("date_add(to_date(concat(YEAR, '-01-01')), DOY - 1)")
        ) 
        .join(df_data, ["data_completa"], "left") 
        .join(df_localidade, ["cidade", "latitude", "longitude"], "left")
        .filter(F.col("localidade_pk").isNotNull() & F.col("data_pk").isNotNull()) # Elimina campos Nulos
        .select(
            F.col("data_pk"),
            F.col("localidade_pk"),
            F.round(F.col("T2M"), 2).cast(DoubleType()).alias("temperatura_media"),
            F.round(F.col("T2M_MAX"), 2).cast(DoubleType()).alias("temperatura_max"),
            F.round(F.col("T2M_MIN"), 2).cast(DoubleType()).alias("temperatura_min"),
            F.round(F.col("ALLSKY_SFC_UV_INDEX"), 2).cast(DoubleType()).alias("radiacao_uv"),
            F.round(F.col("ALLSKY_SFC_UVA"), 2).cast(DoubleType()).alias("radiacao_uva"),
            F.round(F.col("ALLSKY_SFC_UVB"), 2).cast(DoubleType()).alias("radiacao_uvb"),
            F.round(F.col("PRECTOTCORR"), 2).cast(DoubleType()).alias("precipitacao")
        )
)

df_fato_clima.show()

25/06/27 09:21:27 INFO FileSourceStrategy: Pushed Filters: IsNotNull(cidade),IsNotNull(latitude),IsNotNull(longitude)
25/06/27 09:21:27 INFO FileSourceStrategy: Post-Scan Filters: isnotnull(date_add(cast(concat(cast(YEAR#217 as string), -01-01) as date), (DOY#218 - 1))),isnotnull(cidade#214),isnotnull(latitude#215),isnotnull(longitude#216)
25/06/27 09:21:27 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:21:27 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:21:27 INFO FileSourceStrategy: Pushed Filters: 
25/06/27 09:21:27 INFO FileSourceStrategy: Post-Scan Filters: 
25/06/27 09:21:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/27 09:21:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/27 09:21:28 WARN WindowExec: No Partition Defined for Window operation! Movi

In [None]:
# Criação temporaria dos dados do Estado e do Ano para Tabela de Fatos Cancer

df_estado = (
    df_localidade
    .filter(F.col("estado").isNotNull())
    .select(F.col("estado"))
    .distinct()
)

window_estado = Window.orderBy("estado")
df_estado = (
    df_estado
    .withColumn("estado_pk", F.row_number().over(window_estado))
    

    .select("estado_pk", "estado")
)

df_ano = (
    df_data
    .select(F.col("data_ano").alias("ano"))
    .distinct()
)

window_ano = Window.orderBy("ano")
df_ano = (
    df_ano
    .withColumn("ano_pk", F.row_number().over(window_ano))
    .select("ano_pk", "ano")
)

df_estado.show()
df_ano.show()

25/06/26 22:07:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:07:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:07:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:08:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:08:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:08:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 2

+---------+-------------------+
|estado_pk|             estado|
+---------+-------------------+
|        1|               Acre|
|        2|            Alagoas|
|        3|              Amapá|
|        4|           Amazonas|
|        5|              Bahia|
|        6|              Ceará|
|        7|   Distrito Federal|
|        8|     Espírito Santo|
|        9|              Goiás|
|       10|           Maranhão|
|       11|        Mato Grosso|
|       12| Mato Grosso do Sul|
|       13|       Minas Gerais|
|       14|             Paraná|
|       15|            Paraíba|
|       16|               Pará|
|       17|         Pernambuco|
|       18|              Piauí|
|       19|Rio Grande do Norte|
|       20|  Rio Grande do Sul|
+---------+-------------------+
only showing top 20 rows





+------+----+
|ano_pk| ano|
+------+----+
|     1|2001|
|     2|2002|
|     3|2003|
|     4|2004|
|     5|2005|
|     6|2006|
|     7|2007|
|     8|2008|
|     9|2009|
|    10|2010|
|    11|2011|
|    12|2012|
|    13|2013|
|    14|2014|
|    15|2015|
|    16|2016|
|    17|2017|
|    18|2018|
|    19|2019|
|    20|2020|
+------+----+
only showing top 20 rows



25/06/26 22:09:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:09:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:09:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [None]:
# Transformação da Tabela de Fatos Cancer

df_fato_cancer = (
    df_cancer
    .join(df_estado, df_cancer.location_name == df_estado.estado, "left")
    .join(df_sexo, df_cancer.sex_name == df_sexo.sexo, "left")
    .join(df_faixa_etaria, df_cancer.age_id == df_faixa_etaria.id_idade, "left") 
    .join(df_tipo_cancer, df_cancer.cause_name == df_tipo_cancer.tipo_cancer, "left")
    .join(df_ano, df_cancer.year == df_ano.ano, "left")
    .join(df_metrica, df_cancer.metric_name == df_metrica.tipo_metrica, "left")
    .groupBy("ano_pk", "estado_pk", "sexo_pk", "faixa_pk", "tipo_cancer_pk", "metrica_pk")
    .pivot("measure_name")
    .agg(F.sum("val"))
)

group_cols = ["ano_pk", "estado_pk", "sexo_pk", "faixa_pk", "tipo_cancer_pk", "metrica_pk"]
pivot_cols = [c for c in df_fato_cancer.columns if c not in group_cols]

for col_name in pivot_cols:
    df_fato_cancer = df_fato_cancer.withColumn(
        col_name,
        F.round(F.coalesce(F.col(col_name), F.lit(0.0)), 3).cast(DoubleType())
    )

df_fato_cancer = df_fato_cancer \
    .withColumnRenamed("Incidência", "incidencia_cancer") \
    .withColumnRenamed("Prevalência", "prevalencia_cancer") \
    .withColumnRenamed("Óbitos", "obitos_cancer")

df_fato_cancer.show()

25/06/26 22:09:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:09:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:09:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:09:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:09:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:09:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 2

Numero de Tuplas:  204120


25/06/26 22:10:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:10:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:10:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:10:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:10:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:10:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 2

+------+---------+-------+--------+--------------+----------+-----------------+------------------+-------------+
|ano_pk|estado_pk|sexo_pk|faixa_pk|tipo_cancer_pk|metrica_pk|incidencia_cancer|prevalencia_cancer|obitos_cancer|
+------+---------+-------+--------+--------------+----------+-----------------+------------------+-------------+
|    21|       11|      1|      14|             1|         1|            2.861|             8.207|         1.68|
|     4|        8|      1|       3|             2|         1|            1.607|             2.491|        1.675|
|    16|        2|      0|      16|             1|         1|            0.557|             0.327|        0.577|
|    10|       16|      1|       4|             1|         1|            0.787|             4.473|        0.351|
|    18|       14|      1|       5|             1|         3|            11.45|            29.483|        7.128|
|    14|       27|      0|      13|             1|         3|            0.557|             4.39

                                                                                

## Load

In [None]:
load_dotenv() 

DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

In [None]:
# Função para salvar os dados em um base Postgresql

def salvar(df, tabela):
    url = f"jdbc:postgresql://{DB_HOST}:{DB_PORT}/{DB_NAME}"
    df.write \
      .format("jdbc") \
      .option("url", url) \
      .option("dbtable", tabela) \
      .option("user", DB_USER) \
      .option("password", DB_PASSWORD) \
      .option("driver", "org.postgresql.Driver") \
      .mode('overwrite') \
      .save()

salvar(df_data, "dim_data")
salvar(df_localidade, "dim_localidade")
salvar(df_sexo, "dim_sexo")
salvar(df_faixa_etaria, "dim_faixa_etaria")
salvar(df_tipo_cancer, "dim_tipo_cancer")
salvar(df_metrica, "dim_metrica")
salvar(df_fato_clima, "fato_clima")
salvar(df_fato_cancer, "fato_cancer")

25/06/26 22:12:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:12:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:12:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:12:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:12:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 22:12:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/26 2

In [None]:
def executa_sql_postgres(sql_texto):
    conn = psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD,
        host=DB_HOST,
        port=DB_PORT
    )
    try:
        with conn:
            with conn.cursor() as cur:
                cur.execute(sql_texto)
    finally:
        conn.close()

# Criação das Views que serão utilizadas nas consultas

sql_dim_ano = """
CREATE OR REPLACE VIEW vw_ano AS
SELECT
    ROW_NUMBER() OVER (ORDER BY data_ano) AS ano_pk,
    data_ano AS cancer_ano,
    data_decada AS cancer_decada
FROM (
    SELECT DISTINCT
        data_ano,
        data_decada
    FROM
        dim_data
) AS sub;
"""

sql_dim_dia = """
CREATE OR REPLACE VIEW vw_dia AS
SELECT
    data_pk,
    data_completa AS clima_data_completa,
    data_dia AS clima_dia,
    data_mes AS clima_mes,
    data_ano AS clima_ano,
    data_decada AS clima_decada
FROM dim_data;
"""

sql_dim_estado = """
CREATE OR REPLACE VIEW vw_estado AS
SELECT
    ROW_NUMBER() OVER (ORDER BY estado) AS estado_pk,
    estado AS cancer_estado,
    regiao AS cancer_regiao,
    pais AS cancer_pais
FROM (
    SELECT DISTINCT
        estado,
        regiao,
        pais
    FROM
        dim_localidade
) AS sub;
"""

sql_dim_cidade = """
CREATE OR REPLACE VIEW vw_cidade AS
SELECT
    localidade_pk,
    latitude,
    longitude,
    cidade AS clima_cidade,
    estado AS clima_estado,
    regiao AS clima_regiao,
    pais AS clima_pais
FROM dim_localidade;
"""

executa_sql_postgres(sql_dim_ano)
executa_sql_postgres(sql_dim_dia)
executa_sql_postgres(sql_dim_cidade)
executa_sql_postgres(sql_dim_estado)