In [0]:

from pyspark.sql.functions import lit
import json
from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType
from delta import DeltaTable
import sys

# sys.path.append("../lib/")     
sys.path.append(f'/Workspace/Users/{dbutils.widgets.get("account")}/music_data_lake/src/lib/')                                                                                                        
from utils import get_schema, table_exists


In [0]:
catalog = "bronze"
schema = "music_data"
tablename = dbutils.widgets.get("tablename")
table_schema = tablename
primary_key = dbutils.widgets.get("primary_key")
timestamp_field = "ts_ms"

if not table_exists(spark, catalog=catalog, schema=schema, table=tablename):
    print("Tabela não existente, criando...")

    df_full = spark.read.format("parquet").load(f'/Volumes/raw/music_data/full_load/{tablename}/')

    (df_full.coalesce(1)
            .write
            .format("delta")
            .mode("overwrite")
            .saveAsTable(f'{catalog}.{schema}.{tablename}')
    )

else:
    print(f"Tabela {tablename} já existente, ignorando full-load.")

In [0]:
# Define o esquema esperado para os arquivos de CDC

with open(f"/Workspace/Users/mydatabrickstestacc@gmail.com/music_data_lake/src/{catalog}/music_data_schemas.json", "r") as file:
    schema_data = json.load(file)

try:
    cdc_schema = get_schema(schema_name=table_schema, schema_json=schema_data)
except ValueError as e:
    print(e)

In [0]:
# Caminhos dos arquivos Parquet
df_cdc = (spark.read
              .format("parquet")
              .load(f"/Volumes/raw/music_data/cdc/postgres.public.{tablename}/"))


In [0]:
if df_cdc.isEmpty():
    print("Nenhum dado encontrado para processar. Finalizando o fluxo.")
else:
    df_cdc.createOrReplaceGlobalTempView(f"view_{tablename}")  

    query = f"""
        SELECT *
        FROM view_{tablename}
        WHERE {primary_key} IS NOT NULL
        QUALIFY ROW_NUMBER() OVER (PARTITION BY {primary_key} ORDER BY {timestamp_field} DESC) = 1

    """

    df_cdc = spark.sql(query)

In [0]:
bronze = DeltaTable.forName(spark, f"{catalog}.{schema}.{tablename}")
    
# Realizar o MERGE combinando upsert e delete
(bronze.alias("target")
        .merge(df_cdc.alias("source"), f"target.{primary_key} = source.{primary_key}")
        .whenMatchedDelete(condition = "source.operation = 'd'")
        .whenMatchedUpdateAll(condition = "source.operation = 'u'")
        .whenNotMatchedInsertAll(condition = "source.operation = 'c' OR source.operation = 'u'")
        .execute()
)