In [0]:

from pyspark.sql.functions import lit
import json
from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType
from delta import DeltaTable
import sys

sys.path.insert(0, "../lib/")
import utils


In [0]:
df_full = spark.read.format("parquet").load("/Volumes/raw/music_data/full_load/tracks/")

(df_full.coalesce(1)
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable("bronze.music_data.tracks"))

df_full.display()

In [0]:
# Define o esquema esperado para os arquivos de CDC
schema_name = "tracks"
with open("/Workspace/Users/mydatabrickstestacc@gmail.com/music_data_lake/src/bronze/music_data_schemas.json", "r") as file:
    schema_data = json.load(file)

try:
    cdc_schema = utils.get_schema(schema_name=schema_name, schema_json=schema_data)
except ValueError as e:
    print(e)

In [0]:
# Caminhos dos arquivos Parquet
df_cdc = spark.read.format("parquet").load("/Volumes/raw/music_data/cdc/postgres.public.tracks/")
df_cdc.display()

In [0]:
if df_cdc.isEmpty():
    print("Nenhum dado encontrado para processar. Finalizando o fluxo.")
else:
    from pyspark.sql.window import Window
    from pyspark.sql.functions import row_number, desc, col

    # Janela para deduplicação baseada em artist_id e ts_ms mais recente
    window_spec = Window.partitionBy("track_id").orderBy(desc("ts_ms"))

    # Deduplicar registros
    cdc_df_recents = df_cdc.withColumn("row_number", row_number().over(window_spec)) \
                                      .filter("row_number = 1") \
                                      .drop("row_number") \
                                      .filter(
                                          col("track_id").isNotNull())

cdc_df_recents.display()                                    

In [0]:
bronze = DeltaTable.forName(spark, "bronze.music_data.tracks")
    
# Realizar o MERGE combinando upsert e delete
(bronze.alias("target")
        .merge(cdc_df_recents.alias("source"), "target.track_id = source.track_id")
        .whenMatchedDelete(condition = "source.operation = 'd'")
        .whenMatchedUpdateAll(condition = "source.operation = 'u'")
        .whenNotMatchedInsertAll(condition = "source.operation = 'c' OR source.operation = 'u'")
        .execute()
)