In [0]:
from pyspark.sql.functions import lit
import json
from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType
from delta import DeltaTable
import sys

sys.path.insert(0, "../lib/")

import utils

In [0]:
df_full = spark.read.format("parquet").load("/Volumes/raw/music_data/full_load/tracks/")

(df_full.coalesce(1)
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable("bronze.music_data.tracks"))

df_full.display()

In [0]:
# Define o esquema esperado para os arquivos de CDC
schema_name = "tracks"
with open("/Workspace/Users/mydatabrickstestacc@gmail.com/music_data_lake/src/bronze/music_data_schemas.json", "r") as file:
    schema_data = json.load(file)

try:
    cdc_schema = utils.get_schema(schema_name=schema_name, schema_json=schema_data)
except ValueError as e:
    print(e)

In [0]:
# Caminhos dos arquivos Parquet
inserts_path = "/Volumes/raw/music_data/cdc/postgres.public.tracks/inserts/"
updates_path = "/Volumes/raw/music_data/cdc/postgres.public.tracks/updates/"
deletes_path = "/Volumes/raw/music_data/cdc/postgres.public.tracks/deletes/"

df_cdc_inserts = (spark.read
      .format("parquet")
      .load(inserts_path)
      .withColumn("operation", lit("insert")))

df_cdc_updates = (spark.read
      .format("parquet")
      .load(updates_path)
      .withColumn("operation", lit("update")))

df_cdc_deletes = (spark.read
      .format("parquet")
      .load(deletes_path)
      .withColumn("operation", lit("delete")))

In [0]:
df_cdc_inserts.display()

In [0]:
df_cdc_updates.display()

In [0]:
df_cdc_deletes.display()

In [0]:
df_cdc_union = df_cdc_inserts.union(df_cdc_updates).union(df_cdc_deletes)
df_cdc_union.display()

In [0]:
if df_cdc_union.isEmpty():
    print("Nenhum dado encontrado para processar. Finalizando o fluxo.")
else:
    from pyspark.sql.window import Window
    from pyspark.sql.functions import row_number, desc

    # Janela para deduplicação baseada em artist_id e ts_ms mais recente
    window_spec = Window.partitionBy("artist_id", "track_id", "operation").orderBy(desc("ts_ms"))

    # Deduplicar registros
    deduplicated_cdc_df = df_cdc_union.withColumn("row_number", row_number().over(window_spec)) \
                                      .filter("row_number = 1") \
                                      .drop("row_number")

In [0]:
deduplicated_cdc_df.display()

In [0]:
bronze = DeltaTable.forName(spark, "bronze.music_data.tracks")
    
# Realizar o MERGE combinando upsert e delete
(bronze.alias("target").merge(
    deduplicated_cdc_df.alias("source"),
    "target.track_id = source.track_id"  # Condição de correspondência
).whenMatchedUpdateAll(condition="source.operation != 'delete'")  # Atualiza se não for delete
    .whenNotMatchedInsertAll(condition="source.operation != 'delete'")  # Insere se não for delete
    .whenMatchedDelete(condition="source.operation = 'delete'")  # Exclui se for delete
    .execute())

In [0]:
%sql
SELECT * from bronze.music_data.tracks WHERE track_id in (3234505031, 3234505033, 3234505034)

In [0]:
%sql
SELECT * from bronze.music_data.tracks VERSION AS OF 2 WHERE track_id in (3234505031, 3234505033, 3234505034)