In [0]:
# Camada Silver - CardPrices - Magic: The Gathering
# Pipeline 100% PySpark DataFrame API
# Modularizado seguindo padrão Bronze/Silver

# =============================================================================
# BIBLIOTECAS UTILIZADAS
# =============================================================================
import logging
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
from pyspark.sql.utils import AnalysisException





In [0]:
# =============================================================================
# FUNÇÕES UTILITÁRIAS
# =============================================================================
def get_secret(secret_name, default_value=None):
    try:
        return dbutils.secrets.get(scope="mtg-pipeline", key=secret_name)
    except:
        if default_value is not None:
            print(f"Secret '{secret_name}' não encontrado, usando valor padrão")
            return default_value
        else:
            print(f"Secret obrigatório '{secret_name}' não encontrado")
            raise Exception(f"Required secret '{secret_name}' not configured")

def setup_unity_catalog(catalog, schema):
    try:
        spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog}")
        spark.sql(f"USE CATALOG {catalog}")
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
        spark.sql(f"USE SCHEMA {schema}")
        print(f"Schema {catalog}.{schema} criado ou já existente.")
        return True
    except Exception as e:
        print(f"Erro ao configurar Unity Catalog: {e}")
        return False

# =============================================================================
# EXTRAÇÃO
# =============================================================================
def extract_from_bronze(catalog, table_name):
    bronze_table = f"{catalog}.bronze.TB_BRONZE_CARDPRICES"
    df = spark.table(bronze_table)
    return df

# =============================================================================
# TRANSFORMAÇÃO
# =============================================================================
def transform_cardprices_silver(df):
    # Filtro temporal (últimos 5 anos)
    df = df.filter(col("DT_INGESTION") >= add_months(current_date(), -12*5))

    # Padronização de nomes (Title Case)
    df = df.withColumn("NME_CARD", initcap(col("NME_CARD")))
    df = df.withColumn("NME_RARITY", initcap(col("NME_RARITY")))
    df = df.withColumn("COD_SET", upper(col("COD_SET")))

    # Limpeza e tratamento de nulos - colunas VLR_ como FLOAT com 2 casas decimais
    for colname in ["VLR_USD", "VLR_EUR", "VLR_TIX", "VLR_MARKET", "VLR_LOW", "VLR_HIGH"]:
        if colname in df.columns:
            df = df.withColumn(colname, round(coalesce(col(colname).cast("float"), lit(0.0)), 2))

    df = df.withColumn("DT_INGESTION", col("DT_INGESTION").cast("date"))

    # Particionamento
    df = df.withColumn("ANO_PART", col("RELEASE_YEAR"))
    df = df.withColumn("MES_PART", col("RELEASE_MONTH"))

    # Seleção final de colunas (ajustar conforme necessário)
    colunas_finais = [
        "ID_CARD", "NME_CARD", "VLR_USD", "VLR_EUR", "VLR_TIX",
        "DESC_PRICES", "VLR_MARKET", "VLR_LOW",
        "VLR_HIGH", "NME_PRICE_STATUS", "NME_RARITY", "COD_SET",
        "DT_INGESTION", "ANO_PART", "MES_PART"
    ]
    df_final = df.select(*[c for c in colunas_finais if c in df.columns])
    return df_final

# =============================================================================
# CARGA (WRITE/MERGE)
# =============================================================================
def delta_table_exists_and_schema_ok(spark, delta_path, df_final):
    try:
        delta_table = DeltaTable.forPath(spark, delta_path)
        current_schema = set([f.name for f in delta_table.toDF().schema.fields])
        new_schema = set([f.name for f in df_final.schema.fields])
        if current_schema != new_schema:
            return False, None
        return True, delta_table
    except Exception:
        return False, None

def load_to_silver_unity_incremental(df_final, catalog, schema, table_name, s3_silver_path):
    delta_path = f"s3://{s3_silver_path}/{table_name}"
    full_table_name = f"{catalog}.{schema}.{table_name}"
    print(f"Salvando dados em: {delta_path}")
    print("Qtd linhas df_final:", df_final.count())
    print("Colunas df_final:", df_final.columns)
    print("delta_path:", delta_path)

    exists, delta_table = delta_table_exists_and_schema_ok(spark, delta_path, df_final)
    if not exists:
        print("Tabela Delta não existe ou schema mudou. Salvando com overwrite e overwriteSchema=True.")
        try:
            df_final.write.format("delta") \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .partitionBy("ANO_PART", "MES_PART") \
                .save(delta_path)
            print("Write Delta concluído com sucesso!")
        except Exception as e:
            print("Erro no write Delta:", e)
            raise
    else:
        print("Tabela Delta já existe e schema é igual. Executando merge incremental por NME_CARD.")
        count_antes = delta_table.toDF().count()
        df_final = df_final.dropDuplicates(["NME_CARD"])
        update_cols = [c for c in df_final.columns if c != "NME_CARD"]
        set_expr = {col: f"novo.{col}" for col in update_cols}
        merge_result = delta_table.alias("silver").merge(
            df_final.alias("novo"),
            "silver.NME_CARD = novo.NME_CARD"
        ).whenMatchedUpdate(set=set_expr) \
         .whenNotMatchedInsertAll() \
         .execute()
        count_depois = delta_table.toDF().count()
        print(f"Linhas antes do merge: {count_antes}")
        print(f"Linhas depois do merge: {count_depois}")
        print(f"Linhas adicionadas (diferença): {count_depois - count_antes}")
        print("Estatísticas do merge incremental:")
        for attr in [
            "insertedRowsCount", "updatedRowsCount", "deletedRowsCount",
            "copiedRowsCount", "sourceRowsCount", "outputRowsCount"
        ]:
            if hasattr(merge_result, attr):
                print(f"{attr}: {getattr(merge_result, attr)}")
    print("Dados salvos com sucesso na camada Silver!")

    # Criação/atualização da tabela no Unity Catalog
    try:
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema}")
        print(f"Schema {catalog}.{schema} criado ou já existente.")
    except Exception as e:
        print(f"Erro ao criar schema: {e}")
    try:
        from pyspark.sql.utils import AnalysisException
        # Verifica se a tabela já existe
        if spark.catalog.tableExists(full_table_name):
            # Pega o schema atual da tabela
            existing_schema = spark.table(full_table_name).schema
            # Função para comparar apenas nome e tipo das colunas (ignorando ordem e nullable)
            def schema_to_set(schema):
                return set((f.name.lower(), str(f.dataType).lower()) for f in schema.fields)
            if schema_to_set(existing_schema) == schema_to_set(df_final.schema):
                print(f"Tabela {full_table_name} já existe e schema é igual. Não será recriada.")
            else:
                print(f"Tabela {full_table_name} existe mas schema é diferente. Será recriada.")
                spark.sql(f"DROP TABLE IF EXISTS {full_table_name}")
                # Criar tabela com schema explícito para permitir particionamento
                spark.sql(f'''\
                    CREATE TABLE {full_table_name} (
                        NME_CARD STRING,
                        VLR_USD FLOAT,
                        VLR_EUR FLOAT,
                        VLR_TIX FLOAT,
                        NME_RARITY STRING,
                        COD_SET STRING,
                        DT_INGESTION DATE,
                        ANO_PART INT,
                        MES_PART INT
                    )
                    USING DELTA
                    PARTITIONED BY (ANO_PART, MES_PART)
                    LOCATION '{delta_path}'
                ''')
                print(f"Tabela Unity Catalog criada com particionamento: {full_table_name}")
        else:
            print(f"Tabela {full_table_name} não existe. Será criada.")
            # Criar tabela com schema explícito para permitir particionamento
            spark.sql(f'''\
                CREATE TABLE {full_table_name} (
                    NME_CARD STRING,
                    VLR_USD FLOAT,
                    VLR_EUR FLOAT,
                    VLR_TIX FLOAT,
                    NME_RARITY STRING,
                    COD_SET STRING,
                    DT_INGESTION DATE,
                    ANO_PART INT,
                    MES_PART INT
                )
                USING DELTA
                PARTITIONED BY (ANO_PART, MES_PART)
                LOCATION '{delta_path}'
            ''')
            print(f"Tabela Unity Catalog criada com particionamento: {full_table_name}")
    except AnalysisException as e:
        print(f"Erro ao criar tabela no Unity Catalog: {e}")

In [0]:
# =============================================================================
# PIPELINE PRINCIPAL
# =============================================================================

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
CATALOG_NAME = get_secret("catalog_name")
SCHEMA_NAME = "silver"
TABLE_NAME_BRONZE = "TB_BRONZE_CARDPRICES"
TABLE_NAME = "TB_FATO_SILVER_CARDPRICES"
S3_BUCKET = get_secret("s3_bucket")
S3_SILVER_PREFIX = get_secret("s3_silver_prefix", "magic_the_gathering/silver")
S3_SILVER_PATH = f"{S3_BUCKET}/{S3_SILVER_PREFIX}"

setup_unity_catalog(CATALOG_NAME, SCHEMA_NAME)
df_bronze = extract_from_bronze(CATALOG_NAME, TABLE_NAME_BRONZE)
df_final = transform_cardprices_silver(df_bronze)
load_to_silver_unity_incremental(df_final, CATALOG_NAME, SCHEMA_NAME, TABLE_NAME, S3_SILVER_PATH)