In [0]:
# Camada Silver - Cards - Magic: The Gathering
# Pipeline 100% PySpark DataFrame API
# Modularizado seguindo padrão Bronze

# =============================================================================
# BIBLIOTECAS UTILIZADAS
# =============================================================================
import logging
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import udf, year, month, initcap, upper, coalesce, when, lit, split, size, regexp_replace, trim, translate, md5, lower



In [0]:
# =============================================================================
# FUNÇÕES UTILITÁRIAS
# =============================================================================
def get_secret(secret_name, default_value=None):
    try:
        return dbutils.secrets.get(scope="mtg-pipeline", key=secret_name)
    except:
        if default_value is not None:
            print(f"Secret '{secret_name}' não encontrado, usando valor padrão")
            return default_value
        else:
            print(f"Secret obrigatório '{secret_name}' não encontrado")
            raise Exception(f"Required secret '{secret_name}' not configured")

def setup_unity_catalog(catalog, schema):
    try:
        spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog}")
        spark.sql(f"USE CATALOG {catalog}")
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
        spark.sql(f"USE SCHEMA {schema}")
        print(f"Schema {catalog}.{schema} criado ou já existente.")
        return True
    except Exception as e:
        print(f"Erro ao configurar Unity Catalog: {e}")
        return False

# =============================================================================
# EXTRAÇÃO
# =============================================================================
def extract_from_bronze(catalog, table_name):
    bronze_table = f"{catalog}.bronze.TB_BRONZE_CARDS"
    df = spark.table(bronze_table)
    return df

# =============================================================================
# TRANSFORMAÇÃO
# =============================================================================
def transform_cards_silver(df):
    # Filtro temporal (últimos 5 anos)
    df = df.filter(col("DT_INGESTION") >= add_months(current_date(), -12*5))

    # Padronização de nomes (Title Case)
    df = df.withColumn("NME_CARD", initcap(col("NME_CARD")))
    df = df.withColumn("NME_ARTIST", initcap(col("NME_ARTIST")))
    df = df.withColumn("NME_RARITY", initcap(col("NME_RARITY")))
    df = df.withColumn("NME_SET", initcap(col("NME_SET")))
    df = df.withColumn("COD_SET", upper(col("COD_SET")))

    # Limpeza e tratamento de nulos
    df = df.withColumn("DESC_MANA_COST", when((col("DESC_MANA_COST").isNull()) | (col("DESC_MANA_COST") == "") | (col("DESC_MANA_COST") == "null"), lit("0")).otherwise(trim(col("DESC_MANA_COST"))))
    df = df.withColumn("MANA_COST", coalesce(col("MANA_COST"), lit(0.0)))
    df = df.withColumn("NME_POWER", coalesce(col("NME_POWER").cast("int"), lit(0)))
    df = df.withColumn("NME_TOUGHNESS", coalesce(col("NME_TOUGHNESS").cast("int"), lit(0)))

    # Derivação de NME_CARD_TYPE e DESC_CARD_TYPE
    def split_card_type(card_type):
        if card_type is None:
            return (None, None)
        if "planeswalker" in card_type.lower():
            return ("Planeswalker", card_type)
        if "—" in card_type:
            parts = card_type.split("—", 1)
            return (parts[0].strip(), parts[1].strip())
        return (card_type.strip(), "NA")
    from pyspark.sql.functions import udf
    from pyspark.sql.types import StructType, StructField, StringType
    split_card_type_udf = udf(split_card_type, StructType([
        StructField("NME_CARD_TYPE", StringType()),
        StructField("DESC_CARD_TYPE", StringType())
    ]))
    df = df.withColumn("_CARD_TYPE_STRUCT", split_card_type_udf(col("NME_CARD_TYPE")))
    df = df.withColumn("NME_CARD_TYPE", col("_CARD_TYPE_STRUCT.NME_CARD_TYPE"))
    df = df.withColumn("DESC_CARD_TYPE", col("_CARD_TYPE_STRUCT.DESC_CARD_TYPE"))
    df = df.drop("_CARD_TYPE_STRUCT")

    # Limpeza de colunas tipo array/JSON para string simples
    from pyspark.sql.functions import regexp_replace
    df = df.withColumn("NME_PRINTINGS", regexp_replace(col("NME_PRINTINGS"), r'\[|\]|"', ""))
    df = df.withColumn("COD_COLORS", regexp_replace(col("COD_COLORS"), r'\[|\]|"', ""))
    df = df.withColumn("COD_COLOR_IDENTITY", regexp_replace(col("COD_COLOR_IDENTITY"), r'\[|\]|"', ""))
    df = df.withColumn("DESC_SUBTYPES", regexp_replace(col("DESC_SUBTYPES"), r'\[|\]|"', ""))
    df = df.withColumn("NME_ORIGINAL_TYPE", when(col("NME_ORIGINAL_TYPE").isNull(), lit("NA")).otherwise(col("NME_ORIGINAL_TYPE")))

    # COD_COLORS como string separada por vírgula
    df = df.withColumn(
        "COD_COLORS",
        when(col("COD_COLORS").isNull() | (col("COD_COLORS") == "") | (col("COD_COLORS") == '[""]'), lit("Colorless")).otherwise(col("COD_COLORS"))
    )

    # NME_COLOR_CATEGORY
    df = df.withColumn(
        "NME_COLOR_CATEGORY",
        when(col("COD_COLORS") == "Colorless", "Colorless")
        .when(size(split(col("COD_COLORS"), ",")) == 1, "Mono")
        .when(size(split(col("COD_COLORS"), ",")) == 2, "Dual Color")
        .when(size(split(col("COD_COLORS"), ",")) >= 3, "Multicolor")
        .otherwise("Mono")
    )

    # QTY_COLORS: número de cores diferentes no custo de mana
    df = df.withColumn(
        "QTY_COLORS",
        when((col("DESC_MANA_COST").isNull()) | (col("DESC_MANA_COST") == "0") | (col("DESC_MANA_COST") == "1"), lit(0))
        .when(regexp_replace(col("DESC_MANA_COST"), "^[0-9X]+$", "") == "", lit(0))
        .otherwise(length(regexp_replace(upper(col("DESC_MANA_COST")), "[^WUBRG]", "")))
    )

    # DESC_TYPES e DESC_SUBTYPES como string separada por vírgula
    df = df.withColumn("DESC_TYPES", when(col("DESC_TYPES").isNull() | (col("DESC_TYPES") == ""), lit("NA")).otherwise(col("DESC_TYPES")))
    df = df.withColumn(
        "DESC_SUBTYPES",
        when(col("DESC_SUBTYPES").isNull() | (col("DESC_SUBTYPES") == "") | (col("DESC_SUBTYPES") == '[""]'), lit("NA")).otherwise(col("DESC_SUBTYPES"))
    )

    # Substituições e normalização em DESC_CARD
    desc_card_clean = regexp_replace(col("DESC_CARD"), r"\{W\}", "[White]")
    desc_card_clean = regexp_replace(desc_card_clean, r"\{U\}", "[Blue]")
    desc_card_clean = regexp_replace(desc_card_clean, r"\{B\}", "[Black]")
    desc_card_clean = regexp_replace(desc_card_clean, r"\{R\}", "[Red]")
    desc_card_clean = regexp_replace(desc_card_clean, r"\{G\}", "[Green]")
    desc_card_clean = regexp_replace(desc_card_clean, r"\{C\}", "[Colorless]")
    desc_card_clean = regexp_replace(desc_card_clean, r"\{X\}", "[X]")
    desc_card_clean = regexp_replace(desc_card_clean, r"\{T\}", "[Tap]")
    desc_card_clean = regexp_replace(desc_card_clean, r"\{Q\}", "[Untap]")
    desc_card_clean = regexp_replace(desc_card_clean, r"\{S\}", "[Snow]")
    desc_card_clean = regexp_replace(desc_card_clean, r"\{E\}", "[Energy]")
    desc_card_clean = regexp_replace(desc_card_clean, r"\{0\}", "[0]")
    desc_card_clean = regexp_replace(desc_card_clean, r"\{1\}", "[1]")
    desc_card_clean = translate(
        desc_card_clean,
        "áàãâäéèêëíìîïóòõôöúùûüçÁÀÃÂÄÉÈÊËÍÌÎÏÓÒÕÔÖÚÙÛÜÇ",
        "aaaaaeeeeiiiiooooouuuucAAAAAEEEEIIIIOOOOOUUUUC"
    )
    desc_card_clean = trim(desc_card_clean)
    df = df.withColumn("DESC_CARD", desc_card_clean)
    df = df.withColumn(
        "DESC_CARD",
        when(col("DESC_CARD").isNull() | (col("DESC_CARD") == "null"), lit("NA")).otherwise(col("DESC_CARD"))
    )

    # Adiciona colunas de partição
    df = df.withColumn("ANO_PART", year(col("DT_INGESTION")))
    df = df.withColumn("MES_PART", month(col("DT_INGESTION")))

    # Conversão explícita dos tipos
    df = df.withColumn("DT_INGESTION", col("DT_INGESTION").cast("date"))
    df = df.withColumn("MANA_COST", col("MANA_COST").cast("int"))

    # Seleção final de colunas
    colunas_finais = [
        "NME_PRINTINGS", "NME_ORIGINAL_TYPE", "ID_CARD", "DT_INGESTION", "NME_SOURCE", "NME_ENDPOINT",
        "RELEASE_YEAR", "RELEASE_MONTH", "NME_CARD", "DESC_MANA_COST", "MANA_COST", "COD_COLORS",
        "COD_COLOR_IDENTITY", "NME_CARD_TYPE", "DESC_SUBTYPES", "NME_RARITY", "COD_SET", "NME_SET",
        "DESC_CARD", "NME_ARTIST", "COD_NUMBER", "NME_POWER", "NME_TOUGHNESS", "NME_LAYOUT",
        "URL_IMAGE", "NME_COLOR_CATEGORY", "QTY_COLORS", "ANO_PART", "MES_PART"
    ]
    df_final = df.select(*colunas_finais)
    return df_final

# =============================================================================
# CARGA (WRITE/MERGE)
# =============================================================================
def delta_table_exists_and_schema_ok(spark, delta_path, df_final):
    try:
        delta_table = DeltaTable.forPath(spark, delta_path)
        current_schema = set([f.name for f in delta_table.toDF().schema.fields])
        new_schema = set([f.name for f in df_final.schema.fields])
        if current_schema != new_schema:
            return False, None
        return True, delta_table
    except Exception:
        return False, None

def load_to_silver_unity_incremental(df_final, catalog, schema, table_name, s3_silver_path):
    delta_path = f"s3://{s3_silver_path}/{table_name}"
    full_table_name = f"{catalog}.{schema}.{table_name}"
    print(f"Salvando dados em: {delta_path}")
    print("Qtd linhas df_final:", df_final.count())
    print("Colunas df_final:", df_final.columns)
    print("delta_path:", delta_path)

    exists, delta_table = delta_table_exists_and_schema_ok(spark, delta_path, df_final)
    if not exists:
        print("Tabela Delta não existe ou schema mudou. Salvando com overwrite e overwriteSchema=True.")
        try:
            df_final.write.format("delta") \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .partitionBy("ANO_PART", "MES_PART") \
                .save(delta_path)
            print("Write Delta concluído com sucesso!")
        except Exception as e:
            print("Erro no write Delta:", e)
            raise
    else:
        print("Tabela Delta já existe e schema é igual. Executando merge incremental por ID_CARD.")
        count_antes = delta_table.toDF().count()
        df_final = df_final.dropDuplicates(["ID_CARD"])
        update_cols = [c for c in df_final.columns if c != "ID_CARD"]
        set_expr = {col: f"novo.{col}" for col in update_cols}
        merge_result = delta_table.alias("silver").merge(
            df_final.alias("novo"),
            "silver.ID_CARD = novo.ID_CARD"
        ).whenMatchedUpdate(set=set_expr) \
         .whenNotMatchedInsertAll() \
         .execute()
        count_depois = delta_table.toDF().count()
        print(f"Linhas antes do merge: {count_antes}")
        print(f"Linhas depois do merge: {count_depois}")
        print(f"Linhas adicionadas (diferença): {count_depois - count_antes}")
        # Exibe estatísticas do merge incremental
        print("Estatísticas do merge incremental:")
        # Tenta acessar os principais atributos, se existirem
        for attr in [
            "insertedRowsCount", "updatedRowsCount", "deletedRowsCount",
            "copiedRowsCount", "sourceRowsCount", "outputRowsCount"
        ]:
            if hasattr(merge_result, attr):
                print(f"{attr}: {getattr(merge_result, attr)}")
    print("Dados salvos com sucesso na camada Silver!")

    # Criação/atualização da tabela no Unity Catalog
    try:
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema}")
        print(f"Schema {catalog}.{schema} criado ou já existente.")
    except Exception as e:
        print(f"Erro ao criar schema: {e}")
    try:
        from pyspark.sql.utils import AnalysisException
        # Verifica se a tabela já existe
        if spark.catalog.tableExists(full_table_name):
            # Pega o schema atual da tabela
            existing_schema = spark.table(full_table_name).schema
            # Função para comparar apenas nome e tipo das colunas (ignorando ordem e nullable)
            def schema_to_set(schema):
                return set((f.name.lower(), str(f.dataType).lower()) for f in schema.fields)
            if schema_to_set(existing_schema) == schema_to_set(df_final.schema):
                print(f"Tabela {full_table_name} já existe e schema é igual. Não será recriada.")
            else:
                print(f"Tabela {full_table_name} existe mas schema é diferente. Será recriada.")
                spark.sql(f"DROP TABLE IF EXISTS {full_table_name}")
                spark.sql(f'''\
                    CREATE TABLE {full_table_name} (
                        NME_PRINTINGS STRING,
                        NME_ORIGINAL_TYPE STRING,
                        ID_CARD STRING,
                        DT_INGESTION DATE,
                        NME_SOURCE STRING,
                        NME_ENDPOINT STRING,
                        RELEASE_YEAR INT,
                        RELEASE_MONTH INT,
                        NME_CARD STRING,
                        DESC_MANA_COST STRING,
                        MANA_COST INT,
                        COD_COLORS STRING,
                        COD_COLOR_IDENTITY STRING,
                        NME_CARD_TYPE STRING,
                        DESC_SUBTYPES STRING,
                        NME_RARITY STRING,
                        COD_SET STRING,
                        NME_SET STRING,
                        DESC_CARD STRING,
                        NME_ARTIST STRING,
                        COD_NUMBER STRING,
                        NME_POWER INT,
                        NME_TOUGHNESS INT,
                        NME_LAYOUT STRING,
                        URL_IMAGE STRING,
                        NME_COLOR_CATEGORY STRING,
                        QTY_COLORS INT,
                        ANO_PART INT,
                        MES_PART INT
                    )
                    USING DELTA
                    PARTITIONED BY (ANO_PART, MES_PART)
                    LOCATION '{delta_path}'
                ''')
                print(f"Tabela Unity Catalog criada com particionamento explícito: {full_table_name}")
        else:
            print(f"Tabela {full_table_name} não existe. Será criada.")
            spark.sql(f'''\
                CREATE TABLE {full_table_name} (
                    NME_PRINTINGS STRING,
                    NME_ORIGINAL_TYPE STRING,
                    ID_CARD STRING,
                    DT_INGESTION DATE,
                    NME_SOURCE STRING,
                    NME_ENDPOINT STRING,
                    RELEASE_YEAR INT,
                    RELEASE_MONTH INT,
                    NME_CARD STRING,
                    DESC_MANA_COST STRING,
                    MANA_COST INT,
                    COD_COLORS STRING,
                    COD_COLOR_IDENTITY STRING,
                    NME_CARD_TYPE STRING,
                    DESC_SUBTYPES STRING,
                    NME_RARITY STRING,
                    COD_SET STRING,
                    NME_SET STRING,
                    DESC_CARD STRING,
                    NME_ARTIST STRING,
                    COD_NUMBER STRING,
                    NME_POWER INT,
                    NME_TOUGHNESS INT,
                    NME_LAYOUT STRING,
                    URL_IMAGE STRING,
                    NME_COLOR_CATEGORY STRING,
                    QTY_COLORS INT,
                    ANO_PART INT,
                    MES_PART INT
                )
                USING DELTA
                PARTITIONED BY (ANO_PART, MES_PART)
                LOCATION '{delta_path}'
            ''')
            print(f"Tabela Unity Catalog criada com particionamento explícito: {full_table_name}")
    except AnalysisException as e:
        print(f"Erro ao criar tabela no Unity Catalog: {e}")




In [0]:
# =============================================================================
# PIPELINE PRINCIPAL
# =============================================================================

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
CATALOG_NAME = get_secret("catalog_name")
SCHEMA_NAME = "silver"
TABLE_NAME = "TB_FATO_SILVER_CARDS"
S3_BUCKET = get_secret("s3_bucket")
S3_SILVER_PREFIX = get_secret("s3_silver_prefix", "magic_the_gathering/silver")
S3_SILVER_PATH = f"{S3_BUCKET}/{S3_SILVER_PREFIX}"

setup_unity_catalog(CATALOG_NAME, SCHEMA_NAME)
df_bronze = extract_from_bronze(CATALOG_NAME, TABLE_NAME)
df_final = transform_cards_silver(df_bronze)
load_to_silver_unity_incremental(df_final, CATALOG_NAME, SCHEMA_NAME, TABLE_NAME, S3_SILVER_PATH)