In [0]:
# Camada Silver - Types - Magic: The Gathering
# Objetivo: Transformação e limpeza de dados da Bronze para Silver
# Características: Extract da Bronze, Transform com limpeza, Load na Silver com merge incremental

# =============================================================================
# BIBLIOTECAS UTILIZADAS
# =============================================================================
import logging
from datetime import datetime
from pyspark.sql.functions import *
from pyspark.sql.types import *

# =============================================================================
# CONFIGURAÇÃO DE SEGREDOS
# =============================================================================
def get_secret(secret_name, default_value=None):
    try:
        return dbutils.secrets.get(scope="mtg-pipeline", key=secret_name)
    except:
        if default_value is not None:
            print(f"Secret '{secret_name}' não encontrado, usando valor padrão")
            return default_value
        else:
            print(f"Secret obrigatório '{secret_name}' não encontrado")
            raise Exception(f"Secret '{secret_name}' não configurado")


 

In [0]:
# =============================================================================
# FUNÇÕES UTILITÁRIAS
# =============================================================================
def setup_unity_catalog(catalog_name, schema_name):
    # Configura o Unity Catalog e schema
    try:
        spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name}")
        spark.sql(f"USE CATALOG {catalog_name}")
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
        spark.sql(f"USE SCHEMA {schema_name}")
        print(f"Schema {catalog_name}.{schema_name} criado ou já existente.")
        return True
    except Exception as e:
        print(f"Erro ao configurar Unity Catalog: {e}")
        return False

def extract_from_bronze(catalog_name, table_name_bronze):
    # EXTRACT: Lê dados da camada Bronze
    try:
        bronze_table = f"{catalog_name}.bronze.{table_name_bronze}"
        df = spark.table(bronze_table)
        print(f"Extraídos {df.count()} registros da Bronze")
        return df
    except Exception as e:
        print(f"Erro no EXTRACT da Bronze: {e}")
        return None

def transform_types_silver(df):
    # TRANSFORM: Aplica transformações e limpeza
    if not df:
        return None
    
    try:
        print("Iniciando transformações...")
        
        # Converter para maiúsculas e aplicar title case
        df = df.withColumn("NME_TYPE", initcap(trim(col("NME_TYPE"))))
        df = df.withColumn("NME_SOURCE", initcap(trim(col("NME_SOURCE"))))
        
        # Limpeza e tratamento de nulos
        # Para descrições: substituir nulos/vazios por "NA"
        df = df.withColumn("NME_TYPE", when(col("NME_TYPE").isNull() | (col("NME_TYPE") == ""), lit("NA")).otherwise(trim(col("NME_TYPE"))))
        df = df.withColumn("NME_SOURCE", when(col("NME_SOURCE").isNull() | (col("NME_SOURCE") == ""), lit("NA")).otherwise(trim(col("NME_SOURCE"))))
        
        # Para números: substituir nulos por 0
        df = df.withColumn("INGESTION_YEAR", coalesce(col("INGESTION_YEAR"), lit(0)))
        df = df.withColumn("INGESTION_MONTH", coalesce(col("INGESTION_MONTH"), lit(0)))
        
        # Conversão de datas
        df = df.withColumn("DT_INGESTION", to_timestamp(col("DT_INGESTION")))
        
        # Remover duplicatas baseadas em NME_TYPE
        total_before = df.count()
        df = df.dropDuplicates(["NME_TYPE"])
        total_after = df.count()
        print(f"Removidas {total_before - total_after} duplicatas baseadas em NME_TYPE")
        
        # Seleção final de colunas
        colunas_finais = [
            "NME_TYPE", "NME_SOURCE", 
            "DT_INGESTION", "INGESTION_YEAR", "INGESTION_MONTH"
        ]
        
        # Filtrar colunas que existem no DataFrame
        colunas_disponiveis = [c for c in colunas_finais if c in df.columns]
        
        df_final = df.select(*colunas_disponiveis)
        return df_final
        
    except Exception as e:
        print(f"Erro na transformação: {e}")
        return None

def check_unity_table_exists(full_table_name):
    # Verifica se a tabela Unity Catalog existe
    try:
        test_query = f"SELECT 1 FROM {full_table_name} LIMIT 1"
        spark.sql(test_query)
        print(f"Tabela Unity Catalog '{full_table_name}' existe")
        return True
    except Exception as e:
        print(f"Tabela Unity Catalog '{full_table_name}' não existe")
        return False

def load_to_silver_unity_incremental(df, catalog_name, schema_name, table_name, s3_silver_path):
    # LOAD: Carrega dados na camada Silver
    if not df:
        return None
    
    try:
        delta_path = f"s3://{s3_silver_path}/{table_name}"
        full_table_name = f"{catalog_name}.{schema_name}.{table_name}"
        
        print(f"Salvando dados em: {delta_path}")
        print(f"Qtd linhas df_final: {df.count()}")
        print(f"Colunas df_final: {df.columns}")
        print(f"delta_path: {delta_path}")
        
        # Verificar se tabela Delta existe
        try:
            existing_df = spark.read.format("delta").load(delta_path)
            existing_count = existing_df.count()
            print(f"Tabela Delta existe com {existing_count} registros")
            
            # Verificar se schema mudou
            existing_schema = set(existing_df.columns)
            new_schema = set(df.columns)
            
            if existing_schema != new_schema:
                print("Schema mudou. Salvando com overwrite e overwriteSchema=True.")
                df.write.format("delta") \
                       .mode("overwrite") \
                       .option("overwriteSchema", "true") \
                       .partitionBy("INGESTION_YEAR", "INGESTION_MONTH") \
                       .save(delta_path)
            else:
                # Merge incremental
                from delta.tables import DeltaTable
                delta_table = DeltaTable.forPath(spark, delta_path)
                
                # Merge baseado em NME_TYPE
                delta_table.alias("silver").merge(
                    df.alias("novo"),
                    "silver.NME_TYPE = novo.NME_TYPE"
                ).whenMatchedUpdateAll() \
                 .whenNotMatchedInsertAll() \
                 .execute()
                print("Merge incremental executado com sucesso")
                
        except Exception as e:
            print(f"Tabela Delta não existe ou schema mudou. Salvando com overwrite e overwriteSchema=True.")
            df.write.format("delta") \
                   .mode("overwrite") \
                   .option("overwriteSchema", "true") \
                   .partitionBy("INGESTION_YEAR", "INGESTION_MONTH") \
                   .save(delta_path)
        
        print("Write Delta concluído com sucesso!")
        
        # Criar ou atualizar tabela Unity Catalog
        table_exists = check_unity_table_exists(full_table_name)
        
        if not table_exists:
            print(f"Tabela {full_table_name} não existe. Será criada.")
            # Criar tabela com schema explícito para permitir particionamento
            spark.sql(f'''\
                CREATE TABLE {full_table_name} (
                    NME_TYPE STRING,
                    NME_SOURCE STRING,
                    DT_INGESTION TIMESTAMP,
                    INGESTION_YEAR INT,
                    INGESTION_MONTH INT
                )
                USING DELTA
                PARTITIONED BY (INGESTION_YEAR, INGESTION_MONTH)
                LOCATION '{delta_path}'
            ''')
        else:
            print(f"Tabela {full_table_name} já existe.")
        
        print("Dados salvos com sucesso na camada Silver!")
        return df
        
    except Exception as e:
        print(f"Erro no LOAD para Silver: {e}")
        return None



In [0]:
# =============================================================================
# PIPELINE PRINCIPAL
# =============================================================================

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
CATALOG_NAME = get_secret("catalog_name")
SCHEMA_NAME = "silver"
TABLE_NAME_BRONZE = "TB_BRONZE_TYPES"
TABLE_NAME = "TB_REF_SILVER_TYPES"
S3_BUCKET = get_secret("s3_bucket")
S3_SILVER_PREFIX = get_secret("s3_silver_prefix", "magic_the_gathering/silver")
S3_SILVER_PATH = f"{S3_BUCKET}/{S3_SILVER_PREFIX}"

setup_unity_catalog(CATALOG_NAME, SCHEMA_NAME)
df_bronze = extract_from_bronze(CATALOG_NAME, TABLE_NAME_BRONZE)
df_final = transform_types_silver(df_bronze)
load_to_silver_unity_incremental(df_final, CATALOG_NAME, SCHEMA_NAME, TABLE_NAME, S3_SILVER_PATH)
