In [0]:
# Camada Bronze - Sets - Magic: The Gathering (Unity + Incremental Merge)
# Objetivo: Processo EL (Extract & Load) - Extrair dados da staging e carregar na bronze
# Características: Extract da staging (S3/Parquet) -> Load na bronze (Unity Catalog/Delta) - INCREMENTAL
# Melhorias: Janela de 5 anos, particionamento por ano mes

# =============================================================================
# BIBLIOTECAS UTILIZADAS
# =============================================================================
import logging
from datetime import datetime, timedelta
from pyspark.sql.functions import *

# =============================================================================
# CONFIGURAÇÃO DE SEGREDOS
# =============================================================================
def get_secret(secret_name, default_value=None):
    try:
        return dbutils.secrets.get(scope="mtg-pipeline", key=secret_name)
    except:
        if default_value is not None:
            print(f"Segredo '{secret_name}' não encontrado, usando valor padrão")
            return default_value
        else:
            print(f"Segredo obrigatório '{secret_name}' não encontrado")
            raise Exception(f"Segredo '{secret_name}' não configurado")

# =============================================================================
# CONFIGURAÇÕES GLOBAIS
# =============================================================================
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

CATALOG_NAME = get_secret("catalog_name")
SCHEMA_NAME = "bronze"
TABLE_NAME = "sets"

S3_BUCKET = get_secret("s3_bucket")
S3_STAGE_PREFIX = get_secret("s3_stage_prefix", "magic_the_gathering/stage")
S3_BRONZE_PREFIX = get_secret("s3_bronze_prefix", "magic_the_gathering/bronze")
S3_STAGE_PATH = f"s3://{S3_BUCKET}/{S3_STAGE_PREFIX}"
S3_BRONZE_PATH = f"{S3_BUCKET}/{S3_BRONZE_PREFIX}"

# Configurações de janela temporal (5 anos)
YEARS_BACK = int(get_secret("years_back", "5"))
current_year = datetime.now().year
cutoff_year = current_year - YEARS_BACK
CUTOFF_DATE = datetime(cutoff_year, 1, 1)
CUTOFF_DATE_STR = CUTOFF_DATE.strftime("%Y-%m-%d")

print("Iniciando pipeline EL Unity + Incremental (MERGE) - SETS")
print(f"Filtro temporal: últimos {YEARS_BACK} anos (a partir de {CUTOFF_DATE_STR})")



In [0]:
# =============================================================================
# FUNÇÕES UTILITÁRIAS
# =============================================================================
def setup_unity_catalog():
    # Configura o Unity Catalog e schema
    try:
        spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG_NAME}")
        spark.sql(f"USE CATALOG {CATALOG_NAME}")
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA_NAME}")
        spark.sql(f"USE SCHEMA {SCHEMA_NAME}")
        return True
    except Exception as e:
        print(f"Erro ao configurar Unity Catalog: {e}")
        return False

def extract_from_staging(table_name):
    # EXTRACT: Lê dados da camada staging
    try:
        stage_path = f"{S3_STAGE_PATH}/*_{table_name}.parquet"
        df = spark.read.parquet(stage_path)
        return df
    except Exception as e:
        print(f"Erro no EXTRACT de staging: {e}")
        return None

def apply_temporal_filter(df, table_name):
    # Aplica filtro temporal de 5 anos
    if table_name == "sets":
        # Para sets, filtrar por data de lançamento (últimos 5 anos)
        df = df.filter(col("releaseDate") >= lit(CUTOFF_DATE_STR))
        print(f"Filtro temporal aplicado: sets dos últimos {YEARS_BACK} anos")
    return df

def load_to_bronze_unity_merge(df, table_name):
    # LOAD: Carrega dados na camada bronze (Unity + Incremental Merge)
    if not df:
        return None
    try:
        # Elimina duplicidade de code no DataFrame de origem
        df = df.dropDuplicates(['code'])
        
        # Aplicar filtro temporal
        df = apply_temporal_filter(df, table_name)
        
        # Particionamento para sets: por ano e mês de ingestão
        if table_name == "sets":
            df = df.withColumn("partition_year", year(col("ingestion_timestamp"))) \
                   .withColumn("partition_month", month(col("ingestion_timestamp")))
        
        delta_path = f"s3://{S3_BRONZE_PATH}/{table_name}"
        full_table_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.{table_name}"
        from delta.tables import DeltaTable
        
        # Verificar se _delta_log existe e tem arquivos
        delta_log_path = f"{delta_path}/_delta_log"
        delta_exists = False
        try:
            files = dbutils.fs.ls(delta_log_path)
            if files:
                delta_exists = True
        except:
            delta_exists = False

        if not delta_exists:
            # Primeira criação com particionamento
            if table_name == "sets":
                df.write.format("delta") \
                       .mode("overwrite") \
                       .option("overwriteSchema", "true") \
                       .partitionBy("partition_year", "partition_month") \
                       .save(delta_path)
            else:
                df.write.format("delta") \
                       .mode("overwrite") \
                       .option("overwriteSchema", "true") \
                       .save(delta_path)
        else:
            delta_table = DeltaTable.forPath(spark, delta_path)
            delta_table.alias("bronze").merge(
                df.alias("novo"),
                "bronze.code = novo.code"
            ).whenMatchedUpdateAll() \
             .whenNotMatchedInsertAll() \
             .execute()

        # Criar tabela Unity Catalog sempre
        try:
            try:
                spark.sql(f"DROP TABLE IF EXISTS {full_table_name}")
            except:
                pass
            create_table_sql = f"""
            CREATE TABLE {full_table_name}
            USING DELTA
            LOCATION '{delta_path}'
            COMMENT 'Tabela bronze de sets do Magic: The Gathering'
            """
            spark.sql(create_table_sql)
            print(f"Tabela Unity Catalog criada: {full_table_name}")
            try:
                spark.sql(f"""
                ALTER TABLE {full_table_name} SET TBLPROPERTIES (
                    'bronze_layer' = 'true',
                    'data_source' = 'mtg_api',
                    'processing_date' = '{datetime.now().strftime("%Y-%m-%d")}',
                    'table_type' = 'bronze',
                    'load_mode' = 'merge_incremental',
                    'temporal_window_years' = '{YEARS_BACK}',
                    'partitioning' = 'release_date_year_month'
                )
                """)
            except Exception as e:
                pass
        except Exception as e:
            print(f"Erro ao criar tabela Unity Catalog: {e}")
        return df
    except Exception as e:
        print(f"Erro no LOAD para bronze {table_name}: {e}")
        return None

def process_el_unity_merge(table_name):
    # Executa o pipeline EL Unity + Incremental Merge
    stage_df = extract_from_staging(table_name)
    if not stage_df:
        print(f"Falha no EXTRACT de staging para {table_name}")
        return None
    
    result_df = load_to_bronze_unity_merge(stage_df, table_name)
    return result_df

def query_bronze_unity(table_name):
    # Consulta na tabela bronze Unity Catalog
    try:
        full_table_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.{table_name}"
        count_query = f"SELECT COUNT(*) as total FROM {full_table_name}"
        count_result = spark.sql(count_query)
        count_result.show()
        
    except Exception as e:
        print(f"Erro ao consultar tabela bronze: {e}")

def show_delta_info(table_name):
    # Mostra informações da tabela Delta
    try:
        delta_path = f"s3://{S3_BRONZE_PATH}/{table_name}"
        from delta.tables import DeltaTable
        delta_table = DeltaTable.forPath(spark, delta_path)
        history = delta_table.history()
        print(f"Versões Delta: {history.count()}")
            
    except Exception as e:
        print(f"Erro ao mostrar informações Delta: {e}")



In [0]:
# =============================================================================
# EXECUÇÃO PRINCIPAL
# =============================================================================

try:
    spark
except NameError:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.getOrCreate()

setup_success = setup_unity_catalog()
if not setup_success:
    raise Exception("Falha ao configurar Unity Catalog")

sets_bronze_df = process_el_unity_merge("sets")

if sets_bronze_df:
    print("Pipeline executado com sucesso")
    query_bronze_unity("sets")
    show_delta_info("sets")
else:
    print("Falha no pipeline")
