In [0]:
# Ingest√£o de Cards - Magic: The Gathering (Vers√£o Corrigida)
# Objetivo: Ingerir dados de cards da API do Magic: The Gathering para staging em Parquet no S3
# Caracter√≠sticas: Dados brutos, formato Parquet, filtro temporal, particionamento, incremental, paginado

# =============================================================================
# BIBLIOTECAS UTILIZADAS
# =============================================================================
import requests
import json
import time
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, current_timestamp, year, month
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, BooleanType

# =============================================================================
# CONFIGURA√á√ÉO DE SEGREDOS
# =============================================================================
def get_secret(secret_name, default_value=None):
    """
    Fun√ß√£o para obter segredos do Databricks de forma segura
    """
    try:
        return dbutils.secrets.get(scope="mtg-pipeline", key=secret_name)
    except:
        if default_value is not None:
            print(f"Segredo '{secret_name}' n√£o encontrado, usando valor padr√£o")
            return default_value
        else:
            print(f"Segredo obrigat√≥rio '{secret_name}' n√£o encontrado")
            raise Exception(f"Segredo '{secret_name}' n√£o configurado")

# =============================================================================
# VARI√ÅVEIS DE CONFIGURA√á√ÉO
# =============================================================================
# Configura√ß√µes da API
API_BASE_URL = get_secret("api_base_url",)
BATCH_SIZE = int(get_secret("batch_size", "100"))
MAX_RETRIES = int(get_secret("max_retries", "3"))

# Configura√ß√µes do S3
S3_BUCKET = get_secret("s3_bucket")
S3_STAGE_PREFIX = get_secret("s3_stage_prefix", "magic_the_gathering/stage")
S3_BASE_PATH = f"s3://{S3_BUCKET}/{S3_STAGE_PREFIX}"

# Configura√ß√µes de janela temporal (5 anos)
YEARS_BACK = int(get_secret("years_back", "5"))
current_year = datetime.now().year
cutoff_year = current_year - YEARS_BACK
CUTOFF_DATE = datetime(cutoff_year, 1, 1)
CUTOFF_DATE_STR = CUTOFF_DATE.strftime("%Y-%m-%d")

In [0]:
# =============================================================================
# FUN√á√ïES UTILIT√ÅRIAS
# =============================================================================
def setup_s3_storage():
    # Configura o storage S3
    try:
        # Verificar se o diret√≥rio existe (criar se necess√°rio)
        try:
            dbutils.fs.ls(S3_BASE_PATH)
            print("Diret√≥rio do S3 j√° existe")
        except:
            dbutils.fs.mkdirs(S3_BASE_PATH)
            print("Diret√≥rio do S3 criado com sucesso")
        
        return True
        
    except Exception as e:
        print(f"Erro ao configurar S3 storage: {e}")
        return False

def make_api_request(endpoint, params=None, retries=MAX_RETRIES):
    url = f"{API_BASE_URL}/{endpoint}"
    print(f"Fazendo requisi√ß√£o para: {url}")
    
    for attempt in range(retries):
        try:
            response = requests.get(url, params=params, timeout=30)
            print(f"Status Code: {response.status_code}")
            
            if response.status_code == 200:
                data = response.json()
                print(f"Dados recebidos: {type(data)}")
                if isinstance(data, dict):
                    print(f"Chaves dispon√≠veis: {list(data.keys())}")
                return data
            elif response.status_code == 429:  # Rate limit
                wait_time = min((attempt + 1) * 5, 60)
                print(f"Rate limit atingido. Aguardando {wait_time}s...")
                time.sleep(wait_time)
            elif response.status_code == 503:  # Service unavailable
                wait_time = min((attempt + 1) * 10, 120)
                print(f"Servi√ßo indispon√≠vel. Aguardando {wait_time}s...")
                time.sleep(wait_time)
            else:
                print(f"Erro {response.status_code} na API: {response.text[:200]}")
                if attempt < retries - 1:
                    time.sleep(5)
                
        except requests.exceptions.Timeout:
            print(f"Timeout na tentativa {attempt + 1}")
            if attempt < retries - 1:
                time.sleep(10)
        except requests.exceptions.RequestException as e:
            if attempt == retries - 1:
                print(f"Erro na requisi√ß√£o para endpoint ap√≥s {retries} tentativas: {e}")
                return None
            print(f"Tentativa {attempt + 1} falhou, tentando novamente...")
            time.sleep(1)
        except json.JSONDecodeError as e:
            print(f"Erro ao decodificar JSON na tentativa {attempt + 1}: {e}")
            if attempt == retries - 1:
                return None
            time.sleep(1)
    
    return None

def clean_cards_data(data):
    # Limpa e estrutura dados de cards
    cleaned_data = []
    
    for item in data:
        cleaned_item = {}
        
        # Campos simples - converter para string se necess√°rio
        simple_fields = ['name', 'manaCost', 'type', 'rarity', 'set', 'setName', 'text', 
                        'artist', 'number', 'power', 'toughness', 'layout', 'imageUrl', 
                        'originalText', 'originalType', 'id']
        
        for field in simple_fields:
            if field in item and item[field] is not None:
                try:
                    # Tentar converter para o tipo apropriado
                    if field in ['cmc', 'multiverseid']:
                        cleaned_item[field] = float(item[field]) if field == 'cmc' else int(item[field])
                    else:
                        cleaned_item[field] = str(item[field])
                except (ValueError, TypeError):
                    # Se n√£o conseguir converter, usar string
                    cleaned_item[field] = str(item[field]) if item[field] is not None else None
            else:
                cleaned_item[field] = None
        
        # Tratar campos de lista - converter para string JSON
        list_fields = ['colors', 'colorIdentity', 'types', 'subtypes', 'variations', 'foreignNames', 'printings', 'legalities']
        for field in list_fields:
            if field in item and item[field] is not None:
                if isinstance(item[field], list):
                    cleaned_item[field] = json.dumps(item[field])
                else:
                    cleaned_item[field] = str(item[field])
            else:
                cleaned_item[field] = None
        
        cleaned_data.append(cleaned_item)
    
    return cleaned_data

def save_to_parquet(data, table_name):
    if not data:
        print(f"Nenhum dado para salvar na tabela {table_name}")
        return None
    try:
        if table_name == 'cards':
            # Schema expl√≠cito para cards
            schema_fields = [
                StructField("name", StringType(), True),
                StructField("manaCost", StringType(), True),
                StructField("cmc", FloatType(), True),
                StructField("colors", StringType(), True),
                StructField("colorIdentity", StringType(), True),
                StructField("type", StringType(), True),
                StructField("types", StringType(), True),
                StructField("subtypes", StringType(), True),
                StructField("rarity", StringType(), True),
                StructField("set", StringType(), True),
                StructField("setName", StringType(), True),
                StructField("text", StringType(), True),
                StructField("artist", StringType(), True),
                StructField("number", StringType(), True),
                StructField("power", StringType(), True),
                StructField("toughness", StringType(), True),
                StructField("layout", StringType(), True),
                StructField("multiverseid", IntegerType(), True),
                StructField("imageUrl", StringType(), True),
                StructField("variations", StringType(), True),
                StructField("foreignNames", StringType(), True),
                StructField("printings", StringType(), True),
                StructField("originalText", StringType(), True),
                StructField("originalType", StringType(), True),
                StructField("legalities", StringType(), True),
                StructField("id", StringType(), True)
            ]
            
            schema = StructType(schema_fields)
            df = spark.createDataFrame(data, schema)
        else:
            df = spark.createDataFrame(data)
        
        # Adicionar metadados de ingest√£o
        df = df.withColumn("ingestion_timestamp", current_timestamp()) \
               .withColumn("source", lit("mtg_api")) \
               .withColumn("endpoint", lit(table_name))
        
        # Particionamento CORRIGIDO: por data de ingest√£o
        df = df.withColumn("partition_year", year(col("ingestion_timestamp"))) \
               .withColumn("partition_month", month(col("ingestion_timestamp")))
        
        # Salvar como Parquet no S3 com particionamento
        partition_combinations = df.select("partition_year", "partition_month").distinct().collect()
        
        for partition_row in partition_combinations:
            partition_year = partition_row["partition_year"]
            partition_month = partition_row["partition_month"]
            
            # Filtrar dados da parti√ß√£o
            partition_df = df.filter((col("partition_year") == partition_year) & 
                                   (col("partition_month") == partition_month))
            
            # Nome do arquivo
            file_name = f"{partition_year}_{partition_month:02d}_{table_name}.parquet"
            file_path = f"{S3_BASE_PATH}/{file_name}"
            
            # Verificar se arquivo j√° existe
            try:
                existing_files = dbutils.fs.ls(file_path)
                if len(existing_files) > 0:
                    print(f"Arquivo {file_name} j√° existe - pulando")
                    continue
            except:
                pass
            
            # Salvar arquivo
            partition_df.drop("partition_year", "partition_month").write.mode("overwrite").format("parquet").save(file_path)
            print(f"Arquivo {file_name} criado com sucesso")
        
        print(f"Registros salvos como Parquet para {table_name}")
        return df
        
    except Exception as e:
        print(f"Erro ao salvar dados em {table_name}: {e}")
        return None

def ingest_paginated_data(endpoint, table_name, data_key=None, max_pages=100):
    print(f"Iniciando ingest√£o paginada: {table_name}")
    
    if data_key is None:
        data_key = table_name
    
    all_data = []
    page = 1
    
    while page <= max_pages:
        print("Processando p√°gina...")
        
        params = {"page": page, "pageSize": BATCH_SIZE}
        data = make_api_request(endpoint, params)
        
        if data and data_key in data:
            page_data = data[data_key]
            print(f"Processando p√°gina {page}...")
            
            if not page_data:
                print("P√°gina vazia - fim da pagina√ß√£o")
                break
            
            # Limpar dados se for cards
            if table_name == 'cards':
                print("Limpando dados de cards...")
                page_data = clean_cards_data(page_data)
                print(f"Dados limpos: {len(page_data)} registros")
            
            all_data.extend(page_data)
            page += 1
            
            # Aguardar um pouco entre as requisi√ß√µes
            time.sleep(0.5)
        else:
            print("Falha ao obter dados da p√°gina")
            break
    
    # Verificar se h√° dados antes de tentar salvar
    if not all_data:
        print(f"Nenhum dado v√°lido para {table_name}")
        return None
    
    try:
        df = save_to_parquet(all_data, table_name)
        
        if df:
            count = df.count()
            print(f"{table_name}: {count} registros processados")
            return df
        return None
    except Exception as e:
        print(f"Erro ao processar dados de {table_name}: {e}")
        return None



In [0]:
# =============================================================================
# EXECU√á√ÉO PRINCIPAL
# =============================================================================

# Verificar Spark
try:
    spark
    print("Spark dispon√≠vel")
except NameError:
    print("Spark n√£o est√° dispon√≠vel - tentando obter do contexto")
    try:
        from pyspark.sql import SparkSession
        spark = SparkSession.builder.getOrCreate()
        print("Spark criado com sucesso")
    except Exception as e:
        print(f"Erro ao criar Spark: {e}")
        raise Exception("Spark n√£o est√° dispon√≠vel")

# Configurar S3 Storage
setup_success = setup_s3_storage()
if not setup_success:
    raise Exception("Falha ao configurar S3 storage")

print("Setup conclu√≠do com sucesso")

# Iniciar ingest√£o de cards
print("Iniciando ingest√£o de cards...")

cards_df = ingest_paginated_data(
    endpoint="cards",
    table_name="cards",
    max_pages=100  # Limite de 100 p√°ginas para demonstra√ß√£o
)

# Gerar relat√≥rio
print("=" * 50)
print("RELAT√ìRIO DE INGEST√ÉO DE CARDS (CORRIGIDO)")
print("=" * 50)

if cards_df:
    print("‚úÖ Arquivos salvos com sucesso")
    print(f"üìä Total de registros: {cards_df.count()}")
    print("üéØ Particionamento: por ingestion_timestamp (ano/m√™s)")
else:
    print("‚ùå Falha na ingest√£o de cards")

print("=" * 50)