# Capitulo 07 - Scope e Named Secrets

Como mapear URLs para secrets no DuckDB usando scopes, diagnosticar qual credencial sera usada e organizar nomenclaturas e documentacao. Tudo roda apenas em memoria (sem acesso a buckets reais).

In [32]:
import duckdb
import pandas as pd
from pathlib import Path
from collections.abc import Iterable

OUTPUT_DIR = Path("data_output")
OUTPUT_DIR.mkdir(exist_ok=True)

def fresh_con(extensions=("httpfs",)):
    """Create an in-memory connection and load required extensions."""
    con = duckdb.connect(database=":memory:")
    for ext in extensions:
        try:
            con.execute(f"INSTALL {ext};")
        except Exception:
            pass
        try:
            con.execute(f"LOAD {ext};")
        except Exception as exc:
            print(f"Aviso: nao foi possivel carregar a extensao {ext}: {exc}")
    return con

def try_load_extension(con, name):
    """Best-effort load for optional extensions (e.g., azure)."""
    try:
        con.execute(f"INSTALL {name};")
    except Exception:
        pass
    try:
        con.execute(f"LOAD {name};")
        return True
    except Exception as exc:
        print(f"Aviso: nao foi possivel carregar a extensao {name}: {exc}")
        return False

def fmt_scope(scope_val):
    """Normalize scope value to a printable string or None."""
    if scope_val is None:
        return None
    if isinstance(scope_val, str):
        return scope_val
    if isinstance(scope_val, Iterable):
        return ", ".join(str(s) for s in scope_val)
    return str(scope_val)

def fetch_which_secret(con, url, secret_type="s3"):
    """Return which_secret result as DataFrame; adds missing columns when absent."""
    try:
        df = con.execute("SELECT * FROM which_secret(?, ?)", [url, secret_type]).df()
    except Exception as exc:
        print(f"which_secret falhou para {url}: {exc}")
        return None
    if "scope" not in df.columns:
        df["scope"] = None
    if "name" not in df.columns:
        df["name"] = None
    return df

def show_matching(con, urls, secret_type="s3"):
    """Print which_secret() results for a list of URLs."""
    for url in urls:
        df = fetch_which_secret(con, url, secret_type)
        if df is None or df.empty:
            print(f"{url:60} -> Nenhum secret encontrado")
            continue
        row = df.iloc[0]
        name_val = row['name'] if pd.notna(row['name']) else '(nome indisponivel)'
        scope_raw = fmt_scope(row.scope)
        scope_display = scope_raw if scope_raw not in (None, "") else "(escopo indisponivel)"
        print(f"{url:60} -> {name_val} [{scope_display}]")

## 1. SCOPE basico (fallback e prefixo)
Cria secrets globais e com SCOPE especifico e mostra qual credencial sera usada para cada URL.

In [33]:
con = fresh_con()
con.execute("""CREATE SECRET s3_default (TYPE s3, KEY_ID 'default_key', SECRET 'default_secret')""")
con.execute("""CREATE SECRET s3_bucket1 (TYPE s3, KEY_ID 'bucket1_key', SECRET 'bucket1_secret', SCOPE 's3://bucket1/')""")
con.execute("""CREATE SECRET s3_bucket2 (TYPE s3, KEY_ID 'bucket2_key', SECRET 'bucket2_secret', SCOPE 's3://bucket2/')""")

secrets = con.execute("""SELECT name, scope FROM duckdb_secrets() WHERE type = 's3' ORDER BY name""").df()
print("Secrets criados:\n", secrets)

urls = [
    's3://bucket1/file.parquet',
    's3://bucket2/data.csv',
    's3://bucket3/other.parquet',
    's3://bucket1/deep/nested'
]
print("\nwhich_secret() para cada URL:")
show_matching(con, urls)

con.close()

Secrets criados:
          name                    scope
0  s3_bucket1          [s3://bucket1/]
1  s3_bucket2          [s3://bucket2/]
2  s3_default  [s3://, s3n://, s3a://]
3     secret1  [s3://, s3n://, s3a://]

which_secret() para cada URL:
s3://bucket1/file.parquet                                    -> s3_bucket1 [(escopo indisponivel)]
s3://bucket2/data.csv                                        -> s3_bucket2 [(escopo indisponivel)]
s3://bucket3/other.parquet                                   -> s3_default [(escopo indisponivel)]
s3://bucket1/deep/nested                                     -> s3_bucket1 [(escopo indisponivel)]


## 2. Hierarquia de SCOPE (longest prefix wins)
Demonstra o match mais especifico entre bucket e subpasta.

In [34]:
con = fresh_con()
con.execute("""CREATE SECRET s3_all (TYPE s3, KEY_ID 'all', SECRET 'all')""")
con.execute("""CREATE SECRET s3_prod_bucket (TYPE s3, KEY_ID 'prod', SECRET 'prod', SCOPE 's3://production-bucket/')""")
con.execute("""CREATE SECRET s3_prod_subfolder (TYPE s3, KEY_ID 'sensitive', SECRET 'sensitive', SCOPE 's3://production-bucket/sensitive-data/')""")

urls = [
    's3://other-bucket/file.parquet',
    's3://production-bucket/public/data.csv',
    's3://production-bucket/sensitive-data/pii.parquet',
    's3://production-bucket/sensitive-data/sub/file'
]
print("which_secret() aplicando longest prefix match:")
show_matching(con, urls)

con.close()

which_secret() aplicando longest prefix match:
s3://other-bucket/file.parquet                               -> s3_all [(escopo indisponivel)]
s3://production-bucket/public/data.csv                       -> s3_prod_bucket [(escopo indisponivel)]
s3://production-bucket/sensitive-data/pii.parquet            -> s3_prod_subfolder [(escopo indisponivel)]
s3://production-bucket/sensitive-data/sub/file               -> s3_prod_subfolder [(escopo indisponivel)]


## 3. which_secret() detalhado
Mostra colunas retornadas pela funcao para debug e documentacao.

In [35]:
con = fresh_con()
con.execute("""CREATE SECRET s3_bucket_a (TYPE s3, KEY_ID 'key_a', SECRET 'secret_a', SCOPE 's3://bucket-a/')""")
con.execute("""CREATE SECRET s3_bucket_b (TYPE s3, KEY_ID 'key_b', SECRET 'secret_b', SCOPE 's3://bucket-b/')""")

df = fetch_which_secret(con, 's3://bucket-a/file.parquet', 's3')
print("Resultado de which_secret():")
if df is not None:
    cols_to_show = [c for c in ['name', 'type', 'provider', 'scope', 'storage', 'persistent'] if c in df.columns]
    print(df[cols_to_show])
else:
    print("Nenhum resultado retornado.")

con.close()

Resultado de which_secret():
          name scope storage persistent
0  s3_bucket_a  None  memory  TEMPORARY


## 4. Verificando varias URLs
Aplica a hierarquia completa (global -> bucket -> subpasta) e imprime o secret usado por URL.

In [36]:
con = fresh_con()
con.execute("CREATE SECRET s3_default (TYPE s3, KEY_ID 'default', SECRET 'default')")
con.execute("CREATE SECRET s3_prod (TYPE s3, KEY_ID 'prod', SECRET 'prod', SCOPE 's3://prod/')")
con.execute("CREATE SECRET s3_analytics (TYPE s3, KEY_ID 'analytics', SECRET 'analytics', SCOPE 's3://prod/analytics/')")

test_urls = [
    's3://dev-bucket/data.parquet',
    's3://prod/logs/2024/01/data.parquet',
    's3://prod/analytics/reports/summary.parquet',
    's3://prod/analytics/raw/events.parquet'
]
show_matching(con, test_urls)

con.close()

s3://dev-bucket/data.parquet                                 -> s3_default [(escopo indisponivel)]
s3://prod/logs/2024/01/data.parquet                          -> s3_prod [(escopo indisponivel)]
s3://prod/analytics/reports/summary.parquet                  -> s3_analytics [(escopo indisponivel)]
s3://prod/analytics/raw/events.parquet                       -> s3_analytics [(escopo indisponivel)]


## 5. Funcao de diagnostico de SCOPEs
Lista secrets ordenados por especificidade e testa varias URLs de uma vez.

In [37]:
def diagnose_secret_configuration(con, urls, secret_type="s3"):
    """Print current secrets (ordered by scope length) and which_secret for URLs."""
    all_secrets = con.execute(f"""SELECT name, scope FROM duckdb_secrets() WHERE type = '{secret_type}' ORDER BY LENGTH(COALESCE(scope, '')) DESC, name""").df()
    print(f"Secrets configurados ({len(all_secrets)}):")
    for _, row in all_secrets.iterrows():
        scope_clean = fmt_scope(row['scope'])
        scope_val = scope_clean if scope_clean not in (None, "") else '(global)'
        print(f"  - {row['name']:20} -> {scope_val}")

    print("\nTeste de URLs:")
    for url in urls:
        df = fetch_which_secret(con, url, secret_type)
        if df is None or df.empty:
            print(f"  ✗ {url:50} -> Nenhum secret")
            continue
        row = df.iloc[0]
        name_val = row['name'] if pd.notna(row['name']) else '(nome indisponivel)'
        scope_raw = fmt_scope(row.scope)
        scope_display = scope_raw if scope_raw not in (None, "") else '(escopo indisponivel)'
        print(f"  ✓ {url:50} -> {name_val} [{scope_display}]")

con = fresh_con()
con.execute("CREATE SECRET s3_public (TYPE s3, KEY_ID 'pub', SECRET 'pub', SCOPE 's3://public-data/')")
con.execute("CREATE SECRET s3_private (TYPE s3, KEY_ID 'priv', SECRET 'priv', SCOPE 's3://private-data/')")
con.execute("CREATE SECRET s3_backup (TYPE s3, KEY_ID 'backup', SECRET 'backup')")

urls_to_test = [
    's3://public-data/dataset.parquet',
    's3://private-data/sensitive.parquet',
    's3://unknown-bucket/file.parquet',
    's3://public-data/subfolder/nested.parquet'
]

diagnose_secret_configuration(con, urls_to_test, 's3')

con.close()

Secrets configurados (4):
  - s3_backup            -> s3://, s3n://, s3a://
  - secret1              -> s3://, s3n://, s3a://
  - s3_private           -> s3://private-data/
  - s3_public            -> s3://public-data/

Teste de URLs:
  ✓ s3://public-data/dataset.parquet                   -> s3_public [(escopo indisponivel)]
  ✓ s3://private-data/sensitive.parquet                -> s3_private [(escopo indisponivel)]
  ✓ s3://unknown-bucket/file.parquet                   -> s3_backup [(escopo indisponivel)]
  ✓ s3://public-data/subfolder/nested.parquet          -> s3_public [(escopo indisponivel)]


## 6. Multiplos buckets S3 com credenciais diferentes
Exemplo de isolamento por bucket (publico, analytics, ML, logs).

In [38]:
con = fresh_con()
con.execute("""CREATE SECRET s3_public_datasets (TYPE s3, KEY_ID 'AKIAPUBLIC', SECRET 'public_secret', REGION 'us-east-1', SCOPE 's3://company-public-datasets/')""")
con.execute("""CREATE SECRET s3_analytics (TYPE s3, KEY_ID 'AKIAANALYTICS', SECRET 'analytics_secret', REGION 'us-west-2', SCOPE 's3://company-analytics/')""")
con.execute("""CREATE SECRET s3_ml_models (TYPE s3, KEY_ID 'AKIAML', SECRET 'ml_secret', REGION 'eu-west-1', SCOPE 's3://company-ml-models/')""")
con.execute("""CREATE SECRET s3_logs (TYPE s3, KEY_ID 'AKIALOGS', SECRET 'logs_secret', REGION 'us-east-1', SCOPE 's3://company-logs/')""")

secrets = con.execute("""SELECT name, scope FROM duckdb_secrets() WHERE type = 's3' ORDER BY name""").df()
print("Secrets configurados:")
print(secrets)

urls = [
    's3://company-public-datasets/census/2020.parquet',
    's3://company-analytics/reports/monthly.parquet',
    's3://company-ml-models/prod/model-v2.parquet',
    's3://company-logs/2024/01/events.parquet'
]
print("\nwhich_secret() para cada bucket:")
show_matching(con, urls)

con.close()

Secrets configurados:
                 name                            scope
0        s3_analytics        [s3://company-analytics/]
1             s3_logs             [s3://company-logs/]
2        s3_ml_models        [s3://company-ml-models/]
3  s3_public_datasets  [s3://company-public-datasets/]
4             secret1          [s3://, s3n://, s3a://]

which_secret() para cada bucket:
s3://company-public-datasets/census/2020.parquet             -> s3_public_datasets [(escopo indisponivel)]
s3://company-analytics/reports/monthly.parquet               -> s3_analytics [(escopo indisponivel)]
s3://company-ml-models/prod/model-v2.parquet                 -> s3_ml_models [(escopo indisponivel)]
s3://company-logs/2024/01/events.parquet                     -> s3_logs [(escopo indisponivel)]


## 7. Cross-account com credential_chain (STS)
Simula contas diferentes usando scopes distintos e assume role via provider `credential_chain`.

In [39]:
con = fresh_con()
con.execute("""CREATE SECRET s3_account_a (TYPE s3, KEY_ID 'AKIAACCOUNTA', SECRET 'account_a_secret', REGION 'us-east-1', SCOPE 's3://prod-account-a/')""")
# Simpler placeholder for analytics account (credential_chain variants may depend on build); using static key/secret for demo.
con.execute("""CREATE SECRET s3_account_b (TYPE s3, KEY_ID 'AKIAACCOUNTB', SECRET 'account_b_secret', REGION 'us-west-2', SCOPE 's3://analytics-account-b/')""")
con.execute("""CREATE SECRET s3_account_c (TYPE s3, KEY_ID 'AKIAACCOUNTC', SECRET 'account_c_secret', REGION 'eu-west-1', SCOPE 's3://partner-account-c/')""")

urls = [
    's3://prod-account-a/data.parquet',
    's3://analytics-account-b/reports.parquet',
    's3://partner-account-c/shared.parquet'
]
print("Cross-account which_secret():")
show_matching(con, urls)

con.close()

Cross-account which_secret():
s3://prod-account-a/data.parquet                             -> s3_account_a [(escopo indisponivel)]
s3://analytics-account-b/reports.parquet                     -> s3_account_b [(escopo indisponivel)]
s3://partner-account-c/shared.parquet                        -> s3_account_c [(escopo indisponivel)]


## 8. Multi-cloud com SCOPE
Inclui S3, Cloudflare R2, GCS e tentativa opcional de Azure (pode falhar se o provider nao existir na versao atual da extensao).

In [40]:
con = fresh_con()
azure_loaded = try_load_extension(con, "azure")

con.execute("""CREATE SECRET aws_main (TYPE s3, KEY_ID 'AKIAAWS', SECRET 'aws_secret', REGION 'us-east-1', SCOPE 's3://company-data-aws/')""")
con.execute("""CREATE SECRET cloudflare_cdn (TYPE r2, KEY_ID 'r2_key', SECRET 'r2_secret', ACCOUNT_ID 'cf_account_id', SCOPE 'r2://company-cdn/')""")
con.execute("""CREATE SECRET gcs_archive (TYPE gcs, PROVIDER credential_chain, SCOPE 'gs://company-archive-gcs/')""")

if azure_loaded:
    try:
        con.execute("""CREATE SECRET azure_backup (TYPE azure, PROVIDER managed_identity, ACCOUNT_NAME 'companybackup', SCOPE 'azure://backups/')""")
    except Exception as exc:
        print(f"Aviso: nao foi possivel criar azure_backup (provider pode nao existir nesta versao): {exc}")
else:
    print("Extensao azure nao carregada; secret azure_backup ignorado.")

secrets = con.execute("""SELECT name, type, scope FROM duckdb_secrets() ORDER BY type, name""").df()
print("Config multi-cloud:")
print(secrets)

con.close()

Config multi-cloud:
             name   type                        scope
0    azure_backup  azure           [azure://backups/]
1   __default_gcs    gcs              [gcs://, gs://]
2     gcs_archive    gcs  [gs://company-archive-gcs/]
3         secret6    gcs              [gcs://, gs://]
4  cloudflare_cdn     r2          [r2://company-cdn/]
5        aws_main     s3     [s3://company-data-aws/]
6         secret1     s3      [s3://, s3n://, s3a://]


## 9. Estrategias de naming para secrets
Sugestoes de convencao:
- Por ambiente: dev_s3_analytics, staging_s3_analytics, prod_s3_analytics
- Por projeto/time: s3_marketing_data, s3_engineering_logs, s3_finance_reports
- Por regiao: s3_us_east_1, s3_eu_west_1
- Por funcao: s3_readonly_public, s3_readwrite_analytics, s3_writeonly_logs
- Hierarquico: company_aws_s3_prod_analytics, company_gcp_gcs_dev_ml, company_azure_blob_staging_backup
Recomendacoes: snake_case, incluir provider no nome, ser descritivo sem ser longo, manter consistente e documentado.

In [41]:
con = fresh_con()
secrets_config = [
    ("prod_s3_analytics_us", "s3://prod-analytics-us/", "us-east-1"),
    ("prod_s3_analytics_eu", "s3://prod-analytics-eu/", "eu-west-1"),
    ("staging_s3_analytics", "s3://staging-analytics/", "us-west-2"),
    ("dev_s3_analytics", "s3://dev-analytics/", "us-west-2"),
]

for name, scope, region in secrets_config:
    con.execute(f"""CREATE SECRET {name} (TYPE s3, KEY_ID 'key', SECRET 'secret', REGION '{region}', SCOPE '{scope}')""")
    print(f"Criado: {name:25} -> {scope}")

con.close()

Criado: prod_s3_analytics_us      -> s3://prod-analytics-us/
Criado: prod_s3_analytics_eu      -> s3://prod-analytics-eu/
Criado: staging_s3_analytics      -> s3://staging-analytics/
Criado: dev_s3_analytics          -> s3://dev-analytics/


## 10. Inventario de secrets + exportacao para Markdown
Gera tabela de secrets atuais e exporta para um arquivo em `data_output/`.

In [42]:
def create_secret_inventory(con):
    return con.execute("""SELECT name, type, provider, scope, persistent, storage FROM duckdb_secrets() ORDER BY type, name""").df()

def export_secret_documentation(secrets_df, filename=OUTPUT_DIR / "secrets_inventory.md"):
    filename.parent.mkdir(exist_ok=True)
    with open(filename, 'w') as f:
        f.write("# DuckDB Secrets Inventory\n\n")
        f.write(f"Total de secrets: {len(secrets_df)}\n\n")
        for secret_type in secrets_df['type'].unique():
            subset = secrets_df[secrets_df['type'] == secret_type]
            f.write(f"## {secret_type.upper()}\n\n")
            for _, secret in subset.iterrows():
                scope_value = fmt_scope(secret['scope'])
                scope_value = scope_value if scope_value not in (None, "") else '(global)'
                f.write(f"### {secret['name']}\n\n")
                f.write(f"- Type: {secret['type']}\n")
                f.write(f"- Provider: {secret['provider']}\n")
                f.write(f"- Scope: {scope_value}\n")
                f.write(f"- Persistent: {secret['persistent']}\n")
                f.write(f"- Storage: {secret['storage']}\n\n")
    print(f"Inventario exportado para {filename}")

con = fresh_con()
con.execute("CREATE SECRET prod_s3_data (TYPE s3, KEY_ID 'k', SECRET 's', SCOPE 's3://prod-data/')")
con.execute("CREATE SECRET dev_s3_data (TYPE s3, KEY_ID 'k', SECRET 's', SCOPE 's3://dev-data/')")
con.execute("CREATE SECRET api_auth (TYPE http, BEARER_TOKEN 'token')")

inventory = create_secret_inventory(con)
print("Inventario gerado:")
print(inventory[['name', 'type', 'scope']])

export_secret_documentation(inventory)

con.close()

Inventario gerado:
            name  type                    scope
0  __default_gcs   gcs          [gcs://, gs://]
1        secret6   gcs          [gcs://, gs://]
2       api_auth  http                       []
3    dev_s3_data    s3         [s3://dev-data/]
4   prod_s3_data    s3        [s3://prod-data/]
5        secret1    s3  [s3://, s3n://, s3a://]
Inventario exportado para data_output\secrets_inventory.md


### Boas praticas de secret management
- Manter inventario atualizado com owner/time
- Escopar por ambiente e principio do menor privilegio
- Rotacionar/remover secrets nao usados e versionar configuracao
- Validar SCOPEs com which_secret() em staging antes de prod
- Monitorar expiracao/uso e auditar acessos

## Exercicios sugeridos
1. Crie 5 secrets S3 com hierarquia de SCOPEs (global, s3://data/, s3://data/prod/, s3://data/prod/sensitive/, s3://logs/) e teste 10 URLs com which_secret(), documentando o matching.
2. Modele secrets para 3 ambientes (dev/staging/prod) incluindo S3, PostgreSQL e HTTP API; defina naming consistente e funcao de switch.
3. Configure secrets multi-cloud (2 buckets S3, 1 container Azure, 1 bucket GCS), teste which_secret() para cada provider e escreva query de uniao de dados.
4. Gere 10+ secrets variados, crie inventario em Markdown com validacoes (SCOPE duplicado, etc.) e inclua recommendations/warnings.