In [0]:
from pyspark.sql.functions import col
from envlp_encryp_data_loader import EnvelopeEncryptionDataLoader
import uuid

Run the register-widgets notebook to create all the widgets required for the notebook

In [0]:
%run ./register_widgets

Define the widgets for all the variables that can be parameterized

To get the values for all the defined widgets and get it as a dictionary

Calling register_widgets() will create all the widgets in the notebook. Subsequently get_parameters() is used for retrieving all the widget values as a dictionary

In [0]:
register_widgets()
params = get_parameters()

In [0]:
source_catalog = params["envlp_encryp_vault_catalog_name"]
target_catalog = params["envlp_encryp_anonymized_catalog_name"]
envlp_encryp_core_catalog_name = params["envlp_encryp_core_catalog_name"]
dataframe_results_volume_location = params["dataframe_results_volume_location"]

Initialize an object of the class

In [0]:
envlp_encryp_data_loader = EnvelopeEncryptionDataLoader(spark, params)

The below CTE will:
- Identify all the managed tables existing in vault catalog
- Identify all the managed tables existing in anonymized catalog
- Identify new tables that exists in vault catalog but not in anonymized catalog
- Identify the list of columns in all the tables in vault catalog (this is to get the data type, ordinal position of the columns to generate create table statement
- Identify the list of columns that are tagged as PCI / PII for tables in vault catalog. This is to propogate the tags to the tables in anonymized catalog

In [0]:
vault_tables_columns = f"""
--Identify all the managed tables existing in vault catalog
with tables_in_vault as (
    select table_catalog as catalog_name, table_schema as schema_name, table_name from system.information_schema.tables where table_catalog = '{source_catalog}' and table_type = 'MANAGED'
),
--Identify all the managed tables existing in anonymized catalog
tables_in_anonymized as (
    select table_catalog as catalog_name, table_schema as schema_name, table_name from system.information_schema.tables where table_catalog = '{target_catalog}' and table_type = 'MANAGED'
),
--Identify new tables that exists in vault catalog but not in anonymized catalog
new_tables_in_vault as (
    select 
    tv.catalog_name 
    ,tv.schema_name
    ,tv.table_name 
    from tables_in_vault tv
    left join tables_in_anonymized tannoy
    on tv.schema_name = tannoy.schema_name
    and tv.table_name = tannoy.table_name
    where tannoy.schema_name is null and tannoy.table_name is null
),
--Identify the list of columns in all the tables in vault catalog
vault_tables_and_columns as (
    select cols.table_catalog as catalog_name, cols.table_schema as schema_name, cols.table_name, cols.column_name, cols.full_data_type, cols.is_nullable, cols.ordinal_position
    from system.information_schema.columns cols
    inner join new_tables_in_vault vlttbls
    on cols.table_catalog = vlttbls.catalog_name
    and cols.table_schema = vlttbls.schema_name
    and cols.table_name = vlttbls.table_name
)
select * from vault_tables_and_columns 
order by catalog_name, schema_name, table_name, ordinal_position """

df_vault_tables_columns = spark.sql(vault_tables_columns)
display(df_vault_tables_columns)

catalog_name,schema_name,table_name,column_name,full_data_type,is_nullable,ordinal_position
envlp_encryp_vault,pci_pii_dataset,customer_records_1,full_name,string,YES,0
envlp_encryp_vault,pci_pii_dataset,customer_records_1,full_name_hash,string,NO,1
envlp_encryp_vault,pci_pii_dataset,customer_records_1,email,string,YES,2
envlp_encryp_vault,pci_pii_dataset,customer_records_1,email_hash,string,NO,3
envlp_encryp_vault,pci_pii_dataset,customer_records_1,phone_number,string,YES,4
envlp_encryp_vault,pci_pii_dataset,customer_records_1,phone_number_hash,string,NO,5
envlp_encryp_vault,pci_pii_dataset,customer_records_1,dob,string,YES,6
envlp_encryp_vault,pci_pii_dataset,customer_records_1,dob_hash,string,NO,7
envlp_encryp_vault,pci_pii_dataset,customer_records_1,ssn,string,YES,8
envlp_encryp_vault,pci_pii_dataset,customer_records_1,ssn_hash,string,NO,9


In [0]:
#Save the dataframe results to volume to persist the results
unique_id = str(uuid.uuid4())
path = f"{dataframe_results_volume_location}/{unique_id}"
df_vault_tables_columns.write.mode("overwrite").parquet(path)

#Read the dataframe results from the persisted location
df_vault_tables_columns = spark.read.parquet(path)

- df_tables_to_load contains list of new tables created and existing tables with new data in landing catalog that needs to be ingested into vault catalog

In [0]:
df_tables_to_load = envlp_encryp_data_loader.get_distinct_tables_to_load(df_vault_tables_columns)
#display(df_tables_to_load)


- Fetch the column tags from system.information_schema.column_tags

In [0]:
df_col_tags = envlp_encryp_data_loader.get_column_tags(source_catalog)
#display(df_col_tags)

In [0]:
def process_single_table(row, df_vault_tables_columns, df_col_tags, target_catalog, envlp_encryp_data_loader):
    # Extract relevant table metadata from the input row
    source_catalog = row["catalog_name"]
    schema = row["schema_name"]
    table = row["table_name"]

    # Lists to collect ALTER statements for masking and tag propagation
    propagate_tags_statements = []
    hash_statements = {}

    # Fetch full list of column names and data types from the source table
    columns_datatype = envlp_encryp_data_loader.collect_source_table_columns_datatype(source_catalog, schema, table, df_vault_tables_columns)

    target_table = f"`{target_catalog}`.`{schema}`.`{table}`"

    envlp_encryp_data_loader.create_schema(target_catalog, schema)
    create_target_table_statement = envlp_encryp_data_loader.generate_create_table_statement(
        source_catalog, schema, table, columns_datatype, hash_statements, target_table
    )
        
    spark.sql(create_target_table_statement)

    # Generate the INSERT statement with hash columns (if needed)
    insert_statements = envlp_encryp_data_loader.generate_insert_statement(
        source_catalog, schema, table, target_catalog, columns_datatype, hash_statements
    )
    #print(insert_statements)

    # Fetch column tags and prepare ALTER statements to propagate them
    column_tags = envlp_encryp_data_loader.collect_column_tags(source_catalog, schema, table, df_col_tags)
    propagate_tags_statements.extend(envlp_encryp_data_loader.propagate_tags_statement(schema, table, target_catalog, column_tags, hash_statements))

    # Execute data insert into the vault table
    spark.sql(insert_statements)

    # Apply tags to both raw and hash columns
    for apply_tags in propagate_tags_statements:
        spark.sql(apply_tags)

Process the tables to the anonymized catalog

In [0]:
rows = df_tables_to_load.collect()
for row in rows:
  process_single_table(row, df_vault_tables_columns, df_col_tags, target_catalog, envlp_encryp_data_loader)

In [0]:
pip install azure-identity

Collecting azure-identity
  Downloading azure_identity-1.23.0-py3-none-any.whl.metadata (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting msal>=1.30.0 (from azure-identity)
  Downloading msal-1.32.3-py3-none-any.whl.metadata (11 kB)
Collecting msal-extensions>=1.2.0 (from azure-identity)
  Downloading msal_extensions-1.3.1-py3-none-any.whl.metadata (7.8 kB)
Downloading azure_identity-1.23.0-py3-none-any.whl (186 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/186.1 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m186.1/186.1 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading msal-1.32.3-py3-none-any.whl (115 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/115.4 kB[0m [31m?

In [0]:
%restart_python

In [0]:
from azure.identity import DefaultAzureCredential


cred = DefaultAzureCredential()
print(cred)

<azure.identity._credentials.default.DefaultAzureCredential object at 0x7e4f3010c6b0>


In [0]:
print(cred.__dict__)

{'_successful_credential': None, 'credentials': (<dbruntime.servicecredentials.servicecredentials.getAzureDBServiceCredentialsProvider.<locals>.ServiceCredentialTokenProvider object at 0x7e4f3010dd00>, <azure.identity._credentials.environment.EnvironmentCredential object at 0x7e4f300a4ce0>, <azure.identity._credentials.managed_identity.ManagedIdentityCredential object at 0x7e4f680af680>, <azure.identity._credentials.shared_cache.SharedTokenCacheCredential object at 0x7e4f2773e000>, <azure.identity._credentials.azure_cli.AzureCliCredential object at 0x7e4f3018f230>, <azure.identity._credentials.azure_powershell.AzurePowerShellCredential object at 0x7e4f277846b0>, <azure.identity._credentials.azd_cli.AzureDeveloperCliCredential object at 0x7e4f27784710>)}
