In [0]:
from pyspark.sql.functions import col
from secure_core_data_loader import SecureCoreDataLoader

Run the register-widgets notebook to create all the widgets required for the notebook

In [0]:
%run ./register_widgets

Calling register_widgets() will create all the widgets in the notebook. Subsequently get_parameters() is used for retrieving all the widget values as a dictionary

In [0]:
register_widgets()
params = get_parameters()

In [0]:
vault_to_anonymized_load_tracking_table = params["vault_to_anonymized_load_tracking_table"]
source_catalog = params["secure_core_vault_catalog_name"]
target_catalog = params["secure_core_non_pci_pii_catalog_name"]
secure_core_catalog_name = params["secure_core_catalog_name"]
individual_attributes_schema_name = params["individual_attributes_schema_name"]

Initialize an object of the class

In [0]:
secure_core_data_loader = SecureCoreDataLoader(spark, params)

- Get all the last loaded version from each of the landing tables maintained in the tracking table for the landing catalog
- Using table history, identify if there are any new version with WRITE operations for the tables in the landing catalog
- If there is a WRITE operation with a version greater than last loaded version, then there is new data loaded to the table in landing catalog to consider incremental data load
- Register the result set as a temp table that will be used in the below CTE

In [0]:
secure_core_data_loader.get_tables_with_new_versions(vault_to_anonymized_load_tracking_table)

The below CTE will:
- Identify all the managed tables existing in landing catalog
- Identify all the managed tables existing in vault catalog
- Identify new tables that exists in landing catalog but not in vault catalog and derive column vault_table_status as "new"
- Identify tables in landing catalog that already exists in vault for which new data has been loaded. Derive column new_data_available as "yes"
- Identify the list of columns in all the tables in landing catalog (this is to get the data type, ordinal position of the columns to generate create table statement and align the hash columns next the respective PCI / PII attribute in vault)
- Identify the list of columns that are tagged as PCI / PII for tables in landing catalog. This is to propogate the tags to the tables in vault catalog

In [0]:
vault_tables_columns = f"""
--Identify all the managed tables existing in vault catalog
with tables_in_vault as (
    select table_catalog as catalog_name, table_schema as schema_name, table_name from system.information_schema.tables where table_catalog = '{source_catalog}' and table_schema not in ('{individual_attributes_schema_name}') and table_type = 'MANAGED'
),
--Identify all the managed tables existing in non_pci_pii catalog
tables_in_non_pci_pii as (
    select table_catalog as catalog_name, table_schema as schema_name, table_name from system.information_schema.tables where table_catalog = '{target_catalog}' and table_type = 'MANAGED'
),
--Identify new tables that exists in vault catalog but not in non_pci_pii catalog and tables where new data is available in vault catalog
new_tables_new_versions_in_vault as (
    select 
    tv.catalog_name 
    ,tv.schema_name
    ,tv.table_name 
    ,case when tnpp.table_name is null then 'new' else 'existing' end as target_table_status
    ,case when twnv.table_name is not null then 'yes' else 'no' end as new_data_available
    from tables_in_vault tv
    left join tables_in_non_pci_pii tnpp
    on tv.schema_name = tnpp.schema_name
    and tv.table_name = tnpp.table_name
    left join tables_with_new_versions twnv
    on tv.catalog_name = twnv.catalog_name
    and tv.schema_name = twnv.schema_name
    and tv.table_name = twnv.table_name
),
--Identify the list of columns in all the tables in vault catalog
vault_tables_and_columns as (
    select cols.table_catalog as catalog_name, cols.table_schema as schema_name, cols.table_name, cols.column_name, cols.full_data_type, cols.is_nullable, cols.ordinal_position, vlttbls.target_table_status, vlttbls.new_data_available
    from system.information_schema.columns cols
    inner join new_tables_new_versions_in_vault vlttbls
    on cols.table_catalog = vlttbls.catalog_name
    and cols.table_schema = vlttbls.schema_name
    and cols.table_name = vlttbls.table_name
)
select * from vault_tables_and_columns 
order by catalog_name, schema_name, table_name, ordinal_position """

df_vault_tables_columns = spark.sql(vault_tables_columns)
df_vault_tables_columns = df_vault_tables_columns.filter(
    (col("target_table_status") == "new") | 
    ((col("target_table_status") == "existing") & (col("new_data_available") == "yes"))
)
#display(df_vault_tables_columns)

- df_tables_to_load contains list of new tables created and existing tables with new data in landing catalog that needs to be ingested into vault catalog

In [0]:
df_tables_to_load = secure_core_data_loader.get_distinct_tables_to_load(df_vault_tables_columns)
#display(df_tables_to_load)


- Fetch the column tags from system.information_schema.column_tags

In [0]:
df_col_tags = secure_core_data_loader.get_column_tags(source_catalog)
#display(df_col_tags)

Cell below contains the below functions:
- **get_cdf_enabled_status** - For any table created in the landing catalog, check if CDF is enabled. CDF will be used for incrementally loading new data to tables in vault catalog.
- **enable_cdf** - Enable CDF for tables in landing catalog and add an entry to the landing_tables_version_tracking_table.
- **get_source_table_columns_datatype** - This function is used for fetching all the column data types to generate the CREATE TABLE statment in the vault catalog.
- **get_column_tags** - This function is for fetching the column tags in the tables in the landing catalog to propogate the same to the tables in vault catalog.
- **get_pci_pii_columns** - Get the list of PCI/PII columns based on tags for tables in landing catalog. This is used for generating the hash columns for the respective PCI/PII columns.
- **generate_hash_statements** - Generate the hash statement for the PCI/PII attributes. The hash is generated using SHA512 algorithm by concatenating the de-crypted DSK and the PCI/PII attribute.
- **generate_create_table_statement** - Generate the CREATE TABLE statement for creating new tables in vault catalog and also enable CDF.
- **generate_insert_statement** - Generate the INSERT statement to insert data from tables in landing catalog to tables in vault catalog. If CDF is enabled, the SELECT clause with CDF table syntax (table_changes('table_name', version_no) WHERE _change_type = 'insert'). If CDF is not enabled, it will be SELECT * FROM table_name.
- **update_landing_tracking_table_statement** - Function to update the tracking table maintained for the tables in the landing catalog. Update the last read version from the table in the landing catalog. This is used to identify if there is a new version of data available.
- **add_entry_to_vault_tracking_table** - Function to add an entry in the tracking table maintained for the tables in the vault catalog. This will called after running the CREATE TABLE statement. Hence, the last loaded version will be defaulted to 0.
- **apply_masking_function_statement** - Function to apply the masking function to the PCI/PII attributes in the tables in vault catalog.
- **propagate_tags_statement** - Generate ALTER statement to propogate the tags from tables in landing catalog to tables in vault catalog.

In [0]:
def process_single_table(row, df_vault_tables_columns, df_col_tags, target_catalog, secure_core_data_loader, vault_to_anonymized_load_tracking_table):
    # Extract relevant table metadata from the input row
    source_catalog = row["catalog_name"]
    schema = row["schema_name"]
    table = row["table_name"]
    target_table_status = row["target_table_status"]
    new_data_available = row["new_data_available"]

    #print(table)

    # Lists to collect ALTER statements for masking and tag propagation
    propagate_tags_statements = []
    hash_statements = {}

    cdf_enabled = True

    # Fetch full list of column names and data types from the source table
    columns_datatype = secure_core_data_loader.collect_source_table_columns_datatype(source_catalog, schema, table, df_vault_tables_columns)


    # If the vault table is marked as 'new', create it and add tracking entry
    if target_table_status == "new":
        target_table = f"`{target_catalog}`.`{schema}`.`{table}`"
        
        create_target_table_statement = secure_core_data_loader.generate_create_table_statement(
            source_catalog, schema, table, columns_datatype, hash_statements, target_table
        )
        
        spark.sql(create_target_table_statement)

    # Proceed to insert data if:
    # - It's a new vault table, or
    # - It's an existing vault table *and* new data is available
    if (target_table_status == "new" or (target_table_status == "existing" and new_data_available == "yes")):
        # Query to get the last loaded version from the tracking table
        last_loaded_version_query = f"""
            SELECT last_loaded_version FROM `{secure_core_catalog_name}`.default.`{vault_to_anonymized_load_tracking_table}`
            WHERE catalog_name = '{source_catalog}' AND schema_name = '{schema}' AND table_name = '{table}'
        """
        last_loaded_version = spark.sql(last_loaded_version_query).collect()[0][0]

        # Generate the INSERT statement with hash columns (if needed)
        insert_statements = secure_core_data_loader.generate_insert_statement(
            source_catalog, schema, table, target_catalog, columns_datatype, hash_statements, last_loaded_version, cdf_enabled
        )

        # Generate the UPDATE statement to update the tracking table with the new version
        update_tracking_table_statements = secure_core_data_loader.update_tracking_table_statement(source_catalog, schema, table, vault_to_anonymized_load_tracking_table)

        # Fetch column tags and prepare ALTER statements to propagate them
        column_tags = secure_core_data_loader.collect_column_tags(source_catalog, schema, table, df_col_tags)
        propagate_tags_statements.extend(secure_core_data_loader.propagate_tags_statement(schema, table, target_catalog, column_tags, hash_statements))

        # Execute data insert into the vault table
        #print(insert_statements)
        spark.sql(insert_statements)

        # Apply tags to both raw and hash columns
        for apply_tags in propagate_tags_statements:
            spark.sql(apply_tags)

        # Update the version tracking table after successful insert
        spark.sql(update_tracking_table_statements)

In [0]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_tables_parallel(df_tables_to_load, df_vault_tables_columns, df_col_tags, target_catalog, secure_core_data_loader, vault_to_anonymized_load_tracking_table, max_workers=10):
    # Collect all rows from the DataFrame into a local list.
    # Each row contains metadata for a single table to be processed.
    rows = df_tables_to_load.collect()

    # Create a ThreadPoolExecutor with the specified number of worker threads.
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit each table (row) to be processed in parallel using process_single_table function.
        # This returns a list of Future objects representing the execution of each task.
        futures = [executor.submit(process_single_table, row, df_vault_tables_columns, df_col_tags,
                                    target_catalog, secure_core_data_loader, vault_to_anonymized_load_tracking_table) 
                   for row in rows]
        
        # As each future completes, handle its result (or exception if one occurred).
        for future in as_completed(futures):
            future.result()
            # try:
            #     # Calling result() will re-raise any exception that occurred in the thread.
            #     future.result()
            # except Exception as e:
            #     # Log any exception that occurred while processing a table.
            #     print(f"Error processing a table: {e}")

In [0]:
#Call the function to process the tables in parallel.
process_tables_parallel(df_tables_to_load, df_vault_tables_columns, df_col_tags, target_catalog, secure_core_data_loader, vault_to_anonymized_load_tracking_table,  max_workers=10)