Run the register-widgets notebook to create all the widgets required for the notebook

In [0]:
%run ./register_widgets

Define the widgets for all the variables that can be parameterized

To get the values for all the defined widgets and get it as a dictionary

Calling register_widgets() will create all the widgets in the notebook. Subsequently get_parameters() is used for retrieving all the widget values as a dictionary

In [0]:
register_widgets()
params = get_parameters()

In [0]:
envlp_encryp_classifier_volume_location = params['envlp_encryp_classifier_volume_location']
envlp_encryp_landing_catalog_name = params['envlp_encryp_landing_catalog_name']

The below cell will read the manifest file (json file) that has the definition of the schemas, tables and columns defined in Unity Catalog. It will check if the definition of the manifest file matches with schema and table definition using system tables. If there is a mismatch, subsequent steps will not be excuted.

In [0]:
from pyspark.sql.functions import explode, col, create_map, struct, lit, concat
from pyspark.sql.types import StructType

# Step 1: Read attribute classification JSON files from the given path (multiline JSON enabled)
#There can be either one file for all schemas and tables or multiple files for each schema
#Filter attributes that do not have any tags
df_attribute_classifications = spark.read\
                      .option("multiLine", True)\
                      .json(f"{envlp_encryp_classifier_volume_location}/*.json")\
                      .select(explode("schemas").alias("schemas"))\
                      .select(col("schemas.schema_name"),
                              explode("schemas.tables").alias("tables")
                              )\
                      .select(col("schema_name"),
                              col("tables.table_name"),
                              explode("tables.attributes").alias("attributes")
                              )\
                      .select(col("schema_name"),
                              col("table_name"),
                              col("attributes.attribute_name").alias("column_name"),
                              col("attributes.tags").alias("tags")
                              )\
                      .filter(col("tags").isNotNull())

# Step 2: Convert 'tags' struct to a map (key-value pairs), if it's a StructType. 
# This will create seperate entry for each tag for the same attribute.
tags = df_attribute_classifications.select("tags").schema[0].dataType
if isinstance(tags, StructType):
  kv_pairs = []
  for f in tags.fields:
    kv_pairs.extend([lit(f.name), col(f"tags.{f.name}")])
  df_attribute_classifications = df_attribute_classifications.withColumn("tags", create_map(*kv_pairs))
else:
  df_attribute_classifications = df_attribute_classifications.withColumn("tags", col("tags"))

#display(df_attribute_classifications)

# Step 3: Explode the map of tags into individual rows (key = tag_name, value = tag_value)
df_attribute_classifications = df_attribute_classifications\
                                .select(
                                        "schema_name",
                                        "table_name",
                                        "column_name",
                                        explode("tags").alias("tag_name", "tag_value")
                                        )\
                                .filter(col("tag_value").isNotNull())
#display(df_attribute_classifications)

#Alias the dataframe for join condition
df_attribute_classifications = df_attribute_classifications.alias("attribute_classifications")

Get the list of schema, tables and column definition from system tables to evaluate if the definition in the manifest file matches with the definition in the system tables.

In [0]:
# Step 3: Get the list of all columns for each schema and table from the landing catalog to compare with the manifest file
df_landing_catalog_table_columns = spark.sql(f"select table_schema as schema_name, table_name, column_name from system.information_schema.columns where table_catalog = '{envlp_encryp_landing_catalog_name}'")
#display(df_landing_catalog_table_columns)

df_landing_catalog_table_columns = df_landing_catalog_table_columns.alias("landing_catalog_table_columns")

join_keys = ["schema_name", "table_name", "column_name"]

#Left join all the attributes from the manifest files with the table columns from the landing catalog 
df_tags = df_attribute_classifications.join(
              df_landing_catalog_table_columns
              , on=[col(f"attribute_classifications.{k}") == col(f"landing_catalog_table_columns.{k}") for k in join_keys], how="left")
#display(df_tags)

#Check for any mismatch between the schema, table and column definition in the manifest file and landing catalog
df_mismatch_classification = df_tags.select("attribute_classifications.schema_name",
                                            "attribute_classifications.table_name",
                                            "attribute_classifications.column_name").distinct()\
                                            .filter(col("landing_catalog_table_columns.schema_name").isNull())
df_mismatch_classification = df_mismatch_classification\
                              .withColumn("reason", lit("The combination of schema_name, table_name, column_name does not exist in landing catalog"))
display(df_mismatch_classification)

schema_name,table_name,column_name,reason


If there is no mismatch identified, then retrieve the existing column tags of all the tables in the landing catalog and compare with the column tags defined in the manifest file. If there are new tag definition identified in the manifest file, create and execute ALTER statement to those table column attributes in Unity Catalog.

In [0]:
#Step 4: If there is no mismatch identified, then compare the tags in the manifest file with the landing catalog. Fetch the tags in the landing catalog using system tables.
if(df_mismatch_classification.count() == 0):
  
  df_landing_catalog_tags = spark.sql(f"select schema_name, table_name, column_name, tag_name, tag_value from system.information_schema.column_tags where catalog_name = '{envlp_encryp_landing_catalog_name}'")
  #display(df_landing_catalog_tags)

  df_landing_catalog_tags = df_landing_catalog_tags.alias("landing_catalog_tags")

  join_keys = ["schema_name", "table_name", "column_name", "tag_name", "tag_value"]
  
  #Left join all the table column tags from the manifest file with the table column tags from the landing catalog. If there are any new tags defined in the manifest file, create an ALTER statement to add the tags to the table columns in landing catalog.
  df_tags = df_attribute_classifications.join(
              df_landing_catalog_tags
              , on=[col(f"attribute_classifications.{k}") == col(f"landing_catalog_tags.{k}") for k in join_keys], how="left")
  #display(df_tags)
  df_missing_tags_in_landing_catalog = df_tags.select("attribute_classifications.*")\
                                            .filter(col("landing_catalog_tags.tag_value").isNull())
  #display(df_missing_tags_in_landing_catalog)

  #For new tags identified in the manifest file, create and execute ALTER statements to add the tags to the table columns in landing catalog.
  rows = df_missing_tags_in_landing_catalog.select("schema_name", "table_name", "column_name", "tag_name", "tag_value").collect()

  alter_statements = [
    f"ALTER TABLE `{envlp_encryp_landing_catalog_name}`.`{row['schema_name']}`.`{row['table_name']}` "
    f"ALTER COLUMN {row['column_name']} "
    f"SET TAGS ('{row['tag_name']}' = '{row['tag_value']}')"
    for row in rows
  ]

  for stmt in alter_statements:
    try:
      #print(stmt)
      spark.sql(stmt)
    except Exception as e:
      print(f"Failed Executing Statement: {stmt}\n Error: {e}")
else:
  print("There are configuration mismatch between attributes in manifest file and landing catalog ")