# Bulk AI Generate all Table descriptions for a specified Catalog and Schema

In [0]:
catalog = 'sandbox'
schemas = ['bronze', 'silver', 'gold']

def sanitize_comment(text):
    if not text:
        return "No description available."
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    text = text.replace("'", "''")
    text = text.replace("`", "")
    return text

def is_valid_identifier(name):
    # Only allow alphanumeric and underscores, and must not start with a digit
    import re
    return re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', name) is not None

processed_tables = []

print(f"Found schemas: {schemas}")

for schema in schemas:
    tables_df = spark.sql(f"""
        SELECT DISTINCT table_name
        FROM {catalog}.information_schema.tables
        WHERE table_catalog = '{catalog}'
          AND table_schema = '{schema}'
          AND lower(table_name) NOT LIKE 'information_schema%'
          AND lower(table_name) NOT LIKE 'sys%'
          AND lower(table_name) NOT LIKE 'system%'
    """)
    tables = [row['table_name'] for row in tables_df.collect()]
    print(f"Schema {schema} has tables: {tables}")

    for table in tables:
        col_meta_df = spark.sql(f"""
            SELECT CONCAT(column_name, ': ', data_type) AS col_desc
            FROM {catalog}.information_schema.columns
            WHERE table_catalog = '{catalog}'
              AND table_schema = '{schema}'
              AND table_name = '{table}'
        """)
        columns_description = "\n".join([row['col_desc'] for row in col_meta_df.collect()])

        prompt = (
            f"You are a data documentation assistant. Given the following columns, infer the purpose and typical use cases of the table. "
            f"Write a concise, natural-language description of the table's contents and intent, not just a list of fields. "
            f"Start your description with 'This table...'. "
            f"For example: 'This table contains records of client activities related to debt collection. It includes details such as the client ID, debtor number, activity date, and notes made by collectors. This data can be used to track collection efforts, analyze collector performance, and understand client interactions over time.'\n"
            f"Do not use any special characters "
            f"Columns:\n{columns_description}"
        )
        escaped_prompt = prompt.replace("'", "''")

        ai_result = spark.sql(f"""
            SELECT ai_query(
              'databricks-llama-4-maverick',
              '{escaped_prompt}'
            ) AS comment
        """).collect()[0]['comment']

        escaped_ai_result = sanitize_comment(ai_result)

        table_info = spark.sql(f"DESCRIBE TABLE EXTENDED `{catalog}`.`{schema}`.`{table}`")
        obj_type = None
        is_streaming = False
        for row in table_info.collect():
            if row['col_name'] == 'Type':
                obj_type = str(row['data_type']).upper()
                if 'STREAMING' in obj_type:
                    is_streaming = True
                break

        # Try to update comment, and if a parse error occurs, try quoting the table name differently
        updated = False
        if is_streaming:
            print(f"[SKIPPED] {schema}.{table} is a streaming table. Cannot update comment via SQL.\n"
                  f"Suggested comment:\n{ai_result}\n")
        elif obj_type and 'VIEW' in obj_type:
            print(f"[SKIPPED] {schema}.{table} is a VIEW. COMMENT ON VIEW not supported.\n"
                  f"Suggested comment:\n{ai_result}\n")
        elif obj_type and not ('VIEW' in obj_type or 'STREAMING' in obj_type):
            try:
                # Try with fully quoted identifiers
                spark.sql(f"COMMENT ON TABLE `{catalog}`.`{schema}`.`{table}` IS '{escaped_ai_result}'")
                print(f"Updated comment for table {schema}.{table}")
                updated = True
            except Exception as e:
                error_msg = str(e)
                # If parse error, try with unquoted table name if it is a valid identifier
                if "[PARSE_SYNTAX_ERROR]" in error_msg and is_valid_identifier(table):
                    try:
                        spark.sql(f"COMMENT ON TABLE {catalog}.{schema}.{table} IS '{escaped_ai_result}'")
                        print(f"Updated comment for table {schema}.{table} (unquoted fallback)")
                        updated = True
                    except Exception as e2:
                        print(f"[ERROR] Failed to update comment for {schema}.{table} (unquoted fallback): {e2}")
                else:
                    print(f"[ERROR] Failed to update comment for {schema}.{table}: {e}")
            if not updated:
                print(f"[SKIPPED] {schema}.{table} could not be updated due to syntax issues. Suggested comment:\n{ai_result}\n")
        else:
            print(f"[SKIPPED] {schema}.{table} has unsupported type {obj_type}. Skipping.")

        processed_tables.append((schema, table))

if processed_tables:
    escaped_table_schema = []
    for schema, table in processed_tables:
        esc_schema = schema.replace("'", "''")
        esc_table = table.replace("'", "''")
        escaped_table_schema.append(f"(table_schema = '{esc_schema}' AND table_name = '{esc_table}')")
    where_clause = " OR ".join(escaped_table_schema)
    info_schema_comments = spark.sql(f"""
        SELECT table_schema, table_name, comment
        FROM {catalog}.information_schema.tables
        WHERE table_catalog = '{catalog}'
          AND ({where_clause})
    """)
    display(info_schema_comments)
else:
    print('No tables were processed.')

# Bulk AI Generate all Column-level descriptions for a specified Catalog and Schema

In [0]:
import re

catalog = 'sandbox'
schemas = ['bronze', 'silver', 'gold']

def escape_sql_string(s):
    if s is None:
        return None
    s = s.replace("'", "''")
    s = re.sub(r"(\w)''(\w)", r"\1\2", s)
    s = s.replace("\n", " ").replace("\r", " ")
    return s.strip()

def sanitize_for_sql_comment(s):
    # Remove or replace problematic single quotes and special characters that can break SQL parsing
    if s is None:
        return None
    # Replace double single quotes with single quote for clarity, then remove single quotes inside quoted examples
    s = re.sub(r"''([A-Z]+)''", r"\1", s)
    # Remove any remaining single quotes inside the comment
    s = s.replace("'", "")
    # Optionally, remove or replace other problematic characters if needed
    return s

def is_valid_identifier(name):
    return re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', name) is not None

processed_columns = []

for schema in schemas:
    tables_df = spark.sql(f"""
    SELECT DISTINCT table_name
    FROM {catalog}.information_schema.tables
    WHERE table_catalog = '{catalog}'
      AND table_schema = '{schema}'
      AND lower(table_name) NOT LIKE 'information_schema%'
      AND lower(table_name) NOT LIKE 'sys%'
      AND lower(table_name) NOT LIKE 'system%'
    """)
    tables = [row['table_name'] for row in tables_df.collect()]
    print(f"Found tables in {schema}: {tables}")

    for table in tables:
        table_info = spark.sql(f"""
            DESCRIBE TABLE EXTENDED `{catalog}`.`{schema}`.`{table}`
        """)
        is_streaming = False
        for row in table_info.collect():
            if row['col_name'] == 'Type' and 'STREAMING' in str(row['data_type']).upper():
                is_streaming = True
                break

        table_comment_df = spark.sql(f"""
            SELECT comment FROM {catalog}.information_schema.tables
            WHERE table_catalog = '{catalog}'
              AND table_schema = '{schema}'
              AND table_name = '{table}'
        """)
        table_comment = table_comment_df.collect()[0]['comment'] if table_comment_df.count() > 0 else None

        columns_df = spark.sql(f"""
            SELECT column_name, data_type, is_nullable
            FROM {catalog}.information_schema.columns
            WHERE table_catalog = '{catalog}'
              AND table_schema = '{schema}'
              AND table_name = '{table}'
        """)
        columns = columns_df.collect()

        for col in columns:
            col_name = col['column_name']
            col_type = col['data_type']
            col_nullable = col['is_nullable']

            prompt = (
                f"You are a data documentation assistant. Given the following table and column metadata, "
                f"infer the meaning and typical use of the column. Write a concise, natural-language "
                f"description of the column's contents and intent, not just a restatement of the name or type. "
                f"Start your description with 'This column...'. "
                f"Do not use any special characters "
                f"Table: {table}\n"
                + (f"Table description: {table_comment}\n" if table_comment else "")
                + f"Column: {col_name} ({col_type}, {'nullable' if col_nullable == 'YES' else 'not nullable'})"
            )

            escaped_prompt = escape_sql_string(prompt)
            ai_result_df = spark.sql(f"""
                SELECT ai_query(
                    'databricks-llama-4-maverick',
                    '{escaped_prompt}'
                ) AS comment
            """)
            ai_result = ai_result_df.collect()[0]['comment']
            # Sanitize the AI result to avoid SQL parse errors due to problematic quoting
            sanitized_ai_result = sanitize_for_sql_comment(ai_result)
            escaped_ai_result = escape_sql_string(sanitized_ai_result)

            if is_streaming:
                print(f"[SKIPPED] {schema}.{table}.{col_name} is a streaming table column. You cannot update its comment via SQL.\n"
                      f"To update the comment, edit the column definition in your Lakeflow Declarative Pipeline.\n"
                      f"Suggested comment:\n{ai_result}\n")
            elif escaped_ai_result and col_name:
                updated = False
                try:
                    spark.sql(f"ALTER TABLE `{catalog}`.`{schema}`.`{table}` ALTER COLUMN `{col_name}` COMMENT '{escaped_ai_result}'")
                    print(f"Updated comment for {schema}.{table}.{col_name}")
                    updated = True
                except Exception as e:
                    error_msg = str(e)
                    # Try unquoted fallback if parse error and identifiers are valid
                    if "[PARSE_SYNTAX_ERROR]" in error_msg and is_valid_identifier(table) and is_valid_identifier(col_name):
                        try:
                            spark.sql(f"ALTER TABLE {catalog}.{schema}.{table} ALTER COLUMN {col_name} COMMENT '{escaped_ai_result}'")
                            print(f"Updated comment for {schema}.{table}.{col_name} (unquoted fallback)")
                            updated = True
                        except Exception as e2:
                            print(f"[ERROR] Failed to update comment for {schema}.{table}.{col_name} (unquoted fallback): {e2}")
                    else:
                        print(f"[ERROR] Failed to update comment for {schema}.{table}.{col_name}: {e}")
                if updated:
                    processed_columns.append((schema, table, col_name))
            else:
                print(f"[SKIPPED] {schema}.{table}.{col_name} - No comment generated or invalid column name.")

if processed_columns:
    where_clause = " OR ".join(
        [f"(table_schema = '{escape_sql_string(s)}' AND table_name = '{escape_sql_string(t)}' AND column_name = '{escape_sql_string(c)}')"
         for s, t, c in processed_columns]
    )
    info_schema_col_comments = spark.sql(f"""
        SELECT table_schema, table_name, column_name, comment
        FROM {catalog}.information_schema.columns
        WHERE table_catalog = '{catalog}'
          AND ({where_clause})
    """)
    display(info_schema_col_comments)
else:
    print('No columns were processed.')

# Remove all bulk generated table descriptions

In [0]:
catalog = 'sandbox'
schemas = ['bronze', 'silver', 'gold']

processed_tables = []

for schema in schemas:
    # Get all tables
    tables_df = spark.sql(f"""
    SELECT DISTINCT table_name
    FROM {catalog}.information_schema.columns
    WHERE table_catalog = '{catalog}'
    AND table_schema = '{schema}'
    """)
    tables = [row['table_name'] for row in tables_df.collect()]

    for table in tables:
        # Detect if the table is a streaming table
        table_info = spark.sql(f"""
            DESCRIBE TABLE EXTENDED {catalog}.{schema}.{table}
        """)
        is_streaming = False
        for row in table_info.collect():
            if row['col_name'] == 'Type' and 'STREAMING' in str(row['data_type']).upper():
                is_streaming = True
                break

        if is_streaming:
            print(f"[SKIPPED] {schema}.{table} is a streaming table. You cannot update its comment via SQL.\n"
                  f"To remove the comment, edit the table definition in your Lakeflow Declarative Pipeline.\n")
        else:
            # Remove comment for regular tables
            spark.sql(f"COMMENT ON TABLE {catalog}.{schema}.{table} IS NULL")
            print(f"Removed comment for table {schema}.{table}")
        processed_tables.append((schema, table))

# At the end, show the table and comment from the information schema for processed tables
if processed_tables:
    # Escape single quotes in table and schema names for SQL
    escaped_table_schema = []
    for schema, table in processed_tables:
        esc_schema = schema.replace("'", "''")
        esc_table = table.replace("'", "''")
        escaped_table_schema.append(f"(table_schema = '{esc_schema}' AND table_name = '{esc_table}')")
    where_clause = " OR ".join(escaped_table_schema)
    info_schema_comments = spark.sql(f"""
        SELECT table_schema, table_name, comment
        FROM {catalog}.information_schema.tables
        WHERE table_catalog = '{catalog}'
        AND ({where_clause})
    """)
    display(info_schema_comments)
else:
    print('No tables were processed.')

# Bulk Remove all Column-level descriptions for a specified Catalog and Schema

In [0]:
import re

catalog = 'sandbox'
schemas = ['bronze', 'silver', 'gold']

def escape_sql_string(s):
    if s is None:
        return None
    s = s.replace("'", "''")
    s = re.sub(r"(\w)''(\w)", r"\1\2", s)
    s = s.replace("\n", " ").replace("\r", " ")
    return s.strip()

processed_columns = []

for schema in schemas:
    # Get all user tables in the schema excluding system tables
    tables_df = spark.sql(f"""
    SELECT DISTINCT table_name
    FROM {catalog}.information_schema.tables
    WHERE table_catalog = '{catalog}'
      AND table_schema = '{schema}'
      AND lower(table_name) NOT LIKE 'information_schema%'
      AND lower(table_name) NOT LIKE 'sys%'
      AND lower(table_name) NOT LIKE 'system%'
    """)
    tables = [row['table_name'] for row in tables_df.collect()]
    print(f"Found tables in {schema}: {tables}")

    for table in tables:
        # Check if the table is a streaming table
        table_info = spark.sql(f"""
            DESCRIBE TABLE EXTENDED `{catalog}`.`{schema}`.`{table}`
        """)
        is_streaming = False
        for row in table_info.collect():
            if row['col_name'] == 'Type' and 'STREAMING' in str(row['data_type']).upper():
                is_streaming = True
                break

        # Fetch all columns
        columns_df = spark.sql(f"""
            SELECT column_name
            FROM {catalog}.information_schema.columns
            WHERE table_catalog = '{catalog}'
              AND table_schema = '{schema}'
              AND table_name = '{table}'
        """)
        columns = columns_df.collect()

        for col in columns:
            col_name = col['column_name']

            if is_streaming:
                print(f"[SKIPPED] {schema}.{table}.{col_name} is a streaming table column. You cannot update its comment via SQL.\n"
                      f"To remove the comment, edit the column definition in your Lakeflow Declarative Pipeline.\n")
            elif col_name:
                spark.sql(f"ALTER TABLE `{catalog}`.`{schema}`.`{table}` ALTER COLUMN `{col_name}` COMMENT ''")
                print(f"Removed comment for {schema}.{table}.{col_name}")
                processed_columns.append((schema, table, col_name))
            else:
                print(f"[SKIPPED] {schema}.{table}.{col_name} - Invalid column name.")

# Display updated column comments
if processed_columns:
    where_clause = " OR ".join(
        [f"(table_schema = '{escape_sql_string(s)}' AND table_name = '{escape_sql_string(t)}' AND column_name = '{escape_sql_string(c)}')"
         for s, t, c in processed_columns]
    )
    info_schema_col_comments = spark.sql(f"""
        SELECT table_schema, table_name, column_name, comment
        FROM {catalog}.information_schema.columns
        WHERE table_catalog = '{catalog}'
          AND ({where_clause})
    """)
    display(info_schema_col_comments)
else:
    print('No columns were processed.')