In [0]:
dbutils.widgets.text("environment", "")
environment = dbutils.widgets.get("environment")
dbutils.widgets.text("catalog_name", "")
catalog_name = dbutils.widgets.get("catalog_name")
dbutils.widgets.text("schema_name", "")
schema_name = dbutils.widgets.get("schema_name")
dbutils.widgets.text("table_name", "")
table_name = dbutils.widgets.get("table_name")

In [0]:
from pyspark.sql.types import MapType, ArrayType, StructType, IntegerType, LongType, StringType
from pyspark.sql import Row, DataFrame
from pyspark.sql.functions import col, when, count, approx_count_distinct, countDistinct, min, max, length, sum as spark_sum
from typing import List

# Function to compute column-level metrics for a given DataFrame
def compute_column_metrics(df: DataFrame, table_name: str, layer: str, timestamp, proc_date, operation, deleted, inserted, updated) -> DataFrame:
    """
    Computes column-level metrics for a given Spark DataFrame.

    Supported Metrics:
    ------------------
    - Null count for all columns
    - distinct count
    - Min and Max values (IntegerType, LongType)
    - Min, Max, and Total length (StringType)

    Parameters:
    -----------
    df : DataFrame
        Input DataFrame to be profiled.
    table_name : str
        Name of the table.
    layer : str
        Data layer from schema name (e.g., bronze, silver, gold).
    proc_date : date
        Operation date from Delta history.
    operation : str
        Type of operation (WRITE, MERGE, DELETE, etc.).
    deleted : int
        Number of records deleted.
    inserted : int
        Number of records inserted.
    updated : int
        Number of records updated.

    Returns:
    --------
    DataFrame
        A single-row DataFrame containing:
        - table metadata
        - array of column-level metric structs
    """

    schema = df.schema
    # simple_cols = [f.name for f in schema if not isinstance(f.dataType, (MapType, ArrayType, StructType))]
    # Extract all column names 
    simple_cols = [f.name for f in schema]
    # Skip if no columns found
    if not simple_cols:
        return None
    
    # Compute null counts for each column
    null_exprs = [count(when(col(c).isNull(), 1)).alias(f"{c}__nulls") for c in simple_cols]

    # Compute distinct count for each column
    distinct_exprs = [countDistinct(col(c)).alias(f"{c}__distinct") for c in simple_cols]

    # Initialize expressions for numeric min/max and string metrics
    min_exprs = []
    max_exprs = []
    str_len_exprs = []
    str_sum_len_exprs = []

    # Loop over each column in schema to build additional expressions
    """
    Loop through each column in the DataFrame schema to build type-specific aggregation expressions.

    For each column in `simple_cols`:
    - If the column is of IntegerType or LongType:
        - Compute minimum and maximum values.
    - If the column is of StringType:
        - Compute minimum and maximum string lengths.
        - Compute total string length across all rows.

    All expressions are collected into `agg_exprs`, which is used to perform a aggregation
    across the DataFrame. The result is collected as a Row object (`agg_result`).
    """
    for field in schema:
        if field.name not in simple_cols:
            continue
        col_name = field.name
        dtype = field.dataType

        # Add min and max expressions for integer/long columns
        if isinstance(dtype, (IntegerType, LongType)):
            min_exprs.append(min(col(col_name)).alias(f"{col_name}__min"))
            max_exprs.append(max(col(col_name)).alias(f"{col_name}__max"))

        # Add min, max, length expressions for string columns
        elif isinstance(dtype, StringType):
            str_len_exprs.append(min(length(col(col_name))).alias(f"{col_name}__min_length"))
            str_len_exprs.append(max(length(col(col_name))).alias(f"{col_name}__max_length"))
            str_sum_len_exprs.append(spark_sum(length(col(col_name))).alias(f"{col_name}__total_length"))

    # Combine all aggregation expressions
    agg_exprs = null_exprs + distinct_exprs + min_exprs + max_exprs + str_len_exprs + str_sum_len_exprs

    # Perform aggregation on DataFrame and collect result
    agg_result = df.agg(*agg_exprs).collect()[0]

    # Build the column_metrics as an array of Row objects with all collected stats
    """
    Define the `column_metrics` list as an array of Row objects.

    For each column in `simple_cols`, generate:
    - Null count and distinct count (all types)
    - Min/max values for Integer/Long types
    - Min/max/total string length for StringType

    Each metric dictionary is converted to a Row and added to `column_metrics`.
    """
    column_metrics = []
    for field in schema:
        if field.name not in simple_cols:
            continue
        c = field.name
        dtype = field.dataType

        # Initialize default metrics structure for the column
        metrics = {
            "column_name": c,
            "null_count": agg_result[f"{c}__nulls"],
            "distinct_count": agg_result[f"{c}__distinct"],
            "min_value": None,
            "max_value": None,
            "min_length": None,
            "max_length": None,
            "total_length": None
        }

        # Populate numeric min,max for integer/long columns
        if isinstance(dtype, (IntegerType, LongType)):
            metrics["min_value"] = agg_result[f"{c}__min"]
            metrics["max_value"] = agg_result[f"{c}__max"]

        # Populate min, max, length metrics for string column
        if isinstance(dtype, StringType):
            metrics["min_length"] = agg_result[f"{c}__min_length"]
            metrics["max_length"] = agg_result[f"{c}__max_length"]
            metrics["total_length"] = agg_result[f"{c}__total_length"]

        column_metrics.append(Row(**metrics))

    # Define the schema of the final output DataFrame
    """
    Define the final output schema and structure the metrics results into a single-row DataFrame.

    - `result_schema`: Specifies metadata and column-level metrics structure.
    - `output_row`: A Row object capturing table-level and column-level metrics.
    - Returns: A DataFrame containing one row for the current table.
    """
    result_schema = """
        table_name string,
        layer string,
        timestamp timestamp,
        proc_date date,
        operation string,
        deleted_records long,
        inserted_records long,
        updated_records long,
        column_metrics array<struct<
            column_name: string,
            null_count: long,
            distinct_count: long,
            min_value: long,
            max_value: long,
            min_length: int,
            max_length: int,
            total_length: long
        >>
    """
    # Create a single row with results for the table
    output_row = Row(
        table_name=table_name,
        layer=layer,
        timestamp=timestamp,
        proc_date=proc_date,
        operation=operation,
        deleted_records=int(deleted or 0),
        inserted_records=int(inserted or 0),
        updated_records=int(updated or 0),
        column_metrics=column_metrics
    )
    # Return a DataFrame with the specified result schema
    return spark.createDataFrame([output_row], result_schema)


# Main execution
# Initialize empty final DataFrame
final_df: DataFrame = None

try:
    print(f" Processing table: {table_name}")
    full_table = f"{catalog_name}.{schema_name}.{table_name}"
    # Extract 'layer' (bronze/silver/gold) based on schema naming convention
    layer = schema_name.split("_")[2] if len(schema_name.split("_")) > 2 else "unknown"
    # Step 2: Get the latest Delta version history metadata for the table and metrics like numDeletedRows,numOutputRows,numTargetRowsUpdated
    """
    Retrieve the latest Delta version metadata for the table using DESCRIBE HISTORY.
    - Constructs a query to extract recent operation info (version, date, operation type).
    - Also fetches metrics: deleted, inserted, and updated row counts.
    - Returns the most recent entry (latest version) using ROW_NUMBER().
    """
    hist_query = f"""
        SELECT
          '{table_name}' AS table_name,
          '{layer}' AS layer,
          *
        except
          (rn)
        from
          (
            SELECT
              ROW_NUMBER() OVER (
                ORDER BY
                  version DESC
              ) AS rn,
              version,
              timestamp,
              date(timestamp) as proc_date,
              operation,
              operationMetrics.numDeletedRows as deleted_records,
              operationMetrics.numOutputRows as inserted_records,
              operationMetrics.numTargetRowsUpdated as updated_records
            FROM(
              (DESCRIBE HISTORY {catalog_name}.{schema_name}.{table_name})
          )
          WHERE operation NOT IN ('VACUUM END', 'VACUUM START', 'OPTIMIZE', 'RESTORE')
          )
        WHERE
          rn = 1
        """
    hist_row = spark.sql(hist_query).first()
    # If table has no version history, skip
    if not hist_row:
        print(f" No version history found for {full_table}")
        pass

    # Extract metadata from latest version row
    version = hist_row["version"]
    timestamp = hist_row["timestamp"]
    proc_date = hist_row["proc_date"]
    operation = hist_row["operation"]
    deleted = hist_row["deleted_records"]
    inserted = hist_row["inserted_records"]
    updated = hist_row["updated_records"]
    # Step 3: Read the table at the specific Delta version
    df_table = spark.read.format("delta").option("versionAsOf", version).table(full_table).cache()
    # 3. calling compute_column_metrics function
    df_profile = compute_column_metrics(
        df_table,
        table_name=table_name,
        layer=layer,
        timestamp = timestamp,
        proc_date=proc_date,
        operation=operation,
        deleted=deleted,
        inserted=inserted,
        updated=updated
    )
    df_table.unpersist()
    # Step 5: Append metrics result to final_df
    if df_profile:
        final_df = df_profile if final_df is None else final_df.unionByName(df_profile, allowMissingColumns=True)
except Exception as e:
    print(f" Error processing {schema_name}.{table_name}: {e}")

# Step 6: Write final metrics result to monitoring_logs table in append mode
if final_df is not None:
    final_df.write.mode("append").saveAsTable(f"{catalog_name}.default.monitoring_logs")
    print("\n Monitoring logs written successfully.")
else:
    print("\n No metrics data collected.")

In [0]:
spark.sql(f"""select * from {catalog_name}.default.monitoring_logs""").display()