In [0]:
from pyspark.sql.functions import lit
from pyspark.sql.utils import AnalysisException


#Store metadata of data within tables
This includes table verions, operations/tranmsformations performed, etc.

In [0]:
# Define your catalog name here
catalog_name = "online_retail"  # Change this to your actual catalog name

metadata_df = None  # Initialize empty DataFrame for union

# Get all schemas in the catalog
schema_list = [row.databaseName for row in spark.sql(f"SHOW SCHEMAS IN {catalog_name}").collect() if row.databaseName != 'information_schema']

# Loop through schemas
for schema in schema_list:
    # Get all tables in the schema
    table_list = [row.tableName for row in spark.sql(f"SHOW TABLES IN {catalog_name}.{schema}").collect() if row.tableName != '_sqldf' and "metadata" not in row.tableName]
    
    for table in table_list:
        try:
            # Run DESCRIBE HISTORY on each table
            history_df = spark.sql(f"DESCRIBE HISTORY {catalog_name}.{schema}.{table}")
            
            # Add catalog, schema, and table name columns
            history_df = history_df.withColumn("catalog_name", lit(catalog_name)) \
                                   .withColumn("schema_name", lit(schema)) \
                                   .withColumn("table_name", lit(table))

            # Reorder columns: catalog_name, schema_name, table_name first
            column_order = ["catalog_name", "schema_name", "table_name"] + [col for col in history_df.columns if col not in ["catalog_name", "schema_name", "table_name"]]
            history_df = history_df.select(*column_order)
            
            # Append to final DataFrame
            metadata_df = history_df if metadata_df is None else metadata_df.union(history_df)

        except AnalysisException as e:
            print(f"Skipping {catalog_name}.{schema}.{table} due to error: {e}")

metadata_df.write.mode("overwrite").saveAsTable("online_retail.silver.dataset_metadata")


#Store metadata of tables
This gives details of tables like format, storage location, table size, partitioned columns, etc.

In [0]:
# Define your catalog name here
catalog_name = "online_retail"  # Change this to your actual catalog name

table_detail_metadata_df = None  # Initialize empty DataFrame for union

# Get all schemas in the catalog
schema_list = [row.databaseName for row in spark.sql(f"SHOW SCHEMAS IN {catalog_name}").collect() if row.databaseName != 'information_schema']

# Loop through schemas
for schema in schema_list:
    # Get all tables in the schema
    table_list = [row.tableName for row in spark.sql(f"SHOW TABLES IN {catalog_name}.{schema}").collect() if row.tableName != '_sqldf' and "metadata" not in row.tableName]
    
    for table in table_list:
        try:
            # Run DESCRIBE HISTORY on each table
            detail_df = spark.sql(f"DESCRIBE DETAIL {catalog_name}.{schema}.{table}")
            
            # Add catalog, schema, and table name columns
            detail_df = detail_df.withColumn("catalog_name", lit(catalog_name)) \
                                   .withColumn("schema_name", lit(schema)) \
                                   .withColumn("table_name", lit(table))

            # Reorder columns: catalog_name, schema_name, table_name first
            column_order = ["catalog_name", "schema_name", "table_name"] + [col for col in detail_df.columns if col not in ["catalog_name", "schema_name", "table_name"]]
            detail_df = detail_df.select(*column_order)
            
            # Append to final DataFrame
            table_detail_metadata_df = detail_df if table_detail_metadata_df is None else table_detail_metadata_df.union(detail_df)

        except AnalysisException as e:
            print(f"Skipping {catalog_name}.{schema}.{table} due to error: {e}")


table_detail_metadata_df.write.mode("overwrite").saveAsTable("online_retail.silver.table_detail_metadata")


#Store metadata of columns within tables
This gives datatypes of columns within each table, comments, etc.

In [0]:
# Define your catalog name here
catalog_name = "online_retail"  # Change this to your actual catalog name

table_extended_metadata_df = None  # Initialize empty DataFrame for union

# Get all schemas in the catalog
schema_list = [row.databaseName for row in spark.sql(f"SHOW SCHEMAS IN {catalog_name}").collect() if row.databaseName != 'information_schema']

# Loop through schemas
for schema in schema_list:
    # Get all tables in the schema
    table_list = [row.tableName for row in spark.sql(f"SHOW TABLES IN {catalog_name}.{schema}").collect() if row.tableName != '_sqldf' and "metadata" not in row.tableName]
    
    for table in table_list:
        try:
            # Run DESCRIBE HISTORY on each table
            extended_df = spark.sql(f"DESCRIBE EXTENDED {catalog_name}.{schema}.{table}")
            
            # Add catalog, schema, and table name columns
            extended_df = extended_df.withColumn("catalog_name", lit(catalog_name)) \
                                   .withColumn("schema_name", lit(schema)) \
                                   .withColumn("table_name", lit(table))

            # Reorder columns: catalog_name, schema_name, table_name first
            column_order = ["catalog_name", "schema_name", "table_name"] + [col for col in extended_df.columns if col not in ["catalog_name", "schema_name", "table_name"]]
            extended_df = extended_df.select(*column_order)
            
            # Append to final DataFrame
            table_extended_metadata_df = extended_df if table_extended_metadata_df is None else table_extended_metadata_df.union(extended_df)

        except AnalysisException as e:
            print(f"Skipping {catalog_name}.{schema}.{table} due to error: {e}")


table_extended_metadata_df.write.mode("overwrite").saveAsTable("online_retail.silver.table_extended_metadata")