# Data Processing and Unity Catalog Writing Utilities

This notebook contains utilities for processing data and writing to Unity Catalog.

In [0]:
import re
import pandas as pd
from typing import Dict, Any, List
from datetime import datetime, timezone
from pyspark.sql import functions as F

In [0]:
def banner(txt: str):
    print("\n" + "="*22 + f" {txt} " + "="*22 + "\n")

def _flatten_obj(x: Any, prefix="") -> Dict[str, Any]:
    """Flatten nested dict/list into a single-level dict with dotted keys."""
    out = {}
    if isinstance(x, dict):
        for k, v in x.items():
            kk = f"{prefix}{k}" if not prefix else f"{prefix}.{k}"
            out.update(_flatten_obj(v, kk))
    elif isinstance(x, list):
        for i, v in enumerate(x):
            kk = f"{prefix}[{i}]"
            out.update(_flatten_obj(v, kk))
    else:
        out[prefix or "_value"] = x
    return out

def _dedup_columns(cols: List[str]) -> List[str]:
    """Deduplicate and sanitize column names safely for Spark + Unity Catalog + Delta."""
    seen = {}
    deduped = []
    for orig in cols:
        # Normalize case
        c = orig.lower()
        # Replace invalid or risky characters with underscores
        c = (c.replace(".", "_")
               .replace("[", "_")
               .replace("]", "_")
               .replace(" ", "_")
               .replace(":", "_")
               .replace("-", "_")
               .replace("=", "_")
               .replace("{", "_")
               .replace("}", "_")
               .replace("(", "_")
               .replace(")", "_")
               .replace(";", "_")
               .replace(",", "_")
               .replace("\"", "_")
               .replace("'", "_")
               .replace("\n", "_")
               .replace("\t", "_"))
        # Collapse duplicate underscores and trim
        c = re.sub(r"_+", "_", c).strip("_")
        # Deduplicate after normalization
        if c in seen:
            seen[c] += 1
            c = f"{c}__{seen[c]}"
        else:
            seen[c] = 0
        deduped.append(c)
    return deduped

def _convert_complex_to_json(obj: Any, fields_to_preserve: List[str] = None) -> Any:
    """Convert complex nested fields to JSON strings to avoid excessive flattening."""
    import json
    
    if not isinstance(obj, dict):
        return obj
    
    # Default fields that should be kept as JSON strings
    if fields_to_preserve is None:
        fields_to_preserve = [
            'columns', 'properties', 'storage_descriptor', 'partition_keys',
            'view_definition', 'table_constraints', 'row_filter', 'column_mask',
            'data_source_format', 'table_properties', 'schema_properties',
            'catalog_properties', 'storage_location', 'encryption_details',
            'delta_runtime_properties_kvpairs', 'effective_predictive_optimization_flag',
            'pipelines', 'metastore_id'
        ]
    
    result = {}
    for key, value in obj.items():
        # Check if this field should be preserved as JSON
        if key in fields_to_preserve and isinstance(value, (dict, list)):
            # Convert to JSON string
            try:
                result[f"{key}_json"] = json.dumps(value)
            except:
                result[key] = str(value)
        else:
            result[key] = value
    
    return result

def _preprocess_uc_records(records: List[Dict[str, Any]], table_type: str) -> List[Dict[str, Any]]:
    """Preprocess Unity Catalog records to prevent excessive column explosion."""
    if table_type == 'databricks_table':
        # For UC tables, convert complex nested fields to JSON
        return [_convert_complex_to_json(r) for r in records]
    elif table_type == 'databricks_schema':
        # For UC schemas, keep simple structure (usually already simple)
        return [_convert_complex_to_json(r, ['properties', 'schema_properties']) for r in records]
    else:
        return records

def normalize_records(records: List[Dict[str, Any]]) -> pd.DataFrame:
    """Flatten JSON rows to a uniform pandas DataFrame with unique/safe columns."""
    if not records:
        return pd.DataFrame([])
    flat_rows = [_flatten_obj(r) for r in records]
    df = pd.DataFrame(flat_rows)
    df.columns = _dedup_columns(list(df.columns))
    return df

def to_spark_df(records: List[Dict[str, Any]], spark, table_type: str = None):
    """Use pandas→spark to avoid sparkContext usage on shared clusters."""
    if not records:
        # Return a more explicit empty DataFrame with proper schema
        return spark.createDataFrame([], "struct<_empty:string>")
    
    # Preprocess UC records if table_type is specified
    if table_type:
        records = _preprocess_uc_records(records, table_type)
    
    pdf = normalize_records(records)
    if pdf.empty:
        # If normalization resulted in empty DataFrame, create with basic schema
        return spark.createDataFrame([], "struct<_empty:string>")
    else:
        return spark.createDataFrame(pdf)

In [0]:
def ensure_uc_sink(catalog: str, schema: str, spark):
    """Ensure catalog and schema exist in Unity Catalog."""
    spark.sql(f"CREATE CATALOG IF NOT EXISTS `{catalog}`")
    spark.sql(f"CREATE SCHEMA  IF NOT EXISTS `{catalog}`.`{schema}`")

In [0]:
def write_single_raw_table(
    key: str,
    records: List[Dict[str, Any]],
    catalog: str,
    schema: str,
    workspace_url: str,
    start_ts: str,
    spark
):
    """Write a single raw data table to Unity Catalog with robust schema handling and batch writing support."""
    tbl = f"`{catalog}`.`{schema}`.`raw_{key}`"
    
    # Check for empty datasets
    if not records:
        if SKIP_EMPTY_DATASETS:
            print(f"[SKIP] raw_{key} → {tbl} (empty dataset)")
            return
        else:
            # Create a minimal empty DataFrame to avoid schema inference errors
            empty_df = spark.createDataFrame([], "struct<_empty:string>")
            empty_df = (empty_df
                       .withColumn("_collected_at", F.lit(start_ts))
                       .withColumn("_workspace", F.lit(workspace_url)))
            try:
                empty_df.write.mode(WRITE_RAW_MODE).saveAsTable(tbl)
                print(f"[WRITE] raw_{key} → {tbl} (empty table created)")
                return
            except Exception as e:
                print(f"[WRITE-FAIL] raw_{key}: Failed to create empty table: {e}")
                return
    
    # Check if we need batch writing (for very large datasets to avoid protobuf size limits)
    total_records = len(records)
    use_batch_writing = total_records > WRITE_BATCH_THRESHOLD
    
    if use_batch_writing:
        print(f"[BATCH-WRITE] raw_{key}: Writing {total_records} rows in batches of {WRITE_BATCH_SIZE}...")
        
        # Write in batches
        num_batches = (total_records + WRITE_BATCH_SIZE - 1) // WRITE_BATCH_SIZE
        for batch_idx in range(num_batches):
            start_idx = batch_idx * WRITE_BATCH_SIZE
            end_idx = min(start_idx + WRITE_BATCH_SIZE, total_records)
            batch_records = records[start_idx:end_idx]
            
            try:
                # Create Spark DataFrame for this batch
                sdf = (to_spark_df(batch_records, spark, table_type=key)
                       .withColumn("_collected_at", F.lit(start_ts))
                       .withColumn("_workspace", F.lit(workspace_url)))
                
                # First batch: use configured write mode, subsequent batches: append
                write_mode = WRITE_RAW_MODE if batch_idx == 0 else "append"
                writer = sdf.write.mode(write_mode)
                
                # Add Delta Lake schema evolution options
                if ENABLE_MERGE_SCHEMA or batch_idx > 0:  # Always merge schema after first batch
                    writer = writer.option("mergeSchema", "true")
                
                if ENABLE_OVERWRITE_SCHEMA and batch_idx == 0:
                    writer = writer.option("overwriteSchema", "true")
                
                writer.saveAsTable(tbl)
                print(f"[BATCH {batch_idx+1}/{num_batches}] raw_{key} → {tbl} ({len(batch_records)} rows)")
                
            except Exception as e:
                print(f"[WRITE-FAIL] raw_{key} (batch {batch_idx+1}/{num_batches}): {e}")
                if VERBOSE_SCHEMA_ERRORS:
                    import traceback
                    print(f"[ERROR-DETAIL] raw_{key} (batch {batch_idx+1}):")
                    print("  " + "\n  ".join(traceback.format_exc().split("\n")[-10:]))
                # If first batch fails, stop; otherwise continue with remaining batches
                if batch_idx == 0:
                    return
        
        print(f"[WRITE-COMPLETE] raw_{key} → {tbl} ({total_records} rows total)")
        return
    
    # Standard single-write path (for smaller datasets)
    try:
        # Create Spark DataFrame (with preprocessing for UC tables/schemas)
        sdf = (to_spark_df(records, spark, table_type=key)
               .withColumn("_collected_at", F.lit(start_ts))
               .withColumn("_workspace", F.lit(workspace_url)))
        
        # Configure write options based on schema handling settings
        writer = sdf.write.mode(WRITE_RAW_MODE)
        
        # Add Delta Lake schema evolution options
        if ENABLE_MERGE_SCHEMA:
            writer = writer.option("mergeSchema", "true")
        
        if ENABLE_OVERWRITE_SCHEMA:
            writer = writer.option("overwriteSchema", "true")
        
        # First attempt: try with configured options
        try:
            writer.saveAsTable(tbl)
            print(f"[WRITE] raw_{key} → {tbl} ({len(records)} rows)")
            return
            
        except Exception as e1:
            error_msg = str(e1).lower()
            
            # Check if this is a protobuf size error - if so, retry with batch writing
            if "protobuf" in error_msg or "negative size" in error_msg:
                print(f"[PROTOBUF-ERROR] raw_{key}: Message too large, retrying with batch writing...")
                # Recursively call with smaller threshold to force batching
                original_threshold = WRITE_BATCH_THRESHOLD
                globals()['WRITE_BATCH_THRESHOLD'] = 100  # Force batching
                write_single_raw_table(key, records, catalog, schema, workspace_url, start_ts, spark)
                globals()['WRITE_BATCH_THRESHOLD'] = original_threshold  # Restore
                return
            
            # Handle specific schema mismatch errors
            if any(phrase in error_msg for phrase in [
                "schema mismatch", "failed to merge", "_legacy_error_temp_delta",
                "cannot_infer_empty_schema", "delta_failed_to_merge_fields"
            ]):
                if VERBOSE_SCHEMA_ERRORS:
                    print(f"[SCHEMA-ISSUE] raw_{key}: Schema evolution error detected")
                    print(f"  Error: {error_msg[:200]}...")
                
                # Try fallback strategies
                if FALLBACK_TO_OVERWRITE and not ENABLE_OVERWRITE_SCHEMA:
                    try:
                        print(f"[RETRY] raw_{key} → Attempting with overwriteSchema=true")
                        fallback_writer = sdf.write.mode(WRITE_RAW_MODE).option("overwriteSchema", "true")
                        fallback_writer.saveAsTable(tbl)
                        print(f"[WRITE] raw_{key} → {tbl} ({len(records)} rows) [schema overwritten]")
                        return
                    except Exception as e2:
                        if VERBOSE_SCHEMA_ERRORS:
                            print(f"[SCHEMA-FAIL] raw_{key}: Overwrite schema also failed: {str(e2)[:200]}...")
                
                # If schema operations fail and we're in overwrite mode, try dropping the table first
                if WRITE_RAW_MODE == "overwrite":
                    try:
                        print(f"[RETRY] raw_{key} → Attempting to drop and recreate table")
                        spark.sql(f"DROP TABLE IF EXISTS {tbl}")
                        sdf.write.mode("overwrite").saveAsTable(tbl)
                        print(f"[WRITE] raw_{key} → {tbl} ({len(records)} rows) [table recreated]")
                        return
                    except Exception as e3:
                        print(f"[WRITE-FAIL] raw_{key}: All retry attempts failed: {str(e3)[:200]}...")
                        return
            
            # Re-raise the original exception if it's not a schema issue
            raise e1
            
    except Exception as e:
        print(f"[WRITE-FAIL] raw_{key}: {e}")
        if VERBOSE_SCHEMA_ERRORS:
            import traceback
            print(f"[ERROR-DETAIL] raw_{key}:")
            print("  " + "\n  ".join(traceback.format_exc().split("\n")[-10:]))


In [0]:
def write_raw_tables(
    raw_dict: Dict[str, List[Dict[str, Any]]],
    catalog: str,
    schema: str,
    workspace_url: str,
    start_ts: str,
    spark
):
    """Write raw data tables to Unity Catalog."""
    banner("2/4 Write RAW to UC")
    for key, records in raw_dict.items():
        write_single_raw_table(key, records, catalog, schema, workspace_url, start_ts, spark)

In [0]:
def build_and_write_summary(
    counts: Dict[str, int],
    catalog: str,
    schema: str,
    spark
):
    """Build and write the summary table with categorized counts."""
    banner("4/4 Write Summary")
    
    # UC pieces get human-friendly names
    mapping_fixed = {
        "uc_catalogs": ("Metastore", "UC Catalogs"),
        "uc_schemas": ("Metastore", "UC Schemas"),
        "uc_tables": ("Metastore", "UC Tables"),
        "managed_tables": ("Metastore", "UC Tables (Managed)"),
        "external_tables": ("Metastore", "UC Tables (External)"),
        "dbfs_mount_points": ("DBFS/Mounts", "Mount Points"),
    }
    
    # Auto-map for API endpoints
    mapping_auto = {}
    for k in API_ENDPOINTS.keys():
        disp = k.replace("databricks_", "").replace("_", " ").title()
        mapping_auto[k] = ("Auto", disp)

    rows = []
    for k, v in counts.items():
        cat, obj = mapping_fixed.get(k, mapping_auto.get(k, ("Other", k)))
        rows.append({
            "Category": cat,
            "Object": obj,
            "Count": int(v),
            "To_be_Migrated": "Y" if cat not in ["Workspace", "MLflow"] else "N"
        })

    pdf = pd.DataFrame(rows)
    sdf = spark.createDataFrame(pdf)
    tbl = f"`{catalog}`.`{schema}`.`workspace_scan_summary`"
    
    # Use robust writing for summary table too
    try:
        writer = sdf.write.mode(WRITE_SUMMARY_MODE)
        if ENABLE_MERGE_SCHEMA:
            writer = writer.option("mergeSchema", "true")
        writer.saveAsTable(tbl)
        print(f"[WRITE] summary → {tbl}")
    except Exception as e:
        error_msg = str(e).lower()
        if "schema" in error_msg and FALLBACK_TO_OVERWRITE:
            try:
                print(f"[RETRY] summary → Attempting with overwriteSchema=true")
                sdf.write.mode(WRITE_SUMMARY_MODE).option("overwriteSchema", "true").saveAsTable(tbl)
                print(f"[WRITE] summary → {tbl} [schema overwritten]")
            except Exception as e2:
                print(f"[WRITE-FAIL] summary: {e2}")
                raise e2
        else:
            print(f"[WRITE-FAIL] summary: {e}")
            raise e
    
    return sdf

In [0]:
class DataProcessor:
    """Main data processing class for the workspace assessment."""
    
    def __init__(self, spark, workspace_url: str, start_ts: str, catalog: str = None, schema: str = None):
        self.spark = spark
        self.workspace_url = workspace_url
        self.start_ts = start_ts
        self.catalog = catalog
        self.schema = schema
    
    def write_single_raw_table(self, key: str, records: List[Dict[str, Any]]):
        """Write a single raw table immediately."""
        if not self.catalog or not self.schema:
            raise ValueError("Catalog and schema must be set for streaming writes")
        
        write_single_raw_table(
            key, records, self.catalog, self.schema, 
            self.workspace_url, self.start_ts, self.spark
        )
    
    def process_and_write_all(
        self,
        raw_data: Dict[str, List[Dict[str, Any]]],
        uc_counts: Dict[str, int],
        catalog: str,
        schema: str
    ):
        """Process and write all data (raw tables + summary)."""
        # Ensure UC destination exists
        # ensure_uc_sink(catalog, schema, self.spark) # Predefine Your Catalog and Schema Before Hand
        
        # Write raw tables
        write_raw_tables(raw_data, catalog, schema, self.workspace_url, self.start_ts, self.spark)
        
        # Combine counts and write summary
        all_counts = {key: len(records) for key, records in raw_data.items()}
        all_counts.update(uc_counts)
        
        summary_df = build_and_write_summary(all_counts, catalog, schema, self.spark)
        return summary_df