In [0]:
%pip install pandas openpyxl

In [0]:
# Librerías necesarias
import pandas as pd
import json
import uuid
from datetime import datetime
from pyspark.sql.functions import col, lit, current_timestamp, udf, split
from pyspark.sql.types import StringType, BooleanType, ArrayType
from delta.tables import DeltaTable

# 1. Widgets y constantes
dbutils.widgets.text("catalog_name", "workspace", "Catálogo de UC donde residen las tablas")
dbutils.widgets.text("schema_name", "framework_dq", "Esquema de UC donde residen las tablas")
dbutils.widgets.text("tables_sheet", "Tablas", "Hoja Excel: Tablas")
dbutils.widgets.text("rules_sheet", "Reglas", "Hoja Excel: Reglas")
dbutils.widgets.text("validations_sheet", "Validaciones", "Hoja Excel: Validaciones")
dbutils.widgets.text("excel_file_path", "/Volumes/workspace/framework_dq/configexcel/configValidaciones.xlsx", "Ruta del Excel")

CATALOG = dbutils.widgets.get("catalog_name")
SCHEMA = dbutils.widgets.get("schema_name")
CONFIG_SHEET = dbutils.widgets.get("tables_sheet")
LIBRARY_SHEET = dbutils.widgets.get("rules_sheet")
VALIDATIONS_SHEET = dbutils.widgets.get("validations_sheet")
EXCEL_PATH = dbutils.widgets.get("excel_file_path")

TABLE_CONFIG = f"{CATALOG}.{SCHEMA}.dq_tables_config"
RULE_LIB_TABLE = f"{CATALOG}.{SCHEMA}.dq_rules_library"
CATALOG_TABLE = f"{CATALOG}.{SCHEMA}.dq_validations_catalog"

# 2. Leer Excel y validar columnas
def load_excel_sheet(excel_path, sheet_name):
    errors_list = []
    try:
        df_pd = pd.read_excel(excel_path, sheet_name=sheet_name, dtype=str).fillna('')
        df_pd = df_pd.dropna(how='all')
        if df_pd.empty:
            print(f"Hoja '{sheet_name}' vacía. No se sincroniza.")
            return None, errors_list
        return spark.createDataFrame(df_pd), errors_list

    except FileNotFoundError:
        print(f"Error Crítico: No se encontró el fichero Excel en {excel_path}")
        raise
    except ValueError as ve:
        print(f"Error crítico de Validación: {ve}")
        raise
    except Exception as e:
        errors_list.append({"sheet": sheet_name, "fila": None, "error": str(e)})
        return None, errors_list

# 3. Construir JSON de validaciones en base al nombre y los parámetros recibidos
def build_json_definition(technical_name, rule_type, param_columnas_str, param_valor, param_min, param_max, param_conjunto_str, param_sql, param_tipo, param_query, param_merge):
    params_dict = {}
    try:
        # --- Lógica para reglas BUILT-IN ---
        if rule_type == 'BUILT-IN':
            
            # Reglas que solo usan 'Param_Columnas'
            if technical_name in ["is_not_null", "is_not_empty", "is_not_null_and_not_empty", "is_not_in_future"]:
                columns = [col.strip() for col in param_columnas_str.split(',') if col and col.strip()]
                if columns: params_dict["column"] = columns[0]
            
            # Regla is unique
            elif technical_name == "is_unique":
                columns = [col.strip() for col in param_columnas_str.split(',') if col and col.strip()]
                if columns: params_dict["columns"] = columns

            # Reglas 'is_in_list' y 'is_not_null_and_is_in_list'
            elif technical_name in ["is_in_list", "is_not_null_and_is_in_list"]:
                col_name = param_columnas_str.strip() if param_columnas_str else None
                if col_name: params_dict["column"] = col_name
                allowed_values = [val.strip() for val in param_conjunto_str.split(',') if val and val.strip()]
                if allowed_values: params_dict["allowed"] = allowed_values
            
            # Regla 'regex_match'
            elif technical_name == "regex_match":
                col_name = param_columnas_str.strip() if param_columnas_str else None
                if col_name: params_dict["column"] = col_name
                if param_valor: params_dict["regex"] = param_valor.encode().decode("unicode_escape")
             
            # Reglas 'is_equal_to' y 'is_not_equal_to'
            elif technical_name in ["is_equal_to", "is_not_equal_to"]:
                col_name = param_columnas_str.strip() if param_columnas_str else None
                if col_name: params_dict["column"] = col_name
                if param_valor: params_dict["value"] = param_valor

            # Regla 'is_valid_date'
            elif technical_name == "is_valid_date":
                col_name = param_columnas_str.strip() if param_columnas_str else None
                if col_name: params_dict["column"] = col_name
                if param_valor: params_dict["date_format"] = param_valor

            # Regla 'is_valid_timestamp'
            elif technical_name == "is_valid_timestamp":
                col_name = param_columnas_str.strip() if param_columnas_str else None
                if col_name: params_dict["column"] = col_name
                if param_valor: params_dict["timestamp_format"] = param_valor
                
            # Reglas 'is_in_range' y 'is_not_in_range'
            elif technical_name in ["is_in_range", "is_not_in_range"]:
                col_name = param_columnas_str.strip() if param_columnas_str else None
                if col_name: params_dict["column"] = col_name
                try: 
                    if param_min: params_dict["min_limit"] = float(param_min)
                except (ValueError, TypeError): params_dict["min_limit"] = param_min
                try: 
                    if param_max: params_dict["max_limit"] = float(param_max)
                except (ValueError, TypeError): params_dict["max_limit"] = param_max
            
            # Regla 'is_not_less_than'
            elif technical_name == "is_not_less_than":
                col_name = param_columnas_str.strip() if param_columnas_str else None
                if col_name: params_dict["column"] = col_name
                try: 
                    if param_min: params_dict["limit"] = float(param_min)
                except (ValueError, TypeError): params_dict["limit"] = param_min
            
            # Regla 'is_not_greater_than'
            elif technical_name == "is_not_greater_than":
                col_name = param_columnas_str.strip() if param_columnas_str else None
                if col_name: params_dict["column"] = col_name
                try: 
                    if param_max: params_dict["limit"] = float(param_max)
                except (ValueError, TypeError): params_dict["limit"] = param_max

            # Regla 'sql_expression'
            elif technical_name == "sql_expression":
                if param_sql: params_dict["expression"] = param_sql
            
            # Regla 'sql_query'
            elif technical_name == "sql_query":
                if param_query: params_dict["query"] = param_query
                merge_cols = [col.strip() for col in param_merge.split(',') if col and col.strip()]
                if merge_cols: params_dict["merge_columns"] = merge_cols
                
            # Regla 'sql_query'
            elif technical_name == "sql_query":
                if param_query: params_dict["query"] = param_query
                merge_cols = [col.strip() for col in param_merge.split(',') if col and col.strip()]
                if merge_cols: params_dict["merge_columns"] = merge_cols

            # Reglas 'is_aggr_not_greater_than', 'is_aggr_not_less_than', 'is_aggr_equal' y 'is_aggr_not_equal'
            elif technical_name in ["is_aggr_not_greater_than", "is_aggr_not_less_than", "is_aggr_equal", "is_aggr_not_equal"]:
                columns = [col.strip() for col in param_columnas_str.split(',') if col and col.strip()]
                if columns: params_dict["column"] = columns[0]
                if param_sql: params_dict["aggr_type"] = param_sql
                try:
                    if param_valor: params_dict["limit"] = float(param_valor)
                except:
                    params_dict["limit"] = param_valor

            # Regla 'foreign_key'
            elif technical_name == "foreign_key":
                src_cols = [c.strip() for c in param_columnas_str.split(',') if c.strip()]
                if src_cols: params_dict["columns"] = src_cols
                tgt_cols = [c.strip() for c in param_merge.split(',') if c.strip()]
                if tgt_cols: params_dict["ref_columns"] = tgt_cols
                if param_valor: params_dict["ref_table"] = param_valor
                if param_sql: params_dict["row_filter"] = param_sql

            # Regla 'has_valid_schema'
            elif technical_name == "has_valid_schema":
                if param_columnas_str: params_dict["expected_schema"] = param_columnas_str

            # Regla 'compare_datasets'
            elif technical_name == "compare_datasets":
                cols = [c.strip() for c in param_columnas_str.split(',') if c.strip()]
                if cols: params_dict["columns"] = cols
                ref_cols = [c.strip() for c in param_merge.split(',') if c.strip()]
                if ref_cols: params_dict["ref_columns"] = ref_cols
                if param_valor: params_dict["ref_table"] = param_valor
                if param_sql: params_dict["row_filter"] = param_sql

        # --- Lógica para reglas CUSTOM ---
        elif rule_type in ['CUSTOM']:
             columns = [col.strip() for col in param_columnas_str.split(',') if col and col.strip()]
             if columns: params_dict["columns"] = columns

        # Convertir el diccionario de parámetros final a un string JSON     
        return json.dumps(params_dict)
    
    except Exception as e:
        return json.dumps({"error": str(e)})

#Convertir función python en función Spark para aplicar sobre todo el DataFrame
build_json_udf = udf(build_json_definition, StringType())

# 4. Función para sincronizar las tablas Delta por cada hoja del Excel
def sync_delta(df_spark, target_table_name, merge_keys, column_mapping, sheet_name):
    
    if df_spark is None:
        return 0, []
    
    errors_list = []

    try:
        delta_table = DeltaTable.forName(spark, target_table_name)
        delta_table.alias("target").merge(
            df_spark.alias("source"),
            " AND ".join([f"target.{k} = source.{k}" for k in merge_keys])
        ).whenMatchedUpdate(set={k:col(f"source.{v}") for k,v in column_mapping.items()}) \
         .whenNotMatchedInsert(values={k:col(f"source.{v}") for k,v in column_mapping.items()}) \
         .execute()
        return df_spark.count(), errors_list
    except Exception as e:
        errors_list.append({"sheet": sheet_name, "fila": None, "error": str(e)})
        return 0, errors_list

# 5. Sincronización de la hoja de Tables
def sync_tables(df_excel):

    required_columns = ["Id_tabla", "Nombre_tabla", "Nombre_técnico", "Dominio", "Clave_primaria", "Tabla_staging_evidencias"]
    missing_cols = [c for c in required_columns if c not in df_excel.columns]

    if missing_cols:
        raise ValueError(f"Faltan columnas obligatorias en el Excel: {missing_cols}")

    df_valid = df_excel.select(
        col("Id_tabla").alias("table_id"),
        col("Nombre_tabla").alias("table_name"),
        col("Nombre_técnico").alias("table_name_tech"),
        col("Dominio").alias("domain"),
        col("Clave_primaria").alias("primary_key"),
        col("Tabla_staging_evidencias").alias("staging_evidences_table")
    ).filter(col("Id_tabla") != "")
    mapping = {c:c for c in df_valid.columns}
    return sync_delta(df_valid, TABLE_CONFIG, ["table_id"], mapping, "Tablas")

# 6. Sincronización de la hoja de Rules
def sync_rules(df_excel):

    required_columns = ["Id_regla", "Nombre_técnico", "Nombre_funcional", "Descripción", "Nivel_regla",
                        "Dimensión_reglas", "Tipo_implementación", "Requiere_conversión_literal",
                        "Etiquetas", "Clase", "Propietario", "Actualizado_por"]
    missing_cols = [c for c in required_columns if c not in df_excel.columns]

    if missing_cols:
        raise ValueError(f"Faltan columnas obligatorias en el Excel: {missing_cols}")

    df_valid = df_excel.select(
        col("Id_regla").alias("rule_id"),
        col("Nombre_técnico").alias("technical_rule_name"),
        col("Nombre_funcional").alias("functional_name"),
        col("Descripción").alias("description"),
        col("Nivel_regla").alias("rule_level"),
        col("Dimensión_reglas").alias("dimension_dq"),
        col("Tipo_implementación").alias("implementation_type"),
        col("Requiere_conversión_literal").alias("requires_literal_conversion"),        
        split(col("Etiquetas"), ",\\s*").alias("tags"),
        col("Clase").alias("type_rule"),
        col("Propietario").alias("owner"),
        current_timestamp().alias("updated_at"),
        col("Actualizado_por").alias("updated_by")
    ).filter(col("Id_regla") != "")
    mapping = {c:c for c in df_valid.columns}
    return sync_delta(df_valid, RULE_LIB_TABLE, ["rule_id"], mapping, "Reglas")

# 7. Sincronización de la hoja de Validations
def sync_validations(df_excel):

    required_columns = ["Id_validación", "Id_regla", "Id_tabla", "Definición_perimetro", "Validación_activa", 
                        "Severidad", "Nombre_técnico", "Tipo_regla", "Param_columnas", "Param_valor", 
                        "Param_valor_min", "Param_valor_max", "Param_conjunto", "Param_expresion", 
                        "Param_tipo_dato","Param_query_SQL", "Param_merge_columnas", "Owner", 
                        "Etiquetas", "Actualizado_por", "Dominio_tabla"]
    missing_cols = [c for c in required_columns if c not in df_excel.columns]
    if missing_cols:
        raise ValueError(f"Faltan columnas obligatorias en el Excel: {missing_cols}")

    df_valid = df_excel.select(
        col("Id_validación").alias("validation_id"),
        col("Id_regla").alias("rule_id"),
        col("Id_tabla").alias("table_id"),
        col("Definición_perimetro").alias("perimeter_definition"),
        col("Validación_activa").cast(BooleanType()).alias("is_active"),
        col("Severidad").alias("severity"),
        build_json_udf(
            col("Nombre_técnico"), col("Tipo_regla"),
            col("Param_columnas"), col("Param_valor"),
            col("Param_valor_min"), col("Param_valor_max"),
            col("Param_conjunto"), col("Param_expresion"),
            col("Param_tipo_dato"), col("Param_query_SQL"),
            col("Param_merge_columnas")
        ).alias("validation_definition"),
        col("Owner").alias("owner"),
        split(col("Etiquetas"), ",\\s*").alias("tags"),
        current_timestamp().alias("updated_at"),
        col("Actualizado_por").alias("updated_by"),
        col("Dominio_tabla").alias("domain"),
        split(col("Param_columnas"), ",\\s*").alias("Param_columns"),
        col("Param_valor").alias("Param_value"),
        col("Param_valor_min").alias("Param_value_min"),
        col("Param_valor_max").alias("Param_value_max"),
        col("Param_conjunto").alias("Param_range"),
        col("Param_tipo_dato").alias("Param_data_type"),
        col("Param_expresion").alias("Param_expression"),
        col("Param_query_SQL").alias("Param_query_SQL"),
        col("Param_merge_columnas").alias("Param_merge_columns")
    ).filter(col("Id_validación") != "")
    mapping = {c:c for c in df_valid.columns}
    return sync_delta(df_valid, CATALOG_TABLE, ["validation_id"], mapping, "Validaciones")

# 8. Orquestador
def main():
    total_errors = []
    try:
        # 1. Tablas
        df_tablas, errors_tablas = load_excel_sheet(EXCEL_PATH, CONFIG_SHEET)
        total_errors.extend(errors_tablas)
        synced_tablas = 0
        if df_tablas is not None:
            synced_tablas, _ = sync_tables(df_tablas)
        
        # 2. Reglas
        df_rules, errors_rules = load_excel_sheet(EXCEL_PATH, LIBRARY_SHEET)
        total_errors.extend(errors_rules)
        synced_rules = 0
        if df_rules is not None:
            synced_rules, _ = sync_rules(df_rules)
        
        # 3. Validaciones
        df_valids, errors_valids = load_excel_sheet(EXCEL_PATH, VALIDATIONS_SHEET)
        total_errors.extend(errors_valids)
        synced_valids = 0
        if df_valids is not None:
            synced_valids, _ = sync_validations(df_valids)
        
        if total_errors:
            print(f"Se detectaron {len(total_errors)} errores durante la sincronización:")
            for e in total_errors: print(e)

        return f"Éxito: Tablas {synced_tablas}, Reglas {synced_rules}, Validaciones {synced_valids}, Errores {len(total_errors)}"
    
    except Exception as e:
        # Propaga errores críticos
        raise RuntimeError(f"Fallo crítico durante la ejecución de main(): {e}") from e

# 9. Punto de entrada
if __name__ == "__main__":
    main()