In [0]:

# MAGIC %md
# MAGIC # Calcular persistencia de incidencias
# MAGIC 
# MAGIC 
# MAGIC **L��gica:**
# MAGIC 1.  `main()` orquesta todo el proceso.
# MAGIC 2.  Busca ejecuciones en `dq_validations_traceability` que necesiten c��lculo de persistencia.
# MAGIC 3.  Encuentra los pares de ejecuci��n (N vs N-1) usando `dq_execution_traceability`.
# MAGIC 4.  Carga las evidencias (`dq_evidences`), filtrando por las particiones de fecha (N y N-1).
# MAGIC 5.  Calcula las m��tricas de fallos nuevos, persistentes y resueltos.
# MAGIC 6.  Actualiza (`MERGE`) la tabla `dq_validations_traceability` con estas m��tricas.
# MAGIC 7.  Limpia los nulos restantes (ej: primeras ejecuciones) y los establece en 0.

# COMMAND ----------

# DBTITLE 1, 1. Imports y constantes
from pyspark.sql.functions import col, lag, desc, row_number, count, when, lit, coalesce
from pyspark.sql.window import Window
from delta.tables import DeltaTable
import sys

# --- Widgets ---
#dbutils.widgets.text("table_id", "", "Id de la tabla a validar")
dbutils.widgets.text("table_name", "maestro_demo", "Id de la tabla a validar")
dbutils.widgets.text("catalog_name", "workspace", "Catálogo de UC donde residen las tablas")
dbutils.widgets.text("schema_name", "dq_framework", "Esquema de UC donde residen las tablas")
# COMMAND ----------

# DBTITLE 2, 2. Carga de librerías y definición de constantes/mapas

CATALOG = dbutils.widgets.get("catalog_name")
SCHEMA = dbutils.widgets.get("schema_name")

EXECUTION_TABLE = f"{CATALOG}.{SCHEMA}.dq_execution_traceability"
VALIDATIONS_TABLE = f"{CATALOG}.{SCHEMA}.dq_validations_traceability"
EVIDENCES_TABLE = f"{CATALOG}.{SCHEMA}.dq_evidences"


# COMMAND ----------

# DBTITLE 2, 2. Funci��n Principal (main)
def main():
    """
    Flujo principal de orquestaci��n para el c��lculo de persistencia
    """
    
    try:
        # --- 1. Encontrar ejecuciones para procesar ---
        
        df_validations_to_process = (
            spark.table(VALIDATIONS_TABLE)
            .filter(col("status").isin("PASSED", "FAILED") & col("persistent_failures").isNull())
            .select("execution_id", "validation_id", "rule_id")
            .distinct()
        )

        df_executions_to_process = df_validations_to_process.select("execution_id").distinct()

        if df_executions_to_process.isEmpty():
            print("No se encontraron ejecuciones nuevas para procesar.")
            return "��xito: No hay nuevas ejecuciones para procesar."

        print(f"Se encontraron {df_executions_to_process.count()} ejecuciones para procesar.")

        # --- 2. Encontrar pares de Ejecuci��n (N vs N-1) ---
        window_prev_exec = Window.partitionBy("table_id").orderBy(col("execution_timestamp"))

        df_execution_pairs = (
            spark.table(EXECUTION_TABLE)
            .filter(col("status") == "SUCCESS") # Comparar solo contra ejecuciones anteriores exitosas
            .withColumn("prev_execution_id", lag("execution_id", 1).over(window_prev_exec))
            .withColumn("prev_execution_date", lag("execution_timestamp", 1).over(window_prev_exec))
            .join(df_executions_to_process, "execution_id")
            .select(
                col("execution_id").alias("current_execution_id"),
                col("execution_date").alias("current_execution_date"),
                col("prev_execution_id"),
                col("prev_execution_date"),
                "table_id"
            )
            .filter(col("prev_execution_id").isNotNull())
        )

        if df_execution_pairs.isEmpty():
            print("Ejecuciones nuevas encontradas, pero no tienen una ejecuci��n anterior v��lida para comparar.")
        else:
            print("Pares de ejecuci��n N y N-1 encontrados:")

        # --- 3. Cargar evidencias relevantes (Usando particiones) ---
        # Selecciona solo las columnas de fechas relevantes

        dates_df = df_execution_pairs.select(
            col("current_execution_date").alias("execution_date")
        ).union(
            df_execution_pairs.select(col("prev_execution_date").alias("execution_date"))
        ).distinct()

        all_relevant_dates = [row.execution_date for row in dates_df.collect()]

        if not all_relevant_dates:
             print("No hay pares de ejecuci��n para cargar evidencias")
             df_evidences = spark.createDataFrame([], schema=spark.table(EVIDENCES_TABLE).schema)
        else:
            print(f"Cargando evidencias para {len(all_relevant_dates)} particiones de fecha...")
            df_evidences = (
                spark.table(EVIDENCES_TABLE)
                .filter(col("execution_date").isin(all_relevant_dates))
                .select("execution_id", "validation_id", "table_pk")
                .distinct()
                #.cache()
            )
            print(f"Evidencias cargadas: {df_evidences.count()} registros.")

        # --- 4. Calcular m��tricas de persistencia ---
        '''
        print("Calculando m��tricas de persistencia (New, Persistent, Resolved)...")

        execution_ids_n = [row.current_execution_id for row in df_execution_pairs.collect()]
        execution_ids_n_minus_1 = [row.prev_execution_id for row in df_execution_pairs.collect()]

        df_current_failures = df_evidences.filter(col("execution_id").isin(execution_ids_n))
        df_previous_failures = df_evidences.filter(col("execution_id").isin(execution_ids_n_minus_1)) \
                                         .withColumnRenamed("execution_id", "prev_execution_id")

        df_previous_failures_mapped = (df_previous_failures
            .join(df_execution_pairs.select("prev_execution_id", "current_execution_id"), "prev_execution_id")
            .select(col("current_execution_id").alias("execution_id"), "validation_id", "table_pk")
        )

        df_new = (
            df_current_failures.join(df_previous_failures_mapped, ["execution_id", "validation_id", "table_pk"], "left_anti")
            .groupBy("execution_id", "validation_id").agg(count("*").alias("new_failures"))
        )
        df_persistent = (
            df_current_failures.join(df_previous_failures_mapped, ["execution_id", "validation_id", "table_pk"], "inner")
            .groupBy("execution_id", "validation_id").agg(count("*").alias("persistent_failures"))
        )
        df_resolved = (
            df_previous_failures_mapped.join(df_current_failures, ["execution_id", "validation_id", "table_pk"], "left_anti")
            .groupBy("execution_id", "validation_id").agg(count("*").alias("resolved_failures"))
        )

        df_metrics = (
            df_validations_to_process
            .join(df_new, ["execution_id", "validation_id"], "left")
            .join(df_persistent, ["execution_id", "validation_id"], "left")
            .join(df_resolved, ["execution_id", "validation_id"], "left")
            .select(
                "execution_id",
                "validation_id",
                coalesce(col("new_failures"), lit(0)).alias("new_failures"),
                coalesce(col("persistent_failures"), lit(0)).alias("persistent_failures"),
                coalesce(col("resolved_failures"), lit(0)).alias("resolved_failures")
            )
        )
        print("C��lculo de m��tricas finalizado.")
        '''

        # --- 1. Crear ventana sobre cada table_pk y validation_id, ordenada por execution_id ---
        window_spec = Window.partitionBy("validation_id", "table_pk").orderBy("execution_id")

        # --- 2. Calcular lag para la ejecución anterior ---
        df_with_lag = df_evidences.alias("e").join(
            df_execution_pairs.select(
                col("current_execution_id"),
                col("prev_execution_id")
            ).alias("p"),
            col("e.execution_id") == col("p.current_execution_id")
        ).withColumn(
            "prev_execution_id_lag", lag("execution_id", 1).over(window_spec)
        )

        # --- 3. Clasificar cada fila como new, persistent o resolved ---
        df_classified = df_with_lag.withColumn(
            "new_failure", when(col("execution_id") != col("prev_execution_id_lag"), 1).otherwise(0)
        ).withColumn(
            "persistent_failure", when(col("execution_id") == col("prev_execution_id_lag"), 1).otherwise(0)
        ).withColumn(
            "resolved_failure", when(col("prev_execution_id_lag").isNotNull() & (col("execution_id") != col("prev_execution_id_lag")), 1).otherwise(0)
        )

        # --- 4. Agregación final por ejecución y validación ---
        df_metrics = df_classified.groupBy("execution_id", "validation_id").agg(
            count("new_failure").alias("new_failures"),
            count("persistent_failure").alias("persistent_failures"),
            count("resolved_failure").alias("resolved_failures")
        )

        # --- 5. Rellenar posibles nulls ---
        df_metrics = df_metrics.fillna(0)

        print("Cálculo de métricas finalizado (compacto, con ventanas).")


        # --- 5. Actualizar la tabla de trazabilidad (MERGE) ---
        if not df_metrics.isEmpty():
            print(f"Actualizando (MERGE) registros en {VALIDATIONS_TABLE}...")
            
            delta_validations_table = DeltaTable.forName(spark, VALIDATIONS_TABLE)

            (delta_validations_table.alias("target")
             .merge(
                 df_metrics.alias("source"),
                 (col("target.execution_id") == col("source.execution_id")) &
                 (col("target.validation_id") == col("source.validation_id"))
             )
             .whenMatchedUpdate(set={
                 "persistent_failures": "source.persistent_failures",
                 "new_failures": "source.new_failures",
                 "resolved_failures": "source.resolved_failures"
             })
             .execute()
            )
            print("Actualizaci��n de m��tricas de persistencia completada")
        else:
            print("No se calcularon m��tricas, no hay nada que mergear.")
            
        if 'df_evidences' in locals() and df_evidences.is_cached:
            df_evidences.unpersist()


        # --- 0. Reparticionar para merge eficiente ---
        df_metrics = df_metrics.repartition("execution_id", "validation_id")

        # --- 1. Verificar si hay métricas para mergear ---
        if not df_metrics.isEmpty():
            print("Actualizando métricas en Delta...")

            delta_validations_table = DeltaTable.forName(spark, VALIDATIONS_TABLE)

            # --- 2. Merge eficiente usando whenMatchedUpdateAll() ---
            (delta_validations_table.alias("target")
                .merge(
                    df_metrics.alias("source"),
                    (col("target.execution_id") == col("source.execution_id")) &
                    (col("target.validation_id") == col("source.validation_id"))
                )
                .whenMatchedUpdateAll()  # Actualiza todas las columnas de df_metrics
                .execute()
            )

            print("Actualización de métricas completada")
        else:
            print("No se calcularon métricas, no hay nada que mergear.")

        # --- 3. Liberar memoria de df_evidences si estaba cacheado ---
        if 'df_evidences' in locals() and df_evidences.is_cached:
            df_evidences.unpersist()


        # --- 6. Rellenar nulos en validaciones sin pares ---
        print(f"Limpiando nulos restantes en {VALIDATIONS_TABLE}...")
        
        delta_validations_table = DeltaTable.forName(spark, VALIDATIONS_TABLE)
        
        zero_fill_values = {
            "persistent_failures": 0,
            "new_failures": 0,
            "resolved_failures": 0
        }
        condition = (
            col("status").isin("PASSED", "FAILED") & 
            (col("persistent_failures").isNull() | col("new_failures").isNull() | col("resolved_failures").isNull())
        )
        
        delta_validations_table.update(
            condition = condition,
            set = zero_fill_values
        )
        
        print("Limpieza de nulos completada.")
        return "C��lculo de persistencia finalizado."

    except Exception as e:
        print(f"Error fatal durante el c��lculo de persistencia: {e}")
        if 'df_evidences' in locals() and df_evidences.is_cached:
            df_evidences.unpersist()
        raise e

# COMMAND ----------

# DBTITLE 3, 3. Punto de entrada de ejecuci��n
if __name__ == "__main__":
    try:
        completion_message = main()
        dbutils.notebook.exit(completion_message)
    except Exception as e:
        dbutils.notebook.exit(f"Fallo en el c��lculo de persistencia: {e}")