In [0]:

# MAGIC %md
# MAGIC # Calcular persistencia de incidencias
# MAGIC 
# MAGIC 
# MAGIC **L��gica:**
# MAGIC 1.  `main()` orquesta todo el proceso.
# MAGIC 2.  Busca ejecuciones en `dq_validations_traceability` que necesiten c��lculo de persistencia.
# MAGIC 3.  Encuentra los pares de ejecuci��n (N vs N-1) usando `dq_execution_traceability`.
# MAGIC 4.  Carga las evidencias (`dq_evidences`), filtrando por las particiones de fecha (N y N-1).
# MAGIC 5.  Calcula las m��tricas de fallos nuevos, persistentes y resueltos.
# MAGIC 6.  Actualiza (`MERGE`) la tabla `dq_validations_traceability` con estas m��tricas.
# MAGIC 7.  Limpia los nulos restantes (ej: primeras ejecuciones) y los establece en 0.

# COMMAND ----------

# DBTITLE 1, 1. Imports y constantes
from pyspark.sql.functions import col, lag, desc, row_number, count, when, lit, coalesce
from pyspark.sql.window import Window
from delta.tables import DeltaTable
import sys

# --- Widgets ---
#dbutils.widgets.text("table_id", "", "Id de la tabla a validar")
dbutils.widgets.text("table_name", "maestro_demo", "Id de la tabla a validar")
dbutils.widgets.text("catalog_name", "workspace", "Catálogo de UC donde residen las tablas")
dbutils.widgets.text("schema_name", "dq_framework", "Esquema de UC donde residen las tablas")
# COMMAND ----------

# DBTITLE 2, 2. Carga de librerías y definición de constantes/mapas

CATALOG = dbutils.widgets.get("catalog_name")
SCHEMA = dbutils.widgets.get("schema_name")

EXECUTION_TABLE = f"{CATALOG}.{SCHEMA}.dq_execution_traceability"
VALIDATIONS_TABLE = f"{CATALOG}.{SCHEMA}.dq_validations_traceability"
EVIDENCES_TABLE = f"{CATALOG}.{SCHEMA}.dq_evidences"

def main():
    """
    Flujo optimizado extremo para cálculo de persistencia sin crear DataFrame intermedio
    """
    try:
        # --- 1. Validaciones pendientes ---
        df_validations_to_process = (
            spark.table(VALIDATIONS_TABLE)
            .filter((col("status").isin("PASSED", "FAILED")) & col("persistent_failures").isNull())
            .select("execution_id", "validation_id", "rule_id")
            .distinct()
        )

        df_executions_to_process = df_validations_to_process.select("execution_id").distinct()

        if df_executions_to_process.isEmpty():
            print("No se encontraron ejecuciones nuevas para procesar.")
            return "Éxito: No hay nuevas ejecuciones para procesar."

        print(f"Ejecuciones a procesar: {df_executions_to_process.count()}")

        # --- 2. Encontrar pares de Ejecuci��n (N vs N-1) ---
        window_prev_exec = Window.partitionBy("table_id").orderBy("execution_timestamp")
        df_execution_pairs = (
            spark.table(EXECUTION_TABLE)
            .filter(col("status") == "SUCCESS")
            .withColumn("prev_execution_id", lag("execution_id").over(window_prev_exec))
            .withColumn("prev_execution_date", lag("execution_timestamp").over(window_prev_exec))
            .join(df_executions_to_process, "execution_id")
            .filter(col("prev_execution_id").isNotNull())
            .select(
                col("execution_id").alias("current_execution_id"),
                col("execution_date").alias("current_execution_date"),
                "prev_execution_id",
                col("prev_execution_date"),
                "table_id"
            )
        )

        if df_execution_pairs.isEmpty():
            print("Ejecuciones nuevas encontradas, pero no tienen una ejecuci��n anterior v��lida para comparar")
            df_evidences = spark.createDataFrame([], schema=spark.table(EVIDENCES_TABLE).schema)
        else:
            # --- 3. Evidencias relevantes ---
            relevant_executions = df_execution_pairs.select("current_execution_id").union(
                df_execution_pairs.select("prev_execution_id")
            ).distinct()
            df_evidences = (
                spark.table(EVIDENCES_TABLE)
                .join(relevant_executions,
                      (col("execution_id") == col("current_execution_id")) |
                      (col("execution_id") == col("prev_execution_id")),
                      "inner"
                )
                .select("execution_id", "validation_id", "table_pk")
                .distinct()
                #.cache()
            )

        # --- 4. Preparar DeltaTable ---
        delta_validations_table = DeltaTable.forName(spark, VALIDATIONS_TABLE)

        # --- 5. MERGE directo con cálculo de métricas usando window ---
        if len(df_evidences.take(1)) > 0:
            window_spec = Window.partitionBy("validation_id", "table_pk").orderBy("execution_id")
            df_for_merge = df_evidences.withColumn(
                "prev_execution_id_lag", lag("execution_id").over(window_spec)
            ).withColumn(
                "new_failures", when(col("execution_id") != col("prev_execution_id_lag"), 1).otherwise(0)
            ).withColumn(
                "persistent_failures", when(col("execution_id") == col("prev_execution_id_lag"), 1).otherwise(0)
            ).withColumn(
                "resolved_failures", when(col("prev_execution_id_lag").isNotNull() & (col("execution_id") != col("prev_execution_id_lag")), 1).otherwise(0)
            ).groupBy("execution_id", "validation_id").agg(
                sum("new_failures").alias("new_failures"),
                sum("persistent_failures").alias("persistent_failures"),
                sum("resolved_failures").alias("resolved_failures")
            ).fillna(0)

            df_for_merge = df_for_merge.repartition("execution_id", "validation_id")
            delta_validations_table.alias("target").merge(
                df_for_merge.alias("source"),
                (col("target.execution_id") == col("source.execution_id")) &
                (col("target.validation_id") == col("source.validation_id"))
            ).whenMatchedUpdateAll().execute()
            print("Métricas de persistencia actualizadas")

        # --- 6. Rellenar nulos restantes ---
        delta_validations_table.update(
            condition=(
                col("status").isin("PASSED", "FAILED") &
                (col("persistent_failures").isNull() |
                 col("new_failures").isNull() |
                 col("resolved_failures").isNull())
            ),
            set={"persistent_failures": 0, "new_failures": 0, "resolved_failures": 0}
        )
        print("Nulos restantes rellenados.")

        # --- 7. Liberar memoria ---
        if 'df_evidences' in locals() and df_evidences.is_cached:
            df_evidences.unpersist()

        return "Cálculo de persistencia finalizado."

    except Exception as e:
        print(f"Error fatal durante el cálculo de persistencia: {e}")
        if 'df_evidences' in locals() and df_evidences.is_cached:
            df_evidences.unpersist()
        raise e
