In [0]:

'''
Calcular persistencia

Lógica
1. 'main()' orquesta el proceso.
2. Recupera las ejecuciones SUCCESS o FAILED para el cálculo
3. Recupera las validaciones de la última ejecución
4. Recupera las validaciones de la ejecución anterior
5. Realiza los cálculos para incidencias nuevas, persistentes o resueltas
6. Actualiza las tablas de validaciones y evidencias
'''

# 1. Imports
from pyspark.sql.functions import col, lag, desc, row_number, count, when, lit, coalesce
from pyspark.sql.window import Window
from delta.tables import DeltaTable
import sys

# 2. Widgets
dbutils.widgets.text("table_name", "maestro_demo", "Id de la tabla a validar")
dbutils.widgets.text("catalog_name", "workspace", "Catálogo de UC donde residen las tablas")
dbutils.widgets.text("schema_name", "framework_dq", "Esquema de UC donde residen las tablas")

CATALOG = dbutils.widgets.get("catalog_name")
SCHEMA = dbutils.widgets.get("schema_name")

EXECUTION_TABLE = f"{CATALOG}.{SCHEMA}.dq_execution_traceability"
VALIDATIONS_TABLE = f"{CATALOG}.{SCHEMA}.dq_validations_traceability"
EVIDENCES_TABLE = f"{CATALOG}.{SCHEMA}.dq_evidences"

# 3. Función principal
def main():
    try:
        # 1. Recuperar ejecuciones a analizar
        df_exec = (
            spark.table(EXECUTION_TABLE)
            .filter(col("status").isin("SUCCESS", "FAILED"))
            .orderBy(col("execution_timestamp").desc())
        )

        if df_exec.count() == 0:
            return "No existen ejecuciones"
        
        # 2. Cálculo de la última ejecución y su antecesora
        latest_execution_id = df_exec.first().execution_id

        window_exec = Window.orderBy(col("execution_timestamp"))
        df_exec_with_prev = df_exec.withColumn(
            "prev_execution_id", lag("execution_id").over(window_exec)
        )

        prev_execution_id = (
            df_exec_with_prev.filter(col("execution_id") == latest_execution_id)
            .select("prev_execution_id")
            .first()
            .prev_execution_id
        )

        delta_validations = DeltaTable.forName(spark, VALIDATIONS_TABLE)
        delta_evidences = DeltaTable.forName(spark, EVIDENCES_TABLE)

        # 3. Lógica para primera ejecución: todas las evidencias se marcan como nuevas
        if prev_execution_id is None:

            df_current = (
                spark.table(EVIDENCES_TABLE)
                .filter(col("execution_id") == latest_execution_id)
                .withColumn("is_new_failure", lit(1))
            )

            df_merge = (
                df_current.groupBy("execution_id", "validation_id")
                .agg(count("*").alias("new_failures"))
                .withColumn("persistent_failures", lit(0))
                .withColumn("resolved_failures", lit(0))
            )

            # 3.1 Merge validaciones
            delta_validations.alias("target").merge(
                df_merge.alias("source"),
                (col("target.execution_id") == col("source.execution_id"))
                & (col("target.validation_id") == col("source.validation_id")),
            ).whenMatchedUpdate(
                set={
                    "new_failures": col("source.new_failures"),
                    "persistent_failures": col("source.persistent_failures"),
                    "resolved_failures": col("source.resolved_failures"),
                }
            ).execute()

            # 3.2 Merge evidencias
            delta_evidences.alias("target").merge(
                df_current.alias("source"),
                (col("target.execution_id") == col("source.execution_id"))
                & (col("target.validation_id") == col("source.validation_id"))
                & (col("target.table_pk") == col("source.table_pk")),
            ).whenMatchedUpdate(
                set={"is_new_failure": col("source.is_new_failure")}
            ).execute()

            return "Primera ejecución procesada correctamente"

        else:
            # 4. Lógica para siguientes ejecuciones: cargar evidencias previas y actuales

            # 4.1 Evidencias de la última ejecución (alias 'curr' para joins postreriores)
            df_current = (
                spark.table(EVIDENCES_TABLE)
                .filter(col("execution_id") == latest_execution_id)
                .select(
                    col("evidence_id"),
                    col("execution_id"),
                    col("validation_id"),
                    col("table_pk")
                )
                .alias("curr")
            )

            # 4.2 Evidencias de la ejecución anterior (alias 'prev' para joins postreriores)
            df_prev = (
                spark.table(EVIDENCES_TABLE)
                .filter(col("execution_id") == prev_execution_id)
                .select(
                    col("evidence_id"),
                    col("execution_id"),
                    col("validation_id"),
                    col("table_pk")
                )
                .withColumnRenamed("evidence_id", "prev_evidence_id")
                .withColumnRenamed("execution_id", "prev_execution_id")
                .withColumnRenamed("validation_id", "prev_validation_id")
                .withColumnRenamed("table_pk", "prev_table_pk")
                .alias("prev")
            )

            # 4.3 Calcular is_new_failure (actual - prev)

            df_evidences = df_current.join(
                df_prev,
                (col("curr.validation_id") == col("prev.prev_validation_id"))
                & (col("curr.table_pk") == col("prev.prev_table_pk")),
                "left_outer",
            ).withColumn(
                "is_new_failure",
                when(col("prev.prev_validation_id").isNull(), lit(1)).otherwise(lit(0)),
            ).select(
                col("curr.execution_id"),
                col("curr.validation_id"),
                col("curr.table_pk"),
                col("is_new_failure"),
            )

            # 4.4 Calcular resolved_failures (prev - current)

            df_resolved = df_prev.join(
                df_current,
                (col("prev.prev_validation_id") == col("curr.validation_id"))
                & (col("prev.prev_table_pk") == col("curr.table_pk")),
                "left_anti",
            ).groupBy(
                col("prev_execution_id"),
                col("prev_validation_id"),
            ).agg(count("*").alias("resolved_failures"))

            # 4.5 Calcular new y persistent

            df_failures = (
                df_evidences.groupBy("execution_id", "validation_id")
                .agg(
                    count("is_new_failure").alias("new_failures"),
                    (count("*") - count("is_new_failure")).alias("persistent_failures"),
                )
            )

            # 4.6 Agrupa todos los cálculos
            df_failures = df_failures.join(
                df_resolved.withColumnRenamed("prev_execution_id", "execution_id")
                .withColumnRenamed("prev_validation_id", "validation_id"),
                ["execution_id", "validation_id"],
                "left",
            ).fillna({"resolved_failures": 0})

            # 4.7 Merge final de validaciones
            delta_validations.alias("target").merge(
                df_failures.alias("source"),
                (col("target.execution_id") == col("source.execution_id"))
                & (col("target.validation_id") == col("source.validation_id")),
            ).whenMatchedUpdate(
                set={
                    "new_failures": col("source.new_failures"),
                    "persistent_failures": col("source.persistent_failures"),
                    "resolved_failures": col("source.resolved_failures"),
                }
            ).execute()

            # 4.8 Merge final de evidencias
            delta_evidences.alias("target").merge(
                df_evidences.alias("source"),
                (col("target.execution_id") == col("source.execution_id"))
                & (col("target.validation_id") == col("source.validation_id"))
                & (col("target.table_pk") == col("source.table_pk")),
            ).whenMatchedUpdate(set={"is_new_failure": col("source.is_new_failure")}).execute()

            return "Ejecución procesada correctamente."

    except Exception as e:
        raise e

if __name__ == "__main__":
    try:
        result = main()
    except Exception as e:
        dbutils.notebook.exit(f"Fallo en la persistencia de evidencias: {e}")
    else:
        dbutils.notebook.exit(f"Éxito: evidencias procesadas")