In [0]:

'''
Calcular persistencia

Lógica
1. 'main()' orquesta el proceso.
2. Recupera las ejecuciones SUCCESS o FAILED para el cálculo
3. Recupera las validaciones de la última ejecución
4. Recupera las validaciones de la ejecución anterior
5. Realiza los cálculos para incidencias nuevas, persistentes o resueltas
6. Actualiza las tablas de validaciones y evidencias
'''

# 1. Imports
from pyspark.sql.functions import col, lag, desc, row_number, count, when, lit, coalesce
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from delta.tables import DeltaTable
from pyspark.sql.types import IntegerType
import sys

# 2. Widgets
def get_mandatory_widget(name, label=None):
    if label is None:
        label = name
    try:
        # Intentar crear el widget
        dbutils.widgets.text(name, "", label)
    except:
        # Si ya existe, ignorar el error
        pass
    value = dbutils.widgets.get(name)
    if not value.strip():
        raise ValueError(f"Parámetro obligatorio: {name}")
    return value

CATALOG = get_mandatory_widget("catalog_name","Catálogo de UC donde residen las tablas")
SCHEMA = get_mandatory_widget("schema_name", "Esquema de UC donde residen las tablas")

EXECUTION_TABLE = f"{CATALOG}.{SCHEMA}.dq_execution_traceability"
VALIDATIONS_TABLE = f"{CATALOG}.{SCHEMA}.dq_validations_traceability"
EVIDENCES_TABLE = f"{CATALOG}.{SCHEMA}.dq_evidences"

# 3. Función principal
def main():
    
    try:
        # 1. Recupera ejecuciones a analizar
        df_exec = (
            spark.table(EXECUTION_TABLE)
            .filter(col("status").isin("SUCCESS", "FAILED"))
            .orderBy(col("execution_timestamp").desc())
        )

        if df_exec.count() == 0:
            return "No existen ejecuciones"
        
        # 2. Cálculo de la última ejecución y su antecesora
        latest_execution_id = df_exec.first().execution_id

        window_exec = Window.orderBy(col("execution_timestamp"))
        df_exec_with_prev = df_exec.withColumn(
            "prev_execution_id", lag("execution_id").over(window_exec)
        )

        prev_execution_id = (
            df_exec_with_prev.filter(col("execution_id") == latest_execution_id)
            .select("prev_execution_id")
            .first()
            .prev_execution_id
        )

        delta_validations = DeltaTable.forName(spark, VALIDATIONS_TABLE)
        delta_evidences = DeltaTable.forName(spark, EVIDENCES_TABLE)

        # 3. Lógica para primera ejecución: todas las evidencias se marcan como nuevas
        if prev_execution_id is None:

            df_current = (
                spark.table(EVIDENCES_TABLE)
                .filter(col("execution_id") == latest_execution_id)
                .withColumn("is_new_failure_num", lit(1).cast(IntegerType()))
                .withColumn("is_new_failure", lit(True))
            )

            df_merge = (
                spark.table(EVIDENCES_TABLE)
                .groupBy("execution_id", "validation_id")
                .agg(F.count("*").alias("new_failures"))
                .withColumn("persistent_failures", lit(0).cast(IntegerType()))
                .withColumn("resolved_failures", lit(0).cast(IntegerType()))
            )

            # 3.1 Merge validaciones
            delta_validations.alias("target").merge(
                df_merge.alias("source"),
                (col("target.execution_id") == col("source.execution_id"))
                & (col("target.validation_id") == col("source.validation_id")),
            ).whenMatchedUpdate(
                set={
                    "new_failures": col("source.new_failures"),
                    "persistent_failures": col("source.persistent_failures"),
                    "resolved_failures": col("source.resolved_failures"),
                }
            ).whenNotMatchedInsert(
                values={
                    "execution_id": col("source.execution_id"),
                    "validation_id": col("source.validation_id"),
                    "new_failures": col("source.new_failures"),
                    "persistent_failures": col("source.persistent_failures"),
                    "resolved_failures": col("source.resolved_failures"),
                }
            ).execute()

            # 3.2 Merge evidencias
            delta_evidences.alias("target").merge(
                df_current.alias("source"),
                (col("target.execution_id") == col("source.execution_id"))
                & (col("target.validation_id") == col("source.validation_id"))
                & (col("target.table_pk") == col("source.table_pk")),
            ).whenMatchedUpdate(
                set={"is_new_failure": col("source.is_new_failure")}
            ).execute()

            return "Primera ejecución procesada correctamente"

        else:
            # 4. Lógica para siguientes ejecuciones: cargar evidencias previas y actuales

            # 4.1 Evidencias de la última ejecución (alias 'curr' para joins posteriores)
            df_current = (
                spark.table(EVIDENCES_TABLE)
                .filter(col("execution_id") == latest_execution_id)
                .select(
                    col("evidence_id"),
                    col("execution_id"),
                    col("validation_id"),
                    col("table_pk"),
                    col("failed_value")
                )
            )

            # 4.2 Evidencias de la ejecución anterior (alias 'prev' para joins posteriores)
            df_prev = (
                spark.table(EVIDENCES_TABLE)
                .filter(col("execution_id") == prev_execution_id)
                .select(
                    col("evidence_id").alias("prev_evidence_id"),
                    col("execution_id").alias("prev_execution_id"),
                    col("validation_id").alias("prev_validation_id"),
                    col("table_pk").alias("prev_table_pk"),
                    col("failed_value").alias("prev_failed_value")
                )
            )

            # 4.3 Calcula is_new_failure (actual - prev)
            df_new_fails = df_current.join(
                df_prev,
                (col("validation_id") == col("prev_validation_id")) &
                (col("table_pk") == col("prev_table_pk")) &
                (col("failed_value") == col("prev_failed_value")),
                "left_anti"
            ).select(
                "execution_id", "validation_id", "table_pk", "failed_value"
            ).withColumn("is_new_failure_num", lit(1).cast(IntegerType()))

            # Conteo por ejecución y validación
            df_new_fails_count = (
                df_new_fails.groupBy("execution_id", "validation_id")
                .agg(F.sum("is_new_failure_num").alias("new_failures"))
            )

            # 4.4 Calcula resolved_failures (prev - current)

            df_resolved_fails = df_prev.join(
                df_current,
                (col("prev_validation_id") == col("validation_id")) &
                (col("prev_table_pk") == col("table_pk")) &
                (col("prev_failed_value") == col("failed_value")),
                "left_anti"
            ).select(
                "prev_execution_id", "prev_validation_id", "prev_table_pk", "prev_failed_value"
            ).withColumn("is_resolved", lit(1).cast(IntegerType()))

            df_resolved_fails = df_resolved_fails.withColumnRenamed("prev_execution_id", "execution_id")\
                                     .withColumnRenamed("prev_validation_id", "validation_id")\
                                     .withColumnRenamed("prev_table_pk", "table_pk")\
                                     .withColumnRenamed("prev_failed_value", "failed_value")

            df_resolved_count = (
                df_resolved_fails.groupBy("execution_id", "validation_id")
                .agg(F.sum("is_resolved").alias("resolved_failures"))
            )

            # 4.5 Calcula persistent failures

            df_persistent_fails = df_current.join(
                df_prev,
                (col("validation_id") == col("prev_validation_id")) &
                (col("table_pk") == col("prev_table_pk")) &
                (col("failed_value") == col("prev_failed_value")),
                "inner"
            ).select(
                "execution_id", "validation_id", "table_pk", "failed_value"
            ).withColumn("is_persistent", lit(1).cast(IntegerType()))

            df_persistent_count = (
                df_persistent_fails.groupBy("execution_id", "validation_id")
                .agg(F.sum("is_persistent").alias("persistent_failures"))
            )

            # 4.6 Agrupa todos los cálculos
            df_failures = df_new_fails_count.join(
                df_persistent_count,
                ["execution_id", "validation_id"],
                "outer",
            ).fillna({"new_failures": 0, "persistent_failures": 0}).join(
                df_resolved_count,
                ["execution_id", "validation_id"],
                "outer",
            ).fillna({"resolved_failures": 0})    

            # 4.7 Merge final de validaciones
            delta_validations.alias("target").merge(
                df_failures.alias("source"),
                (col("target.execution_id") == col("source.execution_id"))
                & (col("target.validation_id") == col("source.validation_id")),
            ).whenMatchedUpdate(
                set={
                    "new_failures": col("source.new_failures"),
                    "persistent_failures": col("source.persistent_failures"),
                    "resolved_failures": col("source.resolved_failures"),
                }
            ).whenNotMatchedInsert(
                values={
                    "execution_id": col("source.execution_id"),
                    "validation_id": col("source.validation_id"),
                    "new_failures": col("source.new_failures"),
                    "persistent_failures": col("source.persistent_failures"),
                    "resolved_failures": col("source.resolved_failures"),
                }
            ).execute()

            # 4.8 Merge final de evidencias
            delta_evidences.update(
                condition=col("execution_id") == latest_execution_id,
                set={"is_new_failure": lit(False)}
            )
            
            delta_evidences.alias("target").merge(
                df_new_fails.alias("source"),
                (col("target.execution_id") == col("source.execution_id"))
                & (col("target.validation_id") == col("source.validation_id"))
                & (col("target.table_pk") == col("source.table_pk"))
                & (col("target.failed_value") == col("source.failed_value")),
            ).whenMatchedUpdate(set={"is_new_failure": lit(True)}).execute()

            return "Ejecución procesada correctamente."

    except Exception as e:
        raise e

if __name__ == "__main__":
    try:
        result = main()
    except Exception as e:
        raise RuntimeError(f"Fallo en la persistencia de evidencias: {e}")
    else:
        dbutils.notebook.exit(f"Éxito: evidencias procesadas")