# 1. Setting up the environment and loading the parameters.

### a. Importing the libraries and configrations

In [0]:
# Importing required files & libraries
import time
import json
from datetime import datetime, timedelta, timezone
import logging
from pyspark.sql.types import *
from pyspark.sql.functions import *
from typing import *

In [0]:
# Configure logging
vietnam_tz = timezone(timedelta(hours=7))
logging.Formatter.converter = lambda *args: datetime.now(vietnam_tz).timetuple()
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

### b. Defining widgets and parameters & loading configs

In [0]:
# Defining widgets
dbutils.widgets.text("environment", "", "")
dbutils.widgets.text("job_name", "", "")
# Fetching widgets values
environment = dbutils.widgets.get("environment")
job_name = dbutils.widgets.get("job_name")

In [0]:
config = open("../configs/config.json")
settings = json.load(config)

In [0]:
catalog_name = settings[environment]['catalogName']
bronze_schema = settings[environment]['bronzeSchema']
eventhub_invalid_records_path= settings[environment]["eventhub_invalid_records_path"]

In [0]:
logger.info(
    "\n".join(
        [
            "Configuration Loaded:",
            f"  catalog_name                        : {catalog_name}",
            f"  bronze_schema                       : {bronze_schema}",
            f"  eventhub_invalid_records_path       : {eventhub_invalid_records_path}",
        ]
    )
)

# 2. Reading batch wise row count from eventhub audit log table,Aggregating it and writing it to consolidate ingestion adit log table with schema validation results

In [0]:
def get_eventhub_list(catalog_name: str) -> List[str]:
    """
    Fetch list of raw table names (Event Hubs) from lookup table for a given namespace.
    """
    query = f"""
        SELECT distinct table_name
        FROM {catalog_name}.default.lookup_table_source2raw
        WHERE source_name = 'EVENTHUB'
    """
    try:
        df = spark.sql(query)
        event_hubs_list = [
            row["table_name"] for row in df.select("table_name").collect()
        ]
        logger.info(f"Fetched Event Hub tables: {event_hubs_list}")
        return event_hubs_list
    except Exception as e:
        logger.error(f"Failed to fetch Event Hub list: {e}")
        raise


def get_invalid_record_summary(
    hubs: List[str],
    invalid_path: str,
    proc_date: str
) -> DataFrame:
    """
    Returns schema validation summary for each Event Hub.
    Calculates number of invalid records for a given date,
    and identifies if schema mismatches occurred with corresponding error messages.
    """
    audit_rows = []
    for hub in hubs:
        logger.info(f"Checking hub: {hub}")
        try:
            df = spark.read.format("delta").load(f"{invalid_path}/{hub}/")
            df = df.withColumn("ingestion_time", to_date(col("ingestion_time"), "yyyyMMdd"))
            filtered_df = df.filter(col("ingestion_time") == lit(proc_date))
            invalid_count = filtered_df.count()
            status = "Mismatched" if invalid_count > 0 else "Matched"
            error_str = (
                filtered_df.agg(concat_ws(", ", collect_set("error")).alias("error_string")).collect()[0]["error_string"]
                if invalid_count > 0 else None
            )
            audit_rows.append((hub, datetime.now(timezone(timedelta(hours=7))), invalid_count, status, error_str))
        except Exception as e:
            logger.error(f"Error reading data for hub '{hub}': {str(e)}")
            audit_rows.append((hub, datetime.now(timezone(timedelta(hours=7))), -1, "ERROR", str(e)))

    audit_schema = StructType([
        StructField("hub", StringType(), True),
        StructField("audit_time_utc", TimestampType(), True),
        StructField("invalid_record_count", IntegerType(), True),
        StructField("schema_match_status", StringType(), True),
        StructField("schema_mismatches", StringType(), True)
    ])
    return spark.createDataFrame(audit_rows, audit_schema)


def get_bronze_summary(proc_date: str, catalog_name: str, hubs: List[str]) -> DataFrame:
    """
    Returns Bronze layer summary for a given proc_date.
    Aggregates row count from the audit log per hub for the specified processing date.
    """
    bronze_audit_df = spark.table(f"{catalog_name}.default.eventhub_audit_log")
    actual_summary = (
        bronze_audit_df.filter(col("proc_date") == proc_date)
        .select("proc_date", "row_count", "hub")
        .groupBy("hub", "proc_date")
        .agg(sum("row_count").alias("streaming_df_count"))
    )

    # Add 0-count rows for hubs that didn't appear
    missing_hubs = set(hubs) - set([row["hub"] for row in actual_summary.select("hub").distinct().collect()])
    missing_df = spark.createDataFrame([(hub, proc_date, 0) for hub in missing_hubs], actual_summary.schema)
    return actual_summary.unionByName(missing_df)


def get_delta_summary(hubs: List[str], proc_date: str, catalog_name: str, bronze_schema: str) -> DataFrame:
    """
    Returns Delta table summary for each Event Hub.
    Aggregates row count per hub and date from Delta tables matching the proc_date.
    """
    all_summaries = []
    for hub in hubs:
        try:
            logger.info(f"Processing delta table for hub: {hub}")
            delta_df = spark.table(f"{catalog_name}.{bronze_schema}.{hub}")
            delta_df = delta_df.withColumn("proc_date", col("ProcDate").substr(1, 8))
            filtered_df = delta_df.filter(col("proc_date") == proc_date)
            summary = (
                filtered_df.groupBy("proc_date")
                .agg(count("*").alias("delta_row_count"))
                .withColumn("hub", lit(hub))
            )
            all_summaries.append(summary)
        except Exception as e:
            logger.error(f"Error processing hub {hub}: {str(e)}")
    if not all_summaries:
        raise ValueError("No delta summaries were generated.")
    df = all_summaries[0]
    for other_df in all_summaries[1:]:
        df = df.unionByName(other_df)
    return df


def build_final_audit_df(
    bronze_df: DataFrame,
    delta_df: DataFrame,
    audit_df: DataFrame
) -> DataFrame:
    """
    Combines Bronze, Delta, and Invalid record summaries into a single audit DataFrame
    with metrics: streaming row count, delta row count, invalid record count, and schema status.
    """
    return (
        bronze_df.alias("bronze")
        .join(delta_df.alias("delta"), ["hub", "proc_date"], "outer")
        .join(audit_df.alias("audit"), "hub", "left")
        .select(
            "bronze.hub", "bronze.proc_date",
            "bronze.streaming_df_count", "delta.delta_row_count",
            col("audit.invalid_record_count"),
            col("audit.schema_match_status"),
            col("audit.schema_mismatches")
        )
    )


def write_to_audit_table(
    final_df: DataFrame,
    catalog_name: str,
    proc_date_col: str = "proc_date"
):
    """
    Writes the final audit DataFrame to the consolidated audit result Delta table
    after enriching with metadata fields like source info, load date, and pipeline name.
    """
    target_schema = spark.table(f"{catalog_name}.default.consolidate_audit_result").schema

    enriched_df = final_df \
        .withColumn("load_date", to_date(col(proc_date_col), "yyyyMMdd")) \
        .withColumn("source_name", lit("EventHub")) \
        .withColumn("source_description", lit("EventHub")) \
        .withColumn("table_name", col("hub")) \
        .withColumn("pipeline_name", lit("eventhub_source_to_bronze_workflow")) \
        .withColumn("environment", lit("dev")) \
        .withColumn("source_object_count", col("streaming_df_count").cast("string")) \
        .withColumn("target_object_count", col("delta_row_count").cast("string"))

    for field in target_schema:
        if field.name not in enriched_df.columns:
            enriched_df = enriched_df.withColumn(field.name, lit(None).cast(field.dataType))
        else:
            enriched_df = enriched_df.withColumn(field.name, col(field.name).cast(field.dataType))

    final_insert_df = enriched_df.select(*[field.name for field in target_schema])
    final_insert_df.write.mode("append").format("delta").saveAsTable(f"{catalog_name}.default.consolidate_audit_result")
    return final_insert_df


In [0]:
def main(
    hubs: List[str],
    invalid_path: str,
    proc_date: str,
    catalog_name: str,
    bronze_schema: str
):
    logger.info(f"Starting EventHub audit process for proc_date: {proc_date}")

    # Step 1: Get invalid record counts and schema mismatch status
    audit_df = get_invalid_record_summary(hubs, invalid_path, proc_date)
    logger.info("Completed schema validation audit summary.")

    # Step 2: Get Bronze summary
    bronze_df = get_bronze_summary(proc_date,catalog_name,hubs)
    logger.info("Fetched Bronze layer summary.")

    # Step 3: Get Delta summary
    delta_df = get_delta_summary(hubs, proc_date,catalog_name,bronze_schema)
    logger.info("Fetched Delta layer summary.")

    # Step 4: Join summaries to build final audit DataFrame
    final_df = build_final_audit_df(bronze_df, delta_df, audit_df)
    logger.info("Built final audit DataFrame.")

    # Step 5: Write final DataFrame to audit table
    write_to_audit_table(final_df, catalog_name)
    logger.info("Final audit data written to target table successfully.")

# Fetch Event Hub metadata
event_hubs_list = get_eventhub_list(catalog_name)
if not event_hubs_list:
    raise ValueError("No Event Hub tables found")

yesterday = (datetime.now(timezone(timedelta(hours=7))) - timedelta(days=1)).strftime("%Y%m%d")

main(
    hubs=event_hubs_list,  
    invalid_path=eventhub_invalid_records_path,  
    proc_date=yesterday,
    catalog_name=catalog_name,
    bronze_schema=bronze_schema
)

