#### 1. Setup Env Parameters

In [0]:
//  pass the parameters
dbutils.widgets.text("source_catalog", "ag_content_ims_acs")
dbutils.widgets.text("source_environment", "prod")
dbutils.widgets.text("source_version", "")

dbutils.widgets.text("target_catalog", "ag_ra_search_analytics_data")
dbutils.widgets.text("target_environment", "dev")
dbutils.widgets.text("target_version", "v1_0")
dbutils.widgets.text("pipeline_name", "")


dbutils.widgets.text("metadata_table_prefix", "")

// dynamic paramters
val source_catalog = dbutils.widgets.get("source_catalog")
val source_environment = dbutils.widgets.get("source_environment")
val source_version = dbutils.widgets.get("source_version")
val target_catalog = dbutils.widgets.get("target_catalog")
val target_environment = dbutils.widgets.get("target_environment")
val target_version = dbutils.widgets.get("target_version")
val pipeline_name = dbutils.widgets.get("pipeline_name")
val metadata_table_prefix = dbutils.widgets.get("metadata_table_prefix")



[36msource_catalog[39m: [32mString[39m = [32m"ag_content_ims_acs"[39m
[36msource_environment[39m: [32mString[39m = [32m"prod"[39m
[36msource_version[39m: [32mString[39m = [32m""[39m
[36mtarget_catalog[39m: [32mString[39m = [32m"ag_ra_search_analytics_data"[39m
[36mtarget_environment[39m: [32mString[39m = [32m"dev"[39m
[36mtarget_version[39m: [32mString[39m = [32m"v1_0"[39m
[36mpipeline_name[39m: [32mString[39m = [32m"agra-sa-doc-wos-pipeline"[39m

#### 2. DapOps

In [0]:
import org.apache.spark.sql.{DataFrame, SparkSession, Column}
import org.apache.spark.sql.functions._
import io.delta.tables.DeltaTable
import scala.jdk.CollectionConverters._
import java.sql.Timestamp
import spark.implicits._
import java.time.Instant

object DapOps {

  // ---------------------------------------------------------
  // 1.  define the immutable variables: schemas, watermark tables, pipeline, etc.
  // ---------------------------------------------------------
  private val tablePrefix = SchemaResolver.OPS_TABLE_PREFIX

  val DAP_SCHEMA_OPS = SchemaResolver.OPS_SCHEMA
  //val DAP_SCHEMA_OPS = "ag_ra_search_analytics_data_dev.sandbox_v1_0" // for test only

  val WATERMARK = s"${tablePrefix}${DAP_SCHEMA_OPS}.dap_watermarks"
  val WATERMARK_HISTORY = s"${DAP_SCHEMA_OPS}.dap_watermark_history"
  val CHECKPOINT = s"${tablePrefix}${DAP_SCHEMA_OPS}.dap_checkpoints"
  val CHECKPOINT_HISTORY = s"${DAP_SCHEMA_OPS}.dap_checkpoint_history"
  val PIPELINE_TASK_RUN = s"${tablePrefix}${DAP_SCHEMA_OPS}.dap_pipeline_task_runs"

  val REGISTRY = s"${DAP_SCHEMA_OPS}.dap_pipeline_registry"
  val PIPELINE_TASK_UPSTREAM = s"${DAP_SCHEMA_OPS}.dap_pipeline_task_upstream"

  val PIPELINE_META_INSERT_SQL_FILE = "insert_dap_pipeline_registry.sql"
  val TASK_META_INSERT_SQL_FILE = "insert_dap_task_upstream.sql"


  // ---------------------------------------------------------
  // 2  define the functions for Ops tables
  // ---------------------------------------------------------

  // Function to completely remove all OPS tables - danger!
  def dropOpsTables(
    dryRun:Boolean = true
    ): Unit = {
    if(!dryRun){
      spark.sql(s"DROP TABLE IF EXISTS $WATERMARK")
      spark.sql(s"DROP TABLE IF EXISTS $CHECKPOINT")
      spark.sql(s"DROP TABLE IF EXISTS $WATERMARK_HISTORY")
      spark.sql(s"DROP TABLE IF EXISTS $CHECKPOINT_HISTORY")
      spark.sql(s"DROP TABLE IF EXISTS $REGISTRY")
      spark.sql(s"DROP TABLE IF EXISTS $PIPELINE_TASK_RUN")
      spark.sql(s"DROP TABLE IF EXISTS $PIPELINE_TASK_UPSTREAM")
    }
  }

  // Function to create Ops tables if not exists
  def createOpsTables(
    dryRun: Boolean = false
    ): Unit = {
    if(dryRun) return
    //  data for control & tracking
    createCheckpointTable(CHECKPOINT)
    createCheckpointTable(CHECKPOINT_HISTORY)
    createMatermarkTable(WATERMARK)
    createMatermarkTable(WATERMARK_HISTORY) 
    createPipelineTaskRubTable(PIPELINE_TASK_RUN)
    // data for meta
    createRegistryTable(REGISTRY)
    createPipelineUpstreamTable(PIPELINE_TASK_UPSTREAM)

  }

  def InsertPieplieMeta(): Unit ={

      executeSqlStatements(
        sqlStatements =  DapIO.readSQL(PIPELINE_META_INSERT_SQL_FILE),
        stopOnError = true
      )
   
      executeSqlStatements(
        sqlStatements =  DapIO.readSQL(TASK_META_INSERT_SQL_FILE),
        stopOnError = true
      )
  }

  def createMatermarkTable(tableName: String, pipeline_name: String=""): Unit = {
    spark.sql(
      s"""
         |CREATE TABLE IF NOT EXISTS ${tableName} (
         |  batch_id LONG,
         |  table_name STRING,
         |  start_version BIGINT,
         |  end_version BIGINT,
         |  last_processed_version BIGINT,
         |  latest_available_version BIGINT,
         |  start_ts TIMESTAMP,
         |  end_ts TIMESTAMP,
         |  cdf_enabled BOOLEAN,
         |  status STRING,
         |  error_message STRING,
         |  update_ts TIMESTAMP,
         |  updated_by STRING
         |) USING DELTA
       """.stripMargin)
  }

  def createCheckpointTable(tableName: String): Unit = {
    spark.sql(
      s"""
         |CREATE TABLE IF NOT EXISTS  ${tableName} (
         |  pipeline_name STRING,
         |  batch_id LONG,
         |  start_ts TIMESTAMP,
         |  end_ts TIMESTAMP,
         |  processed_ts LONG,
         |  status STRING,
         |  rows_read BIGINT,
         |  rows_written BIGINT,
         |  retry LONG,
         |  error_message STRING,
         |  update_ts TIMESTAMP,
         |  updated_by STRING
         |) USING DELTA
         |PARTITIONED BY (pipeline_name);
       """.stripMargin)
  }

  def createRegistryTable(tableName: String): Unit = {
   spark.sql(
      s"""
         |CREATE TABLE IF NOT EXISTS ${tableName} (
         |  pipeline_name STRING,
         |  type STRING,
         |  product STRING,
         |  owner STRING,
         |  email STRING,
         |  description STRING,
         |  upstream_tables ARRAY<STRING>,
         |  update_ts TIMESTAMP,
         |  updated_by STRING
         |) USING DELTA
       """.stripMargin)
  }

  def createPipelineTaskRubTable(tableName: String): Unit = {
    spark.sql(
      s"""
        |CREATE TABLE IF NOT EXISTS ${tableName} (
         |  pipeline_name        STRING,
         |  run_id               STRING,
         |  task_name            STRING,
         |  status               STRING,   
         |  attempt              INT,
         |  started_at           TIMESTAMP,
         |  completed_at         TIMESTAMP,
         |  error_message        STRING,
         |  created_at           TIMESTAMP,
         |  updated_at           TIMESTAMP
         |) USING DELTA
         |PARTITIONED BY (pipeline_name, task_name);
        """.stripMargin)
  }

  def createPipelineUpstreamTable(tableName: String): Unit = {
    spark.sql(
      s"""
        |CREATE TABLE IF NOT EXISTS ${tableName} (
         |  pipeline_name        STRING,
         |  task_name            STRING,
         |  upstream_tables      ARRAY<STRING>,  
         |  updated_by           STRING,
         |  updated_at           TIMESTAMP
         |) USING DELTA
         |PARTITIONED BY (pipeline_name, task_name);
        """.stripMargin)
  }

  def executeSqlStatements(
      sqlStatements: List[String],
      stopOnError: Boolean = true
  ): Unit = {

    // Filter out empty statements (trim whitespace)
    val statements = sqlStatements.map(_.trim).filter(_.nonEmpty)

    statements.zipWithIndex.foreach { case (stmt, idx) =>
      try {
        println(s"Executing SQL statement #${idx + 1}")
        spark.sql(stmt)
      } catch {
        case e: Exception =>
          println(s"Failed SQL statement #${idx + 1}:\n$stmt")
          if (stopOnError) throw e
      }
    }
  }

}


[32mimport [39m[36morg.apache.spark.sql.{DataFrame, SparkSession, Column}
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._
[39m
[32mimport [39m[36mio.delta.tables.DeltaTable
[39m
[32mimport [39m[36mscala.jdk.CollectionConverters._
[39m
[32mimport [39m[36mjava.sql.Timestamp
[39m
[32mimport [39m[36mspark.implicits._
[39m
[32mimport [39m[36mjava.time.Instant

[39m
defined [32mobject[39m [36mDapOps[39m

#### 3. Testing DapOps

In [0]:
 // dryRun:Boolean = true
 DapOps.dropOpsTables(true)

In [0]:
DapOps.createOpsTables()