####  Description of WatermarkManager


##### Checkpoint Management



- PPL automation job: Create processing plan (start_version / end_version) for ACS tables.  This can be run in the first step in the PPL automation job

```scala
  WatermarkManager.prepareProcessingPlanForSchemas() 
```


- Data pipeline jobs: Get processing plan (start_version / end_version) in pipeline.  This can be run in every pipleine, passing the data as parameter to pipleine jobs

```scala

  // Option-1: get VerionRange for all ACS tables
  val dfPlanned = WatermarkManager.fetchCurrentProcessingPlan()


  // Option-2: get VerionRange for the specified ACS tables
  val tableNames: Seq[String] = Seq(
      "ag_content_ims_acs_prod.gold_entity.d_spmaster",
      "ag_content_ims_acs_prod.gold_entity.d_orgmaster",
      "ag_content_ims_acs_prod.gold_wos.d_organization",
      "ag_content_ims_acs_prod.gold_wos.f_publication"
    )

  val dfVersionRange = WatermarkManager.loadVersionRanges(tableNames)
```



- PPL automation job: Complete processing plan. This can be run in the last  step in the PPL automation job

```scala
  val WatermarkManager.markCompleted()
```



---


List of Functions

| Function Name | Short Description |
|---------------|-----------------|
| `fetchLatestVersionsForSchema(schemaName: String)` | Get the latest Delta version for all tables under a given schema. |
| `loadCheckpoints(tableNames: Seq[String]): DataFrame` | Read checkpoints for the given upstream tables (by `table_name`). |
| `loadVersionRanges(tableNames: Seq[String]): DataFrame` | Read version ranges (start/end versions) for the given upstream tables. |
| `fetchLatestVersionsForSchemas(schemas: Seq[String]): DataFrame` | Returns the latest version per table for one or more schemas. |
| `prepareVersionRange(latestDf: DataFrame, dryRun: Boolean = false): DataFrame` | Build a plan (start/end versions) for upstream tables and write to watermark table with status = `PLANNED`. |
| `prepareProcessingPlanForSchemas(schemaName: Seq[String], dryRun: Boolean = false): DataFrame` | Combined method to build the plan (start/end versions) for a list of upstream tables. |
| `fetchCurrentProcessingPlan()` | Get the current available plan (start/end versions) for upstream tables and write to watermark table (status = `PLANNED`). |
| `markCompleted(dryRun: Boolean = false): DataFrame` | After pipeline execution completes, mark the run as `COMPLETED` and checkpoint the `end_version`. |






#### Setup Env Parameters

In [0]:
//  pass the parameters
dbutils.widgets.text("source_catalog", "ag_content_ims_acs")
dbutils.widgets.text("source_environment", "prod")
dbutils.widgets.text("source_version", "")

dbutils.widgets.text("target_catalog", "ag_ra_search_analytics_data")
dbutils.widgets.text("target_environment", "dev")
dbutils.widgets.text("target_version", "v1_0")

// dynamic paramters
val source_catalog = dbutils.widgets.get("source_catalog")
val source_environment = dbutils.widgets.get("source_environment")
val source_version = dbutils.widgets.get("source_version")
val target_catalog = dbutils.widgets.get("target_catalog")
val target_environment = dbutils.widgets.get("target_environment")
val target_version = dbutils.widgets.get("target_version")


[36msource_catalog[39m: [32mString[39m = [32m"ag_content_ims_acs"[39m
[36msource_environment[39m: [32mString[39m = [32m"prod"[39m
[36msource_version[39m: [32mString[39m = [32m""[39m
[36mtarget_catalog[39m: [32mString[39m = [32m"ag_ra_search_analytics_data"[39m
[36mtarget_environment[39m: [32mString[39m = [32m"dev"[39m
[36mtarget_version[39m: [32mString[39m = [32m"v1_0"[39m

#### 1. WatermarkManager

In [0]:

object SchemaResolver {

  private def getWidget(name: String, default: String): String = {
    try {
      val value = dbutils.widgets.get(name)
      if (value == null || value.isEmpty) default else value
    } catch {
      case _: Throwable => default
    }
  }

  private val source_catalog = getWidget("source_catalog", "ag_content_ims_acs")
  private val source_environment = getWidget("source_environment", "prod")
  private val source_version = getWidget("source_version", "")

  private val target_catalog = getWidget("target_catalog", "ag_ra_search_analytics_data")
  private val target_environment = getWidget("target_environment", "dev")
  private val target_version = getWidget("target_version", "v1_0")

  
  def getAcsSchemaMap():  Map[String, String]  = {
     val version = if(source_version.isEmpty) "" else  s"_${source_version}"

     Map(
      "entity" -> s"${source_catalog}_${source_environment}.gold_entity${version}",
      "wos" -> s"${source_catalog}_${source_environment}.gold_wos${version}",
      "pprn" -> s"${source_catalog}_${source_environment}.gold_pprn${version}",
      "dap" -> s"${target_catalog}_${target_environment}.sandbox_${target_version}",
    )
  }

   
  def getAcsSchemas():  Seq[String]  = {
     val version = if(source_version.isEmpty) "" else  s"_${source_version}"

     Seq(
       s"${source_catalog}_${source_environment}.gold_entity${version}",
       s"${source_catalog}_${source_environment}.gold_wos${version}",
       s"${source_catalog}_${source_environment}.gold_pprn${version}"
    )
  }

    def getDapWaterMarkTable(): String  = {
      s"${target_catalog}_${target_version}.dap_ops_${target_version}.pipeline_watermark"
    }
}


defined [32mobject[39m [36mSchemaResolver[39m

In [0]:
import org.apache.spark.sql.{SparkSession, DataFrame, Row}
import org.apache.spark.sql.functions._
import io.delta.tables._
import org.apache.spark.sql.types.StringType
import io.delta.tables.DeltaTable

case class WatermarkInfo(
  runId: String,
  tableName: String,
  lastProcessedVersion: Option[Long],
  latestAvailableVersion: Long,
  startVersion: Long,
  endVersion: Long,
  cdfEnabled: Boolean = true,
  status: String
)

object WatermarkManager {

  private val catalogACS = SchemaResolver.getAcsSchemas()
  // private val watermarkTableName = SchemaResolver.getDapWaterMarkTable()
  private val watermarkTableName = "ag_ra_search_analytics_data_dev.sandbox_v1_0.dap_watermark" // for test


  def getcatalogACS(): Seq[String] = {
        catalogACS
  }
  // Get latest Delta version for all tables under a schema
  def fetchLatestVersionsForSchema(schemaName: String): DataFrame = {

    // List all tables under the schema
    val tablesDF = spark.sql(s"SHOW TABLES IN $schemaName")
      .filter("isTemporary = false") // exclude temp views

    // For each table, get latest version from Delta history
    val results = tablesDF.collect().map { row =>
      val tableName = row.getAs[String]("tableName")
      val fullTablePath = s"$schemaName.$tableName"

      try {
        val historyDF = spark.sql(s"DESCRIBE HISTORY $fullTablePath")
        val latestVersion = historyDF.select(max("version")).as[Long].first()

        (tableName, latestVersion.toString)
      } catch {
        case e: Exception =>
          // Not a Delta table or error reading history
          (tableName, "N/A")
      }
    }

    // Convert results to a DataFrame
    spark.createDataFrame(results).toDF("table_name", "latest_version")
  }

  // Read checkpoints for given upstream tables (by table_name) 
  def loadCheckpoints(tableNames: Seq[String]): DataFrame = {
    if (!spark.catalog.tableExists(watermarkTableName)) {
      // empty DF with expected columns
      spark.emptyDataFrame
        .withColumn("table_name", lit("").cast("string"))
        .withColumn("last_processed_version", lit(null).cast("long"))
        .filter(lit(false))
    } else {
      spark.table(watermarkTableName)
        .select($"table_name", $"last_processed_version".cast("long"))
        .where($"table_name".isin(tableNames: _*))
    }
  }

  // Read version range for given upstream tables (by table_name)
  def loadVersionRanges(tableNames: Seq[String]): DataFrame = {
    if (!spark.catalog.tableExists(watermarkTableName)) {
      // empty DF with expected columns
      spark.emptyDataFrame
        .withColumn("table_name", lit("").cast("string"))
        .withColumn("start_version", lit(0).cast("long"))
        .withColumn("end_version", lit(0).cast("long"))
        .filter(lit(false))
    } else { 

      spark.table(watermarkTableName)
        .select($"table_name", $"start_version".cast("long"),  $"end_version".cast("long"))
        .where($"table_name".isin(tableNames: _*))
    }
  }

  // Returns latestâ€‘version per table for a list of schema, and a list of schemas
  def fetchLatestVersionsForSchemas(schemas: Seq[String]): DataFrame = {

    // For each schema, get its table version DF; add a column for schema
    val dfs: Seq[DataFrame] = schemas.map { schemaName =>
      fetchLatestVersionsForSchema(schemaName)
        .select($"table_name", $"latest_version".cast("long"))
        // Optionally, prefix table_name with schema to avoid ambiguity if same table exists in multiple schemas
        .withColumn("table_name", concat(lit(s"$schemaName."), $"table_name"))
    }

    // Combine all schema results into one DataFrame
    dfs.reduce((df1, df2) => df1.unionByName(df2, allowMissingColumns = true))
  }

  // prepare VersionRange (start/end versions) for upstream tables and write to watermark table (status = PLANNED)
  def prepareVersionRange(latestDf: DataFrame, dryRun:Boolean = false): DataFrame = {
    val existing = loadCheckpoints(latestDf.select("table_name").as[String].collect())

    val planDf = latestDf
      .join(existing, Seq("table_name"), "left_outer")
      .withColumn("start_version",
        when($"last_processed_version".isNull, lit(0L))
          .otherwise($"last_processed_version" + 1L)
      )
      .withColumn("end_version", $"latest_version")
      .filter($"end_version" >= $"start_version")

    val runId = java.util.UUID.randomUUID().toString
    val planMeta = planDf
      .withColumn("run_id", lit(runId))
      .withColumn("last_processed_version", lit(null: java.lang.Long))
      .withColumn("latest_available_version", $"latest_version")
      .withColumn("cdf_enabled", lit(true))
      .withColumn("status", lit("PLANNED"))
      .withColumn("update_ts", current_timestamp())
      .withColumn("updated_by", lit("WatermarkManager"))
      .withColumn("error_message" , lit("").cast(StringType))
  
    if(!dryRun)
      upsertWatermark(planMeta)
    planMeta
  }

  // Combined method: for a list of upstream tables, build plan -> start/end versions 
  def prepareProcessingPlanForSchemas( schemaName: Seq[String], dryRun: Boolean = false): DataFrame = {
     val latestDf = fetchLatestVersionsForSchemas(schemaName)
      //.filter($"table_name".isin(tableNames: _*))
     prepareVersionRange(latestDf, dryRun)
  }

  // After pipelines complete: mark as COMPLETED and checkpoint the end_version */
  def markCompleted( dryRun: Boolean = false): DataFrame = {

    val updates = spark.table(watermarkTableName)
      .filter($"status" === "PLANNED")
      .withColumn("last_processed_version", $"end_version")
      .withColumn("latest_available_version",  lit(null).cast("bigint"))
      .withColumn("status", lit("COMPLETED"))
      .withColumn("update_ts", current_timestamp())
      .withColumn("updated_by", lit("WatermarkManager"))
      // .withColumn("run_id", lit(java.util.UUID.randomUUID().toString))

    if(!dryRun) upsertWatermark(updates)
    
    updates
    
  }

// Get current available plan (start/end versions) for upstream tables and write to watermark table (status = PLANNED)
  def  fetchCurrentProcessingPlan(): DataFrame = {
      spark.table(watermarkTableName).filter($"status" === "PLANNED")
  }

  // internal upsert (merge) into watermark table using catalog table name 
  private def upsertWatermark(df: DataFrame): Unit = {
    // Use DeltaTable.forName on the watermark table
    val wmTable = DeltaTable.forName(spark, watermarkTableName)
    
    wmTable.as("w")
      .merge(df.as("u"), "w.table_name = u.table_name AND w.run_id = u.run_id")
      //.merge(df.as("u"), "w.run_id = u.run_id")
      .whenMatched()
        .updateExpr(
          Map(
            "run_id"                    -> "u.run_id",
            "last_processed_version"    -> "u.last_processed_version",
            "latest_available_version"  -> "u.latest_available_version",
            "start_version"             -> "u.start_version",
            "end_version"               -> "u.end_version",
            "cdf_enabled"               -> "u.cdf_enabled",
            "update_ts"                 -> "u.update_ts",
            "updated_by"                -> "u.updated_by",
            "status"                    -> "u.status",
            "error_message"             -> "u.error_message" //"u.error_message"
          )
        )
      .whenNotMatched()
        .insertExpr(
          Map(

            "run_id"                   -> "u.run_id",
            "table_name"               -> "u.table_name",
            "last_processed_version"   -> "u.last_processed_version",
            "latest_available_version" -> "u.latest_available_version",
            "start_version"            -> "u.start_version",
            "end_version"              -> "u.end_version",
            "cdf_enabled"              -> "u.cdf_enabled",
            "update_ts"                -> "u.update_ts",
            "updated_by"               -> "u.updated_by",
            "status"                   -> "u.status",
            "error_message"            ->  "u.error_message",
          )
        )
      .execute()
  }
}





[32mimport [39m[36morg.apache.spark.sql.{SparkSession, DataFrame, Row}
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._
[39m
[32mimport [39m[36mio.delta.tables._
[39m
[32mimport [39m[36morg.apache.spark.sql.types.StringType
[39m
[32mimport [39m[36mio.delta.tables.DeltaTable

[39m
defined [32mclass[39m [36mWatermarkInfo[39m
defined [32mobject[39m [36mWatermarkManager[39m

In [0]:

val sql_wm = """
CREATE TABLE IF NOT EXISTS ag_ra_search_analytics_data_dev.sandbox_v1_0.dap_watermark (
  run_id STRING PRIMARY KEY,               -- unique identifier for the run (e.g. UUID)
  table_name STRING ,                     -- upstream table identifier, e.g. "schema.table"
  last_processed_version LONG,            -- last version successfully processed (checkpoint)
  latest_available_version LONG,          -- latest version detected (from upstream)
  start_version LONG,                     -- planned start version for current run
  end_version LONG,                       -- planned end (latest) version for current run
  cdf_enabled BOOLEAN,                    -- whether this table is CDF-enabled (optional/useful)
  update_ts TIMESTAMP,                    -- timestamp of last update to watermark
  updated_by STRING,                      -- job or user who updated watermark
  status STRING,                          -- "PLANNED", "COMPLETED", "FAILED" etc.
  error_message STRING                    -- in case of failure (nullable)
)
USING DELTA;
"""



[36msql_wm[39m: [32mString[39m = [32m"""
CREATE TABLE IF NOT EXISTS ag_ra_search_analytics_data_dev.sandbox_v1_0.dap_watermark (
  run_id STRING PRIMARY KEY,               -- unique identifier for the run (e.g. UUID)
  table_name STRING ,                     -- upstream table identifier, e.g. "schema.table"
  last_processed_version LONG,            -- last version successfully processed (checkpoint)
  latest_available_version LONG,          -- latest version detected (from upstream)
  start_version LONG,                     -- planned start version for current run
  end_version LONG,                       -- planned end (latest) version for current run
  cdf_enabled BOOLEAN,                    -- whether this table is CDF-enabled (optional/useful)
  update_ts TIMESTAMP,                    -- timestamp of last update to watermark
  updated_by STRING,                      -- job or user who updated watermark
  status STRING,                          -- "PLANNED", "COMPLETED", "FAILE

#### 2. Create & Delete Watermark Table

In [0]:
spark.sql("DROP TABLE IF EXISTS ag_ra_search_analytics_data_dev.sandbox_v1_0.dap_watermark ")

[36mres79[39m: [32mDataFrame[39m = []

In [0]:
spark.sql(sql_wm)

[36mres82[39m: [32mDataFrame[39m = []

#### 3. Test

In [0]:
val catalogACS = SchemaResolver.getAcsSchemas()

println(catalogACS)

println(SchemaResolver.getDapWaterMarkTable)

List(ag_content_ims_acs_prod.gold_entity, ag_content_ims_acs_prod.gold_wos, ag_content_ims_acs_prod.gold_pprn)
ag_ra_search_analytics_data_v1_0.dap_ops_v1_0.pipeline_watermark


[36mcatalogACS[39m: [32mSeq[39m[[32mString[39m] = [33mList[39m(
  [32m"ag_content_ims_acs_prod.gold_entity"[39m,
  [32m"ag_content_ims_acs_prod.gold_wos"[39m,
  [32m"ag_content_ims_acs_prod.gold_pprn"[39m
)

In [0]:

  // This can be run in the first step - PPL automation job
  //  DryRun if true - Generate processing plan (start_version / end_version) for those tables

  val planDF = WatermarkManager.prepareProcessingPlanForSchemas(catalogACS)

  display(planDF)

25/12/06 00:18:58 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 7in the parser. Driver memory: 49325015040.
25/12/06 00:20:47 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 7in the parser. Driver memory: 49325015040.
25/12/06 00:21:58 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 7in the parser. Driver memory: 49325015040.
25/12/06 00:21:59 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 7in the parser. Driver memory: 49325015040.


table_name,latest_version,last_processed_version,start_version,end_version,run_id,latest_available_version,cdf_enabled,status,update_ts,updated_by,error_message
ag_content_ims_acs_prod.gold_entity.d_alma_openaccess,2,,0,2,650443eb-9e6c-4db3-906f-f323d71403bd,2,True,PLANNED,2025-12-06T00:22:04.728822Z,WatermarkManager,
ag_content_ims_acs_prod.gold_entity.d_alma_subscriptions,2,,0,2,650443eb-9e6c-4db3-906f-f323d71403bd,2,True,PLANNED,2025-12-06T00:22:04.728822Z,WatermarkManager,
ag_content_ims_acs_prod.gold_entity.d_esi_article,2,,0,2,650443eb-9e6c-4db3-906f-f323d71403bd,2,True,PLANNED,2025-12-06T00:22:04.728822Z,WatermarkManager,
ag_content_ims_acs_prod.gold_entity.d_esi_author_indicator,2,,0,2,650443eb-9e6c-4db3-906f-f323d71403bd,2,True,PLANNED,2025-12-06T00:22:04.728822Z,WatermarkManager,
ag_content_ims_acs_prod.gold_entity.d_esi_country_indicator,2,,0,2,650443eb-9e6c-4db3-906f-f323d71403bd,2,True,PLANNED,2025-12-06T00:22:04.728822Z,WatermarkManager,
ag_content_ims_acs_prod.gold_entity.d_esi_institution_indicator,2,,0,2,650443eb-9e6c-4db3-906f-f323d71403bd,2,True,PLANNED,2025-12-06T00:22:04.728822Z,WatermarkManager,
ag_content_ims_acs_prod.gold_entity.d_esi_journal_indicator,2,,0,2,650443eb-9e6c-4db3-906f-f323d71403bd,2,True,PLANNED,2025-12-06T00:22:04.728822Z,WatermarkManager,
ag_content_ims_acs_prod.gold_entity.d_esi_papers,2,,0,2,650443eb-9e6c-4db3-906f-f323d71403bd,2,True,PLANNED,2025-12-06T00:22:04.728822Z,WatermarkManager,
ag_content_ims_acs_prod.gold_entity.d_funding_organization,7,,0,7,650443eb-9e6c-4db3-906f-f323d71403bd,7,True,PLANNED,2025-12-06T00:22:04.728822Z,WatermarkManager,
ag_content_ims_acs_prod.gold_entity.d_grantmaster_dataitem,2,,0,2,650443eb-9e6c-4db3-906f-f323d71403bd,2,True,PLANNED,2025-12-06T00:22:04.728822Z,WatermarkManager,


[36mplanDF[39m: [32mDataFrame[39m = [table_name: string, latest_version: bigint ... 10 more fields]

In [0]:
val tableNames: Seq[String] = Seq(
"ag_content_ims_acs_prod.gold_entity.d_spmaster",
"ag_content_ims_acs_prod.gold_entity.d_orgmaster",
"ag_content_ims_acs_prod.gold_wos.d_organization",
"ag_content_ims_acs_prod.gold_wos.f_publication",
)
val dfCheckpoint = WatermarkManager.loadCheckpoints(tableNames)
display(dfCheckpoint)


25/12/06 00:43:47 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 11in the parser. Driver memory: 49325015040.


table_name,last_processed_version
ag_content_ims_acs_prod.gold_wos.f_publication,38
ag_content_ims_acs_prod.gold_wos.d_organization,22
ag_content_ims_acs_prod.gold_entity.d_orgmaster,2
ag_content_ims_acs_prod.gold_entity.d_spmaster,2


[36mtableNames[39m: [32mSeq[39m[[32mString[39m] = [33mList[39m(
  [32m"ag_content_ims_acs_prod.gold_entity.d_spmaster"[39m,
  [32m"ag_content_ims_acs_prod.gold_entity.d_orgmaster"[39m,
  [32m"ag_content_ims_acs_prod.gold_wos.d_organization"[39m,
  [32m"ag_content_ims_acs_prod.gold_wos.f_publication"[39m
)
[36mdfCheckpoint[39m: [32mDataFrame[39m = [table_name: string, last_processed_version: bigint]

In [0]:
val tableNames: Seq[String] = Seq(
"ag_content_ims_acs_prod.gold_entity.d_spmaster",
"ag_content_ims_acs_prod.gold_entity.d_orgmaster",
"ag_content_ims_acs_prod.gold_wos.d_organization",
"ag_content_ims_acs_prod.gold_wos.f_publication",
)

val dfVersionRange = WatermarkManager.loadVersionRanges(tableNames)
display(dfVersionRange)


25/12/06 00:42:17 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 11in the parser. Driver memory: 49325015040.
25/12/06 00:42:17 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 11in the parser. Driver memory: 49325015040.


table_name,start_version,end_version
ag_content_ims_acs_prod.gold_wos.f_publication,37,38
ag_content_ims_acs_prod.gold_wos.d_organization,21,22
ag_content_ims_acs_prod.gold_entity.d_orgmaster,1,2
ag_content_ims_acs_prod.gold_entity.d_spmaster,1,2


[36mtableNames[39m: [32mSeq[39m[[32mString[39m] = [33mList[39m(
  [32m"ag_content_ims_acs_prod.gold_entity.d_spmaster"[39m,
  [32m"ag_content_ims_acs_prod.gold_entity.d_orgmaster"[39m,
  [32m"ag_content_ims_acs_prod.gold_wos.d_organization"[39m,
  [32m"ag_content_ims_acs_prod.gold_wos.f_publication"[39m
)
[36mdfVersionRange[39m: [32mDataFrame[39m = [table_name: string, start_version: bigint ... 1 more field]

In [0]:
val dfPlanned = WatermarkManager.fetchCurrentProcessingPlan()
display(dfPlanned)

run_id,table_name,last_processed_version,latest_available_version,start_version,end_version,cdf_enabled,update_ts,updated_by,status,error_message
650443eb-9e6c-4db3-906f-f323d71403bd,ag_content_ims_acs_prod.gold_wos.d_publication_source_citation_flag,,47,46,47,True,2025-12-06T00:22:00.411756Z,WatermarkManager,PLANNED,
650443eb-9e6c-4db3-906f-f323d71403bd,ag_content_ims_acs_prod.gold_wos.d_publisher,,2,1,2,True,2025-12-06T00:22:00.411756Z,WatermarkManager,PLANNED,
650443eb-9e6c-4db3-906f-f323d71403bd,ag_content_ims_acs_prod.gold_wos.d_researcher_citation_count,,10,9,10,True,2025-12-06T00:22:00.411756Z,WatermarkManager,PLANNED,
650443eb-9e6c-4db3-906f-f323d71403bd,ag_content_ims_acs_prod.gold_wos.d_researcher_citation_count_woscore,,10,9,10,True,2025-12-06T00:22:00.411756Z,WatermarkManager,PLANNED,
650443eb-9e6c-4db3-906f-f323d71403bd,ag_content_ims_acs_prod.gold_wos.d_researcher_citations_flag,,10,9,10,True,2025-12-06T00:22:00.411756Z,WatermarkManager,PLANNED,
650443eb-9e6c-4db3-906f-f323d71403bd,ag_content_ims_acs_prod.gold_wos.d_state,,18,17,18,True,2025-12-06T00:22:00.411756Z,WatermarkManager,PLANNED,
650443eb-9e6c-4db3-906f-f323d71403bd,ag_content_ims_acs_prod.gold_wos.d_territory,,7,6,7,True,2025-12-06T00:22:00.411756Z,WatermarkManager,PLANNED,
650443eb-9e6c-4db3-906f-f323d71403bd,ag_content_ims_acs_prod.gold_wos.d_yearwise_citation,,2,1,2,True,2025-12-06T00:22:00.411756Z,WatermarkManager,PLANNED,
650443eb-9e6c-4db3-906f-f323d71403bd,ag_content_ims_acs_prod.gold_wos.d_yearwise_citation_count,,41,40,41,True,2025-12-06T00:22:00.411756Z,WatermarkManager,PLANNED,
650443eb-9e6c-4db3-906f-f323d71403bd,ag_content_ims_acs_prod.gold_wos.d_yearwise_citation_count_woscore,,43,42,43,True,2025-12-06T00:22:00.411756Z,WatermarkManager,PLANNED,


[36mdfPlanned[39m: [32mDataFrame[39m = [run_id: string, table_name: string ... 9 more fields]

In [0]:

// This can be run in the last step - PPL automation job to create a new checkpoint
//dryRun

val dfCompleted = WatermarkManager.markCompleted()
display(dfCompleted)


25/12/06 00:42:43 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 11in the parser. Driver memory: 49325015040.


run_id,table_name,last_processed_version,latest_available_version,start_version,end_version,cdf_enabled,update_ts,updated_by,status,error_message


[36mdfCompleted[39m: [32mDataFrame[39m = [run_id: string, table_name: string ... 9 more fields]