#### 1. Setup Env Parameters

In [0]:
//  pass the parameters
dbutils.widgets.text("source_catalog", "ag_content_ims_acs")
dbutils.widgets.text("source_environment", "prod")
dbutils.widgets.text("source_version", "")

dbutils.widgets.text("target_catalog", "ag_ra_search_analytics_data")
dbutils.widgets.text("target_environment", "dev")
dbutils.widgets.text("target_version", "v1_0")
dbutils.widgets.text("pipeline_name", "")


dbutils.widgets.text("metadata_table_prefix", "")

// dynamic paramters
val source_catalog = dbutils.widgets.get("source_catalog")
val source_environment = dbutils.widgets.get("source_environment")
val source_version = dbutils.widgets.get("source_version")
val target_catalog = dbutils.widgets.get("target_catalog")
val target_environment = dbutils.widgets.get("target_environment")
val target_version = dbutils.widgets.get("target_version")
val pipeline_name = dbutils.widgets.get("pipeline_name")
val metadata_table_prefix = dbutils.widgets.get("metadata_table_prefix")



[36msource_catalog[39m: [32mString[39m = [32m"ag_content_ims_acs"[39m
[36msource_environment[39m: [32mString[39m = [32m"prod"[39m
[36msource_version[39m: [32mString[39m = [32m""[39m
[36mtarget_catalog[39m: [32mString[39m = [32m"ag_ra_search_analytics_data"[39m
[36mtarget_environment[39m: [32mString[39m = [32m"dev"[39m
[36mtarget_version[39m: [32mString[39m = [32m"v1_0"[39m
[36mpipeline_name[39m: [32mString[39m = [32m"agra-sa-doc-wos-pipeline"[39m

#### 2. WatermarkManager

In [0]:
import org.apache.spark.sql.{DataFrame, SparkSession, Column}

// ---------------------------------------------------------
// Watermark: CheckpointStatus
// ---------------------------------------------------------
object WatermarkStatus {
  val Ready    = "Ready"
  val Success  = "Success"
}

// ---------------------------------------------------------
// Checkpoint: CheckpointStatus 
// ---------------------------------------------------------

object CheckpointStatus {
  val Ready    = "Ready"
  val Running  = "Running"
  val Skipped  = "Skipped"
  val Success  = "Success"
  val Failed   = "Failed"
}
// ---------------------------------------------------------
// ProductType
// ---------------------------------------------------------
object ProductType {
  val WOSRI    = "WOSRI"
  val INCITES  = "Incites"
  val WOS      = "WoS"
}

// ---------------------------------------------------------
// PipelineType
// ---------------------------------------------------------
object PipelineType {
  val Data      = "Data"
  val Workflow  = "Workflow"
  val Loader    = "Loader"
  val Assembler = "Assembler"
  val Others    = "Others"
}


// ---------------------------------------------------------
// CheckpointParams
// ---------------------------------------------------------
case class CheckpointParams(
  rowsRead: Long = 0L,
  rowsWritten: Long = 0L,
  errorMessage: String = ""
)


// ---------------------------------------------------------
//  DataFrameOps helper - withColumns uing Map
// ---------------------------------------------------------

implicit class DataFrameOps(df: DataFrame) {
  def withColumns(cols: Map[String, Column]): DataFrame =
    cols.foldLeft(df) { case (acc, (name, col)) =>
      acc.withColumn(name, col)
    }
}


[32mimport [39m[36morg.apache.spark.sql.{DataFrame, SparkSession, Column}

// ---------------------------------------------------------
// 1.  CheckpointStatus
// ---------------------------------------------------------
[39m
defined [32mobject[39m [36mWatermarkStatus[39m
defined [32mobject[39m [36mProductType[39m
defined [32mobject[39m [36mPipelineType[39m
defined [32mobject[39m [36mCheckpointStatus[39m
defined [32mclass[39m [36mDataFrameOps[39m

In [0]:
object Watermarks {

  private val PIPELINE = SchemaResolver.PIPELINE


  // ---------------------------------------------------------
  //  define private fuctions for watermark table 
  // ---------------------------------------------------------

  // Get latest version for tables for one Schema from catalog (ACS or DAP)
  private def fetchLatestVersionsForOneSchema(
      schemaName: String
  ): DataFrame = {

    // List all tables under the upstream schema
    val tablesDF = spark.sql(s"SHOW TABLES IN $schemaName")
      .filter("isTemporary = false") // exclude temp views

    // For each table, get latest version from Delta history
    val results = tablesDF.collect().map { row =>
      val tableName = row.getAs[String]("tableName")
      val fullTablePath = s"${schemaName}.$tableName"
      println(fullTablePath)
      try {
        val historyDF = spark.sql(s"DESCRIBE HISTORY ${fullTablePath}")
        val latestVersion = historyDF.select(max("version")).as[Long].first()
        val latestTimestamp = historyDF.select(max("timestamp")).as[java.sql.Timestamp].first()

        (fullTablePath, latestVersion.toString, latestTimestamp.toString )
      } catch {
        case e: Exception =>
          // Not a Delta table or error reading history
          (fullTablePath, "", "") // use None for missing timestamp
      }
    }

    // Convert to Rows
    spark.createDataFrame(results).toDF("table_name", "latest_version", "latest_timestamp")
  }

  // Fetch latest version for tables for Schema List from catalog
  private def fetchLatestVersionsForSchemas(
    schemas: Seq[String]
  ): DataFrame = {

    // For each schema, get its table version DF; add a column for schema
    val dfs: Seq[DataFrame] = schemas.map { schemaName =>
      fetchLatestVersionsForOneSchema(schemaName)
        .select($"table_name", $"latest_version".cast("long"), $"latest_timestamp".cast("timestamp"))
        // Optionally, prefix table_name with schema to avoid ambiguity if same table exists in multiple schemas
        .withColumn("table_name", concat(lit(s"$schemaName."), $"table_name"))
    }

    // Combine all schema results into one DataFrame
    dfs.reduce((df1, df2) => df1.unionByName(df2, allowMissingColumns = true))
  }

  // Read current checkpoint from Watermark (by table_name) 
  private def loadCheckpointsFromWatermark(
    tableNames: Seq[String] = Seq.empty
  ): DataFrame = {

    // If table does not exist, return empty DF
    if (!spark.catalog.tableExists(DapOps.WATERMARK)) {
      return spark.emptyDataFrame
        .withColumn("table_name", lit("").cast("string"))
        .withColumn("last_processed_version", lit(null).cast("long"))
        .withColumn("end_ts", lit(null).cast("timestamp"))
        .filter(lit(false))
    }

    val curBatchId = latestBatchId()

    // Base dataframe
    val baseDf = spark.table(DapOps.WATERMARK)
      .filter(
        $"batch_id" === curBatchId &&
        $"status" === CheckpointStatus.Success
      )
      .select(
        $"table_name",
        $"last_processed_version".cast("long"),
        $"end_ts".cast("timestamp")
      )

    // Apply optional filter
    val filteredDf =
      if (tableNames.nonEmpty) baseDf.filter($"table_name".isin(tableNames: _*))
      else baseDf

    filteredDf
  }

  // Read current version range for given tables from Watermark (by table_name) as Dataframe
  private def loadVersionRanges(
    tableNames: Seq[String] = Seq.empty
  ): DataFrame = {

    if (!spark.catalog.tableExists(DapOps.WATERMARK)) {
      return spark.createDataFrame(Seq.empty[(String, Long, Long)])
        .toDF("table_name", "start_version", "end_version")
    } 

    val curBatchId = latestBatchId()
    // Base DF: only rows for current batch_id and PLANNED status
    val baseDf = spark.table(DapOps.WATERMARK)
     // .filter($"batch_id" === lit(curBatchId)) //  && $"status" === lit(CheckpointStatus.Ready)
      .select(
        $"table_name",
        $"start_version".cast("long"),
        $"end_version".cast("long")
      )
    
    // Apply optional filter
    val filteredDf =
      if (tableNames.nonEmpty) baseDf.filter($"table_name".isin(tableNames: _*))
      else baseDf

    filteredDf
  }

  private def hasVersionChanged(
    tableName: String
  ): Boolean = {

    if (!spark.catalog.tableExists(DapOps.WATERMARK)) return true
    val df = spark.table(DapOps.WATERMARK)
      .filter(
        $"table_name" === tableName // && $"status" === lit(CheckpointStatus.Ready)
      ) 
      .select(
        $"last_processed_version".cast("long"),
        $"start_version".cast("long"),
        $"end_version".cast("long")
      )
      .as[(Long, Long, Long)] // (last_processed_version, start_version, end_version)
      .collect()

    if (df.isEmpty) {
      false // no watermark row → consider version changed
    } else {
      val (lastProcessed, startV, endV) = df.head
      lastProcessed < startV // true if new versions exist
    }
  }

  // collect and save data for batch run: start/end versions for upstream tables 
  // and write to Watermark  (status = CheckpointStatus.Ready)
  private def collectAndSaveDeltaWatermark(
    latestDf: DataFrame, 
    dryRun:Boolean = false
  ): Long = {

    val tableNames = latestDf.select("table_name").as[String].collect()
    val currentDf = loadCheckpointsFromWatermark(tableNames)

    // Compute start/end version
    val nextDf = latestDf
      .join(currentDf, Seq("table_name"), "left_outer")
      .withColumn("is_same_version", $"latest_version" === $"last_processed_version")
      .withColumn(
        "start_version",
        when($"is_same_version", -1L)
          .otherwise(coalesce($"last_processed_version", lit(-1L)) + 1L)
      )
      .withColumn("end_version", $"latest_version".cast("long") )
      .withColumn("start_ts", $"end_ts".cast("timestamp"))
      .withColumn("end_ts", $"latest_timestamp".cast("timestamp"))
      .withColumn(
        "status",
        when($"is_same_version", lit(CheckpointStatus.Success))
          .otherwise(lit(CheckpointStatus.Ready))
      )
      .filter($"end_version" >= $"start_version")
      .drop("latest_version")                     
      .drop("latest_timestamp")  
      .drop("is_same_version")                     

    //val runId = java.util.UUID.randomUUID().toString
    val batchId = latestBatchId() + 1L;

    val upadteDf = nextDf.withColumns(
        Map(
          "batch_id"                  -> lit(batchId).cast("long"),
          "latest_available_version"  -> $"end_version".cast("bigint"),
          "updated_by"                -> lit(PIPELINE),
          "update_ts"                 -> current_timestamp(),
          "error_message"             -> lit("")
        )
      )
      
    if(!dryRun){
      saveWatermarkHistory(upadteDf)
      updateWatermark(upadteDf)
    }
    
    batchId
  }

  private def collectAndSaveBaselineWatermark(
    latestDf: DataFrame, 
    dryRun: Boolean = false
  ): Long = {

    val now = Instant.now()
    val batchId = latestBatchId() + 1L; 

    // get latest  version for all ACS tables
    val upadteDf = latestDf.withColumns(
            Map(
              "batch_id"                  -> lit(batchId),
              "start_version"             -> $"latest_version",
              "end_version"               -> $"latest_version",
              "start_ts"                  -> $"latest_timestamp",
              "end_ts"                    -> $"latest_timestamp",
              "end_version"               -> $"latest_timestamp",
              "last_processed_version"    -> $"latest_version",
              "latest_available_version"  -> $"latest_version",
              "status"                    -> lit(CheckpointStatus.Ready),
              "updated_by"                -> lit(PIPELINE),
              "update_ts"                 -> lit(now),
              "error_message"             -> lit("")
            )
          )
        .drop("latest_version")                     
        .drop("latest_timestamp")  
      
    if(!dryRun){
      saveWatermarkHistory(upadteDf)
      updateWatermark(upadteDf)
    }
    
    batchId 
  }

  // Read checkpoint rows by batch_id or max batch - Checkpoints table
  private def readCheckpointsByBatchId(
    batchIdOpt: String = ""
    ): DataFrame = {
    val df = spark.table(DapOps.CHECKPOINT)

    // If table is empty → return empty DataFrame directly
    if (df.isEmpty) return df

    // Determine target batch_id
    val batchId = Option(batchIdOpt).map(_.trim).filter(_.nonEmpty).getOrElse(
      df.select(max(col("batch_id"))).as[String].head()
    )

    df.filter(col("batch_id") === batchId)
  }

  // Check if all pipelines successful - Checkpoints table
  private def allPipelinesSuccessful(): Boolean = {

    val checkpointsDf  = readCheckpointsByBatchId()
    val totalCount = checkpointsDf.count()
    val successCount = checkpointsDf
      .filter(lower(col("status")).isin(
        CheckpointStatus.Success.toLowerCase,
        CheckpointStatus.Skipped.toLowerCase
      ))
      .count()

    totalCount > 0 && totalCount == successCount
  }

  // Internal function -  upsert (merge) into watermark table using catalog table name 
  private def updateWatermark(
    df: DataFrame
  ): Unit = {

    // Use DeltaTable.forName on the watermark table
    val wmTable = DeltaTable.forName(spark, DapOps.WATERMARK)

    // Define ONE reusable column mapping
    val columnMap: Map[String, String] = Map(
      "batch_id"                 -> "u.batch_id",
      "table_name"               -> "u.table_name",
      "last_processed_version"   -> "u.last_processed_version",
      "latest_available_version" -> "u.latest_available_version",
      "start_version"            -> "u.start_version",
      "end_version"              -> "u.end_version",
      "start_ts"                 -> "u.start_ts",
      "end_ts"                   -> "u.end_ts",
      "update_ts"                -> "u.update_ts",
      "updated_by"               -> "u.updated_by",
      "status"                   -> "u.status",
      "error_message"            -> "u.error_message"
    )
    // UPDATE: cannot update the PK "table_name"
    // val updateMap = columnMap - "table_name"

    wmTable.as("w")
      .merge(df.as("u"), "w.table_name = u.table_name")
      .whenMatched()
        .updateExpr(columnMap)
      .whenNotMatched()
        .insertExpr(columnMap )
      .execute()
  }

  private def computePipelineStatus(
      pipelinesDf: DataFrame
  ): DataFrame = {

    val explodedDf = pipelinesDf.select($"pipeline_name", explode($"upstream_tables").as("table_name"))
    val watermarkDf = spark.table(DapOps.WATERMARK).select($"table_name", $"start_version")
    
    val joinedDf = explodedDf.join(
          watermarkDf,
          Seq("table_name"),
          "left_outer"
        )

    val statusDf = joinedDf
        .groupBy($"pipeline_name")
        .agg(
          bool_and($"start_version" === -1).as("all_skipped")
        )
        .withColumn(
          "status",
          when($"all_skipped", lit(CheckpointStatus.Skipped))
            .otherwise(lit(CheckpointStatus.Ready))
        )
        .drop("all_skipped")

    pipelinesDf
      .join(statusDf, Seq("pipeline_name"), "left_outer")
      .select(
        $"pipeline_name",
        coalesce($"status", lit(CheckpointStatus.Ready)).as("status")
      )
  }

  // Function to create new checkpoint records for multiple pipelines
  private def createCheckpointforPipelines( 
    batchId: Long, 
    dryRun:Boolean = false
  ): DataFrame = {

    val now = Instant.now()
    val pipelinesDf = Registry.getPipelinesWithTables()
    val statusDf =   computePipelineStatus(pipelinesDf)

    val checkpointDF= statusDf
      .distinct()
      .withColumns(
          Map(
            "batch_id"      -> lit(batchId),
            "processed_ts"  -> lit(0L),
            // "status"        -> lit(CheckpointStatus.Ready),
            "rows_read"     -> lit(0L),
            "rows_written"  -> lit(0L),
            "retry"         -> lit(0L),
            "start_ts"      -> lit(null).cast("timestamp"),
            "end_ts"        -> lit(null).cast("timestamp"),
            "updated_by"    -> lit(PIPELINE),
            "update_ts"     -> current_timestamp(),  
            "error_message" -> lit("")
          )
        )
    // Append to Delta table
    if(!dryRun) {
      checkpointDF.write
        .format("delta")
        .mode("append")
        .saveAsTable(DapOps.CHECKPOINT)
    }

    checkpointDF
  }

  private def saveWatermarkHistory(
    df: DataFrame
  ): Unit = {

    df.write.format("delta").mode("append").saveAsTable(DapOps.WATERMARK_HISTORY)
  }

  // ---------------------------------------------------------
  //   define public fuctions for watermark table 
  // ---------------------------------------------------------

  // Function for pipeline to read current version range for given upstream tables (by table_name)  asMap
  def getWatermarkForTable( 
    tableNames: Seq[String] = Seq.empty
  ): Map[String, (Long, Long)] = {

    if (!spark.catalog.tableExists(DapOps.WATERMARK)) {
      // Return empty map if table missing
      return Map.empty[String, (Long, Long)]
    }

    val tablesToUse: Seq[String] =
        if (tableNames.nonEmpty) tableNames
        else Registry.getMasterTables(SchemaResolver.PIPELINE)   

    val baseDf = loadVersionRanges(tablesToUse)

    // Convert to Map[String, (Long, Long)]
    baseDf
      .as[(String, Long, Long)]
      .collect()
      .map { case (table, startV, endV) =>
        table -> (startV, endV)
      }
      .toMap
  }

  // Function for pipeline to get the latest batch number (max value) from Watermark
  def latestBatchId(): Long = {

    val df = spark.table(DapOps.WATERMARK)
    if (df.isEmpty) {
      0L  // default starting batch_id
    } else {
      df.agg(max("batch_id").cast("long"))
        .as[Long]
        .collect()
        .headOption.getOrElse(0L)
    }
  }

  // Function for PPL to create Watermark for Batch Run: collect and save start/end versions for upstream tables to Watermark
  def initializeWatermark(
    baselineRun: Boolean = false,
    dryRun: Boolean = false
  ): Long = {

      // get latest version for all ACS tables from upstream catalog
      val latestDf = fetchLatestVersionsForSchemas(SchemaResolver.ACS_SCHEMAS)

      // create Watermarksfor ACS with the new batchid
      val batchId = 
        if(baselineRun) collectAndSaveBaselineWatermark(latestDf, dryRun) 
        else collectAndSaveDeltaWatermark(latestDf, dryRun)

      // create the checkpoint with the same batch_id for all registered pipleines
      val checkpointDF = createCheckpointforPipelines(batchId, dryRun)

      batchId
  }

  // Function for PPL to close Watermark and set last_processed_version = end_version after all pipelines complete
  def completeWatermark( 
    dryRun: Boolean = false
  ): Unit = {

    if(allPipelinesSuccessful()) {
      // Leave the existing watermark  for pickup
      // Nothing to do, just exit
      return 
    }

    val now = Instant.now()
    val updates = spark.table(DapOps.WATERMARK)
      .filter($"status" === lit(CheckpointStatus.Ready) )
      .withColumns(
          Map(
            "last_processed_version"    -> $"end_version",
            "latest_available_version"  -> lit(null).cast("bigint"),
            "latest_available_version"  -> $"end_version",
            "status"                    -> lit(CheckpointStatus.Success),
            "updated_by"                -> lit(PIPELINE),
            "update_ts"                 -> lit(now)
          )
      )

    if(!dryRun) {
      saveWatermarkHistory(updates)
      updateWatermark(updates)
    }
  }

  // Function for PPL to create first Watermark first time 
  def insertFirstWatermark(
    dryRun: Boolean = false
  ): Unit = {

      val now = Instant.now()
      // get latest  version for all ACS tables
      val latestDf = fetchLatestVersionsForSchemas(SchemaResolver.ACS_SCHEMAS)

      val firstWatermarkDf = latestDf.withColumns(
            Map(
              "batch_id"                  -> lit(0L),
              "start_version"             -> lit(-1L),
              "end_version"               -> $"latest_version",
              "start_ts"                  -> $"latest_timestamp",
              "end_ts"                    -> $"latest_timestamp",
              "end_version"               -> $"latest_version",
              "last_processed_version"    -> $"latest_version",
              "latest_available_version"  -> $"latest_version",
              "status"                    -> lit(CheckpointStatus.Success),
              "updated_by"                -> lit(PIPELINE),
              "update_ts"                 -> lit(now),
              "error_message"             -> lit("")
            )
          )
        .drop("latest_version")                     
        .drop("latest_timestamp")  
      if(!dryRun) {
        updateWatermark(firstWatermarkDf)
      }
  }

}



defined [32mobject[39m [36mWatermarks[39m

#### 3. Testing

##### Test Watermark - private functions

In [0]:


val df = Watermarks.fetchLatestVersionsForOneSchema("ag_content_ims_acs_prod.gold_entity")
display(df)



ag_content_ims_acs_prod.gold_pprn.d_article_metrics
ag_content_ims_acs_prod.gold_entity.d_alma_openaccess
ag_content_ims_acs_prod.gold_entity.d_alma_subscriptions
ag_content_ims_acs_prod.gold_entity.d_esi_article
ag_content_ims_acs_prod.gold_entity.d_esi_author_indicator
ag_content_ims_acs_prod.gold_entity.d_esi_country_indicator
ag_content_ims_acs_prod.gold_entity.d_esi_institution_indicator
ag_content_ims_acs_prod.gold_entity.d_esi_journal_indicator
ag_content_ims_acs_prod.gold_entity.d_esi_papers
ag_content_ims_acs_prod.gold_entity.d_funding_organization
ag_content_ims_acs_prod.gold_entity.d_grantmaster_dataitem
ag_content_ims_acs_prod.gold_entity.d_grantmaster_grantid
ag_content_ims_acs_prod.gold_entity.d_ip_organisation
ag_content_ims_acs_prod.gold_entity.d_ip_subject
ag_content_ims_acs_prod.gold_entity.d_ip_variable_data
ag_content_ims_acs_prod.gold_entity.d_ip_variables
ag_content_ims_acs_prod.gold_entity.d_jcr_journals
ag_content_ims_acs_prod.gold_entity.d_nuts
ag_content_ims_a

table_name,latest_version,latest_timestamp
ag_content_ims_acs_prod.gold_entity.d_alma_openaccess,2,2025-11-27 08:34:35.0
ag_content_ims_acs_prod.gold_entity.d_alma_subscriptions,2,2025-11-27 08:35:26.0
ag_content_ims_acs_prod.gold_entity.d_esi_article,2,2025-11-27 08:15:30.0
ag_content_ims_acs_prod.gold_entity.d_esi_author_indicator,2,2025-11-27 08:57:27.0
ag_content_ims_acs_prod.gold_entity.d_esi_country_indicator,2,2025-11-27 08:55:57.0
ag_content_ims_acs_prod.gold_entity.d_esi_institution_indicator,2,2025-11-27 08:55:59.0
ag_content_ims_acs_prod.gold_entity.d_esi_journal_indicator,2,2025-11-27 08:55:58.0
ag_content_ims_acs_prod.gold_entity.d_esi_papers,2,2025-11-27 08:15:31.0
ag_content_ims_acs_prod.gold_entity.d_funding_organization,7,2025-12-05 14:47:06.001
ag_content_ims_acs_prod.gold_entity.d_grantmaster_dataitem,2,2025-11-27 09:10:38.0


[36mdf[39m: [32mDataFrame[39m = [table_name: string, latest_version: string ... 1 more field]

In [0]:
// get latest  version for all ACS tables
val dfLatestVersion = Watermarks.fetchLatestVersionsForSchemas(SchemaResolver.ACS_SCHEMAS)
display(dfLatestVersion)


ag_content_ims_acs_prod.gold_entity.d_alma_openaccess
ag_content_ims_acs_prod.gold_entity.d_alma_subscriptions
ag_content_ims_acs_prod.gold_entity.d_esi_article
ag_content_ims_acs_prod.gold_entity.d_esi_author_indicator
ag_content_ims_acs_prod.gold_entity.d_esi_country_indicator
ag_content_ims_acs_prod.gold_entity.d_esi_institution_indicator
ag_content_ims_acs_prod.gold_entity.d_esi_journal_indicator
ag_content_ims_acs_prod.gold_entity.d_esi_papers
ag_content_ims_acs_prod.gold_entity.d_funding_organization
ag_content_ims_acs_prod.gold_entity.d_grantmaster_dataitem
ag_content_ims_acs_prod.gold_entity.d_grantmaster_grantid
ag_content_ims_acs_prod.gold_entity.d_ip_organisation
ag_content_ims_acs_prod.gold_entity.d_ip_subject
ag_content_ims_acs_prod.gold_entity.d_ip_variable_data
ag_content_ims_acs_prod.gold_entity.d_ip_variables
ag_content_ims_acs_prod.gold_entity.d_jcr_journals
ag_content_ims_acs_prod.gold_entity.d_nuts
ag_content_ims_acs_prod.gold_entity.d_nuts_code
ag_content_ims_acs_p

25/12/12 17:35:02 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.
25/12/12 17:35:02 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.


ag_content_ims_acs_prod.gold_wos.address_publication_link
ag_content_ims_acs_prod.gold_wos.author_publication_link
ag_content_ims_acs_prod.gold_wos.category_publication_link
ag_content_ims_acs_prod.gold_wos.country_publication_link
ag_content_ims_acs_prod.gold_wos.country_territory_link
ag_content_ims_acs_prod.gold_wos.d_alc_complete_labels
ag_content_ims_acs_prod.gold_wos.d_article_flag
ag_content_ims_acs_prod.gold_wos.d_article_flag_woscore
ag_content_ims_acs_prod.gold_wos.d_article_identifiers
ag_content_ims_acs_prod.gold_wos.d_article_metrics
ag_content_ims_acs_prod.gold_wos.d_article_metrics_woscore
ag_content_ims_acs_prod.gold_wos.d_article_total_cites
ag_content_ims_acs_prod.gold_wos.d_article_total_cites_woscore
ag_content_ims_acs_prod.gold_wos.d_article_type
ag_content_ims_acs_prod.gold_wos.d_article_type_precedence
ag_content_ims_acs_prod.gold_wos.d_author
ag_content_ims_acs_prod.gold_wos.d_category
ag_content_ims_acs_prod.gold_wos.d_citation
ag_content_ims_acs_prod.gold_wos.

25/12/12 17:37:52 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.
25/12/12 17:37:52 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.


ag_content_ims_acs_prod.gold_pprn.address_publication_link
ag_content_ims_acs_prod.gold_pprn.author_publication_link
ag_content_ims_acs_prod.gold_pprn.category_publication_link
ag_content_ims_acs_prod.gold_pprn.country_publication_link
ag_content_ims_acs_prod.gold_pprn.country_territory_link
ag_content_ims_acs_prod.gold_pprn.d_article_flag
ag_content_ims_acs_prod.gold_pprn.d_article_identifiers
ag_content_ims_acs_prod.gold_pprn.d_article_metrics
ag_content_ims_acs_prod.gold_pprn.d_article_total_cites
ag_content_ims_acs_prod.gold_pprn.d_article_type
ag_content_ims_acs_prod.gold_pprn.d_article_type_precedence
ag_content_ims_acs_prod.gold_pprn.d_author
ag_content_ims_acs_prod.gold_pprn.d_category
ag_content_ims_acs_prod.gold_pprn.d_citation
ag_content_ims_acs_prod.gold_pprn.d_citation_flag
ag_content_ims_acs_prod.gold_pprn.d_citation_patent
ag_content_ims_acs_prod.gold_pprn.d_citation_patents_flag
ag_content_ims_acs_prod.gold_pprn.d_citations
ag_content_ims_acs_prod.gold_pprn.d_country
ag

25/12/12 17:39:43 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.
25/12/12 17:39:43 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.


table_name,latest_version,latest_timestamp
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_alma_openaccess,2,2025-11-27T08:34:35Z
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_alma_subscriptions,2,2025-11-27T08:35:26Z
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_esi_article,2,2025-11-27T08:15:30Z
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_esi_author_indicator,2,2025-11-27T08:57:27Z
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_esi_country_indicator,2,2025-11-27T08:55:57Z
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_esi_institution_indicator,2,2025-11-27T08:55:59Z
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_esi_journal_indicator,2,2025-11-27T08:55:58Z
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_esi_papers,2,2025-11-27T08:15:31Z
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_funding_organization,7,2025-12-05T14:47:06.001Z
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_grantmaster_dataitem,2,2025-11-27T09:10:38Z


[36mdfLatestVersion[39m: [32mDataFrame[39m = [table_name: string, latest_version: bigint ... 1 more field]

In [0]:
 // create watermarks with start & end version for all ACS tables with the new batchid
 // dryRun  = true

  val batchId = Watermarks.collectAndSaveDeltaWatermark(df, true)

25/12/12 18:24:17 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.
25/12/12 18:24:17 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.
25/12/12 18:24:17 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.
25/12/12 18:24:17 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.
25/12/12 18:24:17 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.
25/12/12 18:24:17 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached 

table_name,last_processed_version,end_ts,start_version,end_version,start_ts,status,updated_by,latest_available_version,update_ts,error_message,batch_id
ag_content_ims_acs_prod.gold_entity.d_alma_openaccess,2,2025-11-27T08:34:35Z,3,5,2025-11-27T08:34:35Z,Ready,agra-sa-authorprofile-pipeline,5,2025-12-12T18:24:19.060262Z,,1
ag_content_ims_acs_prod.gold_entity.d_alma_subscriptions,2,2025-11-27T08:35:26Z,3,5,2025-11-27T08:35:26Z,Ready,agra-sa-authorprofile-pipeline,5,2025-12-12T18:24:19.060262Z,,1
ag_content_ims_acs_prod.gold_entity.d_esi_article,2,2025-11-27T08:15:30Z,3,5,2025-11-27T08:15:30Z,Ready,agra-sa-authorprofile-pipeline,5,2025-12-12T18:24:19.060262Z,,1
ag_content_ims_acs_prod.gold_entity.d_esi_author_indicator,2,2025-11-27T08:57:27Z,3,5,2025-11-27T08:57:27Z,Ready,agra-sa-authorprofile-pipeline,5,2025-12-12T18:24:19.060262Z,,1
ag_content_ims_acs_prod.gold_entity.d_esi_country_indicator,2,2025-11-27T08:55:57Z,3,5,2025-11-27T08:55:57Z,Ready,agra-sa-authorprofile-pipeline,5,2025-12-12T18:24:19.060262Z,,1
ag_content_ims_acs_prod.gold_entity.d_esi_institution_indicator,2,2025-11-27T08:55:59Z,3,5,2025-11-27T08:55:59Z,Ready,agra-sa-authorprofile-pipeline,5,2025-12-12T18:24:19.060262Z,,1
ag_content_ims_acs_prod.gold_entity.d_esi_journal_indicator,2,2025-11-27T08:55:58Z,3,5,2025-11-27T08:55:58Z,Ready,agra-sa-authorprofile-pipeline,5,2025-12-12T18:24:19.060262Z,,1
ag_content_ims_acs_prod.gold_entity.d_esi_papers,2,2025-11-27T08:15:31Z,3,5,2025-11-27T08:15:31Z,Ready,agra-sa-authorprofile-pipeline,5,2025-12-12T18:24:19.060262Z,,1
ag_content_ims_acs_prod.gold_entity.d_funding_organization,7,2025-12-05T14:47:06.001Z,8,10,2025-12-05T14:47:06.001Z,Ready,agra-sa-authorprofile-pipeline,10,2025-12-12T18:24:19.060262Z,,1
ag_content_ims_acs_prod.gold_entity.d_grantmaster_dataitem,2,2025-11-27T09:10:38Z,3,5,2025-11-27T09:10:38Z,Ready,agra-sa-authorprofile-pipeline,5,2025-12-12T18:24:19.060262Z,,1


[36mbatchId[39m: [32mLong[39m = [32m1L[39m

In [0]:
// create checkpoints for pipeline with the new batchid
 // dryRun  = true
val checkpointDf  =  Watermarks.createCheckpointforPipelines(0)
display(checkpointDf)


25/12/12 20:01:32 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.
25/12/12 20:01:32 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.


pipeline_name,status,updated_by,end_ts,start_ts,update_ts,error_message,processed_ts,rows_written,rows_read,batch_id,retry
agra-sa-region-pipeline,Ready,agra-sa-authorprofile-pipeline,,,2025-12-12T20:01:41.875175Z,,0,0,0,0,0
agra-sa-researchtopics-pipeline,Ready,agra-sa-authorprofile-pipeline,,,2025-12-12T20:01:41.875175Z,,0,0,0,0,0
agra-sa-normalized-metrics-pipeline,Ready,agra-sa-authorprofile-pipeline,,,2025-12-12T20:01:41.875175Z,,0,0,0,0,0
agra-sa-organization-pipeline,Ready,agra-sa-authorprofile-pipeline,,,2025-12-12T20:01:41.875175Z,,0,0,0,0,0
agra-sa-societal-impact-metrics-pipeline,Ready,agra-sa-authorprofile-pipeline,,,2025-12-12T20:01:41.875175Z,,0,0,0,0,0
agra-sa-metrics-meta-pipeline,Ready,agra-sa-authorprofile-pipeline,,,2025-12-12T20:01:41.875175Z,,0,0,0,0,0
agra-sa-grants-ri-pipeline,Ready,agra-sa-authorprofile-pipeline,,,2025-12-12T20:01:41.875175Z,,0,0,0,0,0
agra-sa-jcr-metrics-pipeline,Ready,agra-sa-authorprofile-pipeline,,,2025-12-12T20:01:41.875175Z,,0,0,0,0,0
agra-sa-doc-patent-pipeline,Ready,agra-sa-authorprofile-pipeline,,,2025-12-12T20:01:41.875175Z,,0,0,0,0,0
agra-sa-category-metrics-pipeline,Ready,agra-sa-authorprofile-pipeline,,,2025-12-12T20:01:41.875175Z,,0,0,0,0,0


[36mcheckpointDf[39m: [32mDataFrame[39m = [pipeline_name: string, status: string ... 10 more fields]

In [0]:
// get checkpoint for all table if no param
val verionsRangeDF = Watermarks.loadVersionRanges(tableNames)
display(verionsRangeDF)

25/12/12 18:30:28 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.
25/12/12 18:30:28 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.
25/12/12 18:30:28 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.


table_name,start_version,end_version
ag_content_ims_acs_prod.gold_entity.d_orgmaster,2,2
ag_content_ims_acs_prod.gold_entity.d_spmaster,2,2


[36mverionsRangeDF[39m: [32mDataFrame[39m = [table_name: string, start_version: bigint ... 1 more field]

In [0]:
// get checkpoint for all table if no param

val tableNames: Seq[String] = Seq(
"ag_content_ims_acs_prod.gold_entity.d_spmaster", // ag_content_ims_acs_prod.gold_entity.
"ag_content_ims_acs_prod.gold_entity.d_orgmaster",
"ag_content_ims_acs_prod.gold_wos.d_organization",
"ag_content_ims_acs_prod.gold_wos.f_publication",
) 
val checkpointDF = Watermarks.loadCheckpointsFromWatermark(tableNames)
display(checkpointDF)

25/12/12 18:29:07 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.
25/12/12 18:29:08 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.
25/12/12 18:29:08 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 8in the parser. Driver memory: 49325015040.


table_name,last_processed_version,end_ts
ag_content_ims_acs_prod.gold_entity.d_orgmaster,2,2025-11-27T08:19:25Z
ag_content_ims_acs_prod.gold_entity.d_spmaster,2,2025-11-27T08:56:47Z


[36mtableNames[39m: [32mSeq[39m[[32mString[39m] = [33mList[39m(
  [32m"ag_content_ims_acs_prod.gold_entity.d_spmaster"[39m,
  [32m"ag_content_ims_acs_prod.gold_entity.d_orgmaster"[39m,
  [32m"ag_content_ims_acs_prod.gold_wos.d_organization"[39m,
  [32m"ag_content_ims_acs_prod.gold_wos.f_publication"[39m
)
[36mcheckpointDF[39m: [32mDataFrame[39m = [table_name: string, last_processed_version: bigint ... 1 more field]

In [0]:

val tableName ="ag_content_ims_acs_prod.gold_entity.d_spmaster" // ag_content_ims_acs_prod.gold_entity.

val hasVersionChanged = Watermarks.hasVersionChanged(tableName)
println( s"hasVersionChanged: {hasVersionChanged}")

25/12/12 20:12:03 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.
25/12/12 20:12:03 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.
25/12/12 20:12:03 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.


hasVersionChanged: {hasVersionChanged}


[36mtableName[39m: [32mString[39m = [32m"ag_content_ims_acs_prod.gold_entity.d_spmaster"[39m
[36mhasVersionChanged[39m: [32mBoolean[39m = [32mfalse[39m

In [0]:

//  Check pipeline status in Chekpoint table
val uiSucessfull = Watermarks.allPipelinesSuccessful()
println(s"Sucessfull; $uiSucessfull")


Sucessfull; false


[36muiSucessfull[39m: [32mBoolean[39m = [32mfalse[39m

In [0]:

val allForLatestBatch = Watermarks.readCheckpointsByBatchId()
display(allForLatestBatch)


pipeline_name,batch_id,processed_ts,start_ts,end_ts,status,rows_read,rows_written,retry,error_message,update_ts,updated_by


[36mallForLatestBatch[39m: [32mDataFrame[39m = [pipeline_name: string, batch_id: bigint ... 10 more fields]

##### Test Watermark - publica functions

In [0]:

println(s"Latest Batch Id: ${Watermarks.latestBatchId()}")


25/12/12 20:12:21 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.


Latest Batch Id: 0


In [0]:
val tableNames: Seq[String] = Seq(
"ag_content_ims_acs_prod.gold_entity.d_spmaster",
"ag_content_ims_acs_prod.gold_entity.d_orgmaster",
"ag_content_ims_acs_prod.gold_wos.d_organization",
"ag_content_ims_acs_prod.gold_wos.f_publication",
)

Watermarks.getWatermarkForTable(tableNames)
.foreach { case (key, (v1, v2)) =>
  println(s"$key -> ($v1, $v2)")
} 

println("--------------------------------")

Watermarks.getWatermarkForTable().foreach { case (key, (v1, v2)) =>
  println(s"$key -> ($v1, $v2)")
}


25/12/13 02:42:27 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 6in the parser. Driver memory: 49325015040.
25/12/13 02:42:28 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 6in the parser. Driver memory: 49325015040.
25/12/13 02:42:28 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 6in the parser. Driver memory: 49325015040.


ag_content_ims_acs_prod.gold_entity.d_orgmaster -> (2, 2)
ag_content_ims_acs_prod.gold_entity.d_spmaster -> (2, 2)
--------------------------------


25/12/13 02:42:30 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 6in the parser. Driver memory: 49325015040.
25/12/13 02:42:30 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 6in the parser. Driver memory: 49325015040.
25/12/13 02:42:30 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 6in the parser. Driver memory: 49325015040.


ag_content_ims_acs_prod.gold_entity.d_spmaster -> (2, 2)
ag_content_ims_acs_prod.gold_entity.grantmaster_orgmaster_link -> (2, 2)
ag_content_ims_acs_prod.gold_entity.funding_org_publication_link -> (7, 7)
ag_content_ims_acs_prod.gold_entity.d_alma_subscriptions -> (2, 2)
ag_content_ims_acs_prod.gold_entity.d_esi_article -> (2, 2)
ag_content_ims_acs_prod.gold_entity.d_ip_organisation -> (2, 2)
ag_content_ims_acs_prod.gold_entity.d_funding_organization -> (7, 7)
ag_content_ims_acs_prod.gold_entity.d_esi_institution_indicator -> (2, 2)
ag_content_ims_acs_prod.gold_entity.orcid_rid_publication_link -> (2, 2)
ag_content_ims_acs_prod.gold_entity.d_ip_variables -> (2, 2)
ag_content_ims_acs_prod.gold_entity.d_nuts -> (8, 8)
ag_content_ims_acs_prod.gold_entity.d_esi_country_indicator -> (2, 2)
ag_content_ims_acs_prod.gold_entity.f_jcr_journal_jci -> (27, 27)
ag_content_ims_acs_prod.gold_entity.f_ip_institution -> (5, 5)
ag_content_ims_acs_prod.gold_entity.d_alma_openaccess -> (2, 2)
ag_content_

[36mtableNames[39m: [32mSeq[39m[[32mString[39m] = [33mList[39m(
  [32m"ag_content_ims_acs_prod.gold_entity.d_spmaster"[39m,
  [32m"ag_content_ims_acs_prod.gold_entity.d_orgmaster"[39m,
  [32m"ag_content_ims_acs_prod.gold_wos.d_organization"[39m,
  [32m"ag_content_ims_acs_prod.gold_wos.f_publication"[39m
)

In [0]:

// This can be run in the first step - PPL automation job
// baselineRun = true ->  Baseline, false -> Delta 
// DryRun if true - Generate batch data(start_version / end_version) for those tables

val startTime = System.nanoTime()

val batchDF = Watermarks.initializeWatermark(true)
println(s"batch_id: ${batchDF}")

val endTime = System.nanoTime()
val durationSec = (endTime - startTime) / 1e9  // duration in seconds

println(s"Duration: $durationSec seconds")
    

ag_content_ims_acs_prod.gold_entity.d_alma_openaccess
ag_content_ims_acs_prod.gold_entity.d_alma_subscriptions
ag_content_ims_acs_prod.gold_entity.d_esi_article
ag_content_ims_acs_prod.gold_entity.d_esi_author_indicator
ag_content_ims_acs_prod.gold_entity.d_esi_country_indicator
ag_content_ims_acs_prod.gold_entity.d_esi_institution_indicator
ag_content_ims_acs_prod.gold_entity.d_esi_journal_indicator
ag_content_ims_acs_prod.gold_entity.d_esi_papers
ag_content_ims_acs_prod.gold_entity.d_funding_organization
ag_content_ims_acs_prod.gold_entity.d_grantmaster_dataitem
ag_content_ims_acs_prod.gold_entity.d_grantmaster_grantid
ag_content_ims_acs_prod.gold_entity.d_ip_organisation
ag_content_ims_acs_prod.gold_entity.d_ip_subject
ag_content_ims_acs_prod.gold_entity.d_ip_variable_data
ag_content_ims_acs_prod.gold_entity.d_ip_variables
ag_content_ims_acs_prod.gold_entity.d_jcr_journals
ag_content_ims_acs_prod.gold_entity.d_nuts
ag_content_ims_acs_prod.gold_entity.d_nuts_code
ag_content_ims_acs_p

25/12/12 20:15:01 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.
25/12/12 20:15:01 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.


ag_content_ims_acs_prod.gold_wos.address_publication_link
ag_content_ims_acs_prod.gold_wos.author_publication_link
ag_content_ims_acs_prod.gold_wos.category_publication_link
ag_content_ims_acs_prod.gold_wos.country_publication_link
ag_content_ims_acs_prod.gold_wos.country_territory_link
ag_content_ims_acs_prod.gold_wos.d_alc_complete_labels
ag_content_ims_acs_prod.gold_wos.d_article_flag
ag_content_ims_acs_prod.gold_wos.d_article_flag_woscore
ag_content_ims_acs_prod.gold_wos.d_article_identifiers
ag_content_ims_acs_prod.gold_wos.d_article_metrics
ag_content_ims_acs_prod.gold_wos.d_article_metrics_woscore
ag_content_ims_acs_prod.gold_wos.d_article_total_cites
ag_content_ims_acs_prod.gold_wos.d_article_total_cites_woscore
ag_content_ims_acs_prod.gold_wos.d_article_type
ag_content_ims_acs_prod.gold_wos.d_article_type_precedence
ag_content_ims_acs_prod.gold_wos.d_author
ag_content_ims_acs_prod.gold_wos.d_category
ag_content_ims_acs_prod.gold_wos.d_citation
ag_content_ims_acs_prod.gold_wos.

25/12/12 20:18:12 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.
25/12/12 20:18:12 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.


ag_content_ims_acs_prod.gold_pprn.address_publication_link
ag_content_ims_acs_prod.gold_pprn.author_publication_link
ag_content_ims_acs_prod.gold_pprn.category_publication_link
ag_content_ims_acs_prod.gold_pprn.country_publication_link
ag_content_ims_acs_prod.gold_pprn.country_territory_link
ag_content_ims_acs_prod.gold_pprn.d_article_flag
ag_content_ims_acs_prod.gold_pprn.d_article_identifiers
ag_content_ims_acs_prod.gold_pprn.d_article_metrics
ag_content_ims_acs_prod.gold_pprn.d_article_total_cites
ag_content_ims_acs_prod.gold_pprn.d_article_type
ag_content_ims_acs_prod.gold_pprn.d_article_type_precedence
ag_content_ims_acs_prod.gold_pprn.d_author
ag_content_ims_acs_prod.gold_pprn.d_category
ag_content_ims_acs_prod.gold_pprn.d_citation
ag_content_ims_acs_prod.gold_pprn.d_citation_flag
ag_content_ims_acs_prod.gold_pprn.d_citation_patent
ag_content_ims_acs_prod.gold_pprn.d_citation_patents_flag
ag_content_ims_acs_prod.gold_pprn.d_citations
ag_content_ims_acs_prod.gold_pprn.d_country
ag

25/12/12 20:20:16 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.
25/12/12 20:20:16 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.
25/12/12 20:20:17 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.
25/12/12 20:20:18 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.
25/12/12 20:20:18 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.
25/12/12 20:20:18 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached 

table_name,last_processed_version,end_ts,start_version,end_version,start_ts,status,updated_by,latest_available_version,update_ts,error_message,batch_id
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_alma_openaccess,,2025-11-27T08:34:35Z,0,2,,Ready,agra-sa-authorprofile-pipeline,2,2025-12-12T20:20:19.972192Z,,1
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_alma_subscriptions,,2025-11-27T08:35:26Z,0,2,,Ready,agra-sa-authorprofile-pipeline,2,2025-12-12T20:20:19.972192Z,,1
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_esi_article,,2025-11-27T08:15:30Z,0,2,,Ready,agra-sa-authorprofile-pipeline,2,2025-12-12T20:20:19.972192Z,,1
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_esi_author_indicator,,2025-11-27T08:57:27Z,0,2,,Ready,agra-sa-authorprofile-pipeline,2,2025-12-12T20:20:19.972192Z,,1
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_esi_country_indicator,,2025-11-27T08:55:57Z,0,2,,Ready,agra-sa-authorprofile-pipeline,2,2025-12-12T20:20:19.972192Z,,1
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_esi_institution_indicator,,2025-11-27T08:55:59Z,0,2,,Ready,agra-sa-authorprofile-pipeline,2,2025-12-12T20:20:19.972192Z,,1
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_esi_journal_indicator,,2025-11-27T08:55:58Z,0,2,,Ready,agra-sa-authorprofile-pipeline,2,2025-12-12T20:20:19.972192Z,,1
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_esi_papers,,2025-11-27T08:15:31Z,0,2,,Ready,agra-sa-authorprofile-pipeline,2,2025-12-12T20:20:19.972192Z,,1
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_funding_organization,,2025-12-05T14:47:06.001Z,0,7,,Ready,agra-sa-authorprofile-pipeline,7,2025-12-12T20:20:19.972192Z,,1
ag_content_ims_acs_prod.gold_entity.ag_content_ims_acs_prod.gold_entity.d_grantmaster_dataitem,,2025-11-27T09:10:38Z,0,2,,Ready,agra-sa-authorprofile-pipeline,2,2025-12-12T20:20:19.972192Z,,1


25/12/12 20:20:20 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.
25/12/12 20:20:20 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 0 DFA states in the parser. Total cached DFA states: 9in the parser. Driver memory: 49325015040.


batch_id: 1
Duration: 394.469414082 seconds


[36mstartTime[39m: [32mLong[39m = [32m22033868745475L[39m
[36mbatchDF[39m: [32mLong[39m = [32m1L[39m
[36mendTime[39m: [32mLong[39m = [32m22428338159557L[39m
[36mdurationSec[39m: [32mDouble[39m = [32m394.469414082[39m

In [0]:
// This can be run in the last step - PPL automation job to create a new checkpoint
//dryRun if true 

val completedDF = Watermarks.completeWatermark(true)
display(completedDF)

25/12/10 19:22:14 INFO AbstractParser$ParserCaches: EXPERIMENTAL: Query cached 1 DFA states in the parser. Total cached DFA states: 7in the parser. Driver memory: 49325015040.


batch_id,table_name,start_version,end_version,last_processed_version,latest_available_version,start_ts,end_ts,cdf_enabled,status,error_message,update_ts,updated_by


[36mcompletedDF[39m: [32mDataFrame[39m = [batch_id: bigint, table_name: string ... 11 more fields]

In [0]:
Watermarks.insertFirstWatermark()