In [0]:

// the following parameters are from thebundle's databrick yaml config file 

//  pass the parameters
dbutils.widgets.text("source_catalog", "ag_content_ims_acs")
dbutils.widgets.text("source_environment", "prod")
dbutils.widgets.text("source_version", "v1_0_0")

dbutils.widgets.text("target_catalog", "ag_ra_search_analytics_data")
dbutils.widgets.text("target_environment", "dev")
dbutils.widgets.text("target_version", "v1_0")

dbutils.widgets.text("pipeline_name", "agra-sa-authorprofile-pipeline")


// dynamic paramters
val source_catalog = dbutils.widgets.get("source_catalog")
val source_environment = dbutils.widgets.get("source_environment")
val source_version = dbutils.widgets.get("source_version")
val target_catalog = dbutils.widgets.get("target_catalog")
val target_environment = dbutils.widgets.get("target_environment")
val target_version = dbutils.widgets.get("target_version")
val pipeline_name = dbutils.widgets.get("pipeline_name")





In [0]:
%run "/Workspace/Shared/wosri/dap_shared/table_resolver" 

#### Define CDC Reader - Type 1 & 2

In [0]:
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
import io.delta.tables.DeltaTable

object CdcType2Reader {

  // CdcType2Reader - for ACS scheam CDC Type2
  private val CdfSystemCols =
    Seq("_commit_version", "_change_type")

  // interface
  def read(
      tableName: String,
      startVersion: Long,
      endVersion: Long,
      selectedCols: Seq[String] = Seq.empty,
      primaryKeyCols: Seq[String] =  Seq.empty
  ): (DataFrame, DataFrame) = {

    if(startVersion == endVersion) {
      val df =  readBaseline(
         tableName,  
          Some(endVersion),
          Seq("uid")
        )
      (df, spark.emptyDataFrame)
    }
    else {
      readIncremental(
          tableName,
          startVersion = startVersion,
          endVersion   = endVersion,
          Seq("uid")
        )
    }
  }

  /**
    * Baseline snapshot AS-OF a given Delta version
    * If endVersion is None => use latest version
    */
  def readBaseline(
      tableName: String,
      endVersion: Option[Long] = None,
      selectedCols: Seq[String] = Seq.empty,
      primaryKeyCols: Seq[String] = Seq.empty
  ): DataFrame = {

    val (df, baselineVersionOpt) =
      try {
        val deltaTable = DeltaTable.forName(spark, tableName)

        val version =
          endVersion.getOrElse {
            deltaTable
              .history(1)
              .select(max(col("version")))
              .as[Long]
              .collect()
              .head
          }

        val df = spark.read
          .format("delta")
          .option("versionAsOf", version)
          .table(tableName)

        (df, Some(version))
      }
      catch {
          case _: Exception =>
            // Fallback: non-Delta table or view
            val df = spark.table(tableName)

          (df, None)
        }

    val projected =
      if (selectedCols.isEmpty) df
      else {
        val requiredCols =
          (primaryKeyCols ++ selectedCols).distinct
        df.select(requiredCols.map(col): _*)
      }

    val result = projected
      .filter(col("__END_AT").isNull)
      .withColumn("_op", lit("BASELINE"))
    // .withColumn("_baseline_version", lit(baselineVersion))


      baselineVersionOpt match {
        case Some(v) => result.withColumn("_baseline_version", lit(v))
        case None    => result
      }
  }


  /**
   * Incremental read using Delta Change Data Feed
   * If selectedCols is empty => read all columns
   */
  def readIncremental(
      tableName: String,
      startVersion: Long,
      endVersion: Long,
      selectedCols: Seq[String] = Seq.empty,
      primaryKeyCols: Seq[String] = Seq.empty
  ): (DataFrame, DataFrame) = {

    if (startVersion > endVersion) {
      return (spark.emptyDataFrame, spark.emptyDataFrame)
    }

    val cdf = spark.read
      .format("delta")
      .option("readChangeFeed", "true")
      .option("startingVersion", startVersion)
      .option("endingVersion", endVersion)
      .table(tableName)
      .filter(col("_change_type").isin("insert", "update_postimage", "delete"))

    val projected = projectCols(cdf, primaryKeyCols, selectedCols, includeCdfCols = true)

    val latestPerKey = dedupeLatest(projected, primaryKeyCols)

    val upserts = latestPerKey
      .filter(col("__END_AT").isNull)
      .withColumn("_op", lit("UPSERT"))

    val deletes = latestPerKey
      .filter(col("__END_AT").isNotNull)
      .withColumn("_op", lit("DELETE"))

    (upserts, deletes)
  }

  /**
   * Deduplicate per business key using commit version ordering
   */
 private def dedupeLatest(
    df: DataFrame,
    primaryKeyCols: Seq[String] = Seq.empty
): DataFrame = {

  if (primaryKeyCols.isEmpty) {
    // No deduplication requested
    df
  } else {
    val windowSpec = Window
      .partitionBy(primaryKeyCols.map(col): _*)
      .orderBy(col("_commit_version").desc_nulls_last)

    df.withColumn("_rn", row_number().over(windowSpec))
      .filter(col("_rn") === 1)
      .drop("_rn")
    }
  }

  /**
   * Project columns safely:
   * - If selectedCols empty => return df as-is
   * - Otherwise ensure primary keys + system columns exist
   */
  private def projectCols(
      df: DataFrame,
      primaryKeyCols: Seq[String] = Seq.empty,
      selectedCols: Seq[String] = Seq.empty,
      includeCdfCols: Boolean = false
  ): DataFrame = {

    if (selectedCols.isEmpty && primaryKeyCols.isEmpty) {
      df
    } else {
      val requiredCols =
        (primaryKeyCols ++ selectedCols ++
          (if (includeCdfCols) CdfSystemCols else Seq("_commit_version")))
          .toSeq
          .distinct

      val existingCols = df.columns.toSet
      val finalCols = requiredCols.filter(existingCols.contains)

      df.select(finalCols.map(col): _*)
    }
  }
}


In [0]:
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import scala.util.chaining._


object CdcType1Reader {

  private val CdfSystemCols = Seq("_commit_version", "_change_type")


 // interface
  def read(
      tableName: String,
      startVersion: Long,
      endVersion: Long,
      selectedCols: Seq[String] = Seq.empty
  ): (DataFrame, DataFrame) = {

    if(startVersion == endVersion) {
      val df =  readBaseline(
         tableName,   
          Some(endVersion),
          selectedCols
        )
      (df, spark.emptyDataFrame)
    }
    else {
      readIncremental(
          tableName,
          startVersion = startVersion,
          endVersion   = endVersion,
          selectedCols
        )
    }
  }

  /**
   * Read baseline snapshot for Type-1
   * If selectedCols empty => read all columns
   */
  def readBaseline(
      tableName: String,
      endVersion: Option[Long] = None,
      selectedCols: Seq[String] = Seq.empty
  ): DataFrame = {

    val df =
      try {
        // Try Delta table
        val version = endVersion.getOrElse {
          io.delta.tables.DeltaTable.forName(spark, tableName)
            .history(1).select(max("version")).as[Long].collect().head
        }
        spark.read.format("delta").option("versionAsOf", version).table(tableName)
      } catch {
        case _: Exception =>
          // Fallback to path or view
          spark.table(tableName)
      }

    if (selectedCols.isEmpty) df else df.select(selectedCols.map(col): _*)
  }

  /**
   * Read incremental changes for Type-1 via CDF (if available)
   * Returns only upserts (no deletes for Type-1)
   */
  /**
   * Incremental CDF for Type-1
   * Returns (upserts, deletes)
   */
  def readIncremental(
      tableName: String,
      startVersion: Long,
      endVersion: Long,
      selectedCols: Seq[String] = Seq.empty
  ): (DataFrame, DataFrame) = {

    if (startVersion > endVersion) return (spark.emptyDataFrame, spark.emptyDataFrame)

    val cdfDF = spark.read.format("delta")
      .option("readChangeFeed", "true")
      .option("startingVersion", startVersion)
      .option("endingVersion", endVersion)
      .table(tableName)
      .filter(col("_change_type").isin("insert", "update_postimage", "delete"))

    // Include selected columns + system columns for CDF
    val projected =
      if (selectedCols.isEmpty) cdfDF
      else {
        val colsToSelect = (selectedCols ++ CdfSystemCols).distinct.filter(cdfDF.columns.contains)
        cdfDF.select(colsToSelect.map(col): _*)
      }

    // Split upserts vs deletes
    val upserts = projected.filter(col("_change_type").isin("insert", "update_postimage"))
      .withColumn("_op", lit("UPSERT"))

    val deletes = projected.filter(col("_change_type") === "delete")
      .withColumn("_op", lit("DELETE"))

    (upserts, deletes)
  }
}


### Test

##### TableResolver

In [0]:
val datasetList: Seq[Dataset] = Seq(Dataset.WosCore, Dataset.Pprn, Dataset.WosEsci)

datasetList.foreach { t =>

    TableResolver.forDataset(t)
    println("-------------------------")
    println(s"Dataset: $t")
    println(ACS.DArticleTotalCites)
    println(ACS.DAlmaOpenaccess)
    println(ACS.AuthorPublicationLink)

}

In [0]:
println("---------DAP----------------")
println(DAP.Alma.fullName)
println(DAP.ApArticle.fullName)
println(DAP.IncitesRiOrgGrants.fullName)
println(DAP.Authorprofile.fullName)

println("---------UDM----------------")
println(UDM.GrantsTopic.fullName)
println(UDM.ProfileGrantRelation.fullName)
println(UDM.ItemTopic.fullName)


##### CdcType2Reader

In [0]:
// optional: endVersion, selectedCols, primaryKeyCols


TableResolver.forDataset(Dataset.WosCore)
println(ACS.FPublication)

val df =
  CdcType2Reader.readBaseline(
    ACS.FPublication.fullName,
    endVersion = Some(50L),
  // selectedCols = Seq("uid", "pub_year", "__END_AT"),
    Seq("uid")
  )

// display(baseline)
println (df.count)

In [0]:

val (upserts, deletes) =
  CdcType2Reader.readIncremental(
    ACS.FPublication.fullName,
    startVersion = 57L,
    endVersion   = 61L,
    Seq("uid")
  )

  println (s"upserts: ${upserts.count()}")
  println (s"deletes  ${deletes.count()}" )
 // display(upserts)

In [0]:
// unified read baseline if startVersion = endVersion, otherwise, incrmental 

val (upserts, deletes) =
  CdcType2Reader.read(
      ACS.FPublication.fullName,
      startVersion = 57L,
      endVersion   = 61L
  )

println (s"upserts: ${upserts.count()}")
println (s"deletes  ${deletes.count()}" )

##### CdcType1Reader

In [0]:
// endVersion - optional

println(DAP.JcrMetrics.fullName)

val df = CdcType1Reader.readBaseline( 
    DAP.JcrMetrics.fullName,
    endVersion = Some(5L)
  )

println (s"df  ${df.count()}" )
//display(df)


In [0]:

val (upserts, deletes) = 
    CdcType1Reader.readIncremental(
        DAP.Authorprofile.fullName,
        startVersion = 4,
        endVersion = 6
    )

  println (s"upserts: ${upserts.count()}")
  println (s"deletes  ${deletes.count()}" )
// display(df)


In [0]:
// unified read baseline if startVersion = endVersion, otherwise, incrmental 

val (upserts, deletes) =
  CdcType1Reader.read(
      DAP.Authorprofile.fullName,
      startVersion = 3,
      endVersion = 3
  )

println (s"upserts: ${upserts.count()}")
println (s"deletes  ${deletes.count()}" )