#### 1. DeltaLinkGenerator.scala

In [None]:
// DeltaLinkGenerator.scala
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.Trigger
import io.delta.tables._
import java.sql.Timestamp

object DeltaLinkGenerator {

  // -------------------------
  // Config / Case classes
  // -------------------------
  case class Upstream(alias: String, tablePath: String, keyCol: String)
  case class PipelineConfig(
    pipelineName: String,
    upstreams: Seq[Upstream],
    // Optional join SQL: must produce columns: entity_key, last_commit_version, last_change_ts, upstream_tables (array<string>)
    // The SQL may reference temp views with names equal to the upstream.alias (e.g., SELECT ... FROM a JOIN b ON ...)
    // If joinSql is None, the engine will UNION distinct keys from each upstream (simple case).
    joinSql: Option[String] = None,
    // path where per-pipeline link table will be stored (delta)
    linkTablePath: String,
    // optional checkpoint path for streaming
    checkpointPath: Option[String] = None
  )

  // Tracker table name (metastore) or path; here we use a Delta path for portability
  val trackerPathDefault = "/mnt/delta/delta_link_tracker"

  // -------------------------
  // Utilities
  // -------------------------
  private def spark: SparkSession = SparkSession.builder().getOrCreate()

  import spark.implicits._

  // Initialize tracker table if not exists
  def initTracker(trackerPath: String = trackerPathDefault): Unit = {
    val spark = this.spark
    import spark.implicits._

    if (!io.delta.tables.DeltaTable.isDeltaTable(spark, trackerPath)) {
      val empty = Seq.empty[(String, Long, Timestamp)].toDF("upstream_table", "last_processed_version", "last_processed_ts")
      empty.write.format("delta").mode("overwrite").save(trackerPath)
      println(s"Created tracker table at $trackerPath")
    } else {
      println(s"Tracker table already exists at $trackerPath")
    }
  }

  // read last processed version for a table (0 if not present)
  def getLastProcessedVersion(upstreamTable: String, trackerPath: String = trackerPathDefault): Long = {
    val spark = this.spark
    import spark.implicits._

    if (!io.delta.tables.DeltaTable.isDeltaTable(spark, trackerPath)) {
      return 0L
    }
    val df = spark.read.format("delta").load(trackerPath)
    val row = df.filter($"upstream_table" === upstreamTable).select("last_processed_version").as[Long].collect()
    if (row.isEmpty) 0L else row.head
  }

  // update tracker entries for multiple upstreams
  def updateTracker(updates: Seq[(String, Long, java.sql.Timestamp)], trackerPath: String = trackerPathDefault): Unit = {
    val spark = this.spark
    import spark.implicits._

    // ensure exists
    initTracker(trackerPath)

    val temp = updates.toDF("upstream_table", "last_processed_version", "last_processed_ts")
    val dt = DeltaTable.forPath(spark, trackerPath)

    // Merge: update existing rows, insert new ones
    dt.as("t")
      .merge(
        temp.as("s"),
        "t.upstream_table = s.upstream_table"
      )
      .whenMatched()
      .updateExpr(Map(
        "last_processed_version" -> "s.last_processed_version",
        "last_processed_ts" -> "s.last_processed_ts"
      ))
      .whenNotMatched()
      .insertExpr(Map(
        "upstream_table" -> "s.upstream_table",
        "last_processed_version" -> "s.last_processed_version",
        "last_processed_ts" -> "s.last_processed_ts"
      ))
      .execute()
  }

  // -------------------------
  // Batch: read change rows for an upstream from lastProcessedVersion+1 to latest
  // returns DataFrame with its columns plus _commit_version and _commit_timestamp
  // and renames the key column to the user-specified name for convenience (kept as original)
  // -------------------------
  def readChangesBatch(up: Upstream, fromVersionExclusive: Long): DataFrame = {
    // readChangeData true supports .option("startingVersion", v) or we can filter by _commit_version > v
    // We'll read the change feed from beginning (or use starting version) and filter by > fromVersionExclusive
    val dfRaw = spark.read
      .format("delta")
      .option("readChangeData", "true")
      .load(up.tablePath)

    // Some environments might require explicitly filtering _commit_version > fromVersionExclusive
    val df = dfRaw
      .filter(col("_change_type").isin("insert", "update", "delete"))
      .filter(col("_commit_version") > lit(fromVersionExclusive))
      .withColumn("upstream_table", lit(up.tablePath))
      .withColumnRenamed(up.keyCol, up.keyCol) // keep original name but it's available in the view

    df
  }

  // -------------------------
  // Aggregation & merge helper for link table
  // Input: DataFrame expected columns -> entity_key, last_commit_version, last_change_ts, upstream_tables (array<string>)
  // -------------------------
  def writeLinkTableWithMerge(pipeline: PipelineConfig, df: DataFrame): Unit = {
    val spark = this.spark
    import spark.implicits._

    // Ensure the output table exists; if not create with df.schema
    if (!io.delta.tables.DeltaTable.isDeltaTable(spark, pipeline.linkTablePath)) {
      df.write.format("delta").mode("overwrite").save(pipeline.linkTablePath)
      println(s"Created link table at ${pipeline.linkTablePath}")
      return
    }

    val deltaT = DeltaTable.forPath(spark, pipeline.linkTablePath)
    // Merge by pipeline + entity_key; pipelineName stored in df or we add it
    val src = df.withColumn("pipeline_name", lit(pipeline.pipelineName)).alias("s")
    deltaT.as("t")
      .merge(
        src,
        "t.pipeline_name = s.pipeline_name AND t.entity_key = s.entity_key"
      )
      .whenMatched("s.last_commit_version > t.last_commit_version")
      .updateExpr(Map(
        "last_commit_version" -> "s.last_commit_version",
        "last_change_ts" -> "s.last_change_ts",
        "upstream_tables" -> "s.upstream_tables",
        "updated_at" -> "current_timestamp()"
      ))
      .whenNotMatched()
      .insertExpr(Map(
        "pipeline_name" -> "s.pipeline_name",
        "entity_key" -> "s.entity_key",
        "last_commit_version" -> "s.last_commit_version",
        "last_change_ts" -> "s.last_change_ts",
        "upstream_tables" -> "s.upstream_tables",
        "created_at" -> "current_timestamp()",
        "updated_at" -> "current_timestamp()"
      ))
      .execute()
    println(s"MERGE completed for pipeline ${pipeline.pipelineName} into ${pipeline.linkTablePath}")
  }

  // -------------------------
  // Run batch processing for one pipeline (supports joinSql or union)
  // -------------------------
  def runPipelineBatch(pipeline: PipelineConfig, trackerPath: String = trackerPathDefault): Unit = {
    val spark = this.spark
    import spark.implicits._

    // 1) Read last processed versions for each upstream
    val lastVersions: Map[String, Long] = pipeline.upstreams.map { up =>
      up.alias -> getLastProcessedVersion(up.tablePath, trackerPath)
    }.toMap

    // 2) Find latest versions for upstream tables (from table history) to decide upper bound
    // We can find latest version via DeltaTable.forPath(...).history(1) or using snapshot metadata
    val latestVersions: Map[String, Long] = pipeline.upstreams.map { up =>
      val dt = io.delta.tables.DeltaTable.forPath(spark, up.tablePath)
      val latest = dt.history(1).select("version").as[Long].collect().headOption.getOrElse(0L)
      up.alias -> latest
    }.toMap

    // 3) For each upstream, read changes > lastProcessedVersion
    // Register as temp views with their alias name so joinSql can reference them.
    pipeline.upstreams.foreach { up =>
      val fromV = lastVersions(up.alias)
      val df = readChangesBatch(up, fromV)
      // Register temp view for SQL joins. The view keeps the original key column name.
      df.createOrReplaceTempView(up.alias)
    }

    // 4) Build final DF
    val finalDF: DataFrame = pipeline.joinSql match {
      case Some(sql) =>
        // User-provided SQL must return columns: entity_key, last_commit_version, last_change_ts, upstream_tables (array<string>)
        // We run it directly.
        val res = spark.sql(sql)
        // Validation for expected columns (light)
        val cols = res.columns.toSet
        require(cols.contains("entity_key") && cols.contains("last_commit_version") && cols.contains("last_change_ts"),
          s"joinSql must return entity_key, last_commit_version, last_change_ts. got: ${cols.mkString(",")}")
        // if upstream_tables is missing, try to create it from constants in SQL or fallback
        val withUpstreams = if (cols.contains("upstream_tables")) res
                            else res.withColumn("upstream_tables", array(lit(pipeline.upstreams.map(_.tablePath): _*)))
        withUpstreams
      case None =>
        // Simple union of distinct keys: we expect each upstream temp view to expose the key column provided in Upstream.keyCol.
        // We'll select each upstream's key as entity_key, commit_version, commit_ts, upstream_table then union.
        val perUp = pipeline.upstreams.map { up =>
          // select key as entity_key
          val colExpr = col(up.keyCol).as("entity_key")
          spark.table(up.alias)
            .select(colExpr, col("_commit_version"), col("_commit_timestamp"))
            .withColumn("upstream_tables", array(lit(up.tablePath)))
        }
        val unioned = perUp.reduce(_.unionByName(_, allowMissingColumns = true))
        // collapse to latest commit per entity_key
        unioned.groupBy("entity_key")
          .agg(
            max("_commit_version").as("last_commit_version"),
            max("_commit_timestamp").as("last_change_ts"),
            flatten(collect_set("upstream_tables")).as("upstream_tables")
          )
    }

    // Add pipeline_name if not present
    val finalWithPipeline = if (finalDF.columns.contains("pipeline_name")) finalDF else finalDF.withColumn("pipeline_name", lit(pipeline.pipelineName))

    // Normalize columns
    val normalized = finalWithPipeline
      .selectExpr("pipeline_name", "entity_key", "cast(last_commit_version as long) as last_commit_version", "cast(last_change_ts as timestamp) as last_change_ts", "upstream_tables")
      .withColumn("processed_at", current_timestamp())

    // 5) Write/MERGE into link table
    writeLinkTableWithMerge(pipeline, normalized)

    // 6) Update tracker: set last_processed_version to latestVersions per upstream
    val updates = pipeline.upstreams.map { up =>
      val latestV = latestVersions(up.alias)
      (up.tablePath, latestV, new java.sql.Timestamp(System.currentTimeMillis()))
    }
    updateTracker(updates, trackerPath)
  }

  // -------------------------
  // Run batch for many pipelines
  // -------------------------
  def runAllPipelinesBatch(pipelines: Seq[PipelineConfig], trackerPath: String = trackerPathDefault): Unit = {
    // ensure tracker exists
    initTracker(trackerPath)
    pipelines.foreach { p =>
      println(s"===== Running batch for pipeline: ${p.pipelineName} =====")
      try {
        runPipelineBatch(p, trackerPath)
      } catch {
        case e: Throwable =>
          println(s"ERROR running pipeline ${p.pipelineName}: ${e.getMessage}")
          e.printStackTrace()
      }
    }
  }

  // -------------------------
  // Streaming (continuous) - union-only mode
  // Note: join-based streaming is non-trivial (stateful multi-stream join) and is not implemented here.
  // If you need streaming joins, consider using Structured Streaming with event-time watermarks and careful
  // windowing, or use DLT which simplifies streaming joins. For now streaming supports UNION mode for each upstream.
  // -------------------------
  def runPipelineStreamingUnion(pipeline: PipelineConfig): Unit = {
    require(pipeline.joinSql.isEmpty, "Streaming union runner supports only pipelines with no joinSql (simple union).")

    val spark = this.spark
    import spark.implicits._

    // Build streaming union of upstream change streams
    val changeStreams = pipeline.upstreams.map { up =>
      spark.readStream
        .format("delta")
        .option("readChangeData", "true")
        .load(up.tablePath)
        .filter(col("_change_type").isin("insert", "update", "delete"))
        .select(col(up.keyCol).as("entity_key"), col("_commit_version"), col("_commit_timestamp"))
        .withColumn("upstream_tables", array(lit(up.tablePath)))
    }

    val unioned = changeStreams.reduce(_.unionByName(_, allowMissingColumns = true))

    val deduped = unioned
      .withWatermark("_commit_timestamp", "1 day")
      .groupBy("entity_key")
      .agg(
        max("_commit_version").as("last_commit_version"),
        max("_commit_timestamp").as("last_change_ts"),
        flatten(collect_set("upstream_tables")).as("upstream_tables")
      )
      .withColumn("pipeline_name", lit(pipeline.pipelineName))
      .withColumn("processed_at", current_timestamp())

    val checkpoint = pipeline.checkpointPath.getOrElse(s"/mnt/checkpoints/delta_link_${pipeline.pipelineName}")

    val query = deduped.writeStream
      .format("delta")
      .outputMode("update")
      .option("checkpointLocation", checkpoint)
      .option("mergeSchema", "true")
      .trigger(Trigger.ProcessingTime("5 minutes"))
      .start(pipeline.linkTablePath)

    println(s"Started streaming query for pipeline ${pipeline.pipelineName}; writing to ${pipeline.linkTablePath} with checkpoint $checkpoint")
    query.awaitTermination()
  }

  // -------------------------
  // Example: helper that constructs a simple JOIN SQL for two upstreams.
  // If you prefer to supply custom SQL, skip this and provide joinSql in config.
  // Example usage shown below in mainExample.
  // -------------------------
  def buildTwoTableJoinSql(aAlias: String, aKey: String, bAlias: String, bKey: String, downstreamKeyExpr: String): String = {
    // downstreamKeyExpr is an expression using aliases a and b (e.g. "coalesce(a.some_fk, b.other_fk) as entity_key")
    s"""
      SELECT
        ${downstreamKeyExpr},
        greatest(a._commit_version, b._commit_version) as last_commit_version,
        greatest(a._commit_timestamp, b._commit_timestamp) as last_change_ts,
        array('${aAlias}', '${bAlias}') as upstream_tables
      FROM $aAlias a
      FULL OUTER JOIN $bAlias b
        ON <PUT_JOIN_CONDITION_HERE>
      -- NOTE: Replace <PUT_JOIN_CONDITION_HERE> with actual join condition, e.g., a.x = b.x
    """
  }

  // -------------------------
  // Example main showing usage
  // -------------------------
  def mainExample(): Unit = {
    // Example pipelines
    val pipeline1 = PipelineConfig(
      pipelineName = "pipelineX",
      upstreams = Seq(
        Upstream("a", "/mnt/delta/upstream_A", "a_id"),
        Upstream("b", "/mnt/delta/upstream_B", "b_id")
      ),
      // Provide a joinSql that references temp views 'a' and 'b'
      // The minimal required output columns are: entity_key, last_commit_version, last_change_ts, upstream_tables
      joinSql = Some(
        """
        -- Example: join where downstream entity_key is coalesced foreign key
        SELECT
          coalesce(a.some_fk, b.some_fk) as entity_key,
          greatest(a._commit_version, b._commit_version) as last_commit_version,
          greatest(a._commit_timestamp, b._commit_timestamp) as last_change_ts,
          array(a.upstream_table, b.upstream_table) as upstream_tables
        FROM a
        FULL OUTER JOIN b
          ON a.some_fk = b.some_fk
        """
      ),
      linkTablePath = "/mnt/delta/delta_link_pipelineX",
      checkpointPath = Some("/mnt/checkpoints/delta_link_pipelineX")
    )

    val pipeline2 = PipelineConfig(
      pipelineName = "pipelineY",
      upstreams = Seq(
        Upstream("c", "/mnt/delta/upstream_C", "c_id")
      ),
      joinSql = None,
      linkTablePath = "/mnt/delta/delta_link_pipelineY",
      checkpointPath = Some("/mnt/checkpoints/delta_link_pipelineY")
    )

    // initialize tracker
    initTracker()

    // run batch for all pipelines
    runAllPipelinesBatch(Seq(pipeline1, pipeline2))
  }

}

#### 2.  Configuration

#### 3. DLT  Implemnetation

##### 3.1 readStream  - outer join

In [None]:

# outer join doesn't work in streaming

from pyspark import pipelines as dp
#from databricks import pipelines
from pyspark.sql.functions import col, lit

# ------------ CONFIG ------------------

#DB = "ag_ra_search_analytics_data_dev.sandbox_v1_0"
DB = "ag_content_ims_dev.acs_entity"
DB2= "ag_content_ims_dev.acs_wos"

SPMASTER = f"{DB}.d_spmaster"
AFFILIATION = f"{DB2}.d_daisng_ranked_affiliation"

START_VERSION = "2"   # Or load from pipeline checkpoint / control table


# =======================================================================
# 1. READ CDF CHANGES AS DP VIEWS
# =======================================================================

@dp.view
def spmaster_changes():
    return (
        spark.readStream.format("delta")
            .table(SPMASTER)
            #.filter(col("_change_type").isin("insert", "update_postimage", "delete"))
            #.selectExpr("diasng_id", "_change_type")
    )



@dp.view
def affiliation_changes():
    return (
        spark.readStream.format("delta")
            .table(AFFILIATION)
            #.filter(col("_change_type").isin("insert", "update_postimage", "delete"))
            #.selectExpr("sp_id", "institution_key", "_change_type")
    )


# =======================================================================
# 2. GENERATE DOWNSTREAM AFFECTED KEY PAIRS
# =======================================================================

@dp.table(
    name="link_sp_affiliation",
    comment="Affected (diasng_id, institution_key) derived from CDF changes from d_spmaster and d_daisng_ranked_affiliation."
)
def link_sp_affiliation():
    
    # Extract distinct changed keys from each upstream table
    sp = (
        dp.readStream("spmaster_changes")
                 .select("diasng_id")
                 .distinct()
    )

    aff = (
        dp.readStream("affiliation_changes")
                 .select("sp_id", "institution_key")
                 .distinct()
    )

    # FULL OUTER JOIN guarantees:
    #  - If spmaster changes → produce pairs
    #  - If affiliation changes → produce ALL diasng_id linked
    #  - Handles deletes, inserts, updates
    joined = (
        sp.join(aff, sp.diasng_id == aff.sp_id, "full_outer")
          .select(
              col("diasng_id"),
              col("institution_key")
          )
    )

    # Affected downstream keys
    return joined.filter(col("diasng_id").isNotNull())

##### 3.2 readStream - Union

In [None]:

# workable streaming pattern: union of both streams

from pyspark import pipelines as dp
from pyspark.sql.functions import col, lit

# ------------ CONFIG ------------------

DB = "ag_content_ims_dev.acs_entity"
DB2 = "ag_content_ims_dev.acs_wos"

SPMASTER = f"{DB}.d_spmaster"
AFFILIATION = f"{DB2}.d_daisng_ranked_affiliation"


# =======================================================================
# 1. READ CDF CHANGES AS DP VIEWS
# =======================================================================

@dp.view
def spmaster_changes():
    return (
        spark.readStream.format("delta")
             .table(SPMASTER)
             .select("diasng_id")
             .distinct()
    )


@dp.view
def affiliation_changes():
    return (
        spark.readStream.format("delta")
             .table(AFFILIATION)
             .select("sp_id", "institution_key")
             .distinct()
    )


# =======================================================================
# 2. GENERATE AFFECTED KEY PAIRS
# =======================================================================

@dp.table(
    name="link_sp_affiliation",
    comment="Affected (diasng_id, institution_key) derived from both CDF streams."
)
def link_sp_affiliation():

    # 1. SP master change → produce (diasng_id, null)
    sp_keys = (
        dp.readStream("spmaster_changes")
          .select(
              col("diasng_id"),
              lit(None).cast("string").alias("institution_key"),
              lit("spmaster").alias("source")
          )
    )

    # 2. Affiliation change → produce (diasng_id, institution_key)
    aff_keys = (
        dp.readStream("affiliation_changes")
          .select(
              col("sp_id").alias("diasng_id"),
              col("institution_key"),
              lit("affiliation").alias("source")
          )
    )

    # Union both streams — VALID streaming pattern
    unified = sp_keys.unionByName(aff_keys)

    # Deduplicate keys across microbatches
    return unified.dropDuplicates(["diasng_id", "institution_key"])

##### 3.2 read CDF

In [None]:


from pyspark import pipelines as dp
#from databricks import pipelines
from pyspark.sql.functions import col, lit

# ------------ CONFIG ------------------

DB = "ag_ra_search_analytics_data_dev.sandbox_v1_0"
DB1 = "ag_content_ims_dev.acs_entity"
DB2= "ag_content_ims_dev.acs_wos"

SPMASTER = f"{DB}.d_spmaster"
AFFILIATION = f"{DB}.d_daisng_ranked_affiliation"

START_VERSION = "2"   # Or load from pipeline checkpoint / control table


# =======================================================================
# 1. READ CDF CHANGES AS DP VIEWS
# =======================================================================

@dp.view
def spmaster_changes():
    return (
         spark.read.format("delta")
            .option("readChangeData", "true")
            .option("startingVersion", START_VERSION)
            .table(SPMASTER)
            .filter(col("_change_type").isin("insert", "update_postimage", "delete"))
            .selectExpr("diasng_id", "_change_type")
    )



@dp.view
def affiliation_changes():
    return (
        spark.read.format("delta")
            .option("readChangeData", "true")
            .option("startingVersion", START_VERSION)
            .table(AFFILIATION)
            .filter(col("_change_type").isin("insert", "update_postimage", "delete"))
            .selectExpr("sp_id", "institution_key", "_change_type")
    )


# =======================================================================
# 2. GENERATE DOWNSTREAM AFFECTED KEY PAIRS
# =======================================================================

@dp.table(
    name="link_sp_affiliation",
    comment="Affected (diasng_id, institution_key) derived from CDF changes from d_spmaster and d_daisng_ranked_affiliation."
)
def link_sp_affiliation():
    
    # Extract distinct changed keys from each upstream table
    sp = (
        dp.read("spmaster_changes")
                 .select("diasng_id")
                 .distinct()
    )

    aff = (
        dp.read("affiliation_changes")
                 .select("sp_id", "institution_key")
                 .distinct()
    )

    # FULL OUTER JOIN guarantees:
    #  - If spmaster changes → produce pairs
    #  - If affiliation changes → produce ALL diasng_id linked
    #  - Handles deletes, inserts, updates
    joined = (
        sp.join(aff, sp.diasng_id == aff.sp_id, "full_outer")
          .select(
              col("diasng_id"),
              col("institution_key")
          )
    )

    # Affected downstream keys
    return joined.filter(col("diasng_id").isNotNull())

In [4]:

  
  val target_catalog = "ag_ra_search_analytics_data"
  val target_environment = "dev"
  val target_version = "v1_0"


  def getDapSchemas(): Seq[String] = {
    val base = s"${target_catalog}_$target_environment"
    val suffixes = Seq(
      "dap_entity_wos",
      "dap_metrics_wos",
      "dap_entity_pprn",
      "dap_metrics_pprn",
      "dap_docs",
      "dap_reference",
      "dap_sort_ref",
      "dap_entity_enrich",
      "dap_grant",
      "dap_prod_core",
      "dap_ops",
      "dap_work"
    )
    suffixes.map { suffix =>
      s"$base.${suffix}_$target_version"
    }
  }



  getDapSchemas().foreach(println)  

target_catalog = ag_ra_search_analytics_data
target_environment = dev
target_version = v1_0


getDapSchemas: ()Seq[String]


ag_ra_search_analytics_data_dev.dap_entity_wos_v1_0
ag_ra_search_analytics_data_dev.dap_metrics_wos_v1_0
ag_ra_search_analytics_data_dev.dap_entity_pprn_v1_0
ag_ra_search_analytics_data_dev.dap_metrics_pprn_v1_0
ag_ra_search_analytics_data_dev.dap_docs_v1_0
ag_ra_search_analytics_data_dev.dap_reference_v1_0
ag_ra_search_analytics_data_dev.dap_sort_ref_v1_0
ag_ra_search_analytics_data_dev.dap_entity_enrich_v1_0
ag_ra_search_analytics_data_dev.dap_grant_v1_0
ag_ra_search_analytics_data_dev.dap_prod_core_v1_0
ag_ra_search_analytics_data_dev.dap_ops_v1_0
ag_ra_search_analytics_data_dev.dap_work_v1_0


v1_0

In [7]:

def getSchemaMap(): Map[String, String] = {
  val srcBase = "ag_content_ims_dev"
  val tgtBase = "ag_ra_search_analytics_data_dev"
  val versionSuffix = "v1_0_)"
  val tgtVerSuffix = "v1_0"

  // ACS schemas: key → base name (without version)
  val acsSchemas = Seq(
    "entity" -> s"$srcBase.gold_entity$versionSuffix",
    "wos"    -> s"$srcBase.gold_wos$versionSuffix",
    "pprn"   -> s"$srcBase.gold_pprn$versionSuffix"
  )

  // DAP schemas: key → base name (without target version suffix)
  val dapKeys = Seq(
    "dap_entity_wos",
    "dap_metrics_wos",
    "dap_entity_pprn",
    "dap_metrics_pprn",
    "dap_docs",
    "dap_reference",
    "dap_sort_ref",
    "dap_entity_enrich",
    "dap_grant",
    "dap_prod_core",
    "dap_ops",
    "dap_work"
  )

  val dapSchemas: Seq[(String, String)] =
    dapKeys.map { key =>
      key -> s"$tgtBase.$key$tgtVerSuffix"
    }

  // Optional sandbox entry
  val sandboxEntry = Seq(
    "dap" -> s"$tgtBase.sandbox$tgtVerSuffix"
  )

  // Combine all
  (acsSchemas ++ dapSchemas ++ sandboxEntry).toMap
}

  getSchemaMap().foreach{ case (k,v) => println(s"$k -> $v") }  


getSchemaMap: ()Map[String,String]


dap_docs -> ag_ra_search_analytics_data_dev.dap_docsv1_0
dap_entity_pprn -> ag_ra_search_analytics_data_dev.dap_entity_pprnv1_0
pprn -> ag_content_ims_dev.gold_pprnv1_0_)
dap_metrics_pprn -> ag_ra_search_analytics_data_dev.dap_metrics_pprnv1_0
dap_entity_wos -> ag_ra_search_analytics_data_dev.dap_entity_wosv1_0
dap_metrics_wos -> ag_ra_search_analytics_data_dev.dap_metrics_wosv1_0
dap_entity_enrich -> ag_ra_search_analytics_data_dev.dap_entity_enrichv1_0
wos -> ag_content_ims_dev.gold_wosv1_0_)
dap -> ag_ra_search_analytics_data_dev.sandboxv1_0
dap_reference -> ag_ra_search_analytics_data_dev.dap_referencev1_0
entity -> ag_content_ims_dev.gold_entityv1_0_)
dap_prod_core -> ag_ra_search_analytics_data_dev.dap_prod_corev1_0
dap_ops -> ag_ra_search_analytics_data_dev.dap_opsv1_0
dap_grant -> ag_ra_search_analytics_data_dev.dap_grantv1_0
dap_work -> ag_ra_search_analytics_data_dev.dap_workv1_0
dap_sort_ref -> ag_ra_search_analytics_data_dev.dap_sort_refv1_0


In [None]:

object SchemaResolver {

  private def getWidget(name: String, default: String): String = {
    try {
      val value = "" // dbutils.widgets.get(name)
      if (value == null || value.isEmpty) default else value
    } catch {
      case _: Throwable => default
    }
  }

  private val source_catalog = getWidget("source_catalog", "ag_content_ims_acs")
  private val source_environment = getWidget("source_environment", "prod")
  private val source_version = getWidget("source_version", "")

  private val target_catalog = getWidget("target_catalog", "ag_ra_search_analytics_data")
  private val target_environment = getWidget("target_environment", "dev")
  private val target_version = getWidget("target_version", "v1_0")

val dapSchemaBases = Seq(
      "dap_entity_wos",
      "dap_metrics_wos",
      "dap_entity_pprn",
      "dap_metrics_pprn",
      "dap_docs",
      "dap_reference",
      "dap_sort_ref",
      "dap_entity_enrich",
      "dap_grant",
      "dap_prod_core",
      "dap_ops",
      "dap_work"
  )

  private val acsSchemaBases= Seq(
      "gold_entity",
      "gold_wos",
      "gold_pprn"
  )

  val SCHEMA_MAP: Map[String, String] = {
    val srcBase = s"${source_catalog}_${source_environment}"
    val tgtBase = s"${target_catalog}_${target_environment}"
    val versionSuffix = if (source_version.isEmpty) "" else s"_${source_version}"
    val tgtVerSuffix = s"_${target_version}"

    // ACS schemas: key → base name (without version)
    val acsSchemas = Seq(
      "entity" -> s"$srcBase.gold_entity$versionSuffix",
      "wos"    -> s"$srcBase.gold_wos$versionSuffix",
      "pprn"   -> s"$srcBase.gold_pprn$versionSuffix"
    )

    // DAP schemas: key → base name (without target version suffix)
    val dapSchemas: Seq[(String, String)] =
      dapSchemaBases.map { key =>
        key -> s"$tgtBase.$key$tgtVerSuffix"
      }

    // Optional sandbox entry
    val sandboxEntry = Seq(
      "dap" -> s"$tgtBase.sandbox$tgtVerSuffix"
    )
    // Combine all
    (acsSchemas ++ dapSchemas ++ sandboxEntry).toMap
  }

  val SCHEMAS: Seq[String] = {
    val srcBase = s"${source_catalog}_${source_environment}"
    val tgtBase = s"${target_catalog}_${target_environment}"
    val versionSuffix = if (source_version.isEmpty) "" else s"_${source_version}"
    val tgtVerSuffix  = s"_${target_version}"

    // ACS schema names
    val acsSchemas = acsSchemaBases.map(name => s"$srcBase.$name$versionSuffix")

    // DAP schema names
    val dapSchemas = dapSchemaBases.map(baseName => s"$tgtBase.$baseName$tgtVerSuffix")

    acsSchemas ++ dapSchemas
  }

}


SchemaResolver.SCHEMA_MAP.foreach{ case (k,v) => println(s"$k -> $v") }
println("-----")
SchemaResolver.SCHEMAS.foreach{ schema => println(schema) }  




defined object SchemaResolver


dap_docs -> ag_ra_search_analytics_data_dev.dap_docs_v1_0
dap_entity_pprn -> ag_ra_search_analytics_data_dev.dap_entity_pprn_v1_0
pprn -> ag_content_ims_acs_prod.gold_pprn
dap_metrics_pprn -> ag_ra_search_analytics_data_dev.dap_metrics_pprn_v1_0
dap_entity_wos -> ag_ra_search_analytics_data_dev.dap_entity_wos_v1_0
dap_metrics_wos -> ag_ra_search_analytics_data_dev.dap_metrics_wos_v1_0
dap_entity_enrich -> ag_ra_search_analytics_data_dev.dap_entity_enrich_v1_0
wos -> ag_content_ims_acs_prod.gold_wos
dap -> ag_ra_search_analytics_data_dev.sandbox_v1_0
dap_reference -> ag_ra_search_analytics_data_dev.dap_reference_v1_0
entity -> ag_content_ims_acs_prod.gold_entity
dap_prod_core -> ag_ra_search_analytics_data_dev.dap_prod_core_v1_0
dap_ops -> ag_ra_search_analytics_data_dev.dap_ops_v1_0
dap_grant -> ag_ra_search_analytics_data_dev.dap_grant_v1_0
dap_work -> ag_ra_search_analytics_data_dev.dap_work_v1_0
dap_sort_ref -> ag_ra_search_analytics_data_dev.dap_sort_ref_v1_0
-----
ag_content_ims_a

In [1]:


val pipelinesDF = SchemaResolver.dapSchemaBases.toDF()





Unknown Error: <console>:23: error: not found: value SchemaResolver
       val pipelinesDF = SchemaResolver.dapSchemaBases.toDF()
                         ^
