#### 1. simulateDeltaChanges

In [None]:
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import scala.util.Random

/**
 * Simulate CRUD operations on a Delta table (generic for any schema)
 *
 * @param tablePath        Path or table name of the Delta table
 * @param primaryKey       Primary key column for identifying rows
 * @param count            Number of rows to ADD, UPDATE, DELETE
 */
def simulateDeltaChanges(
    tablePath: String,
    primaryKey: String,
    count: Int = 10
)(implicit spark: SparkSession): Unit = {

  // -------------------------------------------
  // 1. Read current table
  // -------------------------------------------
  val df = spark.read.format("delta").load(tablePath)

  val totalCount = df.count()
  require(totalCount > 0, "Table must not be empty")
  require(df.columns.contains(primaryKey), s"Primary key $primaryKey not found")

  // -------------------------------------------
  // 2. Pick rows for UPDATE + DELETE
  // -------------------------------------------
  val dfWithRand = df.withColumn("_rand", rand())

  val rowsForUpdate = dfWithRand.orderBy("_rand").limit(count).drop("_rand")
  val rowsForDelete = dfWithRand.orderBy(rand()).limit(count).drop("_rand")

  // -------------------------------------------
  // 3. UPDATE simulation: modify values
  //    We apply a simple "random value modifier"
  // -------------------------------------------
  val updatedRows = rowsForUpdate.columns.foldLeft(rowsForUpdate) { (acc, colName) =>
    if (colName != primaryKey)
      acc.withColumn(colName, lit(s"updated_${Random.nextInt(10000)}"))
    else acc
  }

  // -------------------------------------------
  // 4. DELETE simulation: just collect keys
  // -------------------------------------------
  val deleteKeys = rowsForDelete.select(primaryKey).as[String].collect().toSet

  // -------------------------------------------
  // 5. ADD simulation: create new records
  // -------------------------------------------
  val cols = df.columns
  val addRows = (1 to count).map { _ =>
    val values = cols.map { c =>
      if (c == primaryKey) java.util.UUID.randomUUID().toString
      else s"new_${Random.nextInt(10000)}"
    }
    values.toSeq
  }

  val addDf = spark.createDataFrame(
    spark.sparkContext.parallelize(addRows.map(Row.fromSeq)),
    df.schema
  )

  // -------------------------------------------
  // 6. Build final dataframe for write-back
  // -------------------------------------------

  // Remove deleted rows
  val dfAfterDelete = df.filter(!col(primaryKey).isin(deleteKeys.toSeq: _*))

  // Replace updated rows (by excluding old ones and unioning updated ones)
  val updateKeys = updatedRows.select(primaryKey).as[String].collect().toSet

  val dfAfterUpdate = dfAfterDelete
    .filter(!col(primaryKey).isin(updateKeys.toSeq: _*))
    .union(updatedRows)

  // Insert new rows
  val finalDf = dfAfterUpdate.union(addDf)

  // -------------------------------------------
  // 7. Overwrite table with new data
  // -------------------------------------------
  finalDf.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(tablePath)

  println(s"âœ” Simulation complete on $tablePath")
  println(s"Updated: ${updateKeys.size}, Deleted: ${deleteKeys.size}, Inserted: ${count}")
}


#### 2. simulateDeltaChangesSchemas

In [None]:

def simulateDeltaChangesSchemas(catalog: String, schema: String, count: Int = 10): Unit = {
  // 1. Get list of tables from source schema
  val tables = spark.sql(s"""
    SELECT table_name
    FROM ${catalog}.information_schema.tables
    WHERE table_schema = '${schema}'
  """).collect().map(_.getString(0))

  // 2. Loop and deep clone each table
  tables.foreach { tableName =>
      val tb = s"${catalog}.${schema}.${tableName}"

      simulateDeltaChanges(
        tablePath = tb,  
        primaryKey = "id",
        count  // simulate 10 inserts, 10 updates, 10 deletes
      )

    println(s"simulateDeltaChanges table: $tb")
  }

  println("simulateDeltaChanges completed for all tables.")

}


#### 3. ACS Schema

In [None]:
val acsSchemas = List(
  "gold_entity",
  "gold_wos",
  "gold_pprn"
)

#### 4. Run simulateDeltaChangesSchemas fod ACS

In [None]:


val env = "dev"
val version = "v1_0"
val catalog = s"ag_ra_search_analytics_data_${env}"

acsSchemas.foreach { case schema =>
  val src_schema = s"${schema}_${version}"
  println(s"Processing: source = $src_schema")
  val start = System.nanoTime() 
  simulateDeltaChangesSchemas(catalog, src_schema, count = 10)
  println(s"Time taken: ${(System.nanoTime() - start) / 1e9} seconds")
}


#### 5. Test  simulateDeltaChanges

In [None]:

simulateDeltaChanges(
  tablePath = "schema.table",  // or "schema.table"
  primaryKey = "id",
  count = 10  // simulate 100 inserts, 100 updates, 100 deletes
)
