#### 1. deepCloneSchemas

In [0]:

def deepCloneSchemas(srcCatalog: String, srcSchema: String, tgtCatalog: String, tgtSchema: String ): Unit = {
  // 1. Get list of tables from source schema
  val tables = spark.sql(s"""
    SELECT table_name
    FROM ${srcCatalog}.information_schema.tables
    WHERE table_schema = '${srcSchema}'
  """).collect().map(_.getString(0))

  // 2. Loop and deep clone each table
  tables.foreach { tableName =>
    val cloneSql =
      s"""
        |CREATE OR REPLACE TABLE ${tgtCatalog}.${tgtSchema}.${tableName}
        |DEEP CLONE ${srcCatalog}.${srcSchema}.${tableName}
      """.stripMargin

    println(s"Cloning table: $tableName")
    spark.sql(cloneSql)
  }

  println("Deep clone completed for all tables.")

}


defined [32mfunction[39m [36mdeep_clone[39m

#### 2. shallowCloneSchemas

In [0]:

def shallowCloneSchemas(srcCatalog: String, srcSchema: String, tgtCatalog: String, tgtSchema: String): Unit = {
    
  // 1. Get list of tables in the source schema
  val tables = spark.sql(s"""
    SELECT table_name
    FROM ${srcCatalog}.information_schema.tables
    WHERE table_schema = '${srcSchema}'
  """).collect().map(_.getString(0))

  // 2. Loop through tables and perform SHALLOW CLONE
  tables.foreach { tableName =>
    val cloneSql =
      s"""
        |CREATE OR REPLACE TABLE ${tgtCatalog}.${tgtSchema}.${tableName}
        |SHALLOW CLONE ${srcCatalog}.${srcSchema}.${tableName}
      """.stripMargin

    println(s"Shallow cloning table: $tableName")
    spark.sql(cloneSql)
  }

  println("Shallow clone completed for all tables.")

}


#### 3. createVersionedSchemas

In [0]:

def createVersionedSchemas(
    schemaNames: List[String],
    version: String,
    env: String
)(implicit spark: SparkSession): Unit = {

  // Build catalog name based on env
  val catalog = s"ag_ra_search_analytics_data_${env}"

  // Loop through schema list and create each versioned schema
  schemaNames.foreach { schema =>
    val fullSchemaName = s"${catalog}.${schema}_${version}"

    val sql =
      s"""
         |CREATE SCHEMA IF NOT EXISTS $fullSchemaName;
       """.stripMargin

    println(s"Creating schema: $fullSchemaName")
    spark.sql(sql)
  }

  println(s"✅ Completed creating versioned schemas for environment: $env")
}


#### 4. dropVersionedSchemas

In [0]:
def dropVersionedSchemas(
    schemaNames: List[String],
    version: String,
    env: String,
    dropTable: Boolean = false
)(implicit spark: SparkSession): Unit = {

  val catalog = s"ag_ra_search_analytics_data_${env}"

  schemaNames.foreach { schema =>
    val fullSchema = s"${catalog}.${schema}_${version}"

    println(s"Processing drop for schema: $fullSchema")

    // 1. Fetch all tables in this schema
    if(dropTable) {
      val tables = spark.sql(
        s"""
          |SELECT table_name
          |FROM ${catalog}.information_schema.tables
          |WHERE table_schema = '${schema}_${version}'
        """.stripMargin
      ).collect().map(_.getString(0))

      // 2. Drop each table
      tables.foreach { table =>
        val fullTableName = s"$fullSchema.$table"
        println(s"Dropping table: $fullTableName")
        spark.sql(s"DROP TABLE IF EXISTS $fullTableName")
      }
    }

    // 3. Drop the schema itself
    println(s"Dropping schema: $fullSchema")
    spark.sql(s"DROP SCHEMA IF EXISTS $fullSchema CASCADE")
  }

  println(s"✅ Completed dropping versioned schemas for env: $env")
}


#### 5. Define List of Schema (DAP)

In [None]:
val schemas = List(
  "dap_entity_wos",
  "dap_metrics_wos",
  "dap_entity_pprn",
  "dap_metrics_pprn",
  "dap_docs",
  "dap_reference",
  "dap_sort_ref",
  "dap_entity_enrich",
  "dap_prod_core",
  "dap_ops",
  "dap_work"
)

val acsSchemas = List(
  "gold_entity",
  "gold_wos",
  "gold_pprn"
)

#### 6. Run createVersionedSchemas for DAP

In [0]:


createVersionedSchemas(
  schemaNames = schemas,
  version = "v1_0",
  env = "dev"
)

#### 7. Run dropVersionedSchemas for DAP

In [0]:

dropVersionedSchemas(
  schemaNames = schemas,
  version = "v1_0",
  env = "dev"
)


#### 8. Run deep_clone for DAP

In [None]:

// Init
val start = System.nanoTime()


val src_env = "dev"
val tgt_env = "dev"
val src_version = "v1_0"
val tgt_version = "v1_1"

val src_catalog = "ag_ra_search_analytics_data_${src_env}"
val tgt_catalog = "ag_ra_search_analytics_data_${tgt_env}"


def processSchemas(src: String, tgt: String): Unit = {
  println(s"Processing: source = $src, target = $tgt")
  // your logic here…
  deep_clone(catalog, src, catalog, tgt )

}


schemas.foreach { case schema =>
  val src_schema = s"${schema}_${src_version}"
  val tgt_schema = s"${schema}_${tgt_version}"
  println(s"Processing: source = $src_schema, target = $tgt_schema")

  val start = System.nanoTime()
  deep_clone(src_catalog,  src_schema, tgt_catalog, tgt_schema )
  println(s"Time taken: ${(System.nanoTime() - start) / 1e9} seconds")
}

println("All schemas processed.")
println(s"Time taken: ${(System.nanoTime() - start) / 1e9} seconds")




#### 9. Run deep_clone for List of Spefified Schema

In [0]:

// Init
val start = System.nanoTime()


val schemaPairs: List[(String, String)] = List(
  ("dap_entity_wos",    "dap_entity_wos_v1_0"),
  ("dap_metrics_wos",   "dap_metrics_wos_v1_0"),
  ("dap_entity_pprn",   "dap_entity_pprn_v1_0"),
  ("dap_metrics_pprn",  "dap_metrics_pprn_v1_0"),
  ("reference_data_v1_0", "dap_reference_v1_0"),
  ("dap_work",          "dap_work_v1_0")
)

val catalog = "ag_ra_search_analytics_data_dev"

def processSchemas(src: String, tgt: String): Unit = {
  println(s"Processing: source = $src, target = $tgt")
  // your logic here…
  deep_clone(catalog, src, catalog, tgt )

}

schemaPairs.foreach { case (srcSchema, tgtSchema) =>
  processSchemas(srcSchema, tgtSchema)
}

println(s"Time taken: ${(System.nanoTime() - start) / 1e9} seconds")


Processing: source = dap_entity_wos, target = dap_entity_wos_v1_0
Cloning table: authorprofile
Cloning table: metadata
Cloning table: dept_article
Cloning table: funding_agency
Cloning table: jcr_metrics_more
Cloning table: jcr_metrics
Cloning table: incites_assemble
Cloning table: category
Cloning table: category_article_metrics
Cloning table: org_article
Cloning table: wos
Cloning table: en_grants
Cloning table: funding_agency_article_metrics
Cloning table: ap_article
Cloning table: authorprofile_assemble
Cloning table: region_test
Cloning table: patents
Cloning table: region
Cloning table: region_article_metrics
Cloning table: organization
Cloning table: journal
Deep clone completed for all tables.
Processing: source = dap_metrics_wos, target = dap_metrics_wos_v1_0
Cloning table: funding_agency_article_metrics
Cloning table: en_research_topics
Cloning table: en_societal_facet
Cloning table: en_societal_impact
Cloning table: article_normalized_metrics
Cloning table: category_metrics


[36mstart[39m: [32mLong[39m = [32m576002314729L[39m
[36mschemaPairs[39m: [32mList[39m[([32mString[39m, [32mString[39m)] = [33mList[39m(
  ([32m"dap_entity_wos"[39m, [32m"dap_entity_wos_v1_0"[39m),
  ([32m"dap_metrics_wos"[39m, [32m"dap_metrics_wos_v1_0"[39m),
  ([32m"dap_entity_pprn"[39m, [32m"dap_entity_pprn_v1_0"[39m),
  ([32m"dap_metrics_pprn"[39m, [32m"dap_metrics_pprn_v1_0"[39m),
  ([32m"reference_data_v1_0"[39m, [32m"dap_reference_v1_0"[39m),
  ([32m"dap_work"[39m, [32m"dap_work_v1_0"[39m)
)
defined [32mfunction[39m [36mprocessSchemas[39m

#### 10. Run deep_clone for Env

In [0]:

val schemaPairs: List[(String, String)] = List(
  ("reference_data_v1_0",     "dap_reference_v1_0"),
)

def processSchemas(src: String, tgt: String): Unit = {
  println(s"Processing: source = $src, target = $tgt")
  // your logic here…
  deep_clone("ag_ra_search_analytics_data_dev", src, "ag_ra_search_analytics_data_uat", tgt )
  deep_clone("ag_ra_search_analytics_data_dev", src, "ag_ra_search_analytics_data_preprod", tgt )

}



#### 11.  listFirstTwoColumns 

In [None]:
import org.apache.spark.sql.{DataFrame, SparkSession}

def listFirstTwoColumns(schemaName: String)(implicit spark: SparkSession): DataFrame = {

  // Get all table names in the schema
  val tables = spark.sql(s"SHOW TABLES IN $schemaName")
    .select("tableName")
    .as[String]
    .collect()

  // Extract first and second column for each table
  val results = tables.map { tableName =>
    val df = spark.table(s"$schemaName.$tableName")
    val cols = df.columns

    val firstColumn  = if (cols.length >= 1) cols(0) else null
    val secondColumn = if (cols.length >= 2) cols(1) else null

    (schemaName, tableName, firstColumn, secondColumn)
  }

  // Convert to DataFrame
  spark.createDataFrame(results)
    .toDF("schema", "table_name", "first_column", "second_column")
}


#### 12.  Run listFirstTwoColumns for ACS

In [None]:


val env = "dev"
val version = "v1_0"
val catalog = s"ag_content_ims_${env}"


acsSchemas.foreach { case schema =>
  val schemaName = s"${catalog}.${schema}"
 // val schemaName = s"${catalog}.${schema}_${version}"
  listFirstTwoColumns(schemaName)
}