In [0]:

spark.sql("USE CATALOG spark_catalog")
spark.sql("USE DATABASE default")



+------------+--------+--------------------+-------------------+-------------+--------------------+-------------------+--------------------+-------------------+
|pub_year_for|    pmid|            fullname|               fore|         last|         affiliation|        orcid_final|        autor_concat|          author_id|
+------------+--------+--------------------+-------------------+-------------+--------------------+-------------------+--------------------+-------------------+
|        2020|29072512|    babak daneshfard|              babak|   daneshfard|student research ...|0000-0001-6729-9113|babak daneshfard|...|0000-0001-6729-9113|
|        2020|29294762|   martha k fahlgren|           martha k|     fahlgren|temple university...|0000-0001-9683-2079|martha k fahlgren...|0000-0001-9683-2079|
|        2020|29313423|javier robles-valero|             javier|robles-valero|centro de investi...|0000-0001-5218-0187|javier robles-val...|0000-0001-5218-0187|
|        2020|29334772|      ashwi

In [0]:
############# Esta es la mejora final ############

# ---------------------------------------------------------
# ✅ 1️⃣ Widgets y variables
# ---------------------------------------------------------
dbutils.widgets.text("source_table", "pubmed_affiliaciones", "Tabla RAW")
dbutils.widgets.text("ror_table", "unirdat.pubmed_db.n_affiliaciones", "Tabla ROR Delta")
dbutils.widgets.text("output_table", "unirdat.pubmed_db.m_autor_afiliacion_ror_normalized_final_1", "Tabla final")
dbutils.widgets.text("chunk_size", "50", "Autores por chunk")

source_table = dbutils.widgets.get("source_table")
ror_table = dbutils.widgets.get("ror_table")
output_table = dbutils.widgets.get("output_table")
chunk_size = int(dbutils.widgets.get("chunk_size"))

print(f"Source: {source_table}")
print(f"ROR: {ror_table}")
print(f"Output: {output_table}")
print(f"Chunk size: {chunk_size}")

# ---------------------------------------------------------
# ✅ 2️⃣ Librerías
# ---------------------------------------------------------
from pyspark.sql.functions import col, lower, trim, regexp_replace, monotonically_increasing_id
from pyspark.ml.feature import Tokenizer, HashingTF, MinHashLSH
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

# ---------------------------------------------------------
# ✅ 3️⃣ Cargar tablas base
# ---------------------------------------------------------
df_all = spark.table(source_table).filter(col("nombre_completo") =="eric rouchka" )
df_ror = spark.table(ror_table)

# Limpiar tabla ROR una sola vez
df_ror_clean = df_ror.withColumn(
    "alias_clean",
    trim(regexp_replace(lower(col("aliases")), "[^a-zA-Z0-9 ]", ""))
).filter(col("alias_clean").isNotNull())

# ---------------------------------------------------------
# ✅ 4️⃣ Obtener autores únicos
# ---------------------------------------------------------
autores = [row["nombre_completo"] for row in df_all.select("nombre_completo").distinct().collect()]
print(f"Total autores únicos: {len(autores)}")

# ---------------------------------------------------------
# ✅ 5️⃣ Procesar por chunk de autores
# ---------------------------------------------------------
for i in range(0, len(autores), chunk_size):
    batch = autores[i:i+chunk_size]
    print(f"\n=== Procesando autores {i} → {i+len(batch)-1}: {batch} ===")

    # ------------------------------------------
    # 🔹 5.1 Filtrar chunk
    # ------------------------------------------
    df_affil_ini = df_all.filter(col("nombre_completo").isin(batch)) \
                         .select("nombre_completo", "affiliation").distinct()

    # ------------------------------------------
    # 🔹 5.2 Limpiar texto
    # ------------------------------------------
    df_affil_clean = df_affil_ini.withColumn(
        "affiliation_clean",
        trim(regexp_replace(lower(col("affiliation")), "[^a-zA-Z0-9 ]", ""))
    ).filter(col("affiliation_clean").isNotNull())

    df_affil_clean = df_affil_clean.withColumn("id_aff", monotonically_increasing_id())

    # ------------------------------------------
    # 🔹 5.3 Exact match
    # ------------------------------------------
    df_exact = df_affil_clean.join(
        df_ror_clean,
        df_affil_clean.affiliation_clean == df_ror_clean.alias_clean,
        "left"
    ).withColumn(
        "affiliation_normalized",
        col("name")
    ).withColumn(
        "ror_id",
        col("id")
    )

    df_exact_ok = df_exact.filter(col("ror_id").isNotNull()) \
        .select("nombre_completo", "id_aff", "affiliation", "ror_id", "affiliation_normalized")

    df_no_match = df_exact.filter(col("ror_id").isNull()) \
        .select("id_aff", "nombre_completo", "affiliation", "affiliation_clean") \
        .persist()

    # ------------------------------------------
    # 🔹 5.4 Fuzzy match solo si hay sin match
    # ------------------------------------------
    if df_no_match.count() > 0:
        tokenizer = Tokenizer(inputCol="affiliation_clean", outputCol="words")
        df_affil_tokens = tokenizer.transform(df_no_match)

        tokenizer_ror = Tokenizer(inputCol="alias_clean", outputCol="words")
        df_ror_tokens = tokenizer_ror.transform(df_ror_clean)








        hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=2000)
        df_affil_features = hashingTF.transform(df_affil_tokens)
        df_ror_features = hashingTF.transform(df_ror_tokens)

        lsh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=10)
        lsh_model = lsh.fit(df_ror_features)

        df_matches = lsh_model.approxSimilarityJoin(
            df_affil_features,
            df_ror_features,
            0.8,
            distCol="JaccardDistance"
        ).select(
            col("datasetA.id_aff"),
            col("datasetA.nombre_completo"),
            col("datasetA.affiliation"),
            col("datasetB.id").alias("ror_id_fuzzy"),
            col("datasetB.name").alias("name_fuzzy"),
            col("JaccardDistance")
        )

        w = Window.partitionBy("id_aff").orderBy(col("JaccardDistance").asc())
        df_best_matches = df_matches.withColumn(
            "rn",
            row_number().over(w)
        ).filter(col("rn") == 1).drop("rn")

        df_fuzzy_ok = df_best_matches.withColumnRenamed("ror_id_fuzzy", "ror_id") \
            .withColumnRenamed("name_fuzzy", "affiliation_normalized") \
            .select("nombre_completo", "id_aff", "affiliation", "ror_id", "affiliation_normalized")

    else:
        # Si no hay sin match, generamos DF vacío compatible
        df_fuzzy_ok = spark.createDataFrame([], df_exact_ok.schema)

    # ------------------------------------------
    # 🔹 5.5 Combinar exact + fuzzy
    # ------------------------------------------
    df_final = df_exact_ok.unionByName(df_fuzzy_ok)

    # ------------------------------------------
    # 🔹 5.6 Guardar incremental
    # ------------------------------------------
    df_final.write.mode("append").format("delta").option("mergeSchema", "true").saveAsTable(output_table)

    print(f"✅ Chunk {i} → {i+len(batch)-1} guardado OK")

print("\n🎉 Todos los chunks completados y guardados.")



Source: pubmed_affiliaciones
ROR: unirdat.pubmed_db.n_affiliaciones
Output: unirdat.pubmed_db.m_autor_afiliacion_ror_normalized_fina_2
Chunk size: 50
Total autores únicos: 1

=== Procesando autores 0 → 0: ['eric rouchka'] ===


com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:132)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:132)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
--%sql DESCRIBE DETAIL unirdat.pubmed_db.m_autor_afiliacion_ror_normalized_final



format,id,name,description,location,createdAt,lastModified,partitionColumns,clusteringColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics,clusterByAuto
delta,9281131e-9f42-4798-9007-da2619fd6d8e,unirdat.pubmed_db.m_autor_afiliacion_ror_normalized_final,,gs://databricks-4292703444164643-unitycatalog/4292703444164643/unirdat/pubmed_db/__unitystorage/schemas/65acd996-a3c6-48a3-a6b5-9a6cfac9ca2d/tables/da10c357-6864-4849-83f5-54f06d1878cf,2025-07-11T09:14:34.484Z,2025-07-11T11:27:53.308Z,List(),List(),2,38993,Map(delta.enableDeletionVectors -> true),3,7,"List(appendOnly, deletionVectors, invariants)","Map(numRowsDeletedByDeletionVectors -> 0, numDeletionVectors -> 0)",False


In [0]:
#df = spark.read.format("delta").option("versionAsOf", 7).table("unirdat.pubmed_db.m_autor_afiliacion_ror_normalized_final")
#print(df.count())



15877


In [0]:
# Restaurar tabla Delta a versión anterior
#spark.sql("RESTORE TABLE unirdat.pubmed_db.m_autor_afiliacion_ror_normalized_final TO VERSION AS OF 7")

DataFrame[table_size_after_restore: bigint, num_of_files_after_restore: bigint, num_removed_files: bigint, num_restored_files: bigint, removed_files_size: bigint, restored_files_size: bigint]