In [0]:

spark.sql("USE CATALOG spark_catalog")
spark.sql("USE DATABASE default")



In [0]:
from pyspark.sql.functions import col, lower, trim, regexp_replace, udf, sha2
import unicodedata

# ----------------------------------------------------------
# Diccionario ampliado de alias comunes en PubMed
PUBMED_REPLACEMENTS = {
    # Universidades internacionales y variantes
    "harvard university": "harvard",
    "harvard medical school": "harvard",
    "massachusetts institute of technology": "mit",
    "university college london": "ucl",
    "university of malaga": "uma",
    "university of california san francisco": "ucsf",
    "university of california berkeley": "uc berkeley",
    "university of oxford": "oxford",
    "university of cambridge": "cambridge",
    "stanford university": "stanford",
    "johns hopkins university": "johns hopkins",
    "university of toronto": "toronto",
    "university of washington": "uw",
    "university of chicago": "uchicago",
    "university of pennsylvania": "upenn",
    "university of british columbia": "ubc",
    "university of melbourne": "melbourne",
    "university of sydney": "sydney",

    # Institutos y hospitales de renombre
    "national institutes of health": "nih",
    "national cancer institute": "nci",
    "memorial sloan kettering cancer center": "mskcc",
    "massachusetts general hospital": "mgh",
    "brigham and women's hospital": "bwh",
    "mayo clinic": "mayo",
    "cleveland clinic": "cleveland clinic",
    "johns hopkins hospital": "johns hopkins",

    # Variantes comunes en abreviaturas
    "univ": "university",
    "dept": "department",
    "inst": "institute",
    "med sch": "medical school",
    "sch med": "school of medicine",
    "sch": "school",
    "hospital": "hospital",
    "hosp": "hospital",

    # Ejemplos específicos para España y Latinoamérica
    "universidad complutense de madrid": "ucm",
    "universidad autonoma de barcelona": "uab",
    "universidad de barcelona": "ub",
    "universidad de valencia": "uv",
    "universidad de sevilla": "us",
    "universidad de granada": "ugr",
    "universidad de buenos aires": "uba",
    "universidad nacional autonoma de mexico": "unam",

    # Otros
    "max planck institute": "max planck",
    "fraunhofer institute": "fraunhofer",
    "cnrs": "cnrs",
    "cancer research uk": "cruk"
}

# ----------------------------------------------------------
# 1️⃣ UDF para quitar tildes
def remove_accents(text):
    if text is None:
        return None
    return ''.join(
        c for c in unicodedata.normalize('NFKD', text)
        if not unicodedata.combining(c)
    )

remove_accents_udf = udf(remove_accents)

# ----------------------------------------------------------
# 2️⃣ UDF para aplicar diccionario de reemplazos exactos
def apply_pubmed_aliases(text):
    if text is None:
        return None
    text = text.strip()
    # Normalizamos el texto para hacer la comparación en minúsculas sin tildes
    text_norm = ''.join(
        c for c in unicodedata.normalize('NFKD', text.lower())
        if not unicodedata.combining(c)
    )
    if text_norm in PUBMED_REPLACEMENTS:
        return PUBMED_REPLACEMENTS[text_norm]
    return text_norm

alias_dict_udf = udf(apply_pubmed_aliases)

# ----------------------------------------------------------
# 3️⃣ Pipeline completo de normalización y hash
df_aff = spark.table("default.pubmed_art_autor_aff")

# DISTINCT solo de una columna
df_aff_unique = df_aff.select("affiliation").distinct()


df_norm = df_aff_unique.withColumn(
    "affiliation_normalizada",
    lower(remove_accents_udf(col("affiliation")))
)

# Limpieza de puntuación y números
df_norm = df_norm.withColumn(
    "affiliation_normalizada",
    regexp_replace(col("affiliation_normalizada"), r"[.,()\-\/]", " ")
)

df_norm = df_norm.withColumn(
    "affiliation_normalizada",
    regexp_replace(col("affiliation_normalizada"), r"\d+", "")
)

# Eliminar stopwords comunes
df_norm = df_norm.withColumn(
    "affiliation_normalizada",
    regexp_replace(
        col("affiliation_normalizada"),
        r"\b(department|dept|faculty|school|institute|inst|university|univ|universidad|college|center|centre|laboratory|lab|research|hospital|clinic|group|unit|division|section|chair|facultad)\b",
        ""
    )
)

# Limpiar espacios múltiples y trim
df_norm = df_norm.withColumn(
    "affiliation_normalizada",
    regexp_replace(col("affiliation_normalizada"), r"\s+", " ")
)
df_norm = df_norm.withColumn(
    "affiliation_normalizada",
    trim(col("affiliation_normalizada"))
)

# Aplicar diccionario de reemplazos
df_norm = df_norm.withColumn(
    "affiliation_normalizada",
    alias_dict_udf(col("affiliation_normalizada"))
)

# Crear hash SHA-256 para institution_id
df_norm = df_norm.withColumn(
    "institution_id",
    sha2(col("affiliation_normalizada"), 256)
)


df_norm.select(
    "affiliation",
    "affiliation_normalizada",
    "institution_id"
).show(truncate=False)


df_norm.write \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable("default.pubmed_aff_normalizadas")






df_unique = df_norm.select(
    "affiliation_normalizada",
    "institution_id"
).distinct()

# Guardar tabla Delta
df_unique.write \
    .mode("overwrite") \
    .format("delta") \
    .saveAsTable("unirdat.pubmed_db.m_afiliacion")



+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------+
|affiliation                                                                                                                                                                                                                                                                                                                                |af

In [0]:
%sql 
select affiliation_normalizada, count(1) from unirdat.pubmed_db.m_afiliacion
group by affiliation_normalizada
having count(1) >1

affiliation_normalizada,count(1)


In [0]:



df_autores = spark.table("default.pubmed_art_autor_aff").alias("aut")

# Tabla de afiliaciones únicas
df_afiliaciones = spark.table("default.pubmed_aff_normalizadas").alias("aff")

# Une por la afiliación normalizada o la original si la tienes
df_autores_enriquecido = df_autores.join(
    df_afiliaciones,
    df_autores["affiliation"] == df_afiliaciones["affiliation"],
    how="left"
).distinct()

df_autores_enriquecido.show()

df_art_aut_aff = df_autores_enriquecido.select(
                col("aut.*"),
                col("aff.institution_id")
                )
# Guardar tabla de nuevo sin perder los campos originales
df_art_aut_aff.write.mode("overwrite").format("delta").saveAsTable("unirdat.pubmed_db.m_articulo_autor_affiliation")

+------------+--------+--------------------+------------+---------------+--------------------+-------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+
|pub_year_for|    pmid|            fullname|        fore|           last|         affiliation|        orcid_final|        autor_concat|           author_id|         affiliation|affiliation_normalizada|      institution_id|
+------------+--------+--------------------+------------+---------------+--------------------+-------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+
|        2020|27820722| jonathan d burlison|  jonathan d|       burlison|from the departme...|               NULL|jonathan d burlis...|ed63c66f3d5cf22b5...|from the departme...|   from the of pharm...|f9b8f89d45f4ac043...|
|        2020|27381879| jérôme rene lechien| jérôme rene|        lechien|epicura hospital,...|              