In [0]:
from pyspark.sql.functions import col, when, count, length, avg, substring
# 1️⃣ Leer
df = spark.read.parquet("/FileStore/pubmed_filtrado_2020_2025")

# 👉 Normaliza pub_year: toma solo los 4 primeros caracteres
df_clean = df.withColumn("pub_year_clean", substring(col("pub_year"), 1, 4)) 
        
             

# 2️⃣ Guardar como tabla particionada por año
df1 =df_clean.write \
  .mode("overwrite") \
  .partitionBy("pub_year_clean") \
  .saveAsTable("default.pubmed_filtrado_part")


In [0]:
%sql
select count(distinct pmid) from default.pubmed_filtrado_part

count(DISTINCT pmid)
4912100


In [0]:
import os
dbfs_target_dir = "/dbfs/FileStore/pubmed_tables_parquet"


# Crear carpeta destino en DBFS si no existe
os.makedirs(dbfs_target_dir, exist_ok=True)

In [0]:
from pyspark.sql.functions import col, trim, lower, substring

# Carga tabla original
df = spark.table("default.pubmed_filtrado_part")

# Normaliza columnas
df_clean = df.select(
    substring("pub_year_clean", 1, 4).alias("pub_year_for"),
    "pmid",
    lower(trim(col("journal"))).alias("journal"),
    lower(trim(col("country"))).alias("country"),
    lower(trim(col("language"))).alias("language"),
    lower(trim(col("title"))).alias("title")
).dropDuplicates()

# Guarda como tabla particionada
df_clean.write \
    .mode("overwrite") \
    .partitionBy("pub_year_for") \
    .format("parquet") \
    .save("dbfs:/FileStore/pubmed_tables_parquet/pubmed_articulos/")

    

In [0]:
df = spark.read.parquet("dbfs:/FileStore/pubmed_tables_parquet/pubmed_articulos/")
df.show(5)

+--------+--------------------+-------------+--------+--------------------+------------+
|    pmid|             journal|      country|language|               title|pub_year_for|
+--------+--------------------+-------------+--------+--------------------+------------+
|35102770|pediatric and dev...|united states|     eng|the extrauterine ...|        2022|
|35102873|studies in health...|  netherlands|     eng|dr. lindberg: an ...|        2022|
|35103377|angewandte chemie...|      germany|     eng|leveraging electr...|        2022|
|35103591|               elife|      england|     eng|exsted microscopy...|        2022|
|35705268|internal medicine...|        japan|     eng|multisystem infla...|        2022|
+--------+--------------------+-------------+--------+--------------------+------------+
only showing top 5 rows


In [0]:
%sql
USE CATALOG hive_metastore;

CREATE TABLE IF NOT EXISTS default.pubmed_articulos
  USING PARQUET
  LOCATION 'dbfs:/FileStore/pubmed_tables_parquet/pubmed_articulos/';

-- repara las particiones
 MSCK REPAIR TABLE default.pubmed_articulos; 

In [0]:
# -------------------------------------------
# 1️⃣ Imports
# -------------------------------------------
from pyspark.sql.functions import broadcast, col, lower, trim

# -------------------------------------------
# 2️⃣ Tabla PubMed base
# -------------------------------------------
df_pubmed = spark.table("default.pubmed_articulos")

# -------------------------------------------
# 3️⃣ Tabla ISO de idiomas
# -------------------------------------------
df_lang_iso = spark.table("unirdat.pubmed_db.n_language")
df_lang_iso_sel = df_lang_iso.select("PubMedCode", "ISOCode")

# -------------------------------------------
# 4️⃣ Tabla ISO de países
# -------------------------------------------
df_country_iso = spark.table("unirdat.pubmed_db.n_pais")
df_country_iso_sel = df_country_iso.select("name", "Code")

# -------------------------------------------
# 5️⃣ JOIN idiomas (normalizado)
# -------------------------------------------
df_joined = df_pubmed.join(
    broadcast(df_lang_iso_sel),
    lower(trim(df_pubmed["language"])) == lower(trim(df_lang_iso_sel["PubMedCode"])),
    how="left"
).drop("PubMedCode").withColumnRenamed("ISOCode", "language_iso")

# -------------------------------------------
# 6️⃣ JOIN país principal (normalizado)
# -------------------------------------------
df_joined = df_joined.join(
    broadcast(df_country_iso_sel),
    lower(trim(df_joined["country"])) == lower(trim(df_country_iso_sel["name"])),
    how="left"
).drop("name").withColumnRenamed("Code", "country_iso")

# -------------------------------------------
# 7️⃣ JOIN país de afiliación (normalizado)
# -------------------------------------------


# -------------------------------------------
# 8️⃣ Añade columnas de validación (True/False)
# -------------------------------------------
df_validated = df_joined \
    .withColumn("language_is_iso", col("language_iso").isNotNull()) \
    .withColumn("country_is_iso", col("country_iso").isNotNull()) \
 

# -------------------------------------------
# 9️⃣ Métricas de validación
# -------------------------------------------
num_invalid_lang = df_validated.filter(~col("language_is_iso")).count()
num_invalid_country = df_validated.filter(~col("country_is_iso")).count()


print(f"🔍 Registros con language NO ISO: {num_invalid_lang}")
print(f"🔍 Registros con country NO ISO: {num_invalid_country}")


# -------------------------------------------
# 🔟 Muestras de registros problemáticos
# -------------------------------------------
print("\n📌 Ejemplos de language NO ISO:")
df_validated.filter(~col("language_is_iso")).select("language").distinct().show(100, truncate=False)

print("\n📌 Ejemplos de country NO ISO:")
df_validated.filter(~col("country_is_iso")).select("country").distinct().show(100, truncate=False)

df_validated.count()




🔍 Registros con language NO ISO: 0
🔍 Registros con country NO ISO: 1464353

📌 Ejemplos de language NO ISO:
+--------+
|language|
+--------+
+--------+


📌 Ejemplos de country NO ISO:
+-------------------------+
|country                  |
+-------------------------+
|north macedonia          |
|northern ireland         |
|iran                     |
|georgia (republic)       |
|russia (federation)      |
|scotland                 |
|korea (south)            |
|china (republic : 1949- )|
|england                  |
|wales                    |
+-------------------------+



4912248

In [0]:
## Correction para paises

from pyspark.sql.functions import when

# Normaliza `country`

country_fix_map = {
    "wales": "GB",
    "england": "GB",
    "china (republic : 1949- )": "CN",
    "korea (south)": "KR",
    "scotland": "GB",
    "russia (federation)": "RU",
    "north macedonia": "MK",
    "northern ireland": "GB",
    "iran": "IR",
    "georgia (republic)": "GEO",
    "iran, islamic republic of":"IR"

    
}

df_corrected = df_validated.withColumn(
    "country_normalized",
    when(lower(trim(col("country"))).isin([k.lower() for k in country_fix_map.keys()]),
        when(lower(trim(col("country"))) == "wales", "GB")
        .when(lower(trim(col("country"))) == "england", "GB")
        .when(lower(trim(col("country"))) == "china (republic : 1949- )", "CN")
        .when(lower(trim(col("country"))) == "korea (south)", "KR")
        .when(lower(trim(col("country"))) == "scotland", "GB")
        .when(lower(trim(col("country"))) == "russia (federation)", "RU")
        .when(lower(trim(col("country"))) == "north macedonia", "MK")
        .when(lower(trim(col("country"))) == "northern ireland", "GB")
        .when(lower(trim(col("country"))) == "iran", "IR")
        .when(lower(trim(col("country"))) == "georgia (republic)", "GEO")   
        .when(lower(trim(col("country"))) == "iran, islamic republic of", "IR")   
        

           # Añade más .when() según tu mapa
    ).otherwise(col("country_iso"))
)

if "country_iso_final2" in df_corrected.columns:
    df_corrected = df_corrected.drop("country_iso_final2")

# ----------------------------
# 2️⃣ Si ya es ISO → úsalo directo
# Si no → JOIN para los que no están normalizados
# ----------------------------
df_iso = df_country_iso_sel.select(
    lower(trim(col("Name"))).alias("name_lower"),
    col("Code").alias("iso_code")
)

# Aplica join solo para los que NO tienen `country_normalized` en forma ISO
df_joined = df_corrected.join(
    broadcast(df_iso),
    lower(trim(df_corrected["country_normalized"])) == col("iso_code"),
    how="left"
)

# ----------------------------
# 3️⃣ Elige el final: si ya está normalizado, se usa tal cual;
# si no, se pone el que viene del JOIN
# ----------------------------
df_with_iso = df_joined.withColumn(
    "country_iso_final2",
    when(
        col("country_normalized").isin(list(country_fix_map.values())),
        col("country_iso")
    ).otherwise(col("iso_code"))
)




# Nulos o vacíos (string "")
num_nulls_or_empty = df_with_iso.filter(
    col("country_normalized").isNull() | (trim(col("country_normalized")) == "")
).count()
print(f"🔍 Registros sin country_normalized: {num_nulls_or_empty}")




# -------------------------------------------
# 🔟 Guardar tabla final
# -------------------------------------------
df_final.select(
    "pmid",
    "title",
    "pub_year_for",
    "journal",
    "country_normalized",
    "language_iso"
).write \
 .mode("overwrite") \
 .format("delta") \
 .saveAsTable("unirdat.pubmed_db.m_articulo")

print("\n✅ Tabla final 'm_articulo' creada con columnas ISO + validación + ejemplos.")

🔍 Registros sin country_normalized: 0

✅ Tabla final 'm_articulo' creada con columnas ISO + validación + ejemplos.


In [0]:
%sql
select * from unirdat.pubmed_db.n_pais
where Name like '%Iran%'

/*
"wales": "United Kingdom",
    "england": "United Kingdom",  GB
    "china (republic : 1949- )": "CN",
    "korea (south)": "South Korea", KR
    "scotland": "United Kingdom",
    "russia (federation)": "Russia", RU
    "north macedonia": "North Macedonia", MK
    "northern ireland": "United Kingdom",
    "iran": "Iran", IR
    */

Name,Code
"Iran, Islamic Republic of",IR


In [0]:
%sql
--4912248
select count(1) from unirdat.pubmed_db.m_articulo
 -- where country_is_iso2 = false  
where  country_normalized is not null
--minus
---select code from unirdat.pubmed_db.n_pais
--select count(distinct pmid) from pubmed_articulos --4912100

count(1)
4912248


In [0]:
%sql
-- 1464353
--33852
select count(1) from  unirdat.pubmed_db.m_articulo
where country_is_iso = false and  country_is_iso2 = false

count(1)
33852


In [0]:
%sql
select * from unirdat.pubmed_db.n_language

PubMedCode,ISOCode,LanguageName
fre,fr,French
slv,sl,Slovenian
rus,ru,Russian
afr,af,Afrikaans
nor,no,Norwegian
pol,pl,Polish
cat,ca,Catalan
hrv,hr,Croatian
por,pt,Portuguese
ukr,uk,Ukrainian


In [0]:
from pyspark.sql.functions import broadcast

# Carga las tablas como DataFrames
df_pubmed = spark.table("default.pubmed_articulos")
df_lang_iso = spark.table("unirdat.pubmed_db.n_language")  # Ajusta el nombre real


df_lang_iso_sel = df_lang_iso.select("PubMedCode","ISOCode", "LanguageName")

df_joined = df_pubmed.join(
    broadcast(df_lang_iso_sel),
    df_pubmed["language"] == df_lang_iso_sel["PubMedCode"],
    how="left"
).drop("PubMedCode")  # Quitar columna duplicada que ya no hace falta



# Sobrescribe la tabla pubmed_articulos con la nueva columna añadida
df_joined.write.mode("overwrite").format("delta").saveAsTable("unirdat.pubmed_db.m_articulo")