In [0]:
%pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
dbutils.library.restartPython()

## Base de datos **PubMED**

In [0]:
# 1️⃣ Librerías necesarias


from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import pandas as pd
import os
import shutil

# 2️⃣ Define esquema/base de datos a auditar
DATABASE = "pubmed_db"  # Cambia por tu esquema si aplica

# 3️⃣ Lista todas las tablas del esquema
tables = spark.catalog.listTables(DATABASE)

# 4️⃣ Recorre y recolecta metadatos
summary_data = []

for t in tables:
    table_name = t.name
    full_table_name = f"{DATABASE}.{table_name}"
    df = spark.table(full_table_name)
    row_count = df.count()
    schema = df.schema
    
    for field in schema:
        summary_data.append({
            "table_name": table_name,
            "column_name": field.name,
            "data_type": field.dataType.simpleString(),
            "row_count": row_count
        })

print(f"Tablas procesadas: {len(tables)}")

# 5️⃣ Pasa a Pandas para exportar fácil
summary_pd = pd.DataFrame(summary_data)

# 6️⃣ Guarda como Excel
local_path = "/tmp/schema_report.xlsx"
summary_pd.to_excel(local_path, index=False, engine="openpyxl")

# Copia a DBFS para descargar
os.makedirs("/dbfs/FileStore", exist_ok=True)
dbfs_path = "/dbfs/FileStore/schema_report.xlsx"
shutil.copy(local_path, dbfs_path)

print(f"✅ Reporte disponible en: {dbfs_path}")
print(f"Descárgalo en: https://3132215626649366.6.gcp.databricks.com/files/schema_report.xlsx")


Tablas procesadas: 15
✅ Reporte disponible en: /dbfs/FileStore/schema_report.xlsx
Descárgalo en: https://3132215626649366.6.gcp.databricks.com/files/schema_report.xlsx


## Decarga en CVS de base de datos **Pubmed**

In [0]:
import os
import shutil

# 1️⃣ Base de datos
DATABASE = "pubmed_db"

# 2️⃣ Lista tablas
tables = spark.catalog.listTables(DATABASE)
print(f"Tablas encontradas: {[t.name for t in tables]}")

# 3️⃣ Carpetas
EXPORT_BASE = "/dbfs/FileStore/exports/json_exports/"
LOCAL_TMP = "/tmp/json_exports_tmp"
ZIP_LOCAL = "/tmp/todas_tablas_json"
ZIP_DBFS = "/dbfs/FileStore/exports/todas_tablas_json.zip"

# 4️⃣ Asegúrate de que la carpeta exista
os.makedirs(EXPORT_BASE, exist_ok=True)

# 5️⃣ Exporta cada tabla a JSON (en DBFS)
for t in tables:
    table_name = t.name
    df = spark.table(f"{DATABASE}.{table_name}")
    export_path = os.path.join(EXPORT_BASE, f"{table_name}_json")
    print(f"Exportando {table_name} a {export_path}...")
    df.coalesce(1).write.mode("overwrite").json(export_path)

print("✅ Exportación a JSON completada en DBFS.")

# 6️⃣ Limpia carpeta local
if os.path.exists(LOCAL_TMP):
    shutil.rmtree(LOCAL_TMP)
os.makedirs(LOCAL_TMP, exist_ok=True)

# 7️⃣ Copia todo el contenido DBFS → local
print(f"Copiando de DBFS ({EXPORT_BASE}) a local ({LOCAL_TMP})...")
shutil.copytree("/dbfs" + EXPORT_BASE, LOCAL_TMP, dirs_exist_ok=True)

# 8️⃣ Crear ZIP en `/tmp`
print(f"Crea ZIP en {ZIP_LOCAL}.zip...")
shutil.make_archive(ZIP_LOCAL, 'zip', LOCAL_TMP)

# 9️⃣ Mueve ZIP a DBFS
shutil.move(ZIP_LOCAL + ".zip", ZIP_DBFS)

print(f"✅ ZIP guardado en DBFS: {ZIP_DBFS}")
print(f"📂 Descárgalo: https://<TU-WORKSPACE>.databricks.com/files/exports/todas_tablas_json.zip")



print(f"📂 Descárgalos desde: https://3132215626649366.6.gcp.databricks.com/files/exports/")


Tablas encontradas: ['author_orci', 'coautor', 'coautor_2020', 'coautor_2021', 'coautor_2022', 'coautor_2023', 'coautor_2024', 'coautor_2025', 'coautor_all', 'm_articulo', 'm_autor', 'm_revista', 'n_institucion', 'n_issn', 'n_language']
Exportando author_orci a /dbfs/FileStore/exports/json_exports/author_orci_json...
Exportando coautor a /dbfs/FileStore/exports/json_exports/coautor_json...
Exportando coautor_2020 a /dbfs/FileStore/exports/json_exports/coautor_2020_json...
Exportando coautor_2021 a /dbfs/FileStore/exports/json_exports/coautor_2021_json...
Exportando coautor_2022 a /dbfs/FileStore/exports/json_exports/coautor_2022_json...
Exportando coautor_2023 a /dbfs/FileStore/exports/json_exports/coautor_2023_json...
Exportando coautor_2024 a /dbfs/FileStore/exports/json_exports/coautor_2024_json...
Exportando coautor_2025 a /dbfs/FileStore/exports/json_exports/coautor_2025_json...
Exportando coautor_all a /dbfs/FileStore/exports/json_exports/coautor_all_json...
Exportando m_articulo

In [0]:
# Lista con tamaños en MB
for f in dbutils.fs.ls("dbfs:/FileStore/exports/"):
    print(f"{f.name} - {f.size / (1024*1024):.2f} MB")

json_exports/ - 0.00 MB
todas_tablas_json.zip - 4823.70 MB


## Tabla mapeada filtrada de PubMED 2020- 2025

In [0]:

DATABASE = "default"


# Lista todas
tables = spark.catalog.listTables(DATABASE)

# Filtra solo la tabla deseada
filtered_tables = [t for t in tables if t.name in ('pubmed_delta', 'pubmed_tmp_filtrado') ]



# 4️⃣ Recorre y recolecta metadatos
summary_data = []

for t in tables:
    table_name = t.name
    full_table_name = f"{DATABASE}.{table_name}"
    df = spark.table(full_table_name)
    row_count = df.count()
    schema = df.schema
    
    for field in schema:
        summary_data.append({
            "table_name": table_name,
            "column_name": field.name,
            "data_type": field.dataType.simpleString(),
            "row_count": row_count
        })

print(f"Tablas procesadas: {len(tables)}")

# 5️⃣ Pasa a Pandas para exportar fácil
summary_pd = pd.DataFrame(summary_data)

# 6️⃣ Guarda como Excel
local_path = "/tmp/schema_report.xlsx"
summary_pd.to_excel(local_path, index=False, engine="openpyxl")

# Copia a DBFS para descargar
os.makedirs("/dbfs/FileStore", exist_ok=True)
dbfs_path = "/dbfs/FileStore/schema_report_pubmeds_mapeadas.xlsx"
shutil.copy(local_path, dbfs_path)

print(f"✅ Reporte disponible en: {dbfs_path}")
print(f"Descárgalo en: https://3132215626649366.6.gcp.databricks.com/files/schema_report_pubmeds_mapeadas.xlsx")




Tablas procesadas: 22
✅ Reporte disponible en: /dbfs/FileStore/schema_report_pubmeds_mapeadas.xlsx
Descárgalo en: https://3132215626649366.6.gcp.databricks.com/files/schema_report_pubmeds_mapeadas.xlsx
