In [0]:
# =============================================
# 📌 0) Widgets para controlar el lote
# dbutils.notebook.run("parse_and_download_batch", 0, {"batch_size": "100", "batch_number": "0"})
# =============================================
dbutils.widgets.text("batch_size", "100")
dbutils.widgets.text("batch_number", "") # son 12  total

batch_size = int(dbutils.widgets.get("batch_size"))
batch_number = int(dbutils.widgets.get("batch_number"))

print(f"🚀 LOTE: batch_size={batch_size}, batch_number={batch_number}")

# =============================================
# 1) Construir lista de archivos SOLO para este lote
# =============================================
#pubmed25n1263
#pubmed25n1258.xml.gz
all_files = [f.name for f in dbutils.fs.ls("dbfs:/FileStore/pubmed_filtrado/") if f.name.endswith(".xml.gz") ]

#print(all_files)
# Ordenar para consistencia
all_files = sorted(all_files)

start = batch_number * batch_size
end = start + batch_size
batch_files = all_files[start:end]

print(f"✅ Archivos del lote: {len(batch_files)} ({start}–{end-1})")

if not batch_files:
    raise Exception(f"🚫 No hay archivos en este rango: {start}–{end}")

# =============================================
# 2) Crear RDD solo con archivos del lote
# =============================================
rdd = sc.binaryFiles(",".join([f"dbfs:/FileStore/pubmed_filtrado/{f}" for f in batch_files]))
#rdd = sc.binaryFiles(",".join([f"dbfs:/FileStore/pubmed_filtrado/{f}" for f in all_files]))
print(f"📚 Archivos cargados en RDD: {rdd.count()}")

# =============================================
# 3) Tu función optimizada: igual
# =============================================
import xml.etree.ElementTree as ET
import gzip
import io


def extract_country_from_affiliation(affiliation):
    countries = [
    "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda",
    "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan",
    "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium",
    "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana",
    "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Burundi",
    "Cambodia", "Cameroon", "Canada", "Cape Verde", "Central African Republic",
    "Chad", "Chile", "China", "Colombia", "Comoros", "Congo", "Costa Rica",
    "Croatia", "Cuba", "Cyprus", "Czech Republic", "Denmark", "Djibouti",
    "Dominica", "Dominican Republic", "Ecuador", "Egypt", "El Salvador",
    "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", "Ethiopia",
    "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany",
    "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau",
    "Guyana", "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia",
    "Iran", "Iraq", "Ireland", "Israel", "Italy", "Jamaica", "Japan",
    "Jordan", "Kazakhstan", "Kenya", "Kiribati", "Kuwait", "Kyrgyzstan",
    "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein",
    "Lithuania", "Luxembourg", "Madagascar", "Malawi", "Malaysia", "Maldives",
    "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico",
    "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco",
    "Mozambique", "Myanmar", "Namibia", "Nauru", "Nepal", "Netherlands",
    "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Korea", "North Macedonia",
    "Norway", "Oman", "Pakistan", "Palau", "Panama", "Papua New Guinea",
    "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar",
    "Romania", "Russia", "Rwanda", "Saint Kitts and Nevis", "Saint Lucia",
    "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe",
    "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone",
    "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia",
    "South Africa", "South Korea", "South Sudan", "Spain", "Sri Lanka",
    "Sudan", "Suriname", "Sweden", "Switzerland", "Syria", "Taiwan",
    "Tajikistan", "Tanzania", "Thailand", "Timor-Leste", "Togo", "Tonga",
    "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu",
    "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom",
    "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City",
    "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe"
]
    for country in countries:
        if country.lower() in affiliation.lower():
            return country
    return ""

def get_full_text(element):
    text = element.text or ""
    for child in element:
        text += get_full_text(child)
        text += child.tail or ""
    return text.strip()

def parse_gz_bytes1(pair):
    filename, content = pair
    registros = []

    try:
        with gzip.GzipFile(fileobj=io.BytesIO(content)) as f:
           
            context = ET.iterparse(f, events=("end",))
            for _, elem in context:
                if elem.tag == "PubmedArticle":
                    try:

                        pub_date = elem.find("MedlineCitation/Article/Journal/JournalIssue/PubDate")
                        pub_year = pub_date.findtext("Year") if pub_date is not None else ""
                        if not pub_year and pub_date is not None:
                            pub_year = pub_date.findtext("MedlineDate") or ""
                            #print(pub_year ) 
                        # ✅ Normaliza a solo 4 dígitos si viene como rango
                        #pub_year_clean = pub_year[:4] if pub_year else ""
                        #print(pub_year_clean )         
                        # ⚠️ Si NO está entre 2020 y 2025 → saltar este artículo
                        #if not pub_year_clean.isdigit() or not (2020 <= int(pub_year_clean) <= 2025):
                        # elem.clear()
                        # continue

                        pmid = elem.findtext("MedlineCitation/PMID") or ""
                       
                        title = elem.findtext("MedlineCitation/Article/ArticleTitle") or ""
                           
                       
                        journal = elem.findtext("MedlineCitation/Article/Journal/Title") or ""
                        issn = elem.findtext("MedlineCitation/Article/Journal/ISSN") or ""
                        volume = elem.findtext("MedlineCitation/Article/Journal/JournalIssue/Volume") or ""
                        language = elem.findtext("MedlineCitation/Article/Language") or ""
                        country = elem.findtext("MedlineCitation/MedlineJournalInfo/Country") or ""
                            
                        

                        doi = ""
                        pii = ""
                        pmc = ""
                        for aid in elem.iter("ArticleId"):
                            id_type = aid.attrib.get("IdType", "").lower()
                            if id_type == "doi" and aid.text:
                                doi = aid.text.strip()
                            elif id_type == "pii" and aid.text:
                                pii = aid.text.strip()
                            elif id_type == "pmc" and aid.text:
                                pmc = aid.text.strip()

                        mid = next((aid.text for aid in elem.iter("ArticleId") if aid.attrib.get("IdType") == "mid"), "")

                      
                        pub_types = [pt.text for pt in elem.iter("PublicationType") if pt.text]

                        journal_abbr = elem.findtext("MedlineCitation/Article/Journal/ISOAbbreviation") or ""
                        #country = elem.findtext("MedlineCitation/Article/Journal/Country") or ""
                        citation = f"{journal_abbr}, {volume}, {pub_year}"


                        
                       
                                           
                        
                        authors = []
                        for author in elem.iter("Author"):
                            fore = author.findtext("ForeName") or ""
                            last = author.findtext("LastName") or ""
                            fullname = f"{fore} {last}".strip()

                            orcid = ""
                            id_elem = author.find("Identifier")
                            if id_elem is not None and id_elem.attrib.get("Source") == "ORCID":
                                orcid = id_elem.text or ""
                            
                            #mesh_terms = []
                            #for mesh in elem.iter("MeshHeading"):
                            #    desc = mesh.find("DescriptorName")
                            #    term = desc.text if desc is not None else ""
                            #    major = desc.attrib.get("MajorTopicYN", "N") if desc is not None else "N"
                            #    mesh_terms.append(f"{term}::{'Major' if major == 'Y' else 'Minor'}")
        

                            affiliation = ""
                            for aff_info in author.findall("AffiliationInfo"):
                                aff_node = aff_info.find("Affiliation")
                                if aff_node is not None:
                                    affiliation = get_full_text(aff_node)

                                    if affiliation is not None:
                                        affiliation_country = extract_country_from_affiliation(affiliation)



                                    #affiliation = ""
                                    #aff = author.find("AffiliationInfo/Affiliation")
                                    #if aff is not None:
                                    #affiliation = aff.text or ""
                                    #affiliation_country = extract_country_from_affiliation(affiliation)


                                    mesh_terms = []
                                    for mesh in elem.iter("MeshHeading"):
                                        desc = mesh.find("DescriptorName")
                                        term = desc.text if desc is not None else ""
                                        major = desc.attrib.get("MajorTopicYN", "N") if desc is not None else "N"
                            
                                        registros.append((
                                          pmid, title, pub_year, journal, volume, issn, language, country,
                                          citation, mid,  doi, pii, pmc,fore, last,fullname, orcid
                                        , affiliation, affiliation_country, term , major
                                        
                                        ))
                        
                    except:
                        pass
                    elem.clear()
    except:
        pass
    return registros



def parse_gz_bytes2(pair):
    filename, content = pair
    registros = []

    try:
        with gzip.GzipFile(fileobj=io.BytesIO(content)) as f:
            context = ET.iterparse(f, events=("end",))
            for _, elem in context:
                if elem.tag == "PubmedArticle":
                    try:
                       
                        pmid = elem.findtext("MedlineCitation/PMID") or ""
                        authors = []
                        for author in elem.iter("Author"):
                            fore = author.findtext("ForeName") or ""
                            last = author.findtext("LastName") or ""
                            fullname = f"{fore} {last}".strip()
                                  
                                  
                            orcid = ""
                            id_elem = author.find("Identifier")
                            if id_elem is not None and id_elem.attrib.get("Source") == "ORCID":
                                orcid = id_elem.text or ""

                            affiliation = ""
                            for aff_info in author.findall("AffiliationInfo"):
                                aff_node = aff_info.find("Affiliation")
                                if aff_node is not None:
                                    affiliation = get_full_text(aff_node)
                                    country_node = aff_info.find("Country")
                                    if country_node is not None:
                                        affiliation_country = country_node.text
                                    else:
                                        affiliation_country = None
                                 

                                    # Una fila = artículo + autor + afiliación
                                    registros.append((
                                        fullname, orcid, affiliation, affiliation_country, pmid
                                        
                                    ))

                    except:
                        pass
                    elem.clear()
    except:
        pass
    return registros
# =============================================
# 4) Ejecutar transformación en paralelo
# =============================================
parsed_rdd1 = rdd.flatMap(parse_gz_bytes1)
#parsed_rdd2 = rdd.flatMap(parse_gz_bytes2)
# =============================================
# 5) Definir esquema igual
# =============================================
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

pmid_file_struct = StructType([
    StructField("pmid", StringType(), True),
    StructField("filename", StringType(), True)
   
])

author_struct = StructType([
    StructField("fullname", StringType(), True),
    StructField("orcid", StringType(), True),
    StructField("affiliation", StringType(), True),
    StructField("affiliation_country", StringType(), True),
    StructField("pmid", StringType(), True),
   
])

schema = StructType([
    StructField("pmid", StringType(), True),
    StructField("title", StringType(), True),
    StructField("pub_year", StringType(), True),
    StructField("journal", StringType(), True),
    StructField("volume", StringType(), True),
    StructField("issn", StringType(), True),
    StructField("language", StringType(), True),
    StructField("country", StringType(), True),
    StructField("citation", StringType(), True),
    StructField("mid", StringType(), True),
    StructField("doi", StringType(), True),
    StructField("pii", StringType(), True),
    StructField("pmc", StringType(), True),
    StructField("fore", StringType(), True),
    StructField("last", StringType(), True),
    StructField("fullname", StringType(), True),
    StructField("orcid", StringType(), True),
    StructField("affiliation", StringType(), True),
    StructField("affiliation_country", StringType(), True),
    StructField("term", StringType(), True),
    StructField("major", StringType(), True)
   
    
])


 
# =============================================
# 6) Crear DataFrame y guardar en append
# =============================================
df1 = spark.createDataFrame(parsed_rdd1, schema=schema)
#df2 = spark.createDataFrame(parsed_rdd2, schema=author_struct)

print(f"✅ Muestra lote #{batch_number}")
df1.show(5, truncate=False)

output_path = "/FileStore/pubmed_parsed_batches"
output_path1 = "dbfs:/FileStore/pubmed_parquet/articulos_aux"
#output_path2 = "dbfs:/FileStore/pubmed_parquet/authors_aux"

#.partitionBy("year", "country").

df1.write.format("parquet").mode("append").save(output_path1)
#df2.write.format("parquet").mode("append").save(output_path2)



print(f"✅ Lote #{batch_number} guardado en Delta: {output_path1}")
#print(f"✅ Lote #{batch_number} guardado en Delta: {output_path2}")


🚀 LOTE: batch_size=100, batch_number=0
['pubmed25n1264.xml.gz']
✅ Archivos del lote: 1 (0–99)
📚 Archivos cargados en RDD: 1
✅ Muestra lote #0
+--------+-------------------------------------------------------------------------------------------+--------+--------------------+------+---------+--------+-------------+---------------------+---+-----------------------------+---+---+--------------+-----+--------------------+-----+---------------------------------------------------------------------------------------------------------------------------------------+-------------------+-----------------+-----+
|pmid    |title                                                                                      |pub_year|journal             |volume|issn     |language|country      |citation             |mid|doi                          |pii|pmc|fore          |last |fullname            |orcid|affiliation                                                                                                  