# 1. Descarga Pubmed
### Hacemos una consulta por cada especialidad para obtener los IDs de los documentos de PubMed

In [21]:
import os
from Bio import Entrez
from urllib.request import urlopen

In [22]:
path_file_queries = 'total_queries.txt'
path_files_xmls = 'specialties_subespecialties_xml'
path_casesreports_xml = 'specialties_subespecialties_case_report_xml'
url_entrez = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id='

In [23]:
def open_url(url, file):
    try:
        xml = open(file, "a+")
        xml.write(urlopen(url).read().decode('utf-8'))
        xml.close()
    except urllib.error.HTTPError:
        print("**** URL demasiado larga")

In [24]:
def search_pubmed(query):
    Entrez.email = 'email@email.com'
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='1000000',
                            retmode='xml', 
                            term=query)
    
    results = Entrez.read(handle)
    return results

In [25]:
def download_xmls(list_pmid, file_output):
    if len(list_pmid) < 400:

        url = ','.join(list_pmid)
        url_search = url_entrez + url

        print('[{}/{}]'.format(len(list_pmid), len(list_pmid)))

        open_url(url_search, file_output)

    else:
        
        # hacemos peripecias para hacer la llamada a la URL
        # creamos un fichero auxiliar para albergar varias llamadas a la URL
        new_list_pmid = [list_pmid[i:i+400] for i in range(0, len(list_pmid), 400)]

        for pos, list_files in enumerate(new_list_pmid):

            url = ','.join(list_files)
            url_search = url_entrez + url

            print('[{}/{}]'.format(pos, len(new_list_pmid)))

            open_url(url_search, file_output.replace(".xml", ".mal"))
    
        # construímos el fichero correcto 
        with open(file_output.replace(".xml", ".mal"),"r") as f:
            content = f.read()
            content = content.replace('</PubmedArticleSet><?xml version="1.0" ?>\n<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2019//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd">\n<PubmedArticleSet>', '')
            with open(file_output, 'w') as f:
                f.write(content)
    

In [29]:
def read_file_queries():
    with open(path_file_queries, 'r') as fquery:
        
        for index, line in enumerate(fquery):
            line = line.strip()
            
            # contruímos un buen nombre para el documento
            tree_num = line.split("#")[0]
            specialty = line.split("#")[1]
            specialty = specialty.lower()
            specialty = specialty.replace(", ", " ")
            specialty = specialty.replace(" ", "_")
            specialty = specialty.replace("-", "_")
            
            specialty = tree_num + '_' + specialty
            query = line.split("#")[2]
            
            # comprobamos que no esté generado anteriormente
            if not os.path.isfile(os.path.join(path_files_xmls, specialty) + '.xml'):
                
                # query normal (sin cases reports)
                normal_query = query + ' and not Case Reports[PT]'
                
                # query para obtener los cases reports
                case_report_query = query + ' and Case Reports[PT]'
                
                results_total = search_pubmed(query)
                results_files = search_pubmed(normal_query)
                results_cr = search_pubmed(case_report_query)
                
                list_pmid_total = results_total['IdList']
                list_pmid_files = results_files['IdList']
                list_pmid_cr = results_cr['IdList']
                
                # comprobamos que: numFilesSinCasesReports + numFilesCasesReports = numFilesTotal
                print("{} - {}, nº doc total: {}, files: {}, cases reports: {}. -> {}".format(index, specialty, 
                                                                                        len(list_pmid_total), 
                                                                                        len(list_pmid_files), 
                                                                                        len(list_pmid_cr),
                                                                                        len(list_pmid_files) + len(list_pmid_cr) == len(list_pmid_total)    
                                                                                            ))
                if len(list_pmid_total) > 10000:

                    #DOWNLOAD NORMAL PUBLICATION
                    print("Download normal publication type.")
                    file_output = os.path.join(path_files_xmls, specialty) + '.xml'
                    download_xmls(list_pmid_files, file_output)

                    #DOWNLOAD CASES REPORTS
                    print("Download cases reports publication type.")
                    file_output = os.path.join(path_casesreports_xml, specialty) + '.xml'
                    download_xmls(list_pmid_cr, file_output)

In [30]:
read_file_queries()

0 - H02.403.225_dermatology, nº doc total: 18374, files: 10766, cases reports: 7608. -> True
Download normal publication type.
[0/27]
[1/27]
[2/27]
[3/27]
[4/27]
[5/27]
[6/27]
[7/27]
[8/27]
[9/27]
[10/27]
[11/27]
[12/27]
[13/27]
[14/27]
[15/27]
[16/27]
[17/27]
[18/27]
[19/27]
[20/27]
[21/27]
[22/27]
[23/27]
[24/27]
[25/27]
[26/27]
Download cases reports publication type.
[0/20]
[1/20]
[2/20]
[3/20]
[4/20]
[5/20]
[6/20]
[7/20]
[8/20]
[9/20]
[10/20]
[11/20]
[12/20]
[13/20]
[14/20]
[15/20]
[16/20]
[17/20]
[18/20]
[19/20]
1 - H02.403.720.500_epidemiology, nº doc total: 35329, files: 34403, cases reports: 926. -> True
Download normal publication type.
[0/87]
[1/87]
[2/87]
[3/87]
[4/87]
[5/87]
[6/87]
[7/87]
[8/87]
[9/87]
[10/87]
[11/87]
[12/87]
[13/87]
[14/87]
[15/87]
[16/87]
[17/87]
[18/87]
[19/87]
[20/87]
[21/87]
[22/87]
[23/87]
[24/87]
[25/87]
[26/87]
[27/87]
[28/87]
[29/87]
[30/87]
[31/87]
[32/87]
[33/87]
[34/87]
[35/87]
[36/87]
[37/87]
[38/87]
[39/87]
[40/87]
[41/87]
[42/87]
[43/87]
[44

[57/89]
[58/89]
[59/89]
[60/89]
[61/89]
[62/89]
[63/89]
[64/89]
[65/89]
[66/89]
[67/89]
[68/89]
[69/89]
[70/89]
[71/89]
[72/89]
[73/89]
[74/89]
[75/89]
[76/89]
[77/89]
[78/89]
[79/89]
[80/89]
[81/89]
[82/89]
[83/89]
[84/89]
[85/89]
[86/89]
[87/89]
[88/89]
Download cases reports publication type.
[0/28]
[1/28]
[2/28]
[3/28]
[4/28]
[5/28]
[6/28]
[7/28]
[8/28]
[9/28]
[10/28]
[11/28]
[12/28]
[13/28]
[14/28]
[15/28]
[16/28]
[17/28]
[18/28]
[19/28]
[20/28]
[21/28]
[22/28]
[23/28]
[24/28]
[25/28]
[26/28]
[27/28]
40 - H02.403.810.850_traumatology, nº doc total: 10238, files: 6731, cases reports: 3507. -> True
Download normal publication type.
[0/17]
[1/17]
[2/17]
[3/17]
[4/17]
[5/17]
[6/17]
[7/17]
[8/17]
[9/17]
[10/17]
[11/17]
[12/17]
[13/17]
[14/17]
[15/17]
[16/17]
Download cases reports publication type.
[0/9]
[1/9]
[2/9]
[3/9]
[4/9]
[5/9]
[6/9]
[7/9]
[8/9]
41 - H02.403.810.860_urology, nº doc total: 22643, files: 15509, cases reports: 7134. -> True
Download normal publication type.
[0/39]
[

[4/6]
[5/6]
