In [1]:
from Bio import Entrez
import pandas as pd
import numpy as np

In [2]:
MAX_RESULTS = '1000'
# TODO: paginate to get more results (set to 1000 for testing quickly, max is 10000)
def search(query):
    Entrez.email = 'your.email@example.com'
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax=MAX_RESULTS,
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

In [3]:
def search_and_count(query):
    results = search(query)
    if results['IdList']:
        handle = Entrez.esummary(db="pubmed", id=','.join(results['IdList']), retmode="xml")
        records = Entrez.parse(handle)
        return(len(list(records)))
    else:
        return(0)

In [4]:
#Get MESH FILE USING THIS URL AND SAVE AS mesh-specialties.csv
#https://id.nlm.nih.gov/mesh/query?query=PREFIX+rdf%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F1999%2F02%2F22-rdf-syntax-ns%23%3E%0D%0APREFIX+rdfs%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E%0D%0APREFIX+xsd%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2001%2FXMLSchema%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0APREFIX+meshv%3A+%3Chttp%3A%2F%2Fid.nlm.nih.gov%2Fmesh%2Fvocab%23%3E%0D%0APREFIX+mesh%3A+%3Chttp%3A%2F%2Fid.nlm.nih.gov%2Fmesh%2F%3E%0D%0APREFIX+mesh2015%3A+%3Chttp%3A%2F%2Fid.nlm.nih.gov%2Fmesh%2F2015%2F%3E%0D%0APREFIX+mesh2016%3A+%3Chttp%3A%2F%2Fid.nlm.nih.gov%2Fmesh%2F2016%2F%3E%0D%0APREFIX+mesh2017%3A+%3Chttp%3A%2F%2Fid.nlm.nih.gov%2Fmesh%2F2017%2F%3E%0D%0A%0D%0A+SELECT+DISTINCT+%3Fdescriptor+%3Flabel+%3FtreeNum+%3FmedicineTreeNum%0D%0A+FROM+%3Chttp%3A%2F%2Fid.nlm.nih.gov%2Fmesh%3E%0D%0A%0D%0A+WHERE+%7B%0D%0A+++mesh%3AD008511+meshv%3AtreeNumber+%3FmedicineTreeNum+.%0D%0A+++%3FtreeNum+meshv%3AparentTreeNumber+%3FmedicineTreeNum+.%0D%0A+++%3Fdescriptor+meshv%3AtreeNumber+%3FtreeNum+.%0D%0A+++%3Fdescriptor+rdfs%3Alabel+%3Flabel+.%0D%0A+%7D%0D%0A+%0D%0AORDER+BY+%3FchildTreeNum%0D%0A&format=CSV&inference=true&year=current&limit=1000&offset=0#lodestart-sparql-results



In [5]:
MAIN_SPECIALTY_TREE_LENGTH = 11
mesh_specialties = pd.read_csv('mesh-specialties.csv')
mesh_specialties = mesh_specialties.drop(columns=["descriptor", "medicineTreeNum"])
mesh_specialties = mesh_specialties.applymap(lambda x : x.replace("http://id.nlm.nih.gov/mesh/",""))
mesh_specialties = mesh_specialties.sort_values('treeNum')
mesh_specialties['mainSpecialty'] = mesh_specialties['treeNum'].str.slice(0,MAIN_SPECIALTY_TREE_LENGTH)
mesh_specialties['level'] = (mesh_specialties['treeNum'].str.len() - MAIN_SPECIALTY_TREE_LENGTH) // 4
mesh_specialties = mesh_specialties.reset_index(drop=True)
mesh_specialties[:15]

Unnamed: 0,label,treeNum,mainSpecialty,level
0,Addiction Medicine,H02.403.007,H02.403.007,0
1,Adolescent Medicine,H02.403.014,H02.403.014,0
2,Aerospace Medicine,H02.403.029,H02.403.029,0
3,Allergy and Immunology,H02.403.044,H02.403.044,0
4,Immunochemistry,H02.403.044.500,H02.403.044,1
5,Anesthesiology,H02.403.066,H02.403.066,0
6,Bariatric Medicine,H02.403.074,H02.403.074,0
7,Behavioral Medicine,H02.403.090,H02.403.090,0
8,Clinical Medicine,H02.403.200,H02.403.200,0
9,Evidence-Based Medicine,H02.403.200.400,H02.403.200,1


**Note:** Some specialties appear several times because they are multi-axial (e.g., Neurotology). https://meshb.nlm.nih.gov/record/ui?ui=D063165

In [6]:
mesh_specialties[mesh_specialties.label=="Neurotology"]

Unnamed: 0,label,treeNum,mainSpecialty,level
51,Neurotology,H02.403.600.500,H02.403.600,1
110,Neurotology,H02.403.810.526.500,H02.403.810,2


In [7]:
len(set(mesh_specialties.label.to_list())) # Removes all duplicates

119

In [8]:
mesh_specialties['direct_query'] = mesh_specialties['label'] + "[MH]"
mesh_specialties[:10]

Unnamed: 0,label,treeNum,mainSpecialty,level,direct_query
0,Addiction Medicine,H02.403.007,H02.403.007,0,Addiction Medicine[MH]
1,Adolescent Medicine,H02.403.014,H02.403.014,0,Adolescent Medicine[MH]
2,Aerospace Medicine,H02.403.029,H02.403.029,0,Aerospace Medicine[MH]
3,Allergy and Immunology,H02.403.044,H02.403.044,0,Allergy and Immunology[MH]
4,Immunochemistry,H02.403.044.500,H02.403.044,1,Immunochemistry[MH]
5,Anesthesiology,H02.403.066,H02.403.066,0,Anesthesiology[MH]
6,Bariatric Medicine,H02.403.074,H02.403.074,0,Bariatric Medicine[MH]
7,Behavioral Medicine,H02.403.090,H02.403.090,0,Behavioral Medicine[MH]
8,Clinical Medicine,H02.403.200,H02.403.200,0,Clinical Medicine[MH]
9,Evidence-Based Medicine,H02.403.200.400,H02.403.200,1,Evidence-Based Medicine[MH]


In [9]:
def get_specialty_and_children(df, s):
    return(df[df.treeNum.str.startswith(s)])
#get_specialty_and_children(mesh_specialties,"H02.403.429")

In [10]:
def query_specialty_and_children2(df, spec_tree_num):
    return("(" + get_specialty_and_children(df,spec_tree_num)['label'].str.cat(sep="[MH] OR ") + "[MH])")
#query_specialty_and_children2(mesh_specialties,"H02.403.429", "ENG")

In [11]:
def query_specialty_and_children(df):
    queries = []
    for index, row in df.iterrows():
        queries.append(query_specialty_and_children2(df, row['treeNum']))
    return pd.Series(queries)
#query_specialty_and_children(mesh_specialties)

In [12]:
mesh_specialties['full_query'] = query_specialty_and_children(mesh_specialties)
mesh_specialties[:12]

Unnamed: 0,label,treeNum,mainSpecialty,level,direct_query,full_query
0,Addiction Medicine,H02.403.007,H02.403.007,0,Addiction Medicine[MH],(Addiction Medicine[MH])
1,Adolescent Medicine,H02.403.014,H02.403.014,0,Adolescent Medicine[MH],(Adolescent Medicine[MH])
2,Aerospace Medicine,H02.403.029,H02.403.029,0,Aerospace Medicine[MH],(Aerospace Medicine[MH])
3,Allergy and Immunology,H02.403.044,H02.403.044,0,Allergy and Immunology[MH],(Allergy and Immunology[MH] OR Immunochemistry...
4,Immunochemistry,H02.403.044.500,H02.403.044,1,Immunochemistry[MH],(Immunochemistry[MH])
5,Anesthesiology,H02.403.066,H02.403.066,0,Anesthesiology[MH],(Anesthesiology[MH])
6,Bariatric Medicine,H02.403.074,H02.403.074,0,Bariatric Medicine[MH],(Bariatric Medicine[MH])
7,Behavioral Medicine,H02.403.090,H02.403.090,0,Behavioral Medicine[MH],(Behavioral Medicine[MH])
8,Clinical Medicine,H02.403.200,H02.403.200,0,Clinical Medicine[MH],(Clinical Medicine[MH] OR Evidence-Based Medic...
9,Evidence-Based Medicine,H02.403.200.400,H02.403.200,1,Evidence-Based Medicine[MH],(Evidence-Based Medicine[MH] OR Evidence-Based...


In [13]:
%%time
query_results_eng = mesh_specialties.copy(deep=True)
query_results_spa = mesh_specialties.copy(deep=True)

for index, specialty in mesh_specialties.iterrows():
    query_results_eng.at[index, 'direct_query'] = search_and_count(specialty['direct_query'] +  "ENG[LA]")
    query_results_eng.at[index, 'full_query'] = search_and_count(specialty['full_query'] +  "ENG[LA]")
    
    query_results_spa.at[index, 'direct_query'] = search_and_count(specialty['direct_query'] +  "SPA[LA]")
    query_results_spa.at[index, 'full_query'] = search_and_count(specialty['full_query'] +  "SPA[LA]")

IncompleteRead: IncompleteRead(182 bytes read)

In [16]:
query_results_spa

Unnamed: 0,label,treeNum,mainSpecialty,level,direct_query,full_query
0,Addiction Medicine,H02.403.007,H02.403.007,0,0,0
1,Adolescent Medicine,H02.403.014,H02.403.014,0,24,24
2,Aerospace Medicine,H02.403.029,H02.403.029,0,50,50
3,Allergy and Immunology,H02.403.044,H02.403.044,0,242,707
4,Immunochemistry,H02.403.044.500,H02.403.044,1,498,498
5,Anesthesiology,H02.403.066,H02.403.066,0,736,736
6,Bariatric Medicine,H02.403.074,H02.403.074,0,0,0
7,Behavioral Medicine,H02.403.090,H02.403.090,0,4,4
8,Clinical Medicine,H02.403.200,H02.403.200,0,1000,1000
9,Evidence-Based Medicine,H02.403.200.400,H02.403.200,1,705,705


In [15]:
query_results_eng.to_csv("counts-eng.csv")
query_results_spa.to_csv("counts-spa.csv")