In [174]:
# Some practice for Entrez API
# Load keywords from input and send query to Entrez API
# Get formatted data in xml file

import requests

# Download PubMed records with both th1 and th2 in titles and were also published in 2019.
# like this: (th1[ti]) AND (th2[ti]) AND (2019[pdat]) 

db = 'pubmed';
query = 'th1[ti]+AND+th2[ti]+AND+2019[pdat]';

# Assemble the esearch URL
base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
url = base + "esearch.fcgi?db="+db+"&term=$"+query+"&WebEnv=$Webenv&usehistory=y";
print(url)

# Send request to server
# response is our returned data
req = requests.get(url)
response = req.text
print("done")

# load ET module to parse xml
import xml.etree.ElementTree as ET 

# put all data in a string and use Et module to parse it
root = ET.fromstring(response)

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+2019[pdat]&WebEnv=$Webenv&usehistory=y
done


In [175]:
root

<Element 'eSearchResult' at 0x1121443b8>

In [176]:
root.tag

'eSearchResult'

In [177]:
root.attrib

{}

In [178]:
# print all attrib (or all tags of levels)

for child in root:
    print(child.tag, child.attrib)

Count {}
RetMax {}
RetStart {}
QueryKey {}
WebEnv {}
IdList {}
TranslationSet {}
TranslationStack {}
QueryTranslation {}


In [179]:
# Total data counts
root[0].text

'57'

In [180]:
# How many data counts we retrived every time
root[1].text

'20'

In [181]:
# In this case, we should have total 57 data and 20 data per page
# Therefore, we have total three pages
# Te retrived data will have an index (0-19) indicating data of the first page
root[2].text

'0'

In [182]:
# Print Id of each article of the first page (again, 20 data per page)

for i in range(20):
 print(root[5][i].text)

31371226
31341902
31331391
31316629
31316513
31310707
31297009
31277476
31262519
31260744
31244354
31234191
31231378
31229590
31227229
31221948
31214035
31178059
31177446
31129375


In [200]:
# get data of page2
# Index of data in page2 will be 20-39

url2 = url + "&retstart=20&retmax=20";
print(url2)
req2 = requests.get(url2)
response2 = req2.text

# put all data in a string and use ET module to parse it
root2 = ET.fromstring(response2)
print("done")

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+2019[pdat]&WebEnv=$Webenv&usehistory=y&retstart=20&retmax=20
done


In [201]:
# print all attrib (or all tags of levels)

for child in root2:
    print(child.tag, child.attrib)

Count {}
RetMax {}
RetStart {}
QueryKey {}
WebEnv {}
IdList {}
TranslationSet {}
TranslationStack {}
QueryTranslation {}


In [202]:
root2[0].text

'57'

In [203]:
root2[1].text

'20'

In [204]:
root2[2].text

'20'

In [207]:
# Print Id of each article of the second page (again, 20 data per page)

for i in range(20):
 print(root2[5][i].text)

31122908
31093011
31024043
31019160
31016523
30974054
30947977
30945491
30924169
30904436
30889402
30884431
30856346
30848408
30837040
30824840
30814543
30813927
30804001
30771740


In [209]:
# get data of page3
# Index of data in page3 will be 40-56

url3 = url + "&retstart=40&retmax=20";
print(url3)
req3 = requests.get(url3)
response3 = req3.text

# put all data in a string and use ET module to parse it
root3 = ET.fromstring(response3)
print("done")

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+2019[pdat]&WebEnv=$Webenv&usehistory=y&retstart=40&retmax=20
done


In [211]:
# Print Id of each article of the second page (now, 17 data per page)

for i in range(17):
 print(root3[5][i].text)

30742635
30703651
30681265
30668435
30651758
30642265
30629982
30589483
30587599
30576916
30484290
30415531
30389471
30308394
30246383
30223113
30047012
