In [9]:
# We can also use loop to automate the data retrieval process
# In this case, we are using a loop to automatically retrieve UID of each article

# Import modulus required for sending http requests to NCBI server
import requests

# Download PubMed records with both th1 and th2 in titles and were also published within recent 10 years.
# like this: (th1[ti]) AND (th2[ti]) AND ("2009/08/10"[PDat] : "2019/08/07"[PDat])

db = 'pubmed';
query = 'th1[ti]+AND+th2[ti]+AND+("2009/08/10"[PDat]:"2019/08/07"[PDat])';

# Assemble the esearch URL
base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
url = base + "esearch.fcgi?db="+db+"&term=$"+query+"&WebEnv=$Webenv&usehistory=y";
print(url)

# Send request to server
# 'response' is our returned data
# When we send a request, each time the NCBI server will return us a data in xml format 
# The returned data contains information of 20 articles
# We can get information of the first 20 articles first
# If we need to get the next 20 articles, then we need to send a new quest (see below)
req = requests.get(url)
response = req.text
print("done")

# Load ET module to parse the xml
import xml.etree.ElementTree as ET 

# Concatenate all information in a string and use ET module to process it
root = ET.fromstring(response)

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+("2009/08/10"[PDat]:"2019/08/07"[PDat])&WebEnv=$Webenv&usehistory=y
done


In [10]:
# Print all attrib (or all tags of levels)

for child in root:
    print(child.tag, child.attrib)

Count {}
RetMax {}
RetStart {}
QueryKey {}
WebEnv {}
IdList {}
TranslationSet {}
TranslationStack {}
QueryTranslation {}


In [11]:
# Get some ideas about how many data we retrieved this time

count = int(root[0].text)
count

969

In [12]:
# Now this is the loop to automatically retrieve UID of each article and store them as in a list: UID_list

# There are two loops in this function, the first loop, like we mentioned above, is to get a xml containing information of 20 articles
# The second loop is to retrieve UIDs from this 20 articles and store them into a list ('UID_list')


# Create an empty list to store all retrieved UIDs of articles
UID_list = []

# We also have to consider the last xml
# In this case, we have total 969 articles, which means in the last xml, we only have 9 articles
# Since our second loop is retrieving 20 data from each xml, there will be an "out-of-range" error throwed
# So we have add some codes to check how many articles in the last xml and append them into the UID_list

if count % 20 == 0:
    for k in range(0, count, 20):
        k = str(k)
        url_next_page = url + "&retstart="+k+"&retmax=20";
        req2 = requests.post(url_next_page)
        print(url_next_page)
        response2 = req2.text
        root2 = ET.fromstring(response2)
        for i in range(20):
            UID_list.append(root2[5][i].text.strip())
    print("done")
    print(len(UID_list))
else:
    for k in range(0, count - 20, 20):
        k = str(k)
        url_next_page = url + "&retstart="+k+"&retmax=20";
        req2 = requests.post(url_next_page)
        print(url_next_page)
        response2 = req2.text
        root2 = ET.fromstring(response2)
        for i in range(20):
            UID_list.append(root2[5][i].text.strip())
    print("done")
    print(len(UID_list))
remainder = count % 20
url3 = url + "&retstart="+str(count - 20)+"&retmax=20"
req3 = requests.post(url3)
print(url3)
response3 = req3.text
root3 = ET.fromstring(response3)
for i in range(remainder):
        UID_list.append(root3[5][i].text.strip())
print("done")
print(len(UID_list))

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+("2009/08/10"[PDat]:"2019/08/07"[PDat])&WebEnv=$Webenv&usehistory=y&retstart=0&retmax=20
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+("2009/08/10"[PDat]:"2019/08/07"[PDat])&WebEnv=$Webenv&usehistory=y&retstart=20&retmax=20
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+("2009/08/10"[PDat]:"2019/08/07"[PDat])&WebEnv=$Webenv&usehistory=y&retstart=40&retmax=20
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+("2009/08/10"[PDat]:"2019/08/07"[PDat])&WebEnv=$Webenv&usehistory=y&retstart=60&retmax=20
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+("2009/08/10"[PDat]:"2019/08/07"[PDat])&WebEnv=$Webenv&usehistory=y&retstart=80&retmax=20
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fc

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+("2009/08/10"[PDat]:"2019/08/07"[PDat])&WebEnv=$Webenv&usehistory=y&retstart=880&retmax=20
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+("2009/08/10"[PDat]:"2019/08/07"[PDat])&WebEnv=$Webenv&usehistory=y&retstart=900&retmax=20
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+("2009/08/10"[PDat]:"2019/08/07"[PDat])&WebEnv=$Webenv&usehistory=y&retstart=920&retmax=20
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+("2009/08/10"[PDat]:"2019/08/07"[PDat])&WebEnv=$Webenv&usehistory=y&retstart=940&retmax=20
done
960
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$th1[ti]+AND+th2[ti]+AND+("2009/08/10"[PDat]:"2019/08/07"[PDat])&WebEnv=$Webenv&usehistory=y&retstart=949&retmax=20
done


In [14]:
# Now the UID of each article is saved in 'UID_list'
# UID is the identity of each article, we can use UID to get access to article's title, abstract ... 
# We can output the list into a CSV file

with open('output.csv','w') as file:
    for line in UID_list:
        file.write(line)
        file.write('\n')