In [1]:
from bs4 import BeautifulSoup
import requests
import urllib.request, urllib.parse, urllib.error
import re
import ssl
import json
import calendar
import numpy as np
import pandas as pd

In [9]:


url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=NUM&sort=relevance&term=KEYWORD"

# We ask the user to provide the keyword and number of results and subsequently replace these elements in the url string
keyword = str(input('Please enter the keyword eg. "type+2+diabetes"')) 
num = int(input('Please enter the number of results - Numbers above 500 may cause it to fail'))
url = url.replace('NUM', str(num))
url = url.replace('KEYWORD', keyword)
print(url)
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn’t verify HTTPS certificates by default
    pass
else:
    # Handle target environment that doesn’t support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context
    
webpage = urllib.request.urlopen(url).read()
dict_page =json.loads(webpage)
idlist = dict_page["esearchresult"]["idlist"]

print(idlist)


# We create a function to delete brackets from titles
def strip_brackets(s): 
    # initialization of string to "" 
    no_bracktes = "" 
    dont_want = ['[',']']
    # traverse in the string  
    for char in s: 
        if char not in dont_want:
            no_bracktes += char
    # return string  
    return no_bracktes 


# Create a function which takes the soup and extracts all needed elements for the bibliography and abstract
def get_bibliography(soup):

    # This function creates a empty variable for each needed element and subsequently fills in the true value if it exists

    article = soup.find('article')
    journal = soup.find('journal')

    authorlist = article.find('authorlist')

    # Extracting list of authors
    authors = ""
    if authorlist:
        for i in range(len(authorlist.find_all('lastname'))):
            initial = authorlist.find_all('initials')[i].text
            authors+= initial
            authors+= '. '
            last_name = authorlist.find_all('lastname')[i].text
            authors+= last_name
            if i == len(authorlist.find_all('lastname'))-2:
                authors += ' and '
            elif i != len(authorlist.find_all('lastname'))-1:
                authors += ', '
        authors += ", "
        
    # Extracting title of the article
    ArticleTitle = ''
    if article.find('articletitle'):
            ArticleTitle = '"'
            title_str = article.find('articletitle').text
            title_str = strip_brackets(title_str)
            ArticleTitle += title_str
            # If that is in the title, please leave it and put the comma after the quotation marks. - Professor Bishop
            if ArticleTitle[-1] == '.':
                ArticleTitle += '", '
            else:
                ArticleTitle += '," '
    
    # Extracting date of the article
    JournalIssue = journal.find('journalissue')
    
    month = JournalIssue.find('month')
    date = ''
    if month:
        month = JournalIssue.find('month').text
        if len(month)<3:
            month_int = int(str(month))
            month = calendar.month_abbr[month_int]

        year = JournalIssue.find('year').text
        date += month
        date += '. '
        date += year
    elif JournalIssue.find('year'):
        date+= JournalIssue.find('year').text   
    else: ''

    # Extracting abstract      
    abstract = ''
    if article.find('abstracttext'):
        abstract += '"'
        abstract += article.find('abstracttext').text
        abstract += '"'
        
    # Extracting list of keywords  
    keywordlist  = soup.find('keywordlist')
    keywords = ""
    if keywordlist:
        for i in range(len(keywordlist.find_all('keyword'))):
            keyword = keywordlist.find_all('keyword')[i].text
            keywords+= keyword
            keywords += ", "
            
    # Extracting list of affiliations - NB! Dublicates may occur, handle this in later cleaning process  
    affiliationlist  = soup.find_all('affiliation') 
    affiliations = ""
    if affiliationlist:
        for i in range(len(soup.find_all('affiliation'))):
            affiliation = soup.find_all('affiliation')[i].text
            affiliations+= '"'
            affiliations+= affiliation
            affiliations+= '", '
    
    
    result = []
    result.append(authors)
    result.append(ArticleTitle)
    result.append(date)
    result.append(abstract)
    result.append(keywords)
    result.append(affiliations)
    return result

articles_list = []

# We loop over each element in the idlist to get the soup and feed it into our function
for link in idlist:
    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=idlist"
    url = url.replace('idlist', link)

    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        # Legacy Python that doesn’t verify HTTPS certificates by default
        pass
    else:
        # Handle target environment that doesn’t support HTTPS verification
        ssl._create_default_https_context = _create_unverified_https_context
    
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    article = get_bibliography(soup)
    articles_list.append(article)

df = pd.DataFrame(articles_list)
df.columns = ['Authors', 'ArticleTitle', 'Date', 'Abstract','Keywords','Affiliations']
file_name = keyword + '_' + str(num) + '.csv'
df.to_csv(file_name)

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=500&sort=relevance&term=type+2+diabetes
['25249787', '29412061', '11800065', '27262256', '27159875', '23520370', '30409037', '34251351', '31247468', '27622231', '29699867', '29395440', '25867358', '27974926', '25027899', '11209329', '30046957', '32186603', '33795463', '10199747', '34565781', '34140928', '30360498', '35595482', '26666144', '33315097', '29914779', '36967777', '36297095', '28326652', '23551885', '28460155', '24527480', '9599750', '12211961', '12891055', '10943809', '11841952', '12092688', '33739145', '16466601', '16085165', '30135724', '34371302', '21525449', '36197636', '10609116', '18806081', '37510368', '35919809', '22135872', '12618558', '16329529', '20958092', '28282717', '8482235', '15823386', '15209090', '32447488', '15315864', '15047665', '27074851', '24070804', '19479186', '11809616', '27073949', '25629355', '20609952', '19019474', '10418073', '17331067', '32331901', '2517252

