In [None]:
# retrieve Pubmed article details by either scraping a website or using a keyword or phrase
#  see https://biopython-tutorial.readthedocs.io/en/latest/notebooks/09%20-%20Accessing%20NCBIs%20Entrez%20databases.html
# (c) 2022-2023 RENCI, Chapel Hill, NC

In [None]:
import pandas as pd
import requests
import urllib3
from Bio import Entrez
from bs4 import BeautifulSoup

Entrez.api_key = '7e2310a65401cdf4d5023cda2467c19de708'
Entrez.email = 'hubal@email.unc.edu'
Entrez.sleep_between_tries = 2 # seconds

In [None]:
def collect_articles_from_keyword(keyword, num_articles):
    handle = Entrez.esearch(db='pubmed', term=keyword)
    l = collect_article_details(Entrez.read(handle)['IdList'])
    if len(l) > num_articles:
        return l[0: num_articles]
    else:
        return l

def collect_articles_from_website(text):
    return collect_article_details(retrieve_pmids(text))

def collect_article_details(pmids):
    df = create_df()
    for pmid in pmids:
        a = retrieve_xml_from_pmid(pmid)
        df = append_to_df(df, [
            pmid, extract_pmcid(a), extract_title(a),
            extract_authors(a), extract_year(a), extract_journal(a),
            extract_keywords(a),
            extract_abstract(a), extract_grant(a), has_supplemental_data(extract_pmcid(a))])
    return df

def retrieve_xml_from_url(url):
    q = requests.get(url)
    return BeautifulSoup(q.content, 'html.parser').get_text()

def retrieve_xml_from_pmid(pmid):
    return Entrez.read(Entrez.efetch(db='pubmed', id=pmid, retmode='xml'))

def retrieve_pmids(text):
    i = 1
    r = []
    while i > 0:
        i = text.find('PMID', i+1)
        t = text[i+6: i+14]
        try:
            r.append(int(t))
        except ValueError:
            r.append(-1)
    return r

def create_df():
    return pd.DataFrame({
        'PMID':[], 'PMCID':[], 'Title':[], 'Author':[], 'Year':[], 'Journal':[], 'Keywords':[], 'Abstract':[],
        'Grant':[], 'Supplemental Data (Y/N)': []})

def append_to_df(df, row):
    df.loc[len(df.index)] = row
    return df

def extract_abstract(xml):
    try:
        abstract = xml['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']
        return abstract
    except IndexError:
        return None
    except KeyError:
        return None

def extract_authors(xml):
    authors = []
    try:
        for i in range (0, len(xml['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList'])):
            lname = xml['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList'][i]['LastName']
            inits = xml['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList'][i]['Initials']
            authors.append(lname + inits)
        return authors
    except IndexError:
        return None
    except KeyError:
        return None

def extract_grant(xml):
    grants = []
    try:
        for i in range (0, len(xml['PubmedArticle'][0]['MedlineCitation']['Article']['GrantList'])):
            grant = xml['PubmedArticle'][0]['MedlineCitation']['Article']['GrantList'][i]['GrantID']
            agency = xml['PubmedArticle'][0]['MedlineCitation']['Article']['GrantList'][i]['Agency']
            grants.append(agency + ', ' + grant)
        return grants
    except IndexError:
        return None
    except KeyError:
        return None

def extract_journal(xml):
    try:
        journal = xml['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['Title']
        return journal
    except IndexError:
        return None
    except KeyError:
        return None

def extract_keywords(xml):
    try:
        keywords = xml['PubmedArticle'][0]['MedlineCitation']['KeywordList'][0][0]
        return keywords
    except IndexError:
        return None
    except KeyError:
        return None

def extract_pmcid(xml):
    try:
        pmcid = xml['PubmedArticle'][0]['PubmedData']['ArticleIdList']
        for j in pmcid:
            if j.startswith('PMC'):
                return j[0: len(j)]
        return None
    except IndexError:
        return None
    except KeyError:
        return None

def extract_title(xml):
    try:
        title = xml['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']
        return title
    except IndexError:
        return None

def extract_year(xml):
    try:
        year = xml['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']
        return year
    except IndexError:
        return None
    except KeyError:
        return '0000'

def form_url_from_pmcid(pmcid):
    return 'http://eutils.ncbi.nlm.nih.gov/pmc/?term=(' + str(pmcid) + ')'

def has_supplemental_data(pmcid):
    t = retrieve_xml_from_url(form_url_from_pmcid(pmcid) + '+AND+has+suppdata%5Bfilter%5D')
    if 'No items found' in t:
        return False
    else:
        return True

In [None]:
# option 1: retrieve article information from scraping a website
data = collect_articles_from_website(retrieve_xml_from_url(
    'https://heal.nih.gov/research/publications#translation-of-research-to-practice-for-the-treatment-of-opioid-addiction'))

In [None]:
# option 2: extract article information related to a given string
data = collect_articles_from_keyword('Hepatitus B Virus', 20)