In [1]:
# retrieve Pubmed article details by either scraping a website or using a keyword or phrase
#  see https://biopython-tutorial.readthedocs.io/en/latest/notebooks/09%20-%20Accessing%20NCBIs%20Entrez%20databases.html
# (c) 2022-2023 RENCI, Chapel Hill, NC

In [102]:
import numpy as np
import pandas as pd
import requests
import urllib3
from Bio import Entrez
from bs4 import BeautifulSoup

Entrez.api_key = '7e2310a65401cdf4d5023cda2467c19de708'
Entrez.email = 'hubal@email.unc.edu'
Entrez.sleep_between_tries = 4 # seconds

In [118]:
def collect_articles_from_keyword(keyword, num_articles):
    handle = Entrez.esearch(db='pubmed', term=keyword)
    l = collect_article_details(Entrez.read(handle)['IdList'])
    if len(l) > num_articles:
        return l[0: num_articles]
    else:
        return l

def collect_articles_from_website(text):
    return collect_article_details(retrieve_pmids(text))


def collect_article_details(pmids):
    df = create_df()
    for pmid in pmids:
        a = retrieve_xml_from_pmid(pmid)
        df = append_to_df(df, [
            pmid, extract_pmcid(a), extract_title(a),
            extract_authors(a), extract_year(a), extract_journal(a),
            extract_keywords(a),
            extract_abstract(a), extract_grant(a)])
    return df

def retrieve_xml_from_url(url):
    q = requests.get(url)
    return BeautifulSoup(q.content, 'html.parser').get_text()

def retrieve_xml_from_pmid(pmid):
    return Entrez.read(Entrez.efetch(db='pubmed', id=pmid, retmode='xml'))

def retrieve_pmids(text):
    i = 1
    r = []
    while i > 0:
        i = text.find('PMID', i+1)
        t = text[i+6: i+14]
        try:
            r.append(int(t))
        except ValueError:
            r.append(-1)
    return r

def create_df():
    return pd.DataFrame({
        'PMID':[], 'PMCID':[], 'Title':[], 'Author':[], 'Year':[], 'Journal':[], 'Keywords':[], 'Abstract':[],
        'Grant':[]})

def append_to_df(df, row):
    df.loc[len(df.index)] = row
    return df

def extract_abstract(xml):
    try:
        abstract = xml['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']
        return abstract
    except IndexError:
        return None
    except KeyError:
        return None

def extract_authors(xml):
    authors = []
    try:
        for i in range (0, len(xml['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList'])):
            lname = xml['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList'][i]['LastName']
            inits = xml['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList'][i]['Initials']
            authors.append(lname + " " + inits)
        return authors
    except IndexError:
        return None
    except KeyError:
        return None

def extract_grant(xml):
    grants = []
    try:
        for i in range (0, len(xml['PubmedArticle'][0]['MedlineCitation']['Article']['GrantList'])):
            grant = xml['PubmedArticle'][0]['MedlineCitation']['Article']['GrantList'][i]['GrantID']
            agency = xml['PubmedArticle'][0]['MedlineCitation']['Article']['GrantList'][i]['Agency']
            grants.append(agency + ', ' + grant)
        return grants
    except IndexError:
        return None
    except KeyError:
        return None

def extract_journal(xml):
    try:
        journal = xml['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['Title']
        return journal
    except IndexError:
        return None
    except KeyError:
        return None

def extract_keywords(xml):
    try:
        keywords = xml['PubmedArticle'][0]['MedlineCitation']['KeywordList'][0][0]
        return keywords
    except IndexError:
        return None
    except KeyError:
        return None
    
def extract_pmcid(xml):
    try:
        pmc = xml['PubmedArticle'][0]['PubmedData']['ArticleIdList']
        j = 0
        for j in pmc:
            if j.startswith('PMC'):
                return j
    except IndexError:
        return None

def extract_title(xml):
    try:
        title = xml['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']
        return title
    except IndexError:
        return None
def extract_year(xml):
    try:
        year = xml['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']
        return year
    except IndexError:
        return None
    except KeyError:
        return '0000'

def form_url_from_pmcid(pmcid):
    return 'http://eutils.ncbi.nlm.nih.gov/pmc/?term=(' + str(pmcid) + ')'

#def has_supplemental_data(pmcid):
    #t = retrieve_xml_from_url(form_url_from_pmcid(pmcid) + '+AND+has+suppdata%5Bfilter%5D')
    #if 'No items found' in t:
        #return False
    #else:
        #return True
    

In [119]:
# option 1: retrieve article information from scraping a website
data1 = collect_articles_from_website(retrieve_xml_from_url(
    'https://heal.nih.gov/research/publications#translation-of-research-to-practice-for-the-treatment-of-opioid-addiction'))

  return array(a, dtype, copy=False, order=order)


In [105]:
# option 2: extract article information related to a given string
data2 = collect_articles_from_keyword('Hepatitus B Virus', 10)

In [121]:
has_suppdata=[]
for l in data1['PMCID']:
    request = requests.get("https://www.ncbi.nlm.nih.gov/pmc/?term=(" + str(l) + ")+AND+has+suppdata%5Bfilter%5D")
    soupys = BeautifulSoup(request.content, "html.parser")
    texy = soupys.get_text()
    if "No items found" in texy:
        has_suppdata.append('No')
    elif l == None:
        has_suppdata.append('No')
    else:
        has_suppdata.append('Yes')

In [128]:
data1['Supplemental Data'] = has_suppdata
data1['PMID']=data1['PMID'].astype(int)
clean_data = data1.drop([data1.index[191], data1.index[195]]).sort_values('Supplemental Data', ascending=False)
clean_data

Unnamed: 0,PMID,PMCID,Title,Author,Year,Journal,Keywords,Abstract,Grant,Supplemental Data
151,34213886,PMC8328003,Kratom Alkaloids as Probes for Opioid Receptor...,"[Chakraborty S, Uprety R, Daibani AE, Rouzic V...",2021,ACS chemical neuroscience,Respiration,{'AbstractText': ['Dry leaves of kratom (mitra...,"[NIDA NIH HHS, UH3 DA048379, NIDA NIH HHS, R21...",Yes
68,34174513,PMC8246150,Developmental heatmaps of brain functional con...,"[Chen H, Liu J, Chen Y, Salzwedel A, Cornea E,...",2021,Developmental cognitive neuroscience,Childhood,{'AbstractText': ['Different functional networ...,"[NIDA NIH HHS, R01 DA042988, NIDA NIH HHS, R01...",Yes
81,32592504,PMC7765737,Evaluation of human cartilage endplate composi...,"[Wang L, Han M, Wong J, Zheng P, Lazar AA, Kru...",2021,Journal of orthopaedic research : official pub...,T2*,{'AbstractText': ['Cartilage endplate (CEP) bi...,"[NIAMS NIH HHS, P30AR075055, NIAMS NIH HHS, UH...",Yes
80,33547944,PMC8697722,Measurement of vertebral endplate bone marrow ...,"[Fields AJ, Ballatori A, Han M, Bailey JF, McC...",2021,European spine journal : official publication ...,Bone marrow,{'AbstractText': ['Vertebral endplate bone mar...,"[NIAMS NIH HHS, R01 AR063705, NIAMS NIH HHS, U...",Yes
136,33631465,PMC8009840,Verifying the role of 3-hydroxy of 17-cyclopro...,"[Huang B, Gunta R, Wang H, Li M, Cao D, Mendez...",2021,Bioorganic chemistry,3-Hydroxy group,"{'AbstractText': ['In the present study, the r...","[NIDA NIH HHS, P30 DA013429, NIDA NIH HHS, R01...",Yes
...,...,...,...,...,...,...,...,...,...,...
72,33386337,PMC7780957,Site-Level Variation in the Characteristics an...,,2021,Pediatrics,,{'AbstractText': ['Variation in pediatric medi...,"[NICHD NIH HHS, UG1 HD027853, NIH HHS, UG1 OD0...",No
73,33295951,,The Healthy Brain and Child Development Study-...,"[Volkow ND, Gordon JA, Freund MP]",2021,JAMA psychiatry,,,,No
74,35731889,,"Suvorexant ameliorated sleep disturbance, opio...","[Huhn AS, Finan PH, Gamaldo CE, Hammond AS, Um...",2022,Science translational medicine,,{'AbstractText': ['Increased orexin/hypocretin...,"[NIDA NIH HHS, UG3 DA048734]",No
75,36370080,,Biobehavioral Assessments in BACPAC: Recommend...,"[Greco CM, Wasan AD, Schneider MJ, Mehling W, ...",2022,"Pain medicine (Malden, Mass.)",Behavioral assessments,{'AbstractText': ['The Biobehavioral Working G...,,No


In [123]:
datay = data1[data1['Grant'].isnull()]
datay

Unnamed: 0,PMID,PMCID,Title,Author,Year,Journal,Keywords,Abstract,Grant,Supplemental Data
5,34598100,PMC8397502,The impact of the national stay-at-home order ...,"[Root ED, Slavova S, LaRochelle M, Feaster DJ,...",2021.0,Drug and alcohol dependence,COVID-19,{'AbstractText': ['Although national syndromic...,,Yes
24,33487517,,Justice community opioid innovation network (J...,"[Knight D, Becan J, Olson D, Davis NP, Jones J...",2021.0,Journal of substance abuse treatment,Implementation science,{'AbstractText': ['Recognizing the current opi...,,No
25,34495340,PMC8427378,Comparison of Treatment Retention of Adults Wi...,"[Lee JD, Malone M, McDonald R, Cheng A, Vasude...",2021.0,JAMA network open,,{'AbstractText': ['Extended-release buprenorph...,,Yes
44,34139948,,Examining the Relationship between Social Conn...,"[Cance JD, Saavedra LM, Wondimu B, Scaglione N...",2021.0,Substance use & misuse,Opioids,{'AbstractText': ['We used a Boolean search st...,,No
46,34080556,PMC8028689,Impact of COVID-19 on service delivery for an ...,"[Cruden G, Campbell M, Saldana L]",2021.0,Journal of substance abuse treatment,COVID-19,"{'AbstractText': ['The novel coronavirus, COVI...",,No
70,33386340,,It Is Time to ACT NOW to Improve Quality for O...,"[Patrick SW, Lorch SA]",2021.0,Pediatrics,,,,No
71,33433576,PMC7804920,Neonatal Abstinence Syndrome and Maternal Opio...,"[Hirai AH, Ko JY, Owens PL, Stocks C, Patrick SW]",2021.0,JAMA,,{'AbstractText': ['Substantial increases in bo...,,Yes
73,33295951,,The Healthy Brain and Child Development Study-...,"[Volkow ND, Gordon JA, Freund MP]",2021.0,JAMA psychiatry,,,,No
75,36370080,,Biobehavioral Assessments in BACPAC: Recommend...,"[Greco CM, Wasan AD, Schneider MJ, Mehling W, ...",2022.0,"Pain medicine (Malden, Mass.)",Behavioral assessments,{'AbstractText': ['The Biobehavioral Working G...,,No
76,36315069,,Deep Learning for Multi-Tissue Segmentation an...,"[Hess M, Allaire B, Gao KT, Tibrewala R, Inamd...",2022.0,"Pain medicine (Malden, Mass.)",BACPAC,{'AbstractText': ['In vivo retrospective study...,,No


In [89]:
#Post to github when I put in error codes for -1 and no info at all
#sort by supplemental data Yes on top and then PMID from there
