In [1]:
# retrieve Pubmed article details by either scraping a website or using a keyword or phrase
#  see https://biopython-tutorial.readthedocs.io/en/latest/notebooks/09%20-%20Accessing%20NCBIs%20Entrez%20databases.html
# (c) 2022-2023 RENCI, Chapel Hill, NC

In [18]:
import numpy as np
import pandas as pd
import requests
import urllib3
from Bio import Entrez
from bs4 import BeautifulSoup

Entrez.api_key = '7e2310a65401cdf4d5023cda2467c19de708'
Entrez.email = 'hubal@email.unc.edu'
Entrez.sleep_between_tries = 4 # seconds

In [19]:
def collect_articles_from_website(text):
    return collect_article_details(retrieve_pmids(text))


def collect_article_details(pmids):
    df = create_df()
    for pmid in pmids:
        a = retrieve_xml_from_pmid(pmid)
        df = append_to_df(df, [
            pmid, extract_pmcid(a), extract_title(a),
            extract_authors(a), extract_year(a), extract_journal(a),
            extract_keywords(a),
            extract_abstract(a), extract_grant(a)])
    return df

def retrieve_xml_from_url(url):
    q = requests.get(url)
    return BeautifulSoup(q.content, 'html.parser').get_text()

def retrieve_xml_from_pmid(pmid):
    return Entrez.read(Entrez.efetch(db='pubmed', id=pmid, retmode='xml'))

def retrieve_pmids(text):
    i = 1
    r = []
    while i > 0:
        i = text.find('PMID', i+1)
        t = text[i+6: i+14]
        try:
            r.append(int(t))
        except ValueError:
            r.append(-1)
    return r

def create_df():
    return pd.DataFrame({
        'PMID':[], 'PMCID':[], 'Title':[], 'Author':[], 'Year':[], 'Journal':[], 'Keywords':[], 'Abstract':[],
        'Grant':[]})

def append_to_df(df, row):
    df.loc[len(df.index)] = row
    return df

def extract_abstract(xml):
    try:
        abstract = xml['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']
        return abstract
    except IndexError:
        return None
    except KeyError:
        return None

def extract_authors(xml):
    authors = []
    try:
        for i in range (0, len(xml['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList'])):
            lname = xml['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList'][i]['LastName']
            inits = xml['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList'][i]['Initials']
            authors.append(lname + " " + inits)
        return authors
    except IndexError:
        return None
    except KeyError:
        return None

def extract_grant(xml):
    grants = []
    try:
        for i in range (0, len(xml['PubmedArticle'][0]['MedlineCitation']['Article']['GrantList'])):
            grant = xml['PubmedArticle'][0]['MedlineCitation']['Article']['GrantList'][i]['GrantID']
            agency = xml['PubmedArticle'][0]['MedlineCitation']['Article']['GrantList'][i]['Agency']
            grants.append(agency + ', ' + grant)
        return grants
    except IndexError:
        return None
    except KeyError:
        return None

def extract_journal(xml):
    try:
        journal = xml['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['Title']
        return journal
    except IndexError:
        return None
    except KeyError:
        return None

def extract_keywords(xml):
    try:
        keywords = xml['PubmedArticle'][0]['MedlineCitation']['KeywordList'][0][0]
        return keywords
    except IndexError:
        return None
    except KeyError:
        return None
    
def extract_pmcid(xml):
    try:
        pmc = xml['PubmedArticle'][0]['PubmedData']['ArticleIdList']
        j = 0
        for j in pmc:
            if j.startswith('PMC'):
                return j
    except IndexError:
        return None

def extract_title(xml):
    try:
        title = xml['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']
        return title
    except IndexError:
        return None
def extract_year(xml):
    try:
        year = xml['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']
        return year
    except IndexError:
        return None
    except KeyError:
        return '0000'

def form_url_from_pmcid(pmcid):
    return 'http://eutils.ncbi.nlm.nih.gov/pmc/?term=(' + str(pmcid) + ')'    

In [20]:
data1 = collect_articles_from_website(retrieve_xml_from_url(
    'https://heal.nih.gov/research/publications#translation-of-research-to-practice-for-the-treatment-of-opioid-addiction'))

  return array(a, dtype, copy=False, order=order)


In [21]:
has_suppdata=[]
for l in data1['PMCID']:
    request = requests.get("https://www.ncbi.nlm.nih.gov/pmc/?term=(" + str(l) + ")+AND+has+suppdata%5Bfilter%5D")
    soupys = BeautifulSoup(request.content, "html.parser")
    texy = soupys.get_text()
    if "No items found" in texy:
        has_suppdata.append('No')
    elif l == None:
        has_suppdata.append('No')
    else:
        has_suppdata.append('Yes')

In [25]:
data1['Supplemental Data'] = has_suppdata
data1['PMID']=data1['PMID'].astype(int)
clean_data = data1.drop([data1.index[191], data1.index[195]]).sort_values('Supplemental Data', ascending=False)
clean_data

Unnamed: 0,PMID,PMCID,Title,Author,Year,Journal,Keywords,Abstract,Grant,Supplemental Data
152,34179783,PMC8190897,Synthesis and immunological effects of C14-lin...,"[Gutman ES, Irvin TC, Morgan JB, Barrientos RC...",2021,RSC chemical biology,,{'AbstractText': ['Active immunization is bein...,"[NIDA NIH HHS, DP1 DA034787, NIDA NIH HHS, UG3...",Yes
69,34606910,PMC8578417,Impact of prenatal exposure characterization o...,"[Massey SH, Allen NB, Pool LR, Miller ES, Poup...",2021,Neurotoxicology and teratology,Birthweight,{'AbstractText': ['A major challenge in prenat...,"[NIMH NIH HHS, R01 MH107652, NIDA NIH HHS, R34...",Yes
137,33229892,PMC8119287,"Glucagon-like peptide-1 receptor agonist, exen...","[Douton JE, Augusto C, Stoltzfus B, Carkaci-Sa...",2021,Behavioural pharmacology,,{'AbstractText': ['Opioid use disorder (OUD) c...,"[NIDA NIH HHS, R01 DA009815, NIDA NIH HHS, R37...",Yes
82,34130965,PMC8207994,Home-based virtual reality for chronic pain: p...,"[Birckhead B, Eberlein S, Alvarez G, Gale R, D...",2021,BMJ open,back pain,{'AbstractText': ['Chronic pain is highly prev...,"[NIAMS NIH HHS, UG3 AR076573, NIAMS NIH HHS, U...",Yes
81,32592504,PMC7765737,Evaluation of human cartilage endplate composi...,"[Wang L, Han M, Wong J, Zheng P, Lazar AA, Kru...",2021,Journal of orthopaedic research : official pub...,T2*,{'AbstractText': ['Cartilage endplate (CEP) bi...,"[NIAMS NIH HHS, P30AR075055, NIAMS NIH HHS, UH...",Yes
...,...,...,...,...,...,...,...,...,...,...
72,33386337,PMC7780957,Site-Level Variation in the Characteristics an...,,2021,Pediatrics,,{'AbstractText': ['Variation in pediatric medi...,"[NICHD NIH HHS, UG1 HD027853, NIH HHS, UG1 OD0...",No
73,33295951,,The Healthy Brain and Child Development Study-...,"[Volkow ND, Gordon JA, Freund MP]",2021,JAMA psychiatry,,,,No
74,35731889,,"Suvorexant ameliorated sleep disturbance, opio...","[Huhn AS, Finan PH, Gamaldo CE, Hammond AS, Um...",2022,Science translational medicine,,{'AbstractText': ['Increased orexin/hypocretin...,"[NIDA NIH HHS, UG3 DA048734]",No
75,36370080,,Biobehavioral Assessments in BACPAC: Recommend...,"[Greco CM, Wasan AD, Schneider MJ, Mehling W, ...",2022,"Pain medicine (Malden, Mass.)",Behavioral assessments,{'AbstractText': ['The Biobehavioral Working G...,,No


In [23]:
datay = clean_data[clean_data['Grant'].isnull()]
datay

Unnamed: 0,PMID,PMCID,Title,Author,Year,Journal,Keywords,Abstract,Grant,Supplemental Data
145,34043402,PMC8422285,Deep brain stimulation of the nucleus accumben...,"[Mahoney JJ, Haut MW, Hodder SL, Zheng W, Land...",2021,Experimental and clinical psychopharmacology,,{'AbstractText': ['Given high relapse rates an...,,Yes
71,33433576,PMC7804920,Neonatal Abstinence Syndrome and Maternal Opio...,"[Hirai AH, Ko JY, Owens PL, Stocks C, Patrick SW]",2021,JAMA,,{'AbstractText': ['Substantial increases in bo...,,Yes
116,33478555,PMC7819318,Non-SUMOylated CRMP2 decreases Na<sub>V</sub>1...,"[Gomez K, Ran D, Madura CL, Moutal A, Khanna R]",2021,Molecular brain,CRMP2,{'AbstractText': ['Voltage-gated sodium channe...,,Yes
111,34752775,PMC8776619,"Synchronized cluster firing, a distinct form o...","[Zheng Q, Xie W, Lückemeyer DD, Lay M, Wang XW...",2022,Neuron,DRG,{'AbstractText': ['Spontaneous pain refers to ...,,Yes
172,34861164,PMC8693769,Robotic high-throughput biomanufacturing and f...,"[Tristan CA, Ormanoglu P, Slamecka J, Malley C...",2021,Stem cell reports,CEPT cocktail,{'AbstractText': ['Efficient translation of hu...,,Yes
188,34447323,PMC8382852,An Effective and Safe Novel Treatment of Opioi...,"[Schiffer F, Khan A, Bolger E, Flynn E, Seltze...",2021,Frontiers in psychiatry,hemispheric laterality,{'AbstractText': ['<b>Background:</b> The opio...,,Yes
5,34598100,PMC8397502,The impact of the national stay-at-home order ...,"[Root ED, Slavova S, LaRochelle M, Feaster DJ,...",2021,Drug and alcohol dependence,COVID-19,{'AbstractText': ['Although national syndromic...,,Yes
25,34495340,PMC8427378,Comparison of Treatment Retention of Adults Wi...,"[Lee JD, Malone M, McDonald R, Cheng A, Vasude...",2021,JAMA network open,,{'AbstractText': ['Extended-release buprenorph...,,Yes
169,35119273,PMC9176367,"Thieno[2,3-<i>d</i>]pyrimidine-Based Positive ...","[Berhane I, Hin N, Thomas AG, Huang Q, Zhang C...",2022,Journal of medicinal chemistry,,{'AbstractText': ['Mas-related G protein-coupl...,,Yes
156,34454912,PMC8388132,Covid-19 interface with drug misuse and substa...,"[Cisneros IE, Cunningham KA]",2021,Neuropharmacology,Central nervous system,{'AbstractText': ['The coronavirus disease 201...,,No


In [26]:
big_data=pd.read_excel(r"/Users/riowombacher/Downloads/HEAL data publications  2.7.23 - suppl data.xlsx")

In [27]:
overlap = pd.merge(left=clean_data, right=big_data, left_on='PMID', right_on='PMID')
overlap

Unnamed: 0,PMID,PMCID,Title,Author,Year,Journal,Keywords,Abstract_x,Grant,Supplemental Data_x,...,FirstAuthor,JournalNameISO,FundingText,MLrelevanceprediction,MultipleLinks,ProjectPubMatch,NewFA,NewProgram,in_heal_website,Supplemental Data_y
0,34179783,PMC8190897,Synthesis and immunological effects of C14-lin...,"[Gutman ES, Irvin TC, Morgan JB, Barrientos RC...",2021,RSC chemical biology,,{'AbstractText': ['Active immunization is bein...,"[NIDA NIH HHS, DP1 DA034787, NIDA NIH HHS, UG3...",Yes,...,"Gutman, Eugene S",RSC Chem Biol,The research reported in this publication was ...,1,Yes,Y,Novel Therapeutic Options for Opioid Use Disor...,Development of Novel Immunotheraputics for Opi...,Yes,Yes
1,33229892,PMC8119287,"Glucagon-like peptide-1 receptor agonist, exen...","[Douton JE, Augusto C, Stoltzfus B, Carkaci-Sa...",2021,Behavioural pharmacology,,{'AbstractText': ['Opioid use disorder (OUD) c...,"[NIDA NIH HHS, R01 DA009815, NIDA NIH HHS, R37...",Yes,...,"Douton, Joaquin E",Behav Pharmacol,Authors thank the National Institute on Drug A...,1,Yes,Y,Novel Therapeutic Options for Opioid Use Disor...,Focused Medications Development to Treat Opioi...,Yes,Yes
2,34130965,PMC8207994,Home-based virtual reality for chronic pain: p...,"[Birckhead B, Eberlein S, Alvarez G, Gale R, D...",2021,BMJ open,back pain,{'AbstractText': ['Chronic pain is highly prev...,"[NIAMS NIH HHS, UG3 AR076573, NIAMS NIH HHS, U...",Yes,...,"Birckhead, Brandon",BMJ Open,This work is supported by the National Institu...,1,Yes,Y,Clinical Research in Pain Management,Back pain consortium (BACPAC),Yes,Yes
3,32592504,PMC7765737,Evaluation of human cartilage endplate composi...,"[Wang L, Han M, Wong J, Zheng P, Lazar AA, Kru...",2021,Journal of orthopaedic research : official pub...,T2*,{'AbstractText': ['Cartilage endplate (CEP) bi...,"[NIAMS NIH HHS, P30AR075055, NIAMS NIH HHS, UH...",Yes,...,"Wang, Linshanshan",J Orthop Res,North American Spine Society; School of Medici...,1,Yes,Y,Clinical Research in Pain Management,Back pain consortium (BACPAC),Yes,Yes
4,32592504,PMC7765737,Evaluation of human cartilage endplate composi...,"[Wang L, Han M, Wong J, Zheng P, Lazar AA, Kru...",2021,Journal of orthopaedic research : official pub...,T2*,{'AbstractText': ['Cartilage endplate (CEP) bi...,"[NIAMS NIH HHS, P30AR075055, NIAMS NIH HHS, UH...",Yes,...,"Wang, Linshanshan",J Orthop Res,North American Spine Society; School of Medici...,1,Yes,Y,Clinical Research in Pain Management,Back pain consortium (BACPAC),Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,33980315,PMC8115873,"Housing, opportunities, motivation and engagem...","[Slesnick N, Chavez L, Bunger A, Famelia R, Fo...",2021,Addiction science & clinical practice,Homelessness,{'AbstractText': ['Homeless youth experience h...,"[NIDA NIH HHS, UH3DA050174]",No,...,"Slesnick, Natasha",Addict Sci Clin Pract,The study is funded by the National Institute ...,1,Yes,Y,New Strategies to Prevent and Treat Opioid Add...,Optimizing Care for People with OUD and Mental...,Yes,No
116,34305753,PMC8294463,The Family Check-Up Online: A Telehealth Model...,"[Stormshak EA, Matulis JM, Nash W, Cheng Y]",2021,Frontiers in psychology,early childhood,{'AbstractText': ['Growing opioid misuse in th...,"[NIDA NIH HHS, P50 DA048756]",No,...,"Stormshak, Elizabeth A",Front Psychol,,1,No,Y,New Strategies to Prevent and Treat Opioid Add...,Prevention,Yes,No
117,33632932,PMC7919109,Phenobarbital and Clonidine as Secondary Medic...,,2021,Pediatrics,,{'AbstractText': ['Despite the neonatal opioid...,"[NIH HHS, UG1 OD024945, NIH HHS, U2C OD023375,...",No,...,"Merhar, Stephanie L",Pediatrics,,1,Yes,Y,Enhanced Outcomes for Infants and Children Exp...,Advancing Clinical Trials for Neonatal Opioid ...,Yes,No
118,33386337,PMC7780957,Site-Level Variation in the Characteristics an...,,2021,Pediatrics,,{'AbstractText': ['Variation in pediatric medi...,"[NICHD NIH HHS, UG1 HD027853, NIH HHS, UG1 OD0...",No,...,"Young, Leslie W",Pediatrics,,1,Yes,Y,Enhanced Outcomes for Infants and Children Exp...,Advancing Clinical Trials for Neonatal Opioid ...,Yes,No


In [36]:
unique=clean_data['PMID'].unique()

In [37]:
len(unique)

191

In [89]:
#Post to github when I put in error codes for -1 and no info at all
#sort by supplemental data Yes on top and then PMID from there


In [None]:
#Spreadsheet to website overlap