In [1]:
import pandas as pd 
import numpy as np

In [2]:
data = pd.read_csv('/Users/jonathantao/Downloads/narcolepsyscrape.csv', encoding='windows-1252')
data = data.drop_duplicates(keep='first', subset=['pubmedurls_Article_PMID'])
data.head()

Unnamed: 0,MoA,pubmedurls_Article_Title,pubmedurls_Article_URL,pubmedurls_Article_Authors,pubmedurls_Article_PMID,pubmedurls_Article_Misc
0,Agonist,Pitolisant versus placebo or modafinil in pati...,https://pubmed.ncbi.nlm.nih.gov/24107292/,"Dauvilliers Y, Bassetti C, Lammers GJ, Arnulf ...",24107292,Lancet Neurol. 2013 Nov;12(11):1068-75. doi: 1...
1,,Evaluation of the abuse potential of pitolisan...,https://pubmed.ncbi.nlm.nih.gov/31626696/,"Setnik B, McDonnell M, Mills C, Scart-GrÃ¨s C,...",31626696,Sleep. 2020 Apr 15;43(4):zsz252. doi: 10.1093/...
2,,"Safety, Tolerability, and Pharmacokinetics of ...",https://pubmed.ncbi.nlm.nih.gov/32399853/,"Nirogi R, Mudigonda K, Bhyrapuneni G, Muddana ...",32399853,Clin Drug Investig. 2020 Jul;40(7):603-615. do...
3,,Pharmacokinetics of pitolisant in children and...,https://pubmed.ncbi.nlm.nih.gov/31978866/,"Lecendreux M, Plazzi G, Franco P, Jacqz-Aigrai...",31978866,Sleep Med. 2020 Feb;66:220-226. doi: 10.1016/j...
4,,An inverse agonist of the histamine H(3) recep...,https://pubmed.ncbi.nlm.nih.gov/18295497/,"Lin JS, Dauvilliers Y, Arnulf I, Bastuji H, An...",18295497,Neurobiol Dis. 2008 Apr;30(1):74-83. doi: 10.1...


In [3]:
from pymed import PubMed
from bs4 import BeautifulSoup
import requests
import re
import math

In [4]:
# create a PubMed object
pubmed = PubMed(tool="macOS VSCode", email="beneric1232003@gmail.com")

# set the PMID of the article you want to fetch
#pmid = "33037856"
pmid = "35193545"
print(pmid)

# search for the article using its PMID
article = pubmed.query(pmid)
article_content = next(article)

# extract the article's abstract
abstract = article_content.abstract

# print out the abstract
print(article_content.title)
print(abstract)


35193545
Optimal dose determination of enerisant (TS-091) for patients with narcolepsy: two randomized, double-blind, placebo-controlled trials.
The histamine H3 receptor has emerged as one of the most promising targets of novel pharmacotherapy for narcolepsy. Studies now aim to investigate the optimal dose of enerisant, a novel H3 antagonist/inverse agonist, for the treatment of excessive daytime sleepiness in patients with narcolepsy.
We conducted two phase 2, fixed-dose, double-blind, randomized, placebo-controlled trials in patients with narcolepsy. The first phase 2 study (Study 1) was conducted to investigate the efficacy and safety of enerisant at dosages of 25, 50, and 100 mg/day administered for 3 weeks based on the results of a phase 1 study conducted on healthy volunteers. The primary endpoint was mean sleep latency in maintenance of wakefulness test (MWT), and the secondary endpoint was the total score on the Epworth Sleepiness Scale (ESS). The dosages of enerisant in the s

In [5]:
import csv 
with open('narc.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Indicator", "MoA", "Title", "URL", "Author", "PMID", "Abstract", "Trial ID", "Phase", "Date"])
    i = 0
    currMoa = "None"
    for index, row in data.iterrows(): 
        print(i)
        #PMID 
        ID = str(row['pubmedurls_Article_PMID'])
        
        #Abstract 
        try:
            article = pubmed.query(ID)
            article_content = next(article)
            abstract = article_content.abstract
            title = article_content.title
        except:
            print("Could not find abstract for: " + ID)
        
        #Mesh Terms and URL
        terms_list = []
        url = row["pubmedurls_Article_URL"]
        pmid = ID
        req = requests.get(url)
        soup = BeautifulSoup(req.content, "html.parser")
        try:
            mesh_section = soup.find("div", attrs={'id': 'mesh-terms', 'class': 'mesh-terms keywords-section'})
            keywords_list = mesh_section.find("ul", attrs={'class': 'keywords-list'})
            li_list = keywords_list.find_all("li")
            for li_indiv in li_list:
                try:
                    text_div = li_indiv.find("div", attrs={'class': 'keyword-actions dropdown-block'})
                    button_div = text_div.find("button")
                    #print(button_div.text)
                    button_text_raw = button_div.text
                    button_text_cleaned = button_text_raw.strip()
                    terms_list.append(button_text_cleaned)
                except:
                    pass
        except:
            print("Could not find mesh terms for: " + ID)
        abstract+=' '.join(terms_list)
    
        #MoA
        if isinstance(row["MoA"], str):
            moa = row["MoA"]
            currMoa = moa
        else: 
            moa = currMoa
        
        #Title and Authors
        title = row["pubmedurls_Article_Title"]
        authors = row["pubmedurls_Article_Authors"]
        
        #date 
        raw_data = row["pubmedurls_Article_Misc"]
        dates_regex = "(\d\d\d\d.*?);"
        dates = re.findall(f"{dates_regex}", raw_data)[0]
        
        #phase 
        req = requests.get(url)
        soup = BeautifulSoup(req.content, "html.parser")
        try:
            link = soup.find("a", attrs={'title': 'See in ClinicalTrials.gov'})
            raw_text = link.text
            trial_id = raw_text.strip()
            URL = "https://clinicaltrials.gov/ct2/show/" + trial_id
            try: 
                page = requests.get(URL)
                soup = BeautifulSoup(page.content, "html.parser")
                table_contents = soup.find_all("span", attrs={'style': 'display:block;margin-bottom:1ex;'})
                phase = table_contents[-1].text
            except:
                phase = "Not Found"
        except:
            trial_id = "None"
            phase = "Not Found"
        if phase.find("Phase") == -1:
            phase = "Not Found"    

        
        #write
        writer.writerow(["Narcolepsy", moa, title, url, authors, pmid, abstract, trial_id, phase, dates])
        i+=1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159


IndexError: list index out of range