In [1]:
from semlib import Session
import os
import csv
from dotenv import load_dotenv
from collections import Counter
from Bio import Entrez

In [None]:
# # Always include your email (required by NCBI)
# Entrez.email = "suchit.bhayani@gmail.com"

# # Define your PubMed query
# query = '"drug repurposing"[Title/Abstract] AND "Alzheimer"[Title/Abstract] AND ("2022"[Date - Publication] : "2025"[Date - Publication])'

# # Run the search and store results in history
# search_results = Entrez.read(
#     Entrez.esearch(
#         db="pubmed",
#         term=query,
#         usehistory="y"
#     )
# )

# count = int(search_results["Count"])
# print(f"Found {count} results")

# # Prepare output CSV
# output_filename = "PubMed_Conversations.csv"

# # Define the CSV header (so your reading code works)
# header = ["pmid", "title", "abstract"]

# batch_size = 20

# with open(output_filename, "w", newline="", encoding="latin-1", errors="ignore") as f_out:
#     writer = csv.writer(f_out)
#     writer.writerow(header)

#     for start in range(0, count, batch_size):
#         end = min(count, start + batch_size)
#         print(f"Downloading records {start + 1} to {end}...")

#         # Fetch data in XML for easy parsing
#         handle = Entrez.efetch(
#             db="pubmed",
#             rettype="abstract",
#             retmode="xml",
#             retstart=start,
#             retmax=batch_size,
#             webenv=search_results["WebEnv"],
#             query_key=search_results["QueryKey"]
#         )

#         records = Entrez.read(handle)
#         handle.close()

#         for article in records["PubmedArticle"]:
#             pmid = article["MedlineCitation"]["PMID"]
#             article_data = article["MedlineCitation"]["Article"]
#             title = article_data.get("ArticleTitle", "")
#             abstract_parts = article_data.get("Abstract", {}).get("AbstractText", [])
#             abstract = " ".join(abstract_parts)
            
#             # Write to CSV
#             writer.writerow([pmid, title, abstract])

# print(f"✅ Saved {count} abstracts to '{output_filename}'")

In [2]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
MAX_CONCURRENCY=5


session = Session(model="openai/gpt-4.1-mini", max_concurrency=MAX_CONCURRENCY)

In [3]:
with open("PubMed_Conversations.csv", encoding="latin-1") as f_in:
    csv_file = csv.reader(f_in)
    header = next(csv_file)
    abstracts = [dict(zip(header, row, strict=False)) for row in csv_file]

print(f"Loaded {len(abstracts)} abstracts\n")
print(f"Example abstract: {abstracts[0]['abstract']}")

Loaded 246 abstracts

Example abstract: BackgroundDrug repurposing offers a rapid, cost-effective approach for discovering therapies against multiple targets.ObjectiveHere, we screen virtual ligand libraries consisting of 3468 approved drugs against 11 protein targets associated with Alzheimer's disease (AD).MethodsWe employ blind molecular docking, and target amyloid- (A), microtubule-associated protein tau (MAPT), Apolipoprotein E4 (APOE4), acetylcholinesterase (AChE), butyrylcholinesterase (BChE), amyloid- protein precursor (APP), -secretase (BACE1), brain-derived neurotrophic factor (BDNF), presenilin 1 (PSEN1) and 2 (PSEN2), and -synuclein (SNCA) proteins using AutoDock Vina.ResultsNotably, multitarget binding recurs among the top-10 ligands with Ergotamine and Dihydroergotamine potentially binding 8; Dutasteride 7; Drospirenone and Nilotinib 6; Adapalene and Conivaptan 5; Bromocriptine 4; and Rolapitant, Irinotecan, Plerixafor, Saquinavir, and Telmisartan 3, out of 11 protein tar

In [7]:
extracted_candidates = await session.map(
    abstracts,
    template=lambda r: f"""
Extract the drug repurposing candidates. Repond only with the candidates, separated by a comma and space. If none were mentioned, respond with 'none'
{r['abstract']}
""".strip(),
)

In [8]:
all_items = []
for entry in extracted_candidates:
    parts = [item.strip().lower() for item in entry.split(",")]
    all_items.extend(parts)

# Step 2: Count occurrences
counts = Counter(all_items)

# Step 3: (optional) sort by most common
for item, count in counts.most_common():
    print(f"{item}: {count}")

none: 92
metformin: 12
baricitinib: 6
nilotinib: 4
simvastatin: 4
dihydroergotamine: 3
atorvastatin: 3
fluphenazine: 3
diazoxide: 3
tofacitinib: 3
sildenafil: 3
donepezil: 3
losartan: 3
bromocriptine: 2
irinotecan: 2
saquinavir: 2
levothyroxine: 2
letrozole: 2
celecoxib: 2
bazedoxifene: 2
fluspirilene: 2
lisuride: 2
ibudilast: 2
rg2833: 2
dibenzoylmethane: 2
bt-11: 2
gabapentin: 2
antineoplastics: 2
ruxolitinib: 2
ponatinib: 2
imatinib mesylate: 2
miconazole: 2
fluvoxamine: 2
aspirin: 2
dolutegravir: 2
fluoxetine: 2
empagliflozin: 2
ceftriaxone: 2
dimethyl fumarate: 2
tacrolimus: 2
cyclosporine: 2
cromoglicate: 2
etodolac: 2
nicotine: 2
bbb-crossing ace-inhs: 2
sirolimus: 2
beta blockers: 2
statins: 2
rosiglitazone: 2
pioglitazone: 2
ergotamine: 1
dutasteride: 1
drospirenone: 1
adapalene: 1
conivaptan: 1
rolapitant: 1
plerixafor: 1
telmisartan: 1
candesartan: 1
tolvaptan: 1
netupitant: 1
orantinib: 1
bromodomain inhibitors: 1
clemastine: 1
pitolisant: 1
serotonergic gpcr agonists (5-ht