In [9]:
from pathlib import Path
import requests, json, time, sys

EMAIL   = "you@example.org"     # polite to include
STEP    = 100_000               # any size you like in text mode
SLEEP   = 0.34                  # 3 req/s without an API key

TERM = (
    '('
    '"Abdomen, Acute"[Majr] OR "Abdominal Pain"[Majr] '
    'OR "acute abdominal pain"[tiab] OR "acute abdomen"[tiab] '
    'OR "Pancreatitis"[Majr]      OR "pancreatitis"[tiab] '
    'OR "Diverticulitis"[Majr]    OR "diverticulitis"[tiab] '
    'OR "Cholecystitis"[Majr]     OR "cholecystitis"[tiab] '
    'OR "Appendicitis"[Majr]      OR "appendicitis"[tiab]'
    ')'
)

BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

# ------------------------------------------------------------------
# 1. Seed request – JSON, retmax=0, just to get count + history key
# ------------------------------------------------------------------
seed = requests.get(
    BASE + "esearch.fcgi",
    params=dict(db="pubmed", term=TERM, retmax=0, usehistory="y",
                retmode="json", email=EMAIL)
).json()["esearchresult"]

count, webenv, query_key = int(seed["count"]), seed["webenv"], seed["querykey"]
print(f"Total matches: {count:,}", file=sys.stderr)

# ------------------------------------------------------------------
# 2. Page through results – TEXT mode, rettype=uilist
# ------------------------------------------------------------------
common = dict(db="pubmed", WebEnv=webenv, query_key=query_key,
              rettype="uilist", retmode="text", email=EMAIL)

pmids, page = [], 0
while page < count:
    txt  = requests.get(BASE + "esearch.fcgi",
                        params=dict(retstart=page, retmax=STEP, **common)
                       ).text
    ids  = txt.splitlines()
    pmids.extend(ids)
    page += STEP
    print(f" fetched {len(pmids):,} / {count:,}", end="\r")
    time.sleep(SLEEP)           # honour public 3 req/s limit

print(f"\nDone – saving {len(pmids):,} PMIDs …")
Path("acute_abdpain_plus_inflam_pmids.json").write_text(json.dumps(pmids))


Total matches: 150,262


 fetched 10,014 / 150,262
Done – saving 10,014 PMIDs …


231585

In [10]:
#3 Streaming filter over the MedCPT chunks
import json, numpy as np, mmap
from pathlib import Path

keep = set(json.loads(Path("acute_abdpain_plus_inflam_pmids.json").read_text()))
chunks = sorted(Path(".").glob("pmids_chunk_*.json"))

out_dir = Path("acute_abdpain_subset")
out_dir.mkdir(exist_ok=True)

for pfile in chunks:
    cid = pfile.stem.split("_")[-1]          # e.g. "18"
    pmids   = json.load(pfile.open())
    idx_map = {p:i for i,p in enumerate(pmids) if p in keep}
    if not idx_map:
        continue                             # nothing in this chunk

    # --- slice the embedding matrix without loading it fully ---
    big_embeds = np.load(f"embeds_chunk_{cid}.npy", mmap_mode="r")
    rows       = np.fromiter(idx_map.values(), dtype=np.int64)
    sub_embeds = big_embeds[rows]            # fancy-indexing copies only what you need

    # --- write trimmed files ---
    np.save(out_dir / f"embeds_chunk_{cid}.npy", sub_embeds)
    json.dump(list(idx_map),  open(out_dir / f"pmids_chunk_{cid}.json", "w"))
    full_content = json.load(open(f"pubmed_chunk_{cid}.json"))
    sub_content  = {pm: full_content[pm] for pm in idx_map}
    json.dump(sub_content, open(out_dir / f"pubmed_chunk_{cid}.json", "w"))

    print(f"chunk {cid}: kept {len(idx_map)} articles")


In [None]:
# 4 (Optionally) merge all small chunks into one
mini = sorted(out_dir.glob("embeds_chunk_*.npy"))
all_vecs = np.concatenate([np.load(f) for f in mini])
all_pmids = sum([json.load(open(str(f).replace("embeds", "pmids").replace(".npy", ".json"))) 
                 for f in mini], [])
np.save("acute_abdpain_vectors.npy",  all_vecs)
json.dump(all_pmids, open("acute_abdpain_pmids.json","w"))


In [4]:
FILES = ["pmids_chunk_0.json", "pubmed_chunk_0.json", "embeds_chunk_0.npy"]
QTY = [38, 38, 38]
BASE_LINK = "https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/"
#write one single final linux command to download all files one by one
print("Download command:")
for i in range(len(FILES)):
    for j in range(QTY[i]):
        #replace the index in the filename
        filename = FILES[i].replace("0", str(j))
        #assemble the full URL and command, add semicolon at the end if not the last file
        url = BASE_LINK + filename
        command = f"wget {url}"
        if j < QTY[i] - 1:
            command += ";"
        print(command)

Download command:
wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pmids_chunk_0.json;
wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pmids_chunk_1.json;
wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pmids_chunk_2.json;
wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pmids_chunk_3.json;
wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pmids_chunk_4.json;
wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pmids_chunk_5.json;
wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pmids_chunk_6.json;
wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pmids_chunk_7.json;
wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pmids_chunk_8.json;
wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pmids_chunk_9.json;
wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pmids_chunk_10.json;
wget https://ftp.ncbi.nlm.nih.gov/p