Import Libraries

In [1]:
from lxml import etree
import pandas as pd
import requests
from tqdm import tqdm
import numpy as np
import re
from rapidfuzz import process
from transformers import AutoTokenizer, AutoModel
import torch

Creation of drug embeddings

In [2]:
# Path to DrugBank XML
xml_path = r"...\drugbank_all_full_database.xml\full database.xml"  # Replace with your actual path

# Parse XML
ns = {'db': 'http://www.drugbank.ca'}
tree = etree.parse(xml_path)
root = tree.getroot()

# Extract relevant data
drug_data = []

for drug in root.findall("db:drug", namespaces=ns):
    drug_type = drug.attrib.get("type")
    if drug_type != "small molecule":  # optional filter
        continue

    drug_id = drug.findtext("db:drugbank-id[@primary='true']", namespaces=ns)
    name = drug.findtext("db:name", namespaces=ns)
    description = drug.findtext("db:description", namespaces=ns)

    # RxNorm ID (optional, may not exist)
    rxnorm_id = None
    for ext_id in drug.findall(".//db:external-identifier", namespaces=ns):
        resource = ext_id.find("db:resource", namespaces=ns).text
        identifier = ext_id.find("db:identifier", namespaces=ns).text
        if resource.lower() == "rxnorm":
            rxnorm_id = identifier
            break

    # Synonyms
    synonyms = [s.text.lower() for s in drug.findall(".//db:synonym", namespaces=ns)]

    drug_data.append({
        "DrugBank_ID": drug_id,
        "Name": name.lower(),
        "Synonyms": synonyms,
        "Description": description,
        "RxCUI": rxnorm_id
    })

drugbank_df = pd.DataFrame(drug_data)


In [3]:
drugbank_df["Name"]

0               bivalirudin
1                 goserelin
2              gramicidin d
3              desmopressin
4                cetrorelix
                ...        
13161                tdi-01
13162          ibuzatrelvir
13163          cetyl oleate
13164    cetyl myristoleate
13165    cetyl palmitoleate
Name: Name, Length: 13166, dtype: object

In [None]:
def get_rxcui(drug_name):
    url = f"https://rxnav.nlm.nih.gov/REST/rxcui.json?name={drug_name}"
    response = requests.get(url)
    try:
        return response.json()["idGroup"]["rxnormId"][0]
    except:
        return None

# Fill missing RxCUI
for i, row in tqdm(drugbank_df.iterrows(), total=len(drugbank_df)):
    if pd.isna(row["RxCUI"]):
        rxcui = get_rxcui(row["Name"])
        drugbank_df.at[i, "RxCUI"] = rxcui


  1%|▌                                                                            | 90/13166 [01:14<2:56:29,  1.23it/s]

In [None]:
# Load prescriptions
mimic = pd.read_csv(r'...\PRESCRIPTIONS.csv.gz')


# Lowercase + drop nulls
mimic["DRUG_NAME_GENERIC"] = mimic["DRUG_NAME_GENERIC"].str.lower().str.strip()
generics = mimic["DRUG_NAME_GENERIC"].dropna().unique()

# Get RxNorm CUI for each
mimic_to_rxcui = []

for name in tqdm(generics):
    rxcui = get_rxcui(name)
    if rxcui:
        mimic_to_rxcui.append({
            "Generic_Name": name,
            "RxCUI": rxcui
        })

mimic_df = pd.DataFrame(mimic_to_rxcui)


In [None]:
# Merge on RxCUI
matched_df = pd.merge(drugbank_df, mimic_df, on="RxCUI")

# Columns: DrugBank_ID, Drug_Name, RxCUI, Generic_Name
print(matched_df.head())


In [230]:
# Load the correct model for embeddings
tokenizer = AutoTokenizer.from_pretrained("Lianglab/PharmBERT-uncased")
model = AutoModel.from_pretrained("Lianglab/PharmBERT-uncased")

model.eval()

# Embed text (description, generic name, etc.)
def get_embedding(text):
    if pd.isnull(text) or not isinstance(text, str):
        return None
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Apply it to the 'Description' column
matched_df["embedding"] = matched_df["Description"].apply(get_embedding)


Some weights of BertModel were not initialized from the model checkpoint at Lianglab/PharmBERT-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [231]:
np.save(r'..\drug_embeddings.npy', np.stack(matched_df["embedding"].to_numpy()))
matched_df.drop("embedding", axis=1).to_csv(r'...\drugbank_mimic_rxcui_map.csv', index=False)


In [246]:
print(matched_df)

    DrugBank_ID                            Name  \
0       DB00115                  cyanocobalamin   
1       DB00126                   ascorbic acid   
2       DB00126                   ascorbic acid   
3       DB00130                     l-glutamine   
4       DB00136                      calcitriol   
..          ...                             ...   
494     DB14488               ferrous gluconate   
495     DB14509               lithium carbonate   
496     DB14548  zinc sulfate, unspecified form   
497     DB14681                       cortisone   
498     DB15566            prednisolone acetate   

                                              Synonyms  \
0    ['cianocobalamina', 'cyanocob(iii)alamin', 'cy...   
1    ['acide ascorbique', 'ácido ascórbico', 'acidu...   
2    ['acide ascorbique', 'ácido ascórbico', 'acidu...   
3    ['(2s)-2-amino-4-carbamoylbutanoic acid', '(2s...   
4    ['(1s,3r,5z,7e)-9,10-secocholesta-5,7,10-trien...   
..                                     