In [52]:
import pandas as pd
import random
import requests
import re
import aiohttp
import asyncio
import nest_asyncio
import time
from time import sleep
from tqdm import tqdm
import json

In [3]:
# only needs to be run once to init async use
nest_asyncio.apply()

In [5]:
# load in csv of drugnames to normalize 
# drugname csv pipeline in drugname_load_in.ipynb
drug_df = pd.read_csv("all_drugnames.csv")
drug_df = drug_df.drop_duplicates()
drug_df['drugname_clean'] = drug_df['drugname'].str.lower().str.strip()

In [7]:
drug_df.head(5)

Unnamed: 0,drugname,drugname_clean
0,LETROZOLE,letrozole
1,LAPATINIB,lapatinib
2,FULVESTRANT,fulvestrant
3,CAPECITABINE,capecitabine
4,TRASTUZUMAB,trastuzumab


In [13]:
# preprocesses drug names
def strip_dose_form(text):
    if not isinstance(text, str):
        return ""

    # Lowercase and remove content in parentheses
    text = text.lower()
    text = re.sub(r"\([^)]*\)", "", text)

    # Remove dosage + form terms
    text = re.sub(r"\b\d+(\.\d+)?\s?(mg|mcg|g|ml|iu|units?)\b", "", text)
    text = re.sub(r"\b(tablet|tab|capsule|cap|injection|inj|oral|solution|suspension|spray|patch|cream|ointment|drop|dose|film)\b", "", text)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Remove leading/trailing non-alphanumeric (keep slashes & dashes inside)
    text = re.sub(r"^[^\w]+|[^\w]+$", "", text)

    return text.strip()


drug_df['rxnorm_input'] = drug_df['drugname_clean'].apply(strip_dose_form)
unique_inputs = sorted(drug_df['rxnorm_input'].dropna().unique())

In [62]:
print(f"Found {len(unique_inputs)} unique individual drug parts to normalize.")

Found 462849 unique individual drug parts to normalize.


In [17]:
def get_score(candidate):
    return float(candidate.get("score", 0))

In [64]:
# rxnorm querying function
# single function call -> single drug query
async def get_rxnorm_best_matches(session, name, attempt=1):
    url = "https://rxnav.nlm.nih.gov/REST/approximateTerm.json"
    params = {"term": name, "maxEntries": 15}
    # sleep to delay api calls
    await asyncio.sleep(0.2)

    try:
        async with session.get(url, params=params, timeout=5) as r:
            # time-out api error
            if r.status == 429:
                # exponential delay for each repeat attempt
                wait = 2 ** attempt
                await asyncio.sleep(wait)
                if attempt < 4:
                    return await get_rxnorm_best_matches(session, name, attempt + 1)
                else:
                    # populate list to possibly retry later if needed
                    failed_429_inputs.add(name)
                    return name, None, None, None, None

            elif r.status == 200:
                data = await r.json()
                candidates = data.get("approximateGroup", {}).get("candidate", [])
                # if the rxnorm info returns no known synonyms return tuple of Nones
                if not candidates:
                    return name, None, None, None, None

                # Sort candidates by score
                candidates = sorted(candidates, key=get_score, reverse=True)
                best = candidates[0]
                
                # highest score drug name (regardless of drug database source
                best_name = best.get("name", name)
                best_rxcui = best.get("rxcui")
                
                # get the best rxnorm source specific name if it exists
                for c in candidates:
                    rxcui = c.get("rxcui")
                    if not rxcui:
                        continue
                    props_url = f"https://rxnav.nlm.nih.gov/REST/rxcui/{rxcui}/properties.json"
                    try:
                        async with session.get(props_url, timeout=5) as pr:
                            if pr.status == 200:
                                props = await pr.json()
                                meta = props.get("properties", {})
                                if meta.get("suppress") != "Y" and meta.get("name"):
                                    if c.get("source") == "RXNORM":
                                        return name, best_name, best_rxcui, meta["name"], rxcui
                    except:
                        continue
                # if no rxnorm sourced names exist only populate with the highest scored drug name
                return name, best_name, best_rxcui, None, None

    except:
        return name, None, None, None, None


In [66]:
# batch function
# runs query function on entire inputs list using async to not time-out api
async def run_rxnorm_best_matches(inputs):
    results = []
    async with aiohttp.ClientSession() as session:
        tasks = [get_rxnorm_best_matches(session, name) for name in inputs]
        responses = await asyncio.gather(*tasks)
        results.extend(responses)
    # populated results tuple list for single batch
    return results

In [None]:
batch_size = 50
failed_429_inputs = set()
rxnorm_results = []

for i in range(0, len(unique_inputs), batch_size):
    batch_num = i // batch_size + 1
    batch = unique_inputs[i:i + batch_size]

    # uncomment line below to gauge time per batch
    # start = time.time()
    
    # use the batch query function to query on entire batch
    try:
        batch_result = await run_rxnorm_best_matches(batch)
        rxnorm_results.extend(batch_result)
    except Exception as e:
        print(f"Error in batch {batch_num}: {e}")

    # uncomment line below to gauge time per batch
    # end = time.time()
    
    # uncomment if statement below to gauge time per batch
    # Print timing for batches 1–10
    # if batch_num <= 10:
    #     print(f"Batch {batch_num} took {end - start:.2f} seconds.")

    # save checkpoint every 10 batches to prevent lost progress if error ocurrs
    if batch_num % 10 == 0:
        with open("rxnorm_results_checkpoint.json", "w") as f:
            json.dump(rxnorm_results, f)

    # progress print every 50 batches
    if batch_num % 50 == 0:
        print(f"Processed batch {batch_num} of {len(unique_inputs) // batch_size + 1}")

Batch 1 took 2.22 seconds.
Batch 2 took 1.69 seconds.
Batch 3 took 5.62 seconds.
Batch 4 took 1.66 seconds.
Batch 5 took 1.72 seconds.
Batch 6 took 2.51 seconds.
Batch 7 took 1.70 seconds.
Batch 8 took 1.71 seconds.
Batch 9 took 5.33 seconds.
Batch 10 took 5.38 seconds.
Processed batch 50 of 9257
Processed batch 100 of 9257
Processed batch 150 of 9257
Processed batch 200 of 9257
Processed batch 250 of 9257
Processed batch 300 of 9257
Processed batch 350 of 9257
Processed batch 400 of 9257
Processed batch 450 of 9257
Processed batch 500 of 9257
Processed batch 550 of 9257
Processed batch 600 of 9257
Processed batch 650 of 9257
Processed batch 700 of 9257
Processed batch 750 of 9257
Processed batch 800 of 9257
Processed batch 850 of 9257
Processed batch 900 of 9257
Processed batch 950 of 9257
Processed batch 1000 of 9257
Processed batch 1050 of 9257
Processed batch 1100 of 9257
Processed batch 1150 of 9257
Processed batch 1200 of 9257
Processed batch 1250 of 9257


In [None]:
len(failed_429_inputs)

In [5]:
# UNCOMMENT BELOW

# below code retries timed-out drug queries

# retry_inputs = failed_429_inputs.copy()
# # smaller batch size to attempt to populate failed queries
# batch_size = 25
# failed_429_inputs = set()
# retry_results = []

# for i in range(0, len(retry_inputs), batch_size):
#     batch_num = i // batch_size + 1
#     batch = retry_inputs[i:i + batch_size]

#     # Start timing
#     start = time.time()

#     try:
#         batch_result = await run_rxnorm_best_matches(batch)
#         retry_results.extend(batch_result)
#     except Exception as e:
#         print(f"Error in batch {batch_num}: {e}")

#     # End timing
#     end = time.time()
    
#     # Print timing for batches 1–10
#     if batch_num <= 10:
#         print(f"Batch {batch_num} took {end - start:.2f} seconds.")

#     # Save checkpoint every 10 batches
#     if batch_num % 10 == 0:
#         with open("retry_results_checkpoint.json", "w") as f:
#             json.dump(rxnorm_results, f)

#     # Progress print every 50 batches
#     if batch_num % 50 == 0:
#         print(f"Processed batch {batch_num} of {len(retry_inputs) // batch_size + 1}")

# rxnorm_results.extend(retry_results)

In [None]:
rxnorm_df = pd.DataFrame(
    rxnorm_results,
    columns=["rxnorm_input", "best_match_name", "best_match_rxcui", "rxnorm_name", "rxnorm_rxcui"]
)

In [None]:
# save query output to be safe
rxnorm_df.to_csv('data_files/rxnorm_output_df.csv', index=False) 

In [29]:
# merge new query-result columns into original drug_df
drug_df = drug_df.merge(rxnorm_df, on="rxnorm_input", how="left")

In [30]:
drug_df.head(5)

Unnamed: 0,drugname,drugname_clean,rxnorm_input,rxnorm_name
0,LETROZOLE,letrozole,letrozole,
1,LAPATINIB,lapatinib,lapatinib,
2,FULVESTRANT,fulvestrant,fulvestrant,
3,CAPECITABINE,capecitabine,capecitabine,
4,TRASTUZUMAB,trastuzumab,trastuzumab,
...,...,...,...,...
75471440,VANCOMYCIN,vancomycin,vancomycin,
75471441,DIOVAN,diovan,diovan,
75471442,DIOVAN,diovan,diovan,
75471443,ANTIDIABETICS,antidiabetics,antidiabetics,


In [53]:
num_none = drug_df['rxnorm_name'].isna().sum()
print(f"Number of missing normalized names: {num_none}")

Number of missing normalized names: 725403


In [None]:
drug_df.to_csv("data_files/full_final_mapping.csv", index=False)

In [109]:
# current fuzzy match function but other libraries have similar functions

from difflib import SequenceMatcher

a = ". nikoran"
b = "nikoran"

# optional cleaning step uncommont if needed
# # Preclean: remove non-letters
# def clean(text):
#     return ''.join(c for c in text.lower() if c.isalpha())

# similarity = SequenceMatcher(None, clean(a), clean(b)).ratio()
similarity = SequenceMatcher(None, a, b).ratio()
print(similarity)

0.875
