The following will take the generated csv file of 50 notes as ground truth and corresponding columns and format it in a way to be grouped in a dictionary format for each note, where each key is a drug and each value is a dictionary of information representing (dose, unit, route, form)

In [1]:
ls

20.ann_medications.csv
DatasetTruth.xlsx
Med7_Extracted_Results_RG.csv
README.md
[34mbreastca[m[m/
cleaned_extracted_medical_info.csv
cleaned_extracted_medical_info_all_drug_categories_RG.csv
clinicalbert_chunked_output.csv
[34mclinicalbert_outputs[m[m/
deepseek_python_20250420_936ff4.py
final_project.ipynb
final_project_GB _char.ipynb
final_project_GB.ipynb
[34mgemini_char[m[m/
[34mgemini_outputs[m[m/
[34mgemini_outputs_v2[m[m/
[34mgemini_outputs_word[m[m/
llama3_extracted_info.csv
[34mllama3_outputs[m[m/
[34mmed7[m[m/
med7.zip
merged_random50_discharge_prescriptions.csv
mimic4_formatting_jaccard.ipynb
[34mmodified_med7[m[m/
modified_medications_note_pdac1.json
patient_data_cleaned.xlsx
[34mpdac[m[m/
[34msymptom truth labels[m[m/
[34mtest[m[m/


In [2]:
import pandas as pd

# Read your CSV
df = pd.read_csv('merged_random50_discharge_prescriptions.csv')

columns_to_convert = ['drug', 'dose_val_rx', 'dose_unit_rx', 'route', 'form_unit_disp']

# Step 1: Clean the columns
def parse_list_as_strings(x):
    if pd.isna(x):
        return []
    # Remove outer brackets and split by commas
    x = x.strip('[]')
    items = x.split(',')
    # Clean up each item and return as a list of strings
    return [item.strip().strip('"').strip("'") for item in items]

# Apply the cleaning function to all relevant columns
for col in columns_to_convert:
    df[col] = df[col].apply(parse_list_as_strings)

# Step 2: Function to transform each row into a dictionary
def build_drug_info(drugs, doses, units, routes, forms):
    drug_info = {}
    for drug, dose, unit, route, form in zip(drugs, doses, units, routes, forms):
        if drug not in drug_info:
            drug_info[drug] = {'dose': [], 'unit': [], 'route': [], 'form': []}
        drug_info[drug]['dose'].append(dose)
        drug_info[drug]['unit'].append(unit)
        drug_info[drug]['route'].append(route)
        drug_info[drug]['form'].append(form)
    return drug_info

# Step 3: Apply this function to each row
df['Drug_Info'] = df.apply(
    lambda row: build_drug_info(row['drug'], row['dose_val_rx'], row['dose_unit_rx'], row['route'], row['form_unit_disp']),
    axis=1
)

# Step 4: Check the output
print(df['Drug_Info'].iloc[0])  # Display the first row's drug info dictionary

df.to_csv('prescriptions_table_with_formatted_columns.csv')


{'CloniDINE': {'dose': ['0.1', '0.1'], 'unit': ['mg', 'mg'], 'route': ['PO', 'PO'], 'form': ['TAB', 'TAB']}, 'Nicotine Polacrilex': {'dose': ['1'], 'unit': ['STCK'], 'route': ['PO'], 'form': ['STCK']}, 'Oxcarbazepine': {'dose': ['150'], 'unit': ['mg'], 'route': ['PO'], 'form': ['TAB']}, 'DiphenhydrAMINE': {'dose': ['25', '50', '50', '25'], 'unit': ['mg', 'mg', 'mg', 'mg'], 'route': ['PO/NG', 'PO/NG', 'PO', 'PO'], 'form': ['CAP', 'CAP', 'CAP', 'CAP']}, 'Albuterol Inhaler': {'dose': ['1'], 'unit': ['PUFF'], 'route': ['IH'], 'form': ['INH']}, 'Lorazepam': {'dose': ['2', '2', '1', '1.5', '0.5', '2'], 'unit': ['mg', 'mg', 'mg', 'mg', 'mg', 'mg'], 'route': ['PO', 'PO', 'PO', 'PO', 'PO', 'PO'], 'form': ['TAB', 'TAB', 'TAB', 'TAB', 'TAB', 'TAB']}, 'Triamcinolone Acetonide 0.025% Cream': {'dose': ['1'], 'unit': ['Appl'], 'route': ['TP'], 'form': ['TUBE']}, 'Amitriptyline': {'dose': ['50', '100', '25'], 'unit': ['mg', 'mg', 'mg'], 'route': ['PO', 'PO', 'PO/NG'], 'form': ['TAB', 'TAB', 'TAB']}, '

Now, this turns it from that dictionary to group each dose, unit, route, and form together in a tuple for comparison later

In [3]:
##Convert to tuple format
def convert_to_tuple_format(drug_info):
    converted = {}
    for drug, info in drug_info.items():
        # Create a list of tuples: (dose, unit, route, form) - all as strings
        converted[drug] = [(str(dose), str(unit), str(route), str(form)) for dose, unit, route, form in zip(info['dose'], info['unit'], info['route'], info['form'])]
    return converted

# Convert Drug_Info to the new tuple format
df['Drug_Info_Tuple'] = df['Drug_Info'].apply(convert_to_tuple_format)

# Step 3: Check the output
print(df['Drug_Info_Tuple'].iloc[0])  # Display the first row's drug info in tuple format
df.to_csv('prescriptions_table_with_formatted_columns.csv')

{'CloniDINE': [('0.1', 'mg', 'PO', 'TAB'), ('0.1', 'mg', 'PO', 'TAB')], 'Nicotine Polacrilex': [('1', 'STCK', 'PO', 'STCK')], 'Oxcarbazepine': [('150', 'mg', 'PO', 'TAB')], 'DiphenhydrAMINE': [('25', 'mg', 'PO/NG', 'CAP'), ('50', 'mg', 'PO/NG', 'CAP'), ('50', 'mg', 'PO', 'CAP'), ('25', 'mg', 'PO', 'CAP')], 'Albuterol Inhaler': [('1', 'PUFF', 'IH', 'INH')], 'Lorazepam': [('2', 'mg', 'PO', 'TAB'), ('2', 'mg', 'PO', 'TAB'), ('1', 'mg', 'PO', 'TAB'), ('1.5', 'mg', 'PO', 'TAB'), ('0.5', 'mg', 'PO', 'TAB'), ('2', 'mg', 'PO', 'TAB')], 'Triamcinolone Acetonide 0.025% Cream': [('1', 'Appl', 'TP', 'TUBE')], 'Amitriptyline': [('50', 'mg', 'PO', 'TAB'), ('100', 'mg', 'PO', 'TAB'), ('25', 'mg', 'PO/NG', 'TAB')], 'Zolpidem Tartrate': [('10', 'mg', 'PO', 'TAB')], 'Multivitamins': [('1', 'TAB', 'PO/NG', 'TAB')], 'Acetaminophen': [('650', 'mg', 'PO', 'TAB')], 'Hydrocortisone Cream 1%': [('1', 'Appl', 'TP', 'TUBE')], 'Aluminum-Magnesium Hydrox.-Simethicone': [('30', 'mL', 'PO', 'UDCUP')], 'Fexofenadine'

This does the same thing as above, except now for LLM output and not ground truth output

In [4]:
import pandas as pd
from collections import defaultdict

# Read in your model outputs
model_df = pd.read_csv('cleaned_extracted_medical_info_baseline.csv')

# Initialize a dictionary to collect per-patient information
patient_drug_tuples = defaultdict(lambda: defaultdict(list))

# Iterate through each row
for idx, row in model_df.iterrows():
    patient_id = row['Patient Index']  # Adjust column name if needed
    drug = str(row['Drug Name']).strip()  # Make sure drug name is string and clean spaces
    dose = str(row['Drug Dosage']).strip()
    unit = str(row['Unit of Measurement of Dosage']).strip()
    form = str(row['Drug Form']).strip()
    route = str(row['Route']).strip()

    # Add the tuple to the correct patient and drug
    patient_drug_tuples[patient_id][drug].append((dose, unit, form, route))

# Now create a new dataframe where each row is a patient
patients_list = []
for patient_id, drug_dict in patient_drug_tuples.items():
    patients_list.append({
        'Patient Index': patient_id,
        'drug_tuples_dict': drug_dict
    })

llm_df = pd.DataFrame(patients_list)

# Now 'drug_tuples_df' has a 'drug_tuples_dict' column you can work with!

llm_df["drug_tuples_dict"].iloc[1]
llm_df.to_csv('llm_output_with_formatted_columns.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_extracted_medical_info_baseline.csv'

In [5]:
import numpy as np

def jaccard_similarity_case_insensitive(tuple1, tuple2):
    """Compute Jaccard similarity between two tuples of strings, ignoring case."""
    set1 = set(str(x).lower() for x in tuple1)
    set2 = set(str(x).lower() for x in tuple2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union > 0 else 0.0

def match_and_score_tuples(model_tuples, ground_truth_tuples):
    """
    Matches tuples between model and ground truth using maximum Jaccard similarity,
    and computes the average similarity, number of hallucinations, and misses.
    """
    model_tuples = list(model_tuples)  # Ensure lists
    ground_truth_tuples = list(ground_truth_tuples)
    
    n_model = len(model_tuples)
    n_gt = len(ground_truth_tuples)
    
    if n_model == 0 and n_gt == 0:
        return 1.0, 0, 0  # Perfect match if nothing predicted and nothing to find

    # Build similarity matrix
    sim_matrix = np.zeros((n_model, n_gt))
    for i, model_tuple in enumerate(model_tuples):
        for j, gt_tuple in enumerate(ground_truth_tuples):
            sim_matrix[i, j] = jaccard_similarity_case_insensitive(model_tuple, gt_tuple)
    
    matched_model = set()
    matched_gt = set()
    matches = []

    while True:
        # Find the max similarity pair
        max_sim = -1
        max_i, max_j = -1, -1
        for i in range(n_model):
            if i in matched_model:
                continue
            for j in range(n_gt):
                if j in matched_gt:
                    continue
                if sim_matrix[i, j] > max_sim:
                    max_sim = sim_matrix[i, j]
                    max_i, max_j = i, j
        
        if max_sim <= 0:
            break  # No more good matches

        matched_model.add(max_i)
        matched_gt.add(max_j)
        matches.append(max_sim)

    num_hallucinations = n_model - len(matched_model)
    num_misses = n_gt - len(matched_gt)
    average_similarity = np.mean(matches) if matches else 0.0

    return average_similarity, num_hallucinations, num_misses


In [82]:
##Doing the metric calculations for similarity simply between the drugs from gemini and the EHR drugs

def proportion_overlap(set1, set2):
    """
    Compute the proportion of elements in set1 that exist in set2 (case insensitive).

    Args:
        set1 (iterable): First set/list of elements.
        set2 (iterable): Second set/list of elements.

    Returns:
        float: Proportion of set1 elements that are found in set2.
    """
    # Normalize everything to strings, lowercase, and strip
    set1 = {str(e).lower().strip() for e in set1}
    set2 = {str(e).lower().strip() for e in set2}
    
    if len(set1) == 0:
        return 0.0  # Avoid division by zero

    # Find intersection
    intersection = set1 & set2

    # Calculate proportion
    return len(intersection) / len(set1)

prop_sum = 0
for i in range(len(llm_df)):    
    prop_sum += proportion_overlap(llm_df['drug_tuples_dict'].iloc[i].keys(), df['Drug_Info_Tuple'].iloc[i].keys())
avg_prop = prop_sum/len(llm_df)
print(avg_prop)

0.12437613600894937


In [None]:
#Given two drug dictionaries, find overlapping drugs, and then see how those tuples overlap generally
def compare_drug_dicts(model_dict, ground_truth_dict):
    """
    Compare two drug dictionaries and compute matching metrics.

    Args:
        model_dict (dict): {drug: list of 4-tuples} from model
        ground_truth_dict (dict): {drug: list of 4-tuples} from ground truth

    Returns:
        dict: mapping drug -> {'avg_jaccard', 'hallucinations', 'misses'}
    """
    results = {}

    # Find matching drugs
    common_drugs = set(model_dict.keys()) & set(ground_truth_dict.keys())
    
    for drug in common_drugs:
        tuples1 = model_dict[drug]
        tuples2 = ground_truth_dict[drug]
        metrics = match_and_score_tuples(tuples1, tuples2)
        results[drug] = metrics

    return results

In [39]:
from sklearn.model_selection import train_test_split
# make train test split for the fine tuning and validation
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Now train_df and test_df are your training and testing sets
print("Train DataFrame:")
train_df.head()


Train DataFrame:


Unnamed: 0,subject_id,hadm_id,text,charttime,drug,dose_val_rx,dose_unit_rx,route,form_unit_disp,Drug_Info,Drug_Info_Tuple
48,17653539,29073906,\nName: ___ Unit No: ___\n ...,2181-12-18 00:00:00,"[Diazepam, Acetaminophen, Simvastatin, Senna, ...","[5, 500-1000, 10, 8.6, 650, 1000, 1000, 10, 10...","[mg, mg, mg, mg, mg, mL, mg, mg, mL, g, g, mg,...","[PO/NG, PO/NG, PO/NG, PO/NG, PO/NG, IV, PO/NG,...","[TAB, TAB, TAB, TAB, TAB, mL, TAB, SUPP, mL, B...","{'Diazepam': {'dose': ['5'], 'unit': ['mg'], '...","{'Diazepam': [('5', 'mg', 'PO/NG', 'TAB')], 'A..."
26,17029090,28374285,\nName: ___ Unit No: __...,2144-02-19 00:00:00,"[Influenza Vaccine Quadrivalent, Aspirin, Hepa...","[0.5, 81, 2400-4900, 325, 9700, 650, 80, 25, 2...","[mL, mg, UNIT, mg, UNIT, mg, mg, mg, mg, mg, m...","[IM, PO/NG, IV, PO/NG, IV, PO/NG, PO/NG, PO, P...","[SYR, TAB, mL, TAB, mL, TAB, TAB, TAB, TAB, TA...",{'Influenza Vaccine Quadrivalent': {'dose': ['...,"{'Influenza Vaccine Quadrivalent': [('0.5', 'm..."
25,12851703,21288145,\nName: ___ Unit No: ___\n ...,2168-03-22 00:00:00,"[Albuterol 0.083% Neb Soln, Sodium Chloride 0....","[1, 3, 1000, 2-4, 1, 40, 8, 500, 40, 5000, 5, ...","[NEB, mL, mL, mg, VIAL, mg, mg, mL, mEq, UNIT,...","[IH, IV, IV, IV, IV, IV, IV, IV, IV, SC, PO/NG...","[VIAL, SYR, BAG, VIAL, VIAL, VIAL, VIAL, mL, m...","{'Albuterol 0.083% Neb Soln': {'dose': ['1'], ...","{'Albuterol 0.083% Neb Soln': [('1', 'NEB', 'I..."
32,12363908,29380974,\nName: ___ Unit No: ___\...,2127-06-13 00:00:00,"[0.9% Sodium Chloride, Acetaminophen, Metoprol...","[1000, 325-650, 10, 100, 1000, 5, 3, 10, 1, 20...","[mL, mg, mg, mg, mL, mg, mL, mg, BAG, mg, mg, ...","[IV, PO/NG, IV, PO/NG, IV, IV, IV, IV, IV, IV,...","[mL, TAB, VIAL, UDCUP, mL, VIAL, SYR, VIAL, BA...","{'0.9% Sodium Chloride': {'dose': ['1000', '10...","{'0.9% Sodium Chloride': [('1000', 'mL', 'IV',..."
19,17349534,21534254,\nName: ___ Unit No: ___...,2144-09-04 00:00:00,"[Warfarin, Methimazole, Zolpidem Tartrate, Ome...","[5, 5, 5, 20, 5-10, 0.5, 5, 1000, 40, 81, 325,...","[mg, mg, mg, mg, mL, mL, mg, UNIT, mg, mg, mg,...","[PO/NG, PO/NG, PO, PO, PO/NG, IM, PO/NG, PO/NG...","[dose, TAB, TAB, CAP, UDCUP, VIAL, TAB, TAB, T...","{'Warfarin': {'dose': ['5', '5'], 'unit': ['mg...","{'Warfarin': [('5', 'mg', 'PO/NG', 'dose'), ('..."


In [40]:
print("\nTest DataFrame:")
test_df.head(10)


Test DataFrame:


Unnamed: 0,subject_id,hadm_id,text,charttime,drug,dose_val_rx,dose_unit_rx,route,form_unit_disp,Drug_Info,Drug_Info_Tuple
13,11192372,26590365,\nName: ___ Unit No: ___\...,2136-06-13 00:00:00,"[Metoprolol Tartrate, Metoprolol Tartrate, Fur...","[100, 25, 80, 1, 20, 1, 10, 100, 40, 100, 50, ...","[mg, mg, mg, NEB, mg, TAB, mg, mg, mEq, mg, mg...","[PO/NG, PO/NG, PO/NG, IH, PO/NG, PO/NG, PO/NG,...","[TAB, TAB, TAB, VIAL, TAB, TAB, UDCUP, TAB, TA...","{'Metoprolol Tartrate': {'dose': ['100', '25',...","{'Metoprolol Tartrate': [('100', 'mg', 'PO/NG'..."
39,12634755,25987676,\nName: ___ Unit No: ___...,2183-02-13 00:00:00,"[Diltiazem, Docusate Sodium (Liquid), Metoprol...","[30, 100, 5, 650, 25, 2.5, 2.5, 1, 3, 150, 5, ...","[mg, mg, mg, mg, mg, mg, mg, NEB, mL, mg, mg, ...","[PO, NG, IV, PO, PO/NG, IM, IM, IH, IV, IV BOL...","[TAB, UDCUP, VIAL, TAB, TAB, VIAL, VIAL, VIAL,...","{'Diltiazem': {'dose': ['30', '30', '30', '60'...","{'Diltiazem': [('30', 'mg', 'PO', 'TAB'), ('30..."
30,19178732,28337573,\nName: ___ Unit No: ___...,2165-07-17 00:00:00,"[Heparin, Enoxaparin Sodium, Ipratropium-Albut...","[1800-3500, 80, 1, 20, 600, 1, 2, 3, 650, 60, ...","[UNIT, mg, NEB, mg, mg, BAG, gm, mL, mg, mEq, ...","[IV, SC, NEB, PO/NG, PO, IV, IV, IV, PO/NG, PO...","[mL, SYR, NEB, TAB, TAB, BAG, BAG, SYR, TAB, T...","{'Heparin': {'dose': ['1800-3500', '7000', '1'...","{'Heparin': [('1800-3500', 'UNIT', 'IV', 'mL')..."
45,17219662,27119045,\nName: ___ Unit No: ___\...,2181-05-07 00:00:00,"[Nicotine Patch, Docusate Sodium, FoLIC Acid, ...","[21, 100, 1, 2, 100, 4, 1, 500, 100, 325-650, ...","[mg, mg, mg, PKT, mL, gm, mg, mg, mg, mg, mg, ...","[TD, PO, PO/NG, PO/NG, IV, IV, PO/NG, PO/NG, P...","[PTCH, CAP, TAB, PKT, BAG, BAG, TAB, TAB, UDCU...","{'Nicotine Patch': {'dose': ['21', '7'], 'unit...","{'Nicotine Patch': [('21', 'mg', 'TD', 'PTCH')..."
17,12850463,23058381,\nName: ___ Unit No: ___\n \n...,2196-06-26 00:00:00,"[Insulin, Glucose Gel, Warfarin, Lisinopril, S...","[22, 15, 5, 10, 3, 5, 1000, 25, 650, 1000, 12....","[UNIT, g, mg, mg, mL, mg, mL, mg, mg, mL, gm, ...","[SC, PO, PO/NG, PO/NG, IV, PO/NG, IV, PO/NG, P...","[VIAL, TUBE, TAB, TAB, SYR, TAB, mL, TAB, TAB,...","{'Insulin': {'dose': ['22', '0'], 'unit': ['UN...","{'Insulin': [('22', 'UNIT', 'SC', 'VIAL'), ('0..."


In [41]:
# function to get everything after a certain phrase in the note:
def extract_medications(text):
    text_lower = text.lower()
    
    # Find position of "medications on admission"
    # idx = text_lower.find("medications on admission")
    # if idx != -1:
    #     return text[idx:]  # Return original casing starting from that point
    
    # Otherwise, find "discharge medications"
    idx = text_lower.find("discharge medications")
    if idx != -1:
        return text[idx:]  # Return original casing starting from that point
    
    # If neither phrase is found
    return None  # or `return text` if you prefer to keep the whole text

# Apply to create new column
train_df['meds_section'] = train_df['text'].apply(extract_medications)
train_df.head()

Unnamed: 0,subject_id,hadm_id,text,charttime,drug,dose_val_rx,dose_unit_rx,route,form_unit_disp,Drug_Info,Drug_Info_Tuple,meds_section
48,17653539,29073906,\nName: ___ Unit No: ___\n ...,2181-12-18 00:00:00,"[Diazepam, Acetaminophen, Simvastatin, Senna, ...","[5, 500-1000, 10, 8.6, 650, 1000, 1000, 10, 10...","[mg, mg, mg, mg, mg, mL, mg, mg, mL, g, g, mg,...","[PO/NG, PO/NG, PO/NG, PO/NG, PO/NG, IV, PO/NG,...","[TAB, TAB, TAB, TAB, TAB, mL, TAB, SUPP, mL, B...","{'Diazepam': {'dose': ['5'], 'unit': ['mg'], '...","{'Diazepam': [('5', 'mg', 'PO/NG', 'TAB')], 'A...",Discharge Medications:\n1. Acetaminophen 650 ...
26,17029090,28374285,\nName: ___ Unit No: __...,2144-02-19 00:00:00,"[Influenza Vaccine Quadrivalent, Aspirin, Hepa...","[0.5, 81, 2400-4900, 325, 9700, 650, 80, 25, 2...","[mL, mg, UNIT, mg, UNIT, mg, mg, mg, mg, mg, m...","[IM, PO/NG, IV, PO/NG, IV, PO/NG, PO/NG, PO, P...","[SYR, TAB, mL, TAB, mL, TAB, TAB, TAB, TAB, TA...",{'Influenza Vaccine Quadrivalent': {'dose': ['...,"{'Influenza Vaccine Quadrivalent': [('0.5', 'm...",Discharge Medications:\n1. Aspirin 81 mg PO DA...
25,12851703,21288145,\nName: ___ Unit No: ___\n ...,2168-03-22 00:00:00,"[Albuterol 0.083% Neb Soln, Sodium Chloride 0....","[1, 3, 1000, 2-4, 1, 40, 8, 500, 40, 5000, 5, ...","[NEB, mL, mL, mg, VIAL, mg, mg, mL, mEq, UNIT,...","[IH, IV, IV, IV, IV, IV, IV, IV, IV, SC, PO/NG...","[VIAL, SYR, BAG, VIAL, VIAL, VIAL, VIAL, mL, m...","{'Albuterol 0.083% Neb Soln': {'dose': ['1'], ...","{'Albuterol 0.083% Neb Soln': [('1', 'NEB', 'I...",Discharge Medications:\n1. Acetaminophen 650 m...
32,12363908,29380974,\nName: ___ Unit No: ___\...,2127-06-13 00:00:00,"[0.9% Sodium Chloride, Acetaminophen, Metoprol...","[1000, 325-650, 10, 100, 1000, 5, 3, 10, 1, 20...","[mL, mg, mg, mg, mL, mg, mL, mg, BAG, mg, mg, ...","[IV, PO/NG, IV, PO/NG, IV, IV, IV, IV, IV, IV,...","[mL, TAB, VIAL, UDCUP, mL, VIAL, SYR, VIAL, BA...","{'0.9% Sodium Chloride': {'dose': ['1000', '10...","{'0.9% Sodium Chloride': [('1000', 'mL', 'IV',...",Discharge Medications:\n1. Docusate Sodium 100...
19,17349534,21534254,\nName: ___ Unit No: ___...,2144-09-04 00:00:00,"[Warfarin, Methimazole, Zolpidem Tartrate, Ome...","[5, 5, 5, 20, 5-10, 0.5, 5, 1000, 40, 81, 325,...","[mg, mg, mg, mg, mL, mL, mg, UNIT, mg, mg, mg,...","[PO/NG, PO/NG, PO, PO, PO/NG, IM, PO/NG, PO/NG...","[dose, TAB, TAB, CAP, UDCUP, VIAL, TAB, TAB, T...","{'Warfarin': {'dose': ['5', '5'], 'unit': ['mg...","{'Warfarin': [('5', 'mg', 'PO/NG', 'dose'), ('...",Discharge Medications:\n1. Metoprolol Succinat...


In [49]:
# train_df['meds_section']
# print(train_df['text'][20])
# print(train_df['Drug_Info_Tuple'][20])

In [52]:
#print(df['Drug_Info_Tuple'][0])
import re

def subset_drug_dict(meds_text, drug_dict):
    new_dict = {}

    if meds_text == None:
        return None

    # Lowercase meds_section for case-insensitive matching
    meds_text_lower = meds_text.lower()

    for drug, tuples_list in drug_dict.items():
        # Count occurrences of the drug name (case-insensitive)
        drug_pattern = re.escape(drug.lower())  # Escape in case drug name has special chars
        occurrences = len(re.findall(r'\b' + drug_pattern + r'\b', meds_text_lower))
        
        if occurrences > 0:
            # Keep only the last 'occurrences' number of tuples
            new_dict[drug] = tuples_list[-occurrences:]
    
    return new_dict

In [53]:
train_df['meds_section_gt'] = train_df.apply(
    lambda row: subset_drug_dict(row['meds_section'], row['Drug_Info_Tuple']),
    axis=1
)
train_df.head()

Unnamed: 0,subject_id,hadm_id,text,charttime,drug,dose_val_rx,dose_unit_rx,route,form_unit_disp,Drug_Info,Drug_Info_Tuple,meds_section,meds_section_gt
48,17653539,29073906,\nName: ___ Unit No: ___\n ...,2181-12-18 00:00:00,"[Diazepam, Acetaminophen, Simvastatin, Senna, ...","[5, 500-1000, 10, 8.6, 650, 1000, 1000, 10, 10...","[mg, mg, mg, mg, mg, mL, mg, mg, mL, g, g, mg,...","[PO/NG, PO/NG, PO/NG, PO/NG, PO/NG, IV, PO/NG,...","[TAB, TAB, TAB, TAB, TAB, mL, TAB, SUPP, mL, B...","{'Diazepam': {'dose': ['5'], 'unit': ['mg'], '...","{'Diazepam': [('5', 'mg', 'PO/NG', 'TAB')], 'A...",Discharge Medications:\n1. Acetaminophen 650 ...,"{'Diazepam': [('5', 'mg', 'PO/NG', 'TAB')], 'A..."
26,17029090,28374285,\nName: ___ Unit No: __...,2144-02-19 00:00:00,"[Influenza Vaccine Quadrivalent, Aspirin, Hepa...","[0.5, 81, 2400-4900, 325, 9700, 650, 80, 25, 2...","[mL, mg, UNIT, mg, UNIT, mg, mg, mg, mg, mg, m...","[IM, PO/NG, IV, PO/NG, IV, PO/NG, PO/NG, PO, P...","[SYR, TAB, mL, TAB, mL, TAB, TAB, TAB, TAB, TA...",{'Influenza Vaccine Quadrivalent': {'dose': ['...,"{'Influenza Vaccine Quadrivalent': [('0.5', 'm...",Discharge Medications:\n1. Aspirin 81 mg PO DA...,"{'Aspirin': [('325', 'mg', 'PO/NG', 'TAB')], '..."
25,12851703,21288145,\nName: ___ Unit No: ___\n ...,2168-03-22 00:00:00,"[Albuterol 0.083% Neb Soln, Sodium Chloride 0....","[1, 3, 1000, 2-4, 1, 40, 8, 500, 40, 5000, 5, ...","[NEB, mL, mL, mg, VIAL, mg, mg, mL, mEq, UNIT,...","[IH, IV, IV, IV, IV, IV, IV, IV, IV, SC, PO/NG...","[VIAL, SYR, BAG, VIAL, VIAL, VIAL, VIAL, mL, m...","{'Albuterol 0.083% Neb Soln': {'dose': ['1'], ...","{'Albuterol 0.083% Neb Soln': [('1', 'NEB', 'I...",Discharge Medications:\n1. Acetaminophen 650 m...,"{'Pantoprazole': [('40', 'mg', 'PO', 'TAB')], ..."
32,12363908,29380974,\nName: ___ Unit No: ___\...,2127-06-13 00:00:00,"[0.9% Sodium Chloride, Acetaminophen, Metoprol...","[1000, 325-650, 10, 100, 1000, 5, 3, 10, 1, 20...","[mL, mg, mg, mg, mL, mg, mL, mg, BAG, mg, mg, ...","[IV, PO/NG, IV, PO/NG, IV, IV, IV, IV, IV, IV,...","[mL, TAB, VIAL, UDCUP, mL, VIAL, SYR, VIAL, BA...","{'0.9% Sodium Chloride': {'dose': ['1000', '10...","{'0.9% Sodium Chloride': [('1000', 'mL', 'IV',...",Discharge Medications:\n1. Docusate Sodium 100...,"{'Docusate Sodium': [('100', 'mg', 'PO/NG', 'U..."
19,17349534,21534254,\nName: ___ Unit No: ___...,2144-09-04 00:00:00,"[Warfarin, Methimazole, Zolpidem Tartrate, Ome...","[5, 5, 5, 20, 5-10, 0.5, 5, 1000, 40, 81, 325,...","[mg, mg, mg, mg, mL, mL, mg, UNIT, mg, mg, mg,...","[PO/NG, PO/NG, PO, PO, PO/NG, IM, PO/NG, PO/NG...","[dose, TAB, TAB, CAP, UDCUP, VIAL, TAB, TAB, T...","{'Warfarin': {'dose': ['5', '5'], 'unit': ['mg...","{'Warfarin': [('5', 'mg', 'PO/NG', 'dose'), ('...",Discharge Medications:\n1. Metoprolol Succinat...,"{'Methimazole': [('5', 'mg', 'PO/NG', 'TAB'), ..."


In [56]:
print(train_df['meds_section_gt'][2])

{'Acetaminophen': [('1000', 'mg', 'PO/NG', 'TAB')], 'Ibuprofen': [('400', 'mg', 'PO', 'TAB')], 'TraZODone': [('50', 'mg', 'PO/NG', 'TAB')], 'Escitalopram Oxalate': [('20', 'mg', 'PO/NG', 'TAB')], 'Spironolactone': [('100', 'mg', 'PO/NG', 'TAB')]}


In [57]:
print(train_df['Drug_Info_Tuple'][2])

{'Heparin': [('5000', 'UNIT', 'SC', 'mL')], 'OxyCODONE (Immediate Release)': [('5', 'mg', 'PO/NG', 'TAB')], 'Morphine Sulfate': [('1', 'mg', 'IV', 'SYR'), ('1-2', 'mg', 'IV', 'SYR')], 'Sodium Chloride 0.9%  Flush': [('3-10', 'mL', 'IV', 'SYR'), ('3-10', 'mL', 'IV', 'SYR')], 'NS': [('100', 'mL', 'IV', 'mL')], 'MetroNIDAZOLE': [('500', 'mg', 'IV', 'BAG')], 'Acetaminophen': [('1000', 'mg', 'PO/NG', 'UDCUP'), ('1000', 'mg', 'PO/NG', 'TAB')], 'Ibuprofen': [('400', 'mg', 'PO', 'UDCUP'), ('400', 'mg', 'PO', 'TAB')], 'Acetaminophen IV': [('1000', 'mg', 'IV', 'VIAL')], 'Vial': [('1', 'VIAL', 'IV', 'VIAL')], 'CefTRIAXone': [('1', 'gm', 'IV', 'VIAL')], 'Lactated Ringers': [('1000', 'mL', 'IV', 'mL'), ('1000', 'mL', 'IV', 'mL')], 'TraZODone': [('50', 'mg', 'PO/NG', 'TAB')], 'Escitalopram Oxalate': [('20', 'mg', 'PO/NG', 'TAB')], 'Bag': [('1', 'BAG', 'IV', 'BAG')], 'Magnesium Sulfate': [('2', 'gm', 'IV', 'BAG')], 'Spironolactone': [('100', 'mg', 'PO/NG', 'TAB')]}


In [58]:
print(f"Length of dict b4 subseting: {len(train_df['Drug_Info_Tuple'][2].keys())}")
print(f"Length of dict after subseting: {len(train_df['meds_section_gt'][2].keys())}")
# print(train_df['text'][2])
# print(train_df['meds_section'][2])

Length of dict b4 subseting: 17
Length of dict after subseting: 5


Now I have the subset note and the subseted dictionary for fine tuning

Note stored in "meds_section"

Updated gt dictionary stored in "meds_section_gt"

In [76]:
textss = test_df[test_df['hadm_id'] == 26590365]
# print(test_df[test_df['hadm_id'] == 26590365]['text'])
print(textss)

    subject_id   hadm_id                                               text  \
13    11192372  26590365   \nName:  ___                  Unit No:   ___\...   

              charttime                                               drug  \
13  2136-06-13 00:00:00  [Metoprolol Tartrate, Metoprolol Tartrate, Fur...   

                                          dose_val_rx  \
13  [100, 25, 80, 1, 20, 1, 10, 100, 40, 100, 50, ...   

                                         dose_unit_rx  \
13  [mg, mg, mg, NEB, mg, TAB, mg, mg, mEq, mg, mg...   

                                                route  \
13  [PO/NG, PO/NG, PO/NG, IH, PO/NG, PO/NG, PO/NG,...   

                                       form_unit_disp  \
13  [TAB, TAB, TAB, VIAL, TAB, TAB, UDCUP, TAB, TA...   

                                            Drug_Info  \
13  {'Metoprolol Tartrate': {'dose': ['100', '25',...   

                                      Drug_Info_Tuple  
13  {'Metoprolol Tartrate': [('100', 'mg', 'PO/NG'...

In [77]:
note = test_df.loc[test_df['hadm_id'] == 26590365, 'text'].values[0]
print(note)

 
Name:  ___                  Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: MEDICINE
 
Allergies: 
Penicillins / Tegretol / Insulin,Beef / Insulin,Pork / Zaroxolyn
 
Attending: ___.
 
Chief Complaint:
shortness of breath  
 
Major Surgical or Invasive Procedure:
none

 
History of Present Illness:
___ w/ CAD s/p multiple PCI, CHF LVEF 70% (___), PVD, DMII, 
atrial fibrillation, COPD (on home O2), OSA was noted to become 
acutely dyspneic and hypoxic at ___. Per s/o from ED 
resident who spoke to ___ staff, pt was found in respiratory 
distress, but had been comfortable previously that day. She had 
some frothy sputum and chest pain. BIPAP initated in the 
ambulance, also given morphine. 
.  
On arrival to the ED, her HR was up to 130s she was given 5mg 
metoprolol, lasix 40mg IV, and nitro gtt with rapid improvement 
in her symptoms. Initial dose of lasix resulted in 600cc UOP. 
She was weaned from BIPAP t

In [59]:
# manual annotations
hadm_ids = [26590365, 25987676, 28337573, 27119045, 23058381]
# for num in test_df['hadm_id']:
#     print(num)

dict_26590365 = {}

26590365
25987676
28337573
27119045
23058381
