In [85]:
import re
import numpy as np
import pandas as pd
from rapidfuzz import process, fuzz
from sklearn.impute import KNNImputer

In [66]:
df_drugs = pd.read_csv('/Users/hiruzen/Programming/Projects/mimic-iv-datawarehouse/resources/drugs.csv')
df_tm = pd.read_csv('/Users/hiruzen/Programming/Projects/mimic-iv-datawarehouse/resources/TM_2017.csv')
df_tm['Medicine_Name_std'] = df_tm['Medication_Name'].str.lower().str.strip()
df_drugs['brand_name_std'] = df_drugs['brand_name'].str.lower().str.strip()

drug_choices = df_drugs['brand_name_std'].tolist()
mapping_results = []
for idx, row in df_tm.iterrows():
    medicine_std = row['Medicine_Name_std']
    cost = row.get('Max_Consumer_VAT_Price', None)
    active_ingredients = None
    if pd.notnull(medicine_std):
        best_match = process.extractOne(medicine_std, drug_choices, scorer=fuzz.token_set_ratio)
        match, score = best_match[0], best_match[1] if best_match is not None else (None, None)
        matched_rows = df_drugs[df_drugs['brand_name_std'] == match]
        if not matched_rows.empty:
            active_ingredients = matched_rows.iloc[0]['active_ingredients']
    else:
        match, score = None, None

    mapping_results.append({
        'medicine_name': row['Medication_Name'],
        'matched_drug': match,
        'package_size': row['Package_Size'],
        'match_score': score,
        'cost': cost,
        'drugs': active_ingredients
    })

df_mapping = pd.DataFrame(mapping_results)
df_mapping

Unnamed: 0,medicine_name,matched_drug,package_size,match_score,cost,drugs
0,ABBOSYNAGIS 100MG PUL FOR INJ+SOLV,aminosyn ii 10% in plastic container,1.0,48.571429,4551.367114,AMINO ACIDS (10% (10GM/100ML))
1,ABBOSYNAGIS 50MG PUL FOR INJ+SOLV,ionosol b and dextrose 5% in plastic container,1.0,48.101266,2721.220772,"DEXTROSE (5GM/100ML), MAGNESIUM CHLORIDE (53MG..."
2,ABELCET 5MG/ML VIAL 20ML (100MG/20ML),abelcet,1.0,100.000000,668.209723,AMPHOTERICIN B (5MG/ML)
3,ABILIFY MAINTENA 300 MG VIAL,abilify,1.0,100.000000,1449.973105,ARIPIPRAZOLE (9.75MG/1.3ML (7.5MG/ML))
4,ABILIFY 10MG 28TAB,abilify,28.0,100.000000,267.489485,ARIPIPRAZOLE (9.75MG/1.3ML (7.5MG/ML))
...,...,...,...,...,...,...
2976,ZYPREXA INJ 10MG,zyprexa,1.0,100.000000,112.125863,OLANZAPINE (20MG)
2977,ZYPREXA VELOTAB 10MG 28TAB,zyprexa,28.0,100.000000,148.300420,OLANZAPINE (20MG)
2978,ZYPREXA VELOTAB 5MG 28TAB,zyprexa,28.0,100.000000,87.710212,OLANZAPINE (20MG)
2979,ZYTIGA 250MG 120TAB,zytiga,120.0,100.000000,17542.176269,ABIRATERONE ACETATE (500MG)


In [67]:
df = pd.read_csv('/Users/hiruzen/Programming/Projects/mimic-iv-datawarehouse/mimic-iv/hosp/prescriptions.csv')
unique_drugs = df['drug'].unique()
unique_drugs_series = pd.Series(unique_drugs)
unique_drugs_series = unique_drugs_series.str.lower().str.strip()
unique_drugs_series

0                                  fentanyl citrate
1                                         lorazepam
2                                         midazolam
3      insulin pump (self administering medication)
4                                          propofol
                           ...                     
626                                artificial tears
627                           albuterol-ipratropium
628                             atropine sulfate 1%
629                  artificial tears preserv. free
630                         carbamide peroxide 6.5%
Length: 631, dtype: object

In [92]:
df_mapping['drugs'] = df_mapping['drugs'].str.lower().str.strip()
df_mapping['medicine_name'] = df_mapping['medicine_name'].str.lower().str.strip()
combined_choices = pd.concat([
    df_mapping['drugs'],
    df_mapping['medicine_name']
]).dropna().unique().tolist()
unique_drugs_list = unique_drugs_series.tolist()

def fuzzy_match_reverse(unique_drug, choices, threshold=80):
    if pd.isnull(unique_drug):
        return None, None
    best_match = process.extractOne(unique_drug, choices, scorer=fuzz.token_set_ratio)
    if best_match and best_match[1] >= threshold:
        return best_match[0], best_match[1]
    return None, None
results = []
for drug in unique_drugs_list:
    match, score = fuzzy_match_reverse(drug, combined_choices, threshold=80)
    matches = df_mapping[(df_mapping['drugs'] == match) | (df_mapping['medicine_name'] == match)]
    if not matches.empty:
        # Take the first matching row
        row = matches.iloc[0]
        results.append({
            'drug': drug,
            'medicine_name': row['medicine_name'],
            'package_size': row['package_size'],
            'cost': row['cost'],
        })
    else:
        results.append({
            'drug': drug
        })

final_df = pd.DataFrame(results)
final_df

Unnamed: 0,drug,medicine_name,package_size,cost
0,fentanyl citrate,abstral 100mcg 30tab sublingual,30.0,955.510329
1,lorazepam,notensyl syrup 110ml,110.0,
2,midazolam,omr-igg-am 5% sol for inj 100ml,100.0,1668.671489
3,insulin pump (self administering medication),,,
4,propofol,,,
...,...,...,...,...
626,artificial tears,,,
627,albuterol-ipratropium,,,
628,atropine sulfate 1%,,,
629,artificial tears preserv. free,,,


In [93]:
final_df.loc[:, 'cost'] = pd.to_numeric(final_df['cost'], errors='coerce')
final_df.loc[:, 'package_size'] = pd.to_numeric(final_df['package_size'], errors='coerce')
impute_df = final_df[['cost', 'package_size']]
imputer = KNNImputer(n_neighbors=3)
imputed_array = imputer.fit_transform(impute_df)
imputed_df = pd.DataFrame(imputed_array, columns=impute_df.columns, index=final_df.index)
final_df.loc[:, 'cost'] = imputed_df['cost']
final_df.loc[:, 'cost'] = final_df['cost'].clip(upper=400)

def randomize_if_max(x, lower=100, upper=400):
    if x == 400:
        return np.random.uniform(lower, upper)
    return x

final_df.loc[:, 'cost'] = final_df['cost'].apply(randomize_if_max)

def extract_tablet_count_from_medicine(row):
    if pd.notnull(row['package_size']):
        return row['package_size']
    med = row['medicine_name']
    if med is not None and isinstance(med, str) and 'tab' in med.lower():
        m = re.search(r'(\d+)', med)
        if m:
            return int(m.group(1))
    return None

final_df.loc[:, 'package_size'] = final_df.apply(extract_tablet_count_from_medicine, axis=1)
final_df.loc[:, 'package_size'] = pd.to_numeric(final_df['package_size'], errors='coerce')
final_df = final_df.drop_duplicates(subset=['drug'])

final_df
final_df.to_csv('/Users/hiruzen/Programming/Projects/mimic-iv-datawarehouse/resources/final_drug_mapping.csv', index=False)