In [None]:
from detectdd import config
from detectdd.serializer import Serializer
import root_config as rc #this is needed to resolve the local modules
rc.configure()

import os
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from detectdd.config import *
from detectdd.auth_bigquery import BigQueryClient

if config.isFastMode():
    print("running in FAST mode")
else:
    print("Running in FULL mode")
    
serializer = Serializer()

In [None]:
def main():
    mimic_drugs = f"""
        SELECT itemid, label, abbreviation, category, unitname
        FROM {icu_d_items}
        WHERE linksto='inputevents' """

    mimic_job = BigQueryClient.auth().query(mimic_drugs)
    df = mimic_job.to_dataframe()
    df.label = df.label.fillna("")
    df.label = df.label.apply(str.lower)
    return df

icu_drugs = main()

In [None]:
from detectdd.drug_index import DrugIndex, index_mimic_drugs, clean

drug_index = DrugIndex.get_drug_index()

drug_index.drug_bank_df['cleaned'] = drug_index.drug_bank_df['Common name'].apply(clean)
drug_index.drug_bank_df

In [None]:
from detectdd.drug_index import clean

def read_hosp_drugs():
    sql_hosp_drugs_query = f"""
    SELECT distinct medication from {config.hosp}.emar"""
    return BigQueryClient.auth().query(sql_hosp_drugs_query).to_dataframe()

df_hosp_drugs = read_hosp_drugs()['medication'].astype(str)

df_hosp_drugs

In [None]:
pd.DataFrame(df_hosp_drugs)
drugs_by_norm_name = index_mimic_drugs(pd.DataFrame(df_hosp_drugs), "medication", "medication")
drugs= pd.DataFrame.from_dict(drugs_by_norm_name, orient="index")
drugs

In [None]:

def fuzzy_match(str1, str2):
    return fuzz.token_set_ratio(str1, str2)

def match_dataframe(df1, key1, df2, key2, threshold=88): #threshold from inspection of data
    matches = []
    # matches = pd.DataFrame(columns=['index', 'label', 'norm_label', 'match0', 'match1'])
    for i, row in df1.iterrows():
        drug_name = row[key1]
        fast = True
        to_match = df2[key2]
        if config.isFastMode():
            to_match = to_match.loc[to_match.str.startswith(drug_name[0])] # speed up fuzzy matching by only considering synonyms that start with the same letter

        match = process.extractOne(drug_name, to_match, scorer=fuzzy_match)
        print(match)
        matched_label = match[0]
        match_score = match[1]
        match_index = match[2]
        if match_score >= threshold:
            raw_match = df2.loc[match_index]
            norm_label = raw_match[key2]
            print(f"Found match with score ({str(match[1])}) : {row[key1]} - {matched_label} -- norm label {norm_label}")
            matches.append([i, row['itemid'],row[key1], norm_label, matched_label, match_score, match_index])
        print(i)

    return pd.DataFrame(matches, columns=['index', 'itemid', 'label', 'norm_label', 'matched_label', 'score', 'norm_index'])



def fuzzy_merge():
    medications = icu_drugs.loc[(~icu_drugs['category'].isin(['Medications']))]

    proprietary = match_dataframe(medications, "label", drug_index.drug_bank_df, "cleaned")
    return proprietary

fuzzy_matched = fuzzy_merge()
fuzzy_matched

In [None]:

fuzzy_matched


In [None]:
icu_drugs

In [None]:
def read_ddinter():
    ddinter_files = os.listdir(ddinter_data_dir)

    # loop through ddinter files
    df = pd.DataFrame()
    for file_name in ddinter_files:
        df = pd.concat([df, pd.read_csv(ddinter_data_dir / file_name)])

    df.Drug_B = df.Drug_B.fillna("")
    df.Drug_B = df.Drug_B.apply(clean)

    df.Drug_A = df.Drug_A.fillna("")
    df.Drug_A = df.Drug_A.apply(clean)
    return df

ddinter = read_ddinter()
cleaned= ddinter.loc[ddinter.Level.isin( ['Major'])]
cleaned = cleaned.drop_duplicates()
cleaned

In [None]:
def get_ddinter_multimap():
    multimap = cleaned.groupby('Drug_A')['Drug_B'].apply(set).to_dict()
    opp_direction = cleaned.groupby('Drug_B')['Drug_A'].apply(set).to_dict()
    for key in opp_direction.keys():
        existing = multimap.get(key)
        if existing is None:
            existing = set()
        existing |= opp_direction[key]
        multimap[key] = existing
    return multimap
    
    multimap

In [None]:
def get_interaction_clause_with_synonyms():
    multimap = get_ddinter_multimap()
    
    clauses= []
    for key in multimap.keys():
        first_ids = fuzzy_matched.loc[(fuzzy_matched['norm_label'] == key)]['itemid']

        second_ids = fuzzy_matched.loc[(fuzzy_matched['norm_label'].isin(multimap[key]))]['itemid']
        if first_ids.any() & second_ids.any():
            sql = f"(first_ie.itemid IN ({','.join([str(item) for item in first_ids if item])}) AND second_ie.itemid IN ({','.join([str(item) for item in second_ids if item])}))"
            clauses.append(sql)
    print(len(clauses))
    clause = " OR ".join(clauses)
    print (clause)
    print(f"Found {len(clauses)} first ids")
    return clause

In [None]:
def get_interaction_clause_raw():
    multimap = get_ddinter_multimap()
    print(len(multimap))
    clauses= []
    first_match_count = 0
    for key in multimap.keys():
        first_ids = icu_drugs.loc[(icu_drugs['label'] == key)]['itemid']
        if first_ids.any():
            first_match_count += 1
        
        second_ids = icu_drugs.loc[(icu_drugs['label'].isin(multimap[key]))]['itemid']
        if first_ids.any() & second_ids.any():
            sql = f"\n(first_ie.itemid IN ({','.join([str(item) for item in first_ids if item])}) AND second_ie.itemid IN ({','.join([str(item) for item in second_ids if item])}))"
            clauses.append(sql)
    clause = " OR ".join(clauses)
    print(f"Found {first_match_count} administered ddinter keys")
    print(f"Found {len(clauses)} clauses")
    return clause

In [None]:
multimap = get_ddinter_multimap()
df = pd.DataFrame()
df['dinter'] = pd.Series(multimap.keys()).sort_values()
val = list(icu_drugs['label'])
val.sort()
df['icu_labels'] = pd.Series(val)
df

In [None]:

print(f"Found {len(df.loc[df['dinter'].isin(drug_index.indexed_synonyms)])} ddinter keys in drugbank of {len(df['dinter'])}")

medication = pd.DataFrame()
medication['medication'] = icu_drugs['label']

def compare_to_ddinter(to_compare, descriptor):
    print(f"\nTotal {len(to_compare)} {descriptor} drugs")
    print(f"Found {len(to_compare.loc[to_compare['medication'].isin(df['dinter'])])} {descriptor} drugs in dinter without using synonyms")
    to_compare = to_compare.loc[~to_compare['medication'].isin(df['dinter'])]['medication']
    print(f"Found {len(to_compare.loc[to_compare.isin(drug_index.common_names)])} icu drugs in indexed common names")
    to_compare = to_compare.loc[~to_compare.isin(drug_index.common_names)]
    print(f"Found {len(to_compare.loc[to_compare.isin(drug_index.indexed_synonyms)])} icu drugs in indexed synonyms")

compare_to_ddinter(medication, 'icu')
medication = pd.DataFrame()
medication['medication'] = df_hosp_drugs
compare_to_ddinter(medication, 'hosp')

print(f"Found {len(df_hosp_drugs[~df_hosp_drugs.isin(icu_drugs['label'])])} hosp drugs not in icu_labels")

In [None]:
def query_for_drug_interactions(type='synonyms'):
   
    if type == 'synonyms':
        clause = get_interaction_clause_with_synonyms()
    else:
        clause = get_interaction_clause_raw()
    icu = "physionet-data.mimiciv_icu"

    sql = f"""SELECT first_ie.subject_id, first_ie.hadm_id, first_ie.stay_id, first_ie.itemid as drug_a_item_id, second_ie.itemid as drug_b_item_id, MAX(second_ie.starttime) as dose_b_time, count(*) as event_count
        FROM `{icu}.inputevents` as first_ie
        INNER JOIN `{icu}.inputevents` as second_ie ON first_ie.stay_id = second_ie.stay_id
        WHERE {clause} AND first_ie.amount > 0
            AND second_ie.amount > 0
            AND first_ie.starttime < second_ie.starttime
            AND DATETIME_DIFF(second_ie.starttime, first_ie.starttime, MINUTE) < 300
        GROUP BY first_ie.subject_id, first_ie.hadm_id, first_ie.stay_id, first_ie.itemid, second_ie.itemid, second_ie.starttime
    """
    print("\n\n",sql)
    mimic_job = BigQueryClient.auth().query(sql)
    return mimic_job.to_dataframe()

interaction_clause = 'synonyms'
if config.isFastMode():
    interaction_clause = 'raw'

interaction_clause = 'raw'
icu_drug_interactions_raw = query_for_drug_interactions(type='raw')
icu_drug_interactions_synonyms = query_for_drug_interactions(type='synonyms')
print(f"raw interactions {len(icu_drug_interactions_raw)}")
print(f"synonym interactions {len(icu_drug_interactions_synonyms)}")
icu_drug_interactions = pd.concat([icu_drug_interactions_synonyms, icu_drug_interactions_raw]).drop_duplicates()
print(f"combined interactions {len(icu_drug_interactions)}")
print(f"Total unique hadms: {len(icu_drug_interactions.drop_duplicates()['hadm_id'].drop_duplicates())}")

In [None]:
icu_drug_interactions

In [None]:
icu_drug_interactions.drop_duplicates()['hadm_id'].drop_duplicates()

In [None]:
def get_emar_interaction_clause():
    multimap = get_ddinter_multimap()
    clauses= []
    for key in multimap.keys():
        first_id = drugs_by_norm_name.get(key).db_identifier if drugs_by_norm_name.get(key) is not None else None
        # first_ids = drugs_by_norm_name.get[(drugs_by_norm_name['common_name'] == key)]['db_identifier']

        second_ids = []
        for inter_key in multimap[key]:
            if drugs_by_norm_name.get(inter_key):
                second_ids.append(drugs_by_norm_name[inter_key].db_identifier)
        
        if bool(first_id) & len(second_ids) > 0:
            inclause = "','".join([str(item) for item in second_ids if item])
            sql = f"(e1.medication = '{first_id}' AND e2.medication IN ('{inclause}'))\n"
            clauses.append(sql)
    print(f"Found {len(clauses)} hosp interactions")
    return clauses

get_emar_interaction_clause()

In [None]:
# emar table has more drugs administered than the icu records
def query_for_emar_drug_interactions():
    # clauses = ["e1.medication = 'Citalopram' AND e2.medication IN ('Insulin')", "e1.medication = 'Insulin' AND e2.medication IN ('Citalopram')"]
    clauses = get_emar_interaction_clause()
    event_txt_not_status = ('Not Given', 'Not Started', 'Not Confirmed')
    sql = f"""
        SELECT e1.subject_id, e1.hadm_id, 
        e1.medication as medication_1, 
        e2.medication as medication_2,
        e1.charttime as charttime_1 ,
        e2.charttime as charttime_2,
        e1.event_txt,
        e2.event_txt, 
        stays.stay_id, 
        stays.intime
        FROM `physionet-data.mimiciv_hosp.emar` as e1
        INNER JOIN `physionet-data.mimiciv_hosp.emar` as e2
            ON e1.hadm_id = e2.hadm_id 
                AND e2.charttime > e1.charttime
                AND DATETIME_DIFF(e2.charttime, e1.charttime, MINUTE) < 720
        INNER JOIN `physionet-data.mimiciv_icu.icustays` as stays ON e1.subject_id = stays.subject_id
            AND (stays.intime > e2.charttime AND DATETIME_DIFF(stays.intime, e2.charttime, HOUR) < 24 OR (e2.charttime BETWEEN stays.intime AND stays.outtime))
        WHERE
            e1.event_txt NOT IN {event_txt_not_status}
            AND e2.event_txt NOT IN {event_txt_not_status}
            AND 
            (
                {' OR '.join(clauses)}
            )
    """
    print(sql)
    return BigQueryClient.auth().query(sql).to_dataframe()
    
hosp_drug_interactions = query_for_emar_drug_interactions()    
print(f"Found {len(hosp_drug_interactions)} emar drug events")

hosp_drug_interactions = hosp_drug_interactions.drop_duplicates()
hosp_drug_interactions['hadm_id'].drop_duplicates()



In [None]:
icu_drug_interactions

In [None]:
# merge icu and emar drug interactions

mergeable_hosp_interactions = pd.DataFrame(hosp_drug_interactions[['subject_id', 'hadm_id', 'stay_id']])
mergeable_hosp_interactions['drug_a_item_id'] = hosp_drug_interactions.medication_1
mergeable_hosp_interactions['drug_b_item_id'] = hosp_drug_interactions.medication_2
mergeable_hosp_interactions['dose_b_time'] = hosp_drug_interactions.charttime_2
mergeable_hosp_interactions['event_count'] = 0 # dummy value

combined_drug_interactions = pd.concat([icu_drug_interactions, mergeable_hosp_interactions])
combined_drug_interactions

In [None]:
from detectdd.serializer import Serializer 
serializer = Serializer()
serializer.write_total_drug_interactions(combined_drug_interactions)

In [None]:
serializer.write_icu_drug_interactions(icu_drug_interactions)

In [None]:
serializer.write_emar_drug_interactions(hosp_drug_interactions.drop_duplicates())

In [None]:
serializer.read_total_drug_interactions()

In [None]:
print(icu_drug_interactions.event_count.sum())

icu_drug_interactions

In [None]:
icu_drug_interactions.event_count.describe()

In [None]:
# remove outliers, q75 + 1.5 x IQR = 11 + 1.5 * 6 =  18
# filter for events with count below 
drug_interactions_truncated = icu_drug_interactions.loc[icu_drug_interactions.event_count < 18]
print(len(drug_interactions_truncated))
plt = drug_interactions_truncated.boxplot(column="event_count")
