In [None]:
%load_ext autoreload
%autoreload 2

import root_config as rc #this is needed to resolve the local modules
rc.configure()

import os
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from detectdd.config import *
from detectdd.auth_bigquery import BigQueryClient

In [None]:
def main():
    mimic_drugs = f"""
        SELECT itemid, label, abbreviation, category, unitname
        FROM {icu_d_items}
        WHERE linksto='inputevents' """

    mimic_job = BigQueryClient.auth().query(mimic_drugs)
    df = mimic_job.to_dataframe()
    df.label = df.label.fillna("")
    df.label = df.label.apply(str.lower)
    return df

icu_drugs = main()

In [None]:
def read_drugs():
    df = pd.read_csv(data_dir / "NDC_product_table.csv", encoding='ISO-8859-1')
    df.NONPROPRIETARYNAME = df.NONPROPRIETARYNAME.fillna("")
    df.NONPROPRIETARYNAME = df.NONPROPRIETARYNAME.apply(str.lower)
    df.PROPRIETARYNAME = df.PROPRIETARYNAME.fillna("")
    df.PROPRIETARYNAME = df.PROPRIETARYNAME.apply(str.lower)
    return df
ndc_drug_synonyms = read_drugs()
ndc_drug_synonyms

In [None]:
from detectdd.drug_index import DrugIndex

drug_index = DrugIndex.init_with_drugbank()


In [None]:
def read_hosp_drugs():
    sql_hosp_drugs_query = """
    SELECT """
    BigQueryClient.auth()

In [None]:
def exact_match():
    df = (icu_drugs.merge(
        ndc_drug_synonyms.drop_duplicates(subset="NONPROPRIETARYNAME"), left_on= ['label'], right_on=['NONPROPRIETARYNAME'], how='left', indicator=True)[['itemid', 'label', 'category', 'PRODUCTID', 'NONPROPRIETARYNAME', ]]
          .merge(ndc_drug_synonyms.drop_duplicates(subset="PROPRIETARYNAME"), left_on=['label'], right_on='PROPRIETARYNAME', how='left', indicator=True))
    print(df.count())
    return df

matched = exact_match()
matched

In [None]:
matched.loc[matched._merge != 'both']

In [None]:

def fuzzy_match(str1, str2):
    return fuzz.token_set_ratio(str1, str2)

def match_dataframe(df1, key1, df2, key2, threshold=90):
    matches = []
    # matches = pd.DataFrame(columns=['index', 'label', 'norm_label', 'match0', 'match1'])
    for i, row in df1.iterrows():
        drug_name = row[key1]
        fast = True
        to_match = df2[key2]
        if fast:
            to_match = to_match.loc[to_match.str.startswith(drug_name[0])] # speed up fuzzy matching by only considering synonyms that start with the same letter

        match = process.extractOne(drug_name, to_match, scorer=fuzzy_match)
        print(match)
        matched_label = match[0]
        match_score = match[1]
        match_index = match[2]
        if match_score >= threshold:
            raw_match = df2.loc[match_index]
            norm_label = raw_match.NONPROPRIETARYNAME
            print(f"Found match with score ({str(match[1])}) : {row[key1]} - {matched_label} -- norm label {norm_label}")
            matches.append([i, row['itemid'],row[key1], norm_label, matched_label, match_score, match_index])
        print(i)

    return pd.DataFrame(matches, columns=['index', 'itemid', 'label', 'norm_label', 'matched_label', 'score', 'norm_index'])



def fuzzy_merge():
    medications = icu_drugs.loc[(~icu_drugs['category'].isin(['Medications']))]

    proprietary = match_dataframe(medications, "label", ndc_drug_synonyms, "PROPRIETARYNAME")
    return proprietary

fuzzy_matched = fuzzy_merge()
fuzzy_matched

In [None]:
m1 = ndc_drug_synonyms['PROPRIETARYNAME'].str.startswith("glyc")
ndc_drug_synonyms.loc[m1]


In [None]:
m1 = ~matched['PRODUCTID_x'].isnull()
m2 = ~matched['PRODUCTID_y'].isnull()
has_product_id = m1 | m2
matched.loc[~has_product_id].groupby('category').count()


In [None]:
matched.loc[(~matched['category'].isin(['Medications'])) & has_product_id]

In [None]:
icu_drugs

In [None]:
def read_ddinter():
    ddinter_files = os.listdir(ddinter_data_dir)

    # loop through ddinter files
    df = pd.DataFrame()
    for file_name in ddinter_files:
        df = pd.concat([df, pd.read_csv(ddinter_data_dir / file_name)])

    df.Drug_B = df.Drug_B.fillna("")
    df.Drug_B = df.Drug_B.apply(str.lower)
    df.Drug_B = df.Drug_B.fillna("")
    df.Drug_B = df.Drug_B.apply(str.lower)

    df.Drug_A = df.Drug_A.fillna("")
    df.Drug_A = df.Drug_A.apply(str.lower)
    df.Drug_A = df.Drug_A.fillna("")
    df.Drug_A = df.Drug_A.apply(str.lower)
    return df

ddinter = read_ddinter()
cleaned= ddinter.loc[ddinter.Level.isin( ['Major'])]
# cleaned = cleaned.loc[(cleaned['Drug_B'].isin( fuzzy_matched['norm_label_x']) | cleaned['Drug_A'].isin( fuzzy_matched['norm_label_x']))]
cleaned

In [None]:
def get_interaction_clause_with_synonyms():
    multimap = cleaned.groupby('Drug_A')['Drug_B'].apply(list).to_dict()
    clauses= []
    for key in multimap.keys():
        first_ids = fuzzy_matched.loc[(fuzzy_matched['norm_label'] == key)]['itemid']

        second_ids = fuzzy_matched.loc[(fuzzy_matched['norm_label'].isin(multimap[key]))]['itemid']
        if first_ids.any() & second_ids.any():
            sql = f"(first_ie.itemid IN ({','.join([str(item) for item in first_ids if item])}) AND second_ie.itemid IN ({','.join([str(item) for item in second_ids if item])}))"
            clauses.append(sql)
    print(len(clauses))
    clause = " OR ".join(clauses)
    print (clause)
    return clause

In [None]:
def get_interaction_clause_raw():
    multimap = cleaned.groupby('Drug_A')['Drug_B'].apply(list).to_dict()
    print(len(multimap))
    clauses= []
    for key in multimap.keys():
        first_ids = icu_drugs.loc[(icu_drugs['label'] == key)]['itemid']
        
        second_ids = icu_drugs.loc[(icu_drugs['label'].isin(multimap[key]))]['itemid']
        if first_ids.any() & second_ids.any():
            sql = f"\n(first_ie.itemid IN ({','.join([str(item) for item in first_ids if item])}) AND second_ie.itemid IN ({','.join([str(item) for item in second_ids if item])}))"
            clauses.append(sql)
    clause = " OR ".join(clauses)
    return clause

In [None]:
multimap = cleaned.groupby('Drug_A')['Drug_B'].apply(list).to_dict()
def query_for_drug_interactions(type='synonyms'):
    
    if type == 'synonyms':
        clause = get_interaction_clause_with_synonyms()
    else:
        clause = get_interaction_clause_raw()
    icu = "physionet-data.mimiciv_icu"

    sql = f"""SELECT first_ie.subject_id, first_ie.hadm_id, first_ie.stay_id, first_ie.itemid as drug_a_item_id, second_ie.itemid as drug_b_item_id, MAX(second_ie.starttime) as dose_b_time, count(*) as event_count
        FROM `{icu}.inputevents` as first_ie
        INNER JOIN `{icu}.inputevents` as second_ie ON first_ie.stay_id = second_ie.stay_id
        WHERE {clause} AND first_ie.amount > 0
            AND second_ie.amount > 0
            AND first_ie.starttime < second_ie.starttime
            AND DATETIME_DIFF(second_ie.starttime, first_ie.starttime, MINUTE) < 300
        GROUP BY first_ie.subject_id, first_ie.hadm_id, first_ie.stay_id, first_ie.itemid, second_ie.itemid, second_ie.starttime
    """
    print("\n\n",sql)
    mimic_job = BigQueryClient.auth().query(sql)
    return mimic_job.to_dataframe()

drug_interactions_raw = query_for_drug_interactions(type='raw')
drug_interactions_synonyms = query_for_drug_interactions()
drug_interactions = pd.concat([drug_interactions_synonyms, drug_interactions_raw]).drop_duplicates()  

In [None]:
from detectdd.serializer import Serializer

serializer = Serializer()
serializer.write_cohort(drug_interactions)

In [None]:
serializer.read_cohort()

In [None]:
print(drug_interactions.event_count.sum())

drug_interactions

In [None]:
drug_interactions.describe()

In [None]:
drug_interactions.loc[drug_interactions.event_count < 18].describe()
