In [None]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt

from detectdd.auth_bigquery import BigQueryClient
from detectdd.serializer import Serializer

serializer = Serializer()
cohort_before_icd_filter = serializer.read_total_drug_interactions()
cohort_before_icd_filter

In [None]:
from detectdd.config import icd_data_file
# read ICD code data
icd_codes_raw = pd.read_csv(icd_data_file, encoding='ISO-8859-1')

category_prefixes = ('A', 'B', 'C', 'D')
# category_prefixes = ('A','B', 'C')

def filter_icd_codes(to_filter):
    # The data contains filters with the following meaning.
    #A.1	Induced by medication
    #A.2	Induced by medication or other causes
    #B.1	Poisoning by medication
    #B.2	Poisoning by or harmful use of medication or other causes
    #C	ADE very likely
    #D	ADE likely
    #E	ADE possible
    #U	ADE unlikely
    #V	Induced by vaccine (added)
    
    # simple filter, only accept at A - D
    

    return to_filter.loc[to_filter.iloc[:, 2].str.startswith(category_prefixes, na=False)]
    # TODO - exclude codes where it is unlikely to be drug drug interaction, i.e. T40
    
    
icd_codes_filtered = filter_icd_codes(icd_codes_raw)
icd_codes_filtered

In [None]:
hosp = "physionet-data.mimiciv_hosp"
def get_hadm_id_with_matching_icd_code(icd_codes_filtered=icd_codes_filtered, hadm_ids = [24233473]):
    icd_in_values = ', '.join(["'" + str(x).replace('.','') + "'" for x in icd_codes_filtered['Code'].tolist()])

    likes = []
    codes = icd_codes_filtered['Code']
    
    cleaned_codes = []
    
    for x in codes:
        x = x.replace('.','')
        if len(x) < 5:
            cleaned_codes.append(x + 'X')
    
    cleaned_codes += ["E93", "E94", "E85", "E86"] #ICD 9 therapeutic use and poising prefix, maps to B and A categories
  
    for x in cleaned_codes:
        likes.append(f"d_icd.icd_code LIKE '{x}%'")
    like_clause = "\n OR ".join(likes)
    
    sql = f"""SELECT diag.hadm_id, 
        diag.subject_id, 
          
        COUNT(distinct d_icd.icd_code) AS num_icd_codes,
        
        ARRAY_AGG(d_icd.icd_code) AS icd_code, 
        ARRAY_AGG(DISTINCT d_icd.icd_version) AS icd_versions,
        STRING_AGG(d_icd.long_title) AS icd_titles
        FROM `{hosp}.diagnoses_icd` AS diag
        INNER JOIN `{hosp}.d_icd_diagnoses` AS d_icd
            ON diag.icd_code = d_icd.icd_code
            AND diag.icd_version= d_icd.icd_version
            AND ( d_icd.icd_code IN ({icd_in_values}) OR 
                {like_clause})
        WHERE diag.hadm_id IN ({','.join([str(x) for x in hadm_ids])})
        GROUP BY diag.hadm_id, diag.subject_id
        ORDER BY diag.subject_id"""
    print(sql)
    mimic_job = BigQueryClient.auth().query(sql)
    return mimic_job.to_dataframe()

hadm_with_icd_codes = get_hadm_id_with_matching_icd_code(hadm_ids = cohort_before_icd_filter["hadm_id"].drop_duplicates())
print(f"Found {len(hadm_with_icd_codes)} unique codes with icd stays in categories: {category_prefixes}")
hadm_with_icd_codes

In [None]:
def create_word_cloud_from_column(df, column):
        text = ",".join(df[column])
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
        
        # Display the word cloud using matplotlib
        plt.figure(figsize=(10, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.show()
        
create_word_cloud_from_column(hadm_with_icd_codes, "icd_titles")

In [None]:
drug_interactions_joined_with_icd = pd.merge(left=cohort_before_icd_filter, right=hadm_with_icd_codes, how='left', on=['hadm_id', 'subject_id'], validate='many_to_one')

drug_interactions_joined_with_icd

In [None]:
cohort_without_icd = drug_interactions_joined_with_icd[drug_interactions_joined_with_icd.num_icd_codes.isna()]
cohort_with_icd = drug_interactions_joined_with_icd[~drug_interactions_joined_with_icd.num_icd_codes.isna()]
print(f"Total {len(drug_interactions_joined_with_icd)} drug interactions ")
print(f"Found {len(cohort_with_icd)} drug interactions with icd codes")
print(f"Found {len(cohort_without_icd)} drug interactions without icd codes")

In [None]:
serializer.write_cohort(cohort_with_icd)
serializer.write_cohort_with_no_icd(cohort_without_icd)

In [None]:
cohort_with_icd.groupby('stay_id').count().describe(percentiles=[ .25, .5, .75, 0.9, 0.95, 0.975, .99])

In [None]:
data = {'key': [1, 2, 2, 3, 3, 3, 4, 4, 4, 4],
        'values': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']}

df = pd.DataFrame(data)

# Group by 'key' and count occurrences
counts = df['key'].value_counts()

# Filter the DataFrame to exclude rows with counts greater than 100
filtered_df = df[df['key'].map(counts) <= 3]
filtered_df

In [None]:
print("Truncating cohort to 50th percentile (10)")
counts = cohort_with_icd['stay_id'].value_counts()
truncacted_cohort = cohort_with_icd[cohort_with_icd['stay_id'].map(counts) <= 10]
truncacted_cohort.describe()
serializer.write_cohort(truncacted_cohort)

In [None]:
truncacted_cohort.groupby('stay_id').count().describe(percentiles=[ .25, .5, .75, 0.9, 0.95, 0.975, .99])