In [None]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt

from detectdd.auth_bigquery import BigQueryClient
from detectdd.serializer import Serializer

cohort_before_icd_filter = Serializer().read_cohort()
cohort_before_icd_filter

In [None]:
from detectdd.config import icd_data_file
# read ICD code data
icd_codes_raw = pd.read_csv(icd_data_file, encoding='ISO-8859-1')

def filter_icd_codes(to_filter):
    # The data contains filters with the following meaning.
    #A.1	Induced by medication
    #A.2	Induced by medication or other causes
    #B.1	Poisoning by medication
    #B.2	Poisoning by or harmful use of medication or other causes
    #C	ADE very likely
    #D	ADE likely
    #E	ADE possible
    #U	ADE unlikely
    #V	Induced by vaccine (added)
    
    # simple filter, only accept at A - D
    category_prefixes = ('A', 'B', 'C', 'D')
    #category_prefixes = ('A','B', 'C')
    return to_filter.loc[to_filter.iloc[:, 2].str.startswith(category_prefixes, na=False)]
    # TODO - exclude codes where it is unlikely to be drug drug interaction, i.e. T40
    
    
icd_codes_filtered = filter_icd_codes(icd_codes_raw)
icd_codes_filtered

In [None]:
hosp = "physionet-data.mimiciv_hosp"
def get_hadm_id_with_matching_icd_code( hadm_ids = [24233473]):
    icd_in_values = ', '.join(["'" + str(x).replace('.','') + "'" for x in icd_codes_filtered['Code'].tolist()])
    print(icd_in_values)

    likes = []
    codes = icd_codes_filtered['Code']
    
    cleaned_codes = []
    
    for x in codes:
        x = x.replace('.','')
        if len(x) < 5:
            cleaned_codes.append(x + 'X')
    
    cleaned_codes += ["E93", "E94", "E85", "E86"] #ICD 9 therapeutic use and poising prefix, maps to B and A categories
  
    for x in cleaned_codes:
        likes.append(f"d_icd.icd_code LIKE '{x}%'")
    like_clause = "\n OR ".join(likes)
    
    sql = f"""SELECT diag.hadm_id, 
        diag.subject_id, 
          
        COUNT(distinct d_icd.icd_code) AS num_icd_codes,
        
        ARRAY_AGG(d_icd.icd_code) AS icd_code, 
        ARRAY_AGG(DISTINCT d_icd.icd_version) AS icd_versions,
        STRING_AGG(d_icd.long_title) AS icd_titles
        FROM `{hosp}.diagnoses_icd` AS diag
        INNER JOIN `{hosp}.d_icd_diagnoses` AS d_icd
            ON diag.icd_code = d_icd.icd_code
            AND diag.icd_version= d_icd.icd_version
            AND ( d_icd.icd_code IN ({icd_in_values}) OR 
                {like_clause})
        WHERE diag.hadm_id IN ({','.join([str(x) for x in hadm_ids])})
        GROUP BY diag.hadm_id, diag.subject_id
        ORDER BY diag.subject_id"""
    print(sql)
    mimic_job = BigQueryClient.auth().query(sql)
    return mimic_job.to_dataframe()

hadm_with_icd_codes = get_hadm_id_with_matching_icd_code(hadm_ids = cohort_before_icd_filter["hadm_id"])
hadm_with_icd_codes

In [None]:
def create_word_cloud_from_column(df, column):
        text = ",".join(df[column])
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
        
        # Display the word cloud using matplotlib
        plt.figure(figsize=(10, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.show()
        
create_word_cloud_from_column(hadm_with_icd_codes, "icd_titles")

In [None]:
icd_in_values = ', '.join(["'" + str(x).replace('.','') + "'" for x in icd_codes_filtered['Code'].tolist()])

likes = []
codes = icd_codes_filtered['Code']

cleaned_codes = []

for x in codes:
    x = x.replace('.','')
    if len(x) < 5:
        cleaned_codes.append(x + 'X')

cleaned_codes.append("E93")
cleaned_codes.append("E94")
for x in cleaned_codes:
    likes.append(f"d_icd.icd_code LIKE '{x}%'")

likes

like_clause = "\n OR ".join(likes)
print(like_clause)
    
sql = f"""SELECT *, 
        d_icd.icd_code AS icd_code
        FROM `{hosp}.diagnoses_icd` AS diag
        INNER JOIN `{hosp}.d_icd_diagnoses` AS d_icd
        ON diag.icd_code = d_icd.icd_code
            AND diag.icd_version= d_icd.icd_version
            AND ( d_icd.icd_code IN ({icd_in_values}) OR
             {like_clause})
                """


print(sql)
BigQueryClient.auth().query(sql).to_dataframe()
