# diagnoses_icd

In [1]:
import pandas as pd

In [2]:
diagnoses = pd.read_csv('/data/share/AKI/3.0/hosp/diagnoses_icd.csv.gz', compression = 'gzip')

In [None]:
diagnoses.columns

In [None]:
print(diagnoses.head(10))
print('-----------------')
print(diagnoses.shape)

In [5]:
target_value = ['5845', '5846', '5847', '5848', '5849', '66930', '66932', '66934', 'N17', 'N170', 'N171', 'N172', 'N178', 'N179', 'Q904']

In [6]:
diagnoses = diagnoses.drop(['seq_num', 'icd_version'], axis = 1)

In [None]:
diagnoses['target'] = diagnoses['icd_code'].apply(lambda x : 1 if x in target_value else 0)
diagnoses['target'].value_counts() 
"""
without AKI 0.99
with AKI 0.01
"""

In [8]:
aki_list = diagnoses[diagnoses['target'] == 1]['hadm_id'].unique()
not_aki_list = diagnoses[diagnoses['target'] == 0]['hadm_id'].unique()

# Chartevents

In [9]:
import gc
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
import datetime
import matplotlib.pyplot as plt
from statsmodels.stats.proportion import proportions_ztest

icu_path = '/data/share/AKI/3.0/icu/'
label_data = pd.read_csv('/home/stateun/aki_hadm.csv')

label_subject = label_data['subject_id']
label_hadm = label_data['hadm_id']
label_aki = label_data['aki']

In [28]:
filtered_chartevents = pd.read_csv('filtered_chartevents.csv', index_col = 0)

In [None]:
filtered_chartevents.columns

In [30]:
filtered_chartevents = filtered_chartevents.drop([ 'charttime', 'storetime', 'value', 'warning'], axis= 1)

In [None]:
# Check the nan value
sum(filtered_chartevents['value_numeric'].isna())

In [None]:
drop_index = filtered_chartevents[filtered_chartevents['value_numeric'].isna()].index
drop_index

In [None]:
filtered_chartevents.drop(drop_index)

In [None]:
filtered_chartevents['value_numeric'].dtypes

In [None]:
plt.boxplot(filtered_chartevents[(filtered_chartevents['itemid'] == 220245)]['value_numeric'])

In [36]:
aki_hadms = filtered_chartevents.loc[filtered_chartevents['hadm_id'].isin(aki_list)]['hadm_id'].unique().tolist()

In [38]:
hadm_ids = filtered_chartevents.hadm_id.unique().tolist()
aki_hadms_set = set(aki_hadms)
hadm_aki_dict = {'hadm_id' : hadm_ids, 'aki':[int(h in aki_hadms_set) for h in hadm_ids]}
hadm_aki_df = pd.DataFrame(hadm_aki_dict)

In [39]:
chartevents_icd = filtered_chartevents[['subject_id', 'hadm_id']].drop_duplicates()
hadm_to_subject = {str(h):s for s, h in zip(chartevents_icd.subject_id.tolist(), chartevents_icd.hadm_id.tolist())}
aki_subject = [s for h, s in hadm_to_subject.items() if int(h) in aki_hadms_set]

In [40]:
# Simple code
aki_subject = filtered_chartevents[filtered_chartevents['hadm_id'].isin(aki_list)]['subject_id'].tolist()

In [41]:
aki_subject_set = set(aki_subject)

In [None]:
aki_first_prev_hadm = []
aki_last_prev_hadm = []
aki_subject_for_check = []
for key, df in tqdm(chartevents_icd.groupby(by = ['subject_id'])):
    had_aki = list(set(df.hadm_id).intersection(aki_hadms_set))
    if had_aki:
        first_h = np.array(had_aki).min()
        last_h = np.array(had_aki).max()
        first_prev_h = df.loc[df.hadm_id <= first_h, 'hadm_id'].tolist()
        last_prev_h = df.loc[df.hadm_id <= last_h, 'hadm_id'].tolist()
        aki_first_prev_hadm.extend(first_prev_h)
        aki_last_prev_hadm.append(last_prev_h)
        aki_subject_for_check.append(key)

In [44]:
from statsmodels.stats.proportion import proportions_ztest

In [45]:
def do_everything4(filtered_df, colname, aki_last_prev_hadm, aki_subject_set):
    # Remove rows with null 'hadm_id'
    filtered_df = filtered_df.dropna(subset=['hadm_id'])
    n_aki_hadms = filtered_df['hadm_id'].isin(aki_last_prev_hadm).sum()
    n_hadms = filtered_df['hadm_id'].nunique()
    print(f'# unique AKI hadms: {n_aki_hadms}\n# unique hadms: {n_hadms}')

    # Initialize dictionaries to store counts
    res_counts = {}
    aki_res_counts = {}

    # Group by 'subject_id' and 'hadm_id'
    grouped = filtered_df.groupby(['subject_id', 'hadm_id'])

    for (subject_id, _), h_df in grouped:
        is_aki_subject = subject_id in aki_subject_set
        codes = h_df[colname].unique()

        # Update overall counts
        for code in codes:
            res_counts[code] = res_counts.get(code, 0) + 1
            if is_aki_subject:
                aki_res_counts[code] = aki_res_counts.get(code, 0) + 1

    code_test_pval = {}
    print('Calculating p-values for each code...')

    for code in aki_res_counts:
        count = [aki_res_counts.get(code, 0), res_counts[code] - aki_res_counts.get(code, 0)]
        nobs = [n_aki_hadms, n_hadms - n_aki_hadms]

        if all(nobs) and all(count):
            _, pval = proportions_ztest(count, nobs)
            code_test_pval[code] = pval
        elif count[1] == 0:
            code_test_pval[code] = 0
        else:
            print(f'Code {code} has zero counts in AKI group.')

    aki_code_top = {code for code, pval in code_test_pval.items() if pval < 0.05}
    print(f'There are {len(aki_code_top)} codes with p-value < 0.05.')

    # Filter codes with counts higher than the mean
    mean_count = sum(res_counts.values()) / len(res_counts)
    code_top = {code for code, count in res_counts.items() if count > mean_count}
    print(f'# codes with counts higher than the mean: {len(code_top)}')

    # Final selection
    final_aki_code_top = aki_code_top & code_top
    print(f'Finally, we have {len(final_aki_code_top)} codes to maintain.')

    return list(final_aki_code_top)

def do_everything4_2(df, colname, val_colname, fact):
    # Prepare data
    print('\nCreating new DataFrame...')
    df = df[['subject_id', 'hadm_id', colname, val_colname]]
    df = df[df[colname].isin(fact)]

    # Calculate mean values for each code per hadm_id
    grouped = df.groupby(['subject_id', 'hadm_id', colname])[val_colname].mean().reset_index()
    pivot_df = grouped.pivot_table(
        index=['subject_id', 'hadm_id'],
        columns=colname,
        values=val_colname,
        fill_value=0
    ).reset_index()

    # Rename columns
    prefix = input('Column name prefix: ')
    pivot_df.columns = [
        f'value_{prefix}_{col}' if col not in ['subject_id', 'hadm_id'] else col
        for col in pivot_df.columns
    ]

    return pivot_df


In [None]:
fact = do_everything4(filtered_chartevents, 'itemid', aki_last_prev_hadm = aki_last_prev_hadm, aki_subject_set = aki_subject_set)
del filtered_chartevents; gc.collect()

In [48]:
import pickle

with open('fact_pickle.pkl', 'wb') as f:
    pickle.dump(fact, f)

In [49]:
with open('fact_pickle.pkl', 'rb') as f:
    fact = pickle.load(f)

In [None]:
df = pd.read_csv('chartevents.csv')

In [None]:
final_df = do_everything4_2(df, 'itemid','value_numeric', fact)

In [None]:
final_df = do_everything4_2(df, 'itemid','value_numeric', fact)

In [None]:
final_df.to_csv('final_chartevents.csv')