In [395]:
from pathlib import Path, PurePath
import glob
import os
import numpy as np
import csv
import pandas as pd
import time
import matplotlib.pyplot as plt

In [396]:
file_path = PurePath(os.getcwd())
dataset_path = file_path.parent.parent.parent.joinpath('eicu')

In [397]:
column_names = [
    'patientunitstayid', 'labresultoffset', 'labname', 'labresultrevisedoffset']

In [398]:
lab_file = pd.read_csv(dataset_path.joinpath('lab.csv'), usecols=column_names)

d_lab_file = pd.read_csv(dataset_path.joinpath('lab_delirium.csv'), usecols=column_names)



In [399]:
lab_file = lab_file[lab_file['labresultoffset'] >0]
d_lab_file = d_lab_file[d_lab_file['labresultoffset'] > 0]

In [400]:
unique_ids = lab_file['patientunitstayid'].unique()
d_unique_ids = d_lab_file['patientunitstayid'].unique()
unique_labs = lab_file['labname'].unique()
lab_index = {lab: idx for idx, lab in enumerate(unique_labs)}


In [401]:
mapped_lab_file = lab_file.replace(lab_index)
d_mapped_lab_file = d_lab_file.replace(lab_index)

#saves me time in the long run by reducing computing times

In [402]:
mapped_lab_arr = mapped_lab_file.to_numpy(dtype=int)
d_mapped_lab_arr = d_mapped_lab_file.to_numpy(dtype=int)

In [403]:
def get_stats(lab, arr):
    if np.size(arr):
        q3, q1 = np.percentile(arr, [75, 25])
        IQR = q3 - q1
    else:
        return [unique_labs[lab], np.NaN, np.NaN, np.NaN, np.NaN]
    # mean = arr.mean()
    # median = arr.median()
    # var = arr.var()
    # minima = arr.min()
    # maxima = arr.max()
    # return mean, var, median, IQR, minima, maxima
    return [unique_labs[lab], np.mean(arr), np.var(arr), np.median(arr), IQR, np.min(arr), np.max(arr)]
    

In [428]:
freq_stats = []
count_stats = []
for lab in range(158):
    df_lab = mapped_lab_file[mapped_lab_file['labname'] == lab]
    if df_lab.empty:
        freq_stats.append(get_stats(lab, []))
        continue
    
    df_lab_sorted = df_lab.sort_values(by=['patientunitstayid','labresultoffset']).reset_index()
    lab_counts = df_lab_sorted['patientunitstayid'].value_counts()

    count_stats.append([unique_labs[lab], lab_counts.mean(), lab_counts.var(), lab_counts.median(), lab_counts.sum(), len(lab_counts)])
    
    lab_counts = lab_counts.to_dict()
    arr_lab = df_lab_sorted.to_numpy(dtype=int)
    
    index_map = [0]
    length_map = []
    for i, id in enumerate(df_lab['patientunitstayid'].unique()):
        index_map.append(index_map[i] + lab_counts[id])
        length_map.append(lab_counts[id])
    index_map = index_map[:-1]
    index_m_arr = np.asarray(index_map)
    length_m_arr = np.asarray(length_map)

    all_times = arr_lab[:, -3]
    time_diff = all_times[1:] - all_times[:-1]
    afil1 = index_m_arr - 1
    afil2 = index_m_arr[length_m_arr == 1]
    anti_filter = np.append(afil1, afil2)
    true_filter = ~np.isin(np.arange(all_times.size - 1), anti_filter)
    filtered_time_diff = time_diff[true_filter]
    filtered_time_diff = filtered_time_diff[filtered_time_diff != 0]


    freq_stats.append(get_stats(lab, filtered_time_diff))
freq_stats_df = pd.DataFrame(freq_stats, columns = ["lab", "mean", "var", "median", "IQR", "min", "max"])

    

In [430]:
count_stats_df = pd.DataFrame(count_stats, columns = ["lab", "mean", "var", "median", "total labs", "patients w/ lab"])
print(count_stats_df)

                      lab      mean        var  median  total labs  \
0              fibrinogen  2.275553  11.554940     1.0       30456   
1                PT - INR  3.551265  22.282565     2.0      296694   
2               magnesium  4.592734  36.319997     3.0      576062   
3                      PT  3.526298  21.938640     2.0      287150   
4                      pH  4.717114  60.218341     2.0      318825   
..                    ...       ...        ...     ...         ...   
153            Vent Other  2.175074   4.513137     1.0        1466   
154  HSV 1&2 IgG AB titer  1.000000   0.000000     1.0           7   
155             RPR titer  1.000000        NaN     1.0           1   
156            HIV 1&2 AB  1.000000   0.000000     1.0           6   
157        HSV 1&2 IgG AB  1.096774   0.090323     1.0          34   

     patients w/ lab  
0              13384  
1              83546  
2             125429  
3              81431  
4              67589  
..               ... 

In [405]:
print(freq_stats_df)
# freq_stats_df.to_csv(file_path.joinpath('Lab_Frequency.csv'))

                      lab         mean           var  median      IQR    min  \
0              fibrinogen  1353.188729  1.193593e+07   515.0  1048.00    1.0   
1                PT - INR  1724.244717  8.861066e+06  1423.0   543.00    1.0   
2               magnesium  1477.887199  3.226104e+06  1405.0   770.00    1.0   
3                      PT  1722.247553  8.799621e+06  1423.0   533.00    1.0   
4                      pH   973.109490  4.292318e+06   445.0  1174.00    1.0   
..                    ...          ...           ...     ...      ...    ...   
153            Vent Other  1816.184010  1.239065e+07   651.0  1416.75    2.0   
154  HSV 1&2 IgG AB titer          NaN           NaN     NaN      NaN    NaN   
155             RPR titer          NaN           NaN     NaN      NaN    NaN   
156            HIV 1&2 AB          NaN           NaN     NaN      NaN    NaN   
157        HSV 1&2 IgG AB   830.000000  0.000000e+00   830.0     0.00  830.0   

          max  
0    149635.0  
1    47

In [408]:
d_freq_stats = []
for lab in range(158):
    df_lab = d_mapped_lab_file[d_mapped_lab_file['labname'] == lab]
    if df_lab.empty:
        d_freq_stats.append(get_stats(lab, []))
    # Filters out one type of lab only
    df_lab_sorted = df_lab.sort_values(by=['patientunitstayid', 'labresultoffset']).reset_index()
    # Sorts by patient, then time to make subtraction later on easier

    # Creates
    lab_counts = df_lab_sorted['patientunitstayid'].value_counts().to_dict()
    arr_lab = df_lab_sorted.to_numpy(dtype=int)

    index_map = [0]
    length_map = []
    for i, id in enumerate(df_lab['patientunitstayid'].unique()):
        index_map.append(index_map[i] + lab_counts[id])
        length_map.append(lab_counts[id])
    index_map = index_map[:-1]
    index_m_arr = np.asarray(index_map)
    length_m_arr = np.asarray(length_map)

    all_times = arr_lab[:, -3]
    if all_times.size != 0:
        time_diff = all_times[1:] - all_times[:-1]
        afil1 = index_m_arr - 1
        afil2 = index_m_arr[length_m_arr == 1]
        anti_filter = np.append(afil1, afil2)
        true_filter = ~np.isin(np.arange(all_times.size - 1), anti_filter)
        filtered_time_diff = time_diff[true_filter]
        filtered_time_diff = filtered_time_diff[filtered_time_diff != 0]
    d_freq_stats.append(get_stats(lab, filtered_time_diff))
d_freq_stats_df = pd.DataFrame(d_freq_stats, columns = ["lab", "mean", "var", "median", "IQR", "min", "max"])

In [409]:
print(d_freq_stats_df)
# d_freq_stats_df.to_csv(file_path.joinpath('Lab_Frequency_delirium.csv'))

                      lab         mean           var  median      IQR  min  \
0              fibrinogen  1569.987567  1.241292e+07   615.0  1089.00  1.0   
1                PT - INR  1878.614975  1.473883e+07  1425.0   522.75  1.0   
2               magnesium  1609.303855  3.662724e+06  1418.0   702.50  1.0   
3                      PT  1881.034873  1.477390e+07  1425.0   523.00  1.0   
4                      pH  1116.792751  5.275647e+06   542.0  1208.00  1.0   
..                    ...          ...           ...     ...      ...  ...   
155  HSV 1&2 IgG AB titer  1998.791667  1.582595e+07   789.5  1697.50  4.0   
156             RPR titer          NaN           NaN     NaN      NaN  NaN   
157            HIV 1&2 AB          NaN           NaN     NaN      NaN  NaN   
158            HIV 1&2 AB          NaN           NaN     NaN      NaN  NaN   
159        HSV 1&2 IgG AB          NaN           NaN     NaN      NaN  NaN   

          max  
0     66170.0  
1    477685.0  
2     87915.0  