In [169]:
from pathlib import Path, PurePath
import glob
import os
import numpy as np
import csv
import pandas as pd
import time
import matplotlib.pyplot as plt

In [170]:
file_path = PurePath(os.getcwd())
dataset_path = file_path.parent.parent.parent.joinpath('eicu')

In [171]:
column_names = [
    'patientunitstayid', 'labresultoffset', 'labname', 'labresultrevisedoffset']

In [172]:
lab_file = pd.read_csv(dataset_path.joinpath('lab.csv'), usecols=column_names)

d_lab_file = pd.read_csv(dataset_path.joinpath('lab_delirium.csv'), usecols=column_names)


In [173]:
unique_ids = lab_file['patientunitstayid'].unique()
d_unique_ids = d_lab_file['patientunitstayid'].unique()
unique_labs = lab_file['labname'].unique()


In [174]:
lab_index = {lab: idx for idx, lab in enumerate(unique_labs)}
# print(lab_index)

In [175]:
lab_value_counts = d_lab_file['labname'].value_counts()
print(lab_value_counts)

bedside glucose              496367
potassium                    251013
glucose                      244919
sodium                       242468
Hgb                          233979
                              ...  
Site                              3
Legionella pneumophila Ab         2
Procainamide                      1
NAPA                              1
RPR titer                         1
Name: labname, Length: 156, dtype: int64


In [176]:
mapped_lab_file = lab_file.replace(lab_index)
d_mapped_lab_file = d_lab_file.replace(lab_index)
#saves me time in the long run by reducing computing times

In [179]:
mapped_lab_arr = mapped_lab_file.to_numpy(dtype=int)
d_mapped_lab_arr = d_mapped_lab_file.to_numpy(dtype=int)

In [241]:
def get_stats(lab, arr):
    if np.size(arr):
        q3, q1 = np.percentile(arr, [75, 25])
        IQR = q3 - q1
    else:
        return [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN]
    # mean = arr.mean()
    # median = arr.median()
    # var = arr.var()
    # minima = arr.min()
    # maxima = arr.max()
    # return mean, var, median, IQR, minima, maxima
    return [unique_labs[lab], np.mean(arr), np.var(arr), np.median(arr), IQR, np.min(arr), np.max(arr)]
    

In [247]:
rev_freq_stats = []
for lab in range(158):
    df_lab = mapped_lab_file[mapped_lab_file['labname'] == lab]
    if df_lab.empty:
        continue
    # Filters out one type of lab only
    df_lab_sorted = df_lab.sort_values(by=['patientunitstayid', 'labresultoffset','labresultrevisedoffset']).reset_index()
    # Sorts by patient, then time to make subtraction later on easier

    # Creates
    lab_counts = df_lab_sorted['patientunitstayid'].value_counts().to_dict()
    # print(lab_counts)
    arr_lab = df_lab_sorted.to_numpy(dtype=int)
    # print(arr_lab)
    index_map = [0]
    length_map = []
    for i, id in enumerate(df_lab['patientunitstayid'].unique()):
        index_map.append(index_map[i] + lab_counts[id])
        length_map.append(lab_counts[id])
    index_map = index_map[:-1]
    index_m_arr = np.asarray(index_map)
    length_m_arr = np.asarray(length_map)

    # print(index_map)
    # length_map = [lab_counts[id] for i, id in enumerate(df_lab['patientunitstayid'].unique())]
    # print(length_map)
    all_times = arr_lab[:, 4]
    if all_times.size != 0:
        # print(all_times)

        time_diff = all_times[1:] - all_times[:-1]
        afil1 = index_m_arr - 1
        afil2 = index_m_arr[length_m_arr == 1]
        anti_filter = np.append(afil1, afil2)
        filtering = ~np.isin(np.arange(all_times.size - 1), anti_filter)
    rev_freq_stats.append(get_stats(lab, time_diff[filtering]))
    print(lab, 'finished')
rev_freq_stats_df = pd.DataFrame(rev_freq_stats, columns = ["lab", "mean", "var", "median", "IQR", "min", "max"])

    # print(arr_lab[:,3])
    # print(arr_lab)
    

0 finished
1 finished
2 finished
3 finished
4 finished
5 finished
6 finished
7 finished
8 finished
9 finished
10 finished
11 finished
12 finished
13 finished
14 finished
15 finished
16 finished
17 finished
18 finished
19 finished
20 finished
21 finished
22 finished
23 finished
24 finished
25 finished
26 finished
27 finished
28 finished
29 finished
30 finished
31 finished
32 finished
33 finished
34 finished
35 finished
36 finished
37 finished
38 finished
39 finished
40 finished
41 finished
42 finished
43 finished
44 finished
45 finished
46 finished
47 finished
48 finished
49 finished
50 finished
51 finished
52 finished
53 finished
54 finished
55 finished
56 finished
57 finished
58 finished
59 finished
60 finished
61 finished
62 finished
63 finished
64 finished
65 finished
66 finished
67 finished
68 finished
69 finished
70 finished
71 finished
72 finished
73 finished
74 finished
75 finished
76 finished
77 finished
78 finished
79 finished
80 finished
81 finished
82 finished
83 finished
84

In [None]:
rev_freq_stats_df.to_csv(file_path.joinpath('Lab_Frequency.csv'))

In [245]:
rev_freq_stats = []
for lab in range(158):
    df_lab = mapped_lab_file[mapped_lab_file['labname'] == lab]
    if df_lab.empty:
        continue
    # Filters out one type of lab only
    df_lab_sorted = df_lab.sort_values(by=['patientunitstayid', 'labresultoffset','labresultrevisedoffset']).reset_index()
    # Sorts by patient, then time to make subtraction later on easier

    # Creates
    lab_counts = df_lab_sorted['patientunitstayid'].value_counts().to_dict()
    # print(lab_counts)
    arr_lab = df_lab_sorted.to_numpy(dtype=int)
    # print(arr_lab)
    index_map = [0]
    length_map = []
    for i, id in enumerate(df_lab['patientunitstayid'].unique()):
        index_map.append(index_map[i] + lab_counts[id])
        length_map.append(lab_counts[id])
    index_map = index_map[:-1]
    index_m_arr = np.asarray(index_map)
    length_m_arr = np.asarray(length_map)

    # print(index_map)
    # length_map = [lab_counts[id] for i, id in enumerate(df_lab['patientunitstayid'].unique())]
    # print(length_map)
    all_times = arr_lab[:, 4]
    if all_times.size != 0:
        # print(all_times)

        time_diff = all_times[1:] - all_times[:-1]
        afil1 = index_m_arr - 1
        afil2 = index_m_arr[length_m_arr == 1]
        anti_filter = np.append(afil1, afil2)
        filtering = ~np.isin(np.arange(all_times.size - 1), anti_filter)
    rev_freq_stats.append(get_stats(lab, time_diff[filtering]))
rev_freq_stats_df = pd.DataFrame(rev_freq_stats, columns = ["lab", "mean", "var", "median", "IQR", "min", "max"])

    # print(arr_lab[:,3])
    # print(arr_lab)
    

[ 0  1  4  5 13 15 16 17 18 19]
[1 3 1 8 2 1 1 1 1 1]
[ 0  5  6  9 10 13 30 32 33 37]
[ 5  1  3  1  3 17  2  1  4  1]
[ 0  1  4  5  6  9 13 14 19 21]
[1 3 1 1 3 4 1 5 2 1]
[ 0  5  6  9 10 13 30 32 33 37]
[ 5  1  3  1  3 17  2  1  4  1]
[ 0  4  5  6  7 11 12 35 39 42]
[ 4  1  1  1  4  1 23  4  3  1]
[ 0  1  2  3  4  9 10 11 12 13]
[1 1 1 1 5 1 1 1 1 1]
[ 0  3  4  5 15 18 19 23 24 28]
[ 3  1  1 10  3  1  4  1  4  2]
[ 0  3  4  7 21 23 24 28 29 34]
[ 3  1  3 14  2  1  4  1  5  4]
[ 0  1  2  3  4  5 10 11 12 13]
[1 1 1 1 1 5 1 1 1 1]
[ 0  4  5  6  7 11 12 35 39 42]
[ 4  1  1  1  4  1 23  4  3  1]
[ 0  1  5  6  7 11 12 15 17 18]
[1 4 1 1 4 1 3 2 1 1]
[ 0  2  3  6  7  8  9 10 11 12]
[2 1 3 1 1 1 1 1 1 1]
[ 0  2  6  7  8 11 15 18 19 20]
[2 4 1 1 3 4 3 1 1 1]
[ 0  3  4  6 10 26 30 32 33 34]
[ 3  1  2  4 16  4  2  1  1  1]
[ 0  3  4 12 15 16 20 21 25 27]
[3 1 8 3 1 4 1 4 2 1]
[ 0  4  5  9 31 32 33 36 37 46]
[ 4  1  4 22  1  1  3  1  9 10]
[ 0  1  2  3  5 12 13 14 15 16]
[1 1 1 2 7 1 1 1 1 2]
[ 

KeyboardInterrupt: 

THIS IS FAR FROM PERFECT:

- Min should not be negative
- Mean is not exactly correct but close enough

BUT IT IS 60x faster so idc

In [248]:
print(d_rev_freq_stats_df)
d_rev_freq_stats_df.to_csv(file_path.joinpath('Lab_Frequency_delirium.csv'))

                 lab         mean           var  median     IQR      min  \
0         fibrinogen  1368.310847  1.249498e+07   477.0  1064.0  -3052.0   
1           PT - INR  1810.384789  9.736522e+06  1410.0   719.0 -28602.0   
2          magnesium  1508.227676  3.638306e+06  1393.0   792.0 -31145.0   
3                 PT  1810.023781  9.678817e+06  1411.0   712.0 -28602.0   
4                 pH  1014.280210  6.819083e+09   335.0  1000.0 -51652.0   
..               ...          ...           ...     ...     ...      ...   
153  Amikacin - peak  3136.300000  2.252629e+06  2895.5  2219.5    286.0   
154       Vent Other  2001.214932  2.220917e+07   665.0  1518.0  -2273.0   
155              NaN          NaN           NaN     NaN     NaN      NaN   
156              NaN          NaN           NaN     NaN     NaN      NaN   
157   HSV 1&2 IgG AB   276.666667  1.530889e+05     0.0   415.0      0.0   

            max  
0      149784.0  
1      477665.0  
2      211640.0  
3      477665.0

In [None]:
# d_lab_pat_idx_dict = {}
# for lab in range(158):
#     d_lab_df = d_mapped_lab_file[d_mapped_lab_file['labname'] == lab]
#     d_lab_pat_idx_dict.update({lab: (d_lab_df['patientunitstayid'].unique())})
# # This saves so much time

In [25]:
# #For every lab
# def get_stats(name, arr):
#     if arr:
#         q3, q1 = np.percentile(arr, [75, 25])
#         IQR = q3 - q1
#     else:
#         return [name, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, 1]
#     # mean = arr.mean()
#     # median = arr.median()
#     # var = arr.var()
#     # minima = arr.min()
#     # maxima = arr.max()
#     # return mean, var, median, IQR, minima, maxima
#     return [name, np.mean(arr), np.var(arr), np.median(arr), IQR, np.min(arr), np.max(arr), len(arr) + 1]

# for lab in range(158):

#     # Create array of only that lab
#     per_lab_arr = d_mapped_lab_arr[d_mapped_lab_arr[:,2] == lab,:]
#     freq_arr = []
#     freq_stats = np.ndarray((158, 8))
#     for id in d_lab_pat_idx_dict[lab]:
#         filtered_arr = per_lab_arr[per_lab_arr[:,0] == id,:]
#         times = np.sort(filtered_arr[:,3])
#         freq = times[1:] - times[:-1]
#         freq = freq[freq != 0].tolist()
#         freq_arr.extend(freq)
#     freq_stats[lab, :] = get_stats(lab, freq_arr)
#     print(get_stats(lab, freq_arr))
#     print(lab, 'finished')



[0, 1638.9254285714285, 13645824.149581961, 571.5, 1113.25, 1, 66118, 3501]
0 finished
[1, 1972.715886506829, 14685650.939703222, 1413.0, 706.0, 1, 477665, 47299]
1 finished
[2, 1636.0709426627793, 4322151.854733902, 1409.0, 743.0, 1, 90605, 82321]
2 finished
[3, 1974.1854325699746, 14710107.789215269, 1413.5, 708.0, 1, 477665, 47161]
3 finished
[4, 1992.917956945118, 41807029191.754974, 449.0, 1206.0, 1, 47319220, 53607]
4 finished
[5, 5545.5, 77343074.4826389, 2791.0, 5924.75, 1, 130819, 577]
5 finished
[6, 1435.134252510799, 1404353.920941453, 1417.0, 531.0, 1, 96552, 149555]
6 finished
[7, 1299.0396308556733, 984958.6234029284, 1393.0, 754.0, 1, 46587, 193638]
7 finished
[8, 5544.255033557047, 51862818.975226335, 3041.0, 5843.75, 1, 83304, 597]
8 finished
[9, 1994.0727035344587, 41800010846.34041, 450.0, 1206.5, 1, 47319220, 53616]
9 finished
[10, 1247.6298725701206, 38387195.64511084, 375.0, 845.0, 1, 525001, 29272]
10 finished
[11, 1254.2338345864662, 8321806.541561987, 474.0, 68

finishes 5 seconds faster, significantly more complexity