In [1]:
import pandas as pd
import json
import os
import numpy as np
import random
from collections import defaultdict, Counter
import seaborn as sns

In [33]:
RAW_DATA_PATH = '/work/InternalMedicine/s223850/raw_data/ED Events - 11.21.23.csv'
CLEAN_DATA_DIR = '/work/InternalMedicine/s223850/ED-StaticDynamic/clean_target'
clean_date_folder = "12_1_23"

In [3]:
df_raw = pd.read_csv(RAW_DATA_PATH)
df_raw['Calculated_DateTime'] = pd.to_datetime(df_raw['Calculated_DateTime'])
df_raw['Arrived_Time'] = pd.to_datetime(df_raw['Arrived_Time'])

In [34]:
df_clean =  pd.read_csv(os.path.join(CLEAN_DATA_DIR, clean_date_folder, 'df_clean.csv'))
df_clean['Calculated_DateTime'] = pd.to_datetime(df_clean['Calculated_DateTime'])
df_clean['Arrived_Time'] = pd.to_datetime(df_clean['Arrived_Time'])

In [35]:
with open(os.path.join(CLEAN_DATA_DIR, clean_date_folder, 'error_pat.json'), 'r') as f:
    errors_pat_list = json.load(f)

len(errors_pat_list)

10209

In [36]:
with open(os.path.join(CLEAN_DATA_DIR, clean_date_folder, 'orderbflag.json'), 'r') as f:
    orderbflag_list = json.load(f)
len(orderbflag_list)

1135

In [37]:
random_error_pat_id = random.choice(errors_pat_list)

In [38]:
df_sample = df_raw[df_raw['PAT_ENC_CSN_ID']==random_error_pat_id].sort_values(by='Calculated_DateTime')[['PAT_MRN_ID', 'PAT_ENC_CSN_ID', 'Arrived_Time', 'Calculated_DateTime', 'Type', 'EVENT_NAME']]
df_sample

Unnamed: 0,PAT_MRN_ID,PAT_ENC_CSN_ID,Arrived_Time,Calculated_DateTime,Type,EVENT_NAME
14131605,97335200,686478795,2023-08-03 23:42:00,2023-08-03 23:42:16,Event,Emergency encounter created
14131606,97335200,686478795,2023-08-03 23:42:00,2023-08-03 23:42:29,Event,Admission Med List Generated
14131607,97335200,686478795,2023-08-03 23:42:00,2023-08-03 23:42:29,Event,Admission FAM List Generated
14131608,97335200,686478795,2023-08-03 23:42:00,2023-08-03 23:42:29,Event,Patient arrived in ED
14131609,97335200,686478795,2023-08-03 23:42:00,2023-08-03 23:42:29,Event,Prev Admission Rec Generated
14131610,97335200,686478795,2023-08-03 23:42:00,2023-08-03 23:44:33,Event,ED SECONDARY TRIAGE
14131611,97335200,686478795,2023-08-03 23:42:00,2023-08-03 23:49:17,Event,Triage Completed
14131612,97335200,686478795,2023-08-03 23:42:00,2023-08-04 00:01:00,Order - Imaging,XR KNEE RIGHT COMPLETE W/PATELLA
14131613,97335200,686478795,2023-08-03 23:42:00,2023-08-04 00:02:28,Event,RME Note Filed
14131614,97335200,686478795,2023-08-03 23:42:00,2023-08-04 00:02:28,Event,ED Census


In [9]:
df_sample['EVENT_NAME'].value_counts()

US DOPPLER VENOUS DVT LOWER EXTREMITY BILATERAL           2
Patient Moving                                            2
Emergency encounter created                               1
Patient arrived in ED                                     1
Admission FAM List Generated                              1
Prev Admission Rec Generated                              1
Admission Med List Generated                              1
Triage Started                                            1
ED SECONDARY TRIAGE                                       1
Triage Completed                                          1
MATH (PE DC) Research Study Page                          1
Patient roomed in ED                                      1
Patient transferred                                       1
ED Ret to WR                                              1
ED Course - Free Text                                     1
LWBS Documented -Patient is not responding when called    1
Name: EVENT_NAME, dtype: int64

In [10]:
incomplete = [685745685, 676224350, 651612649, 651577643, 651643314]
pregnant_attrs = [663583112]

In [11]:
small_df = []
for l in errors_pat_list:
    df_sample = df_raw[df_raw['PAT_ENC_CSN_ID']==l]
    if len(df_sample) <=10:
        small_df.append(l)

In [12]:
len(small_df)/len(errors_pat_list)

0.5520370918417677

In [13]:
df_raw[df_raw['PAT_ENC_CSN_ID'] == small_df[9]][['PAT_ENC_CSN_ID','EVENT_NAME']]

Unnamed: 0,PAT_ENC_CSN_ID,EVENT_NAME
30280,651643314,Emergency encounter created


In [14]:
df_clean.groupby('PAT_ENC_CSN_ID', group_keys=False)['Admitted_YN'].apply(lambda x: x.iloc[-1]=='Admitted').sum()

44130

In [15]:
df_clean.groupby('PAT_ENC_CSN_ID', group_keys=False)['Admitted_YN'].apply(lambda x: x.iloc[-1]=='Not Admitted').sum()

66563

In [16]:
44130/110693

0.39867019594734987

In [17]:
44130+66563

110693

# If I am filtering out "short" encounters. What threshold should be used?

**Assumptions** <br>
1. Short encounter is somewhat less that 10 events

In [18]:
df_clean = df_clean.sort_values(by='Calculated_DateTime')

In [19]:
groups = df_clean.groupby('PAT_ENC_CSN_ID')

In [32]:
events_cnt = defaultdict(list)
for pat_id, df_grb in groups:
    for i in range(200):
        if i<len(df_grb):
            events_cnt[i].append(df_grb.iloc[i]['EVENT_NAME'])    
        
    
    

In [31]:
Counter(events_cnt[6]).most_common()

[('Triage Completed', 30162),
 ('ED SECONDARY TRIAGE', 16300),
 ('Patient Moving', 6871),
 ('Patient roomed in ED', 6589),
 ('Bed was Held', 3980),
 ('Initial ED Provider Contact', 3250),
 ('BLOOD GLUCOSE POCT', 3026),
 ('POSS Documented', 2917),
 ('IP NUR BLOOD SUGAR BELOW 70', 2441),
 ('Swallow Screen Trigger Event', 2189),
 ('TRIAGE DIRECT TO ROOM', 2160),
 ('RN Re-Assement Note Filed', 2101),
 ('GLUCOSE POC METER', 2074),
 ('CBC WITH DIFFERENTIAL', 2021),
 ('CBC W/ DIFF', 1806),
 ('URINALYSIS + REFLEX MICROSCOPIC', 1573),
 ('Triage Started', 1445),
 ('CMPL (COMPREHENSIVE METABOLIC PANEL)', 1319),
 ('RME Note Filed', 1101),
 ('TROPONIN I HS', 1048),
 ('IP NUR PHYSICIAN COMMUNICATION TO NURSING', 739),
 ('IP NUR INSERT PERIPHERAL IV', 715),
 ('IP NUR NOTIFY PHYSICIAN (SPECIFY)', 707),
 ('SODIUM CHLORIDE 0.9 % SYRINGE', 700),
 ('IP NUR IN AND OUT CATHETERIZATION', 673),
 ('CORONAVIRUS, COVID-19 W/ OR W/O RESPIRATORY TESTING', 627),
 ('BMP (BASIC METABOLIC PANEL)', 584),
 ('XR CHEST 1 

In [97]:
df_len = groups.apply(lambda x: len(x))

In [98]:
df_len[df_len>=6]

PAT_ENC_CSN_ID
86725001      99
619927095     10
651542751     30
651542792     31
651542795    110
            ... 
693399528     96
693399583     83
693401044    112
693401129     16
693401341     33
Length: 110300, dtype: int64

In [99]:
110300/len(df_len)

0.9964496399953023

In [101]:
set(df_len[df_len>=6].index)

{669253632,
 680001536,
 677117954,
 686555140,
 670302213,
 681050118,
 685768709,
 691535876,
 674496522,
 666894347,
 669515787,
 681574410,
 687341579,
 685768721,
 662175762,
 664797204,
 662437909,
 685768728,
 651690013,
 665321502,
 661913631,
 674496541,
 681050144,
 684982307,
 669253668,
 681574436,
 687341610,
 686817325,
 687341618,
 688128051,
 691798068,
 662437942,
 691535927,
 651690045,
 687341629,
 686555201,
 676069442,
 665583683,
 691535940,
 668467269,
 674758728,
 687341640,
 657981514,
 673972298,
 671613004,
 679215183,
 664010835,
 683671635,
 690487380,
 672399447,
 665583704,
 674496601,
 684982362,
 664535136,
 675020897,
 686555235,
 689963109,
 655097958,
 690487398,
 691798119,
 688334806,
 664797295,
 672923762,
 690487411,
 691535988,
 654311541,
 681574517,
 662438007,
 684720246,
 651952261,
 659292294,
 662438029,
 684982413,
 667943056,
 671088786,
 653787283,
 653787284,
 667156627,
 687341714,
 657457305,
 666370202,
 674496667,
 685244572,
 681