In [5]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import simplefilter
simplefilter('ignore')

In [6]:
ROOT_PATH = '/work/InternalMedicine/s223850/ED-StaticDynamic/static_dynamic_ds_parallel'

In [7]:
file_1_s = 'static_7.csv'
file_1_d = 'dynamic_7.csv'

In [8]:
file_2_s = 'static_100.csv'
file_2_d = 'dynamic_100.csv'

In [9]:
file_3_s = 'static_230.csv'
file_3_d = 'dynamic_230.csv'

In [10]:
def basic_preprocess(df):
    if 'Arrived_Time' in df.columns:
        df['Arrived_Time'] = pd.to_datetime(df['Arrived_Time'])
    if 'Arrived_Time_appx' in df.columns:
        df = df.loc[df['Arrived_Time_appx']!='-1']
        df['Arrived_Time_appx'] = pd.to_datetime(df['Arrived_Time_appx'])
    if 'Calculated_DateTime' in df.columns:
        df['Calculated_DateTime'] = pd.to_datetime(df['Calculated_DateTime'])
    unamed_cols = [col for col in df.columns if 'unnamed' in col.lower()]
    if len(unamed_cols) >= 1:
        df.drop(columns=unamed_cols, inplace=True)
    return df

In [11]:
df_static_1 = pd.read_csv(os.path.join(ROOT_PATH, file_1_s))
df_static_2 = pd.read_csv(os.path.join(ROOT_PATH, file_2_s))
df_static_3 = pd.read_csv(os.path.join(ROOT_PATH, file_3_s))

df_dynamic_1 = pd.read_csv(os.path.join(ROOT_PATH, file_1_d))
df_dynamic_2 = pd.read_csv(os.path.join(ROOT_PATH, file_2_d))
df_dynamic_3 = pd.read_csv(os.path.join(ROOT_PATH, file_3_d))



In [12]:
df_static_1 = basic_preprocess(df_static_1)
df_static_2 = basic_preprocess(df_static_2)
df_static_3 = basic_preprocess(df_static_3)

df_dynamic_1 = basic_preprocess(df_dynamic_1)
df_dynamic_2 = basic_preprocess(df_dynamic_2)
df_dynamic_3 = basic_preprocess(df_dynamic_3)

In [57]:
class CustomCrossValidator:
    def __init__(self, date_col, training_period, testing_perid, overlap_perc=0):
        self.tr_pr = training_period
        self.te_pr = testing_perid
        self.overlap = overlap_perc
        self.date_col = date_col
        self.size_of_overlap = int(overlap_perc*training_period)

    def split(self, X, y=None):
        Xc = X.sort_values(by=self.date_col)
        start_date = X[self.date_col].iloc[0]
        end_date = X[self.date_col].iloc[-1]

        self.total_duration_in_month = (end_date-start_date).total_seconds()/(60*60*24*30);
        
        for idx, df_grp in df_sorted.groupby(['arr_mnth', 'arr_year']):
            if len(group_training) < training_size:
                group_training.append(df_grp)
            elif len(group_testing) < testing_size:
                group_testing.append(df_grp)
            else:
                df_train_fold = pd.concat(group_training)
                df_test_fold = pd.concat(group_testing)
                print(f"Between {group_training[0]['Arrived_Time_appx'].min()} to {group_training[-1]['Arrived_Time_appx'].max()}, the training data contains {df_train_fold['PAT_ENC_CSN_ID'].nunique()} encounters with total of {len(df_train_fold)} rows ...")
                print(f"Between {group_testing[0]['Arrived_Time_appx'].min()} to {group_testing[-1]['Arrived_Time_appx'].max()}, the testing data contains {df_test_fold['PAT_ENC_CSN_ID'].nunique()} encounters with total of {len(df_test_fold)} rows ...")
                print('-----------------------------')
                group_training = []
                group_testing = []
                yield df_train_fold, df_test_fold
                
            
        

In [10]:
CustomCrossValidator('Arrived_Time_appx', 6, 2, 

Index(['PAT_ENC_CSN_ID', 'Ethnicity', 'Sex', 'MultiRacial', 'Admitted_YN',
       'Patient_Age', 'Coverage_Financial_Class_Grouper',
       'Has Completed Appt in Last Seven Days',
       'Has Hospital Encounter in Last Seven Days',
       'Number of Inpatient Admissions in the last 30 Days',
       'Number of past appointments in last 60 days',
       'Number of past inpatient admissions over ED visits in last three years',
       'Chief_Complaint_All', 'Count_of_Chief_Complaints', 'Means_Of_Arrival',
       'Acuity_Level', 'FirstRace', 'Arrived_Time_appx'],
      dtype='object')

In [74]:
df_sorted = df_static_1.sort_values(by='Arrived_Time_appx')

In [14]:
df_sorted['Arrived_Time_appx']

0        2017-04-02 09:58:34
1        2019-11-14 10:19:52
2        2021-11-01 00:44:44
4        2021-11-01 00:55:01
5        2021-11-01 01:09:37
                 ...        
111760   2023-11-30 19:21:55
111761   2023-11-30 19:50:05
111762   2023-11-30 20:01:41
111763   2023-11-30 21:08:44
111764   2023-11-30 22:34:46
Name: Arrived_Time_appx, Length: 111762, dtype: datetime64[ns]

In [75]:
df_sorted['arr_mnth'] = df_sorted['Arrived_Time_appx'].dt.month
df_sorted['arr_year'] = df_sorted['Arrived_Time_appx'].dt.year

In [51]:
start_date = df_sorted['Arrived_Time_appx'].iloc[0]
end_date = df_sorted['Arrived_Time_appx'].iloc[-1]
start_date, end_date

(Timestamp('2017-04-02 09:58:34'), Timestamp('2023-11-30 22:34:46'))

In [78]:
training_size = 6
testing_size = 2
group_training = []
group_testing = []

for idx, df_grp in df_sorted.groupby(['arr_year', 'arr_mnth']):
    if len(group_training) < training_size:
        print(f'stacking {idx} to training group ...')
        group_training.append(df_grp)
    elif len(group_testing) < testing_size:
        print(f'stacking {idx} to testing group ...')
        group_testing.append(df_grp)
    else:
        df_train_fold = pd.concat(group_training)
        df_test_fold = pd.concat(group_testing)
        print(f"Between {group_training[0]['Arrived_Time_appx'].min()} to {group_training[-1]['Arrived_Time_appx'].max()}, the training data contains {df_train_fold['PAT_ENC_CSN_ID'].nunique()} encounters with total of {len(df_train_fold)} rows ...")
        print(f"Between {group_testing[0]['Arrived_Time_appx'].min()} to {group_testing[-1]['Arrived_Time_appx'].max()}, the testing data contains {df_test_fold['PAT_ENC_CSN_ID'].nunique()} encounters with total of {len(df_test_fold)} rows ...")
        print('-----------------------------')
        # group_training = []
        # group_testing = []
        # break
        # yield df_train_fold, df_test_fold
        

stacking (2017, 4) to training group ...
stacking (2019, 11) to training group ...
stacking (2021, 11) to training group ...
stacking (2021, 12) to training group ...
stacking (2022, 1) to training group ...
stacking (2022, 2) to training group ...
stacking (2022, 3) to testing group ...
stacking (2022, 4) to testing group ...
Between 2017-04-02 09:58:34 to 2022-02-28 22:13:01, the training data contains 16842 encounters with total of 16842 rows ...
Between 2022-03-01 00:17:41 to 2022-04-30 23:42:28, the testing data contains 8914 encounters with total of 8914 rows ...
-----------------------------
Between 2017-04-02 09:58:34 to 2022-02-28 22:13:01, the training data contains 16842 encounters with total of 16842 rows ...
Between 2022-03-01 00:17:41 to 2022-04-30 23:42:28, the testing data contains 8914 encounters with total of 8914 rows ...
-----------------------------
Between 2017-04-02 09:58:34 to 2022-02-28 22:13:01, the training data contains 16842 encounters with total of 16842 r

In [65]:
df_sorted[['Arrived_Time_appx', 'arr_mnth', 'arr_year']].iloc[0]

Arrived_Time_appx    2017-04-02 09:58:34
arr_mnth                               4
arr_year                            2017
Name: 0, dtype: object

In [66]:
df_sorted[['Arrived_Time_appx', 'arr_mnth', 'arr_year']].iloc[2]

Arrived_Time_appx    2021-11-01 00:44:44
arr_mnth                              11
arr_year                            2021
Name: 2, dtype: object

In [67]:
df_sorted[['Arrived_Time_appx', 'arr_mnth', 'arr_year']].iloc[3]

Arrived_Time_appx    2021-11-01 00:55:01
arr_mnth                              11
arr_year                            2021
Name: 4, dtype: object

In [72]:
idx

(4, 2023)

In [3]:
df = pd.read_csv('/work/InternalMedicine/s223850/ED-StaticDynamic/raw_data/ED Events - 12.1.23.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16832198 entries, 0 to 16832197
Data columns (total 30 columns):
 #   Column                                                                  Dtype  
---  ------                                                                  -----  
 0   PAT_ENC_CSN_ID                                                          int64  
 1   PAT_MRN_ID                                                              int64  
 2   PAT_ID                                                                  object 
 3   Ethnicity                                                               object 
 4   FirstRace                                                               object 
 5   MultiRacial                                                             int64  
 6   Sex                                                                     object 
 7   Arrived_Time                                                            object 
 8   ED_Disposition                

In [17]:
df['FirstRace'].value_counts()

White                                        9502845
Black or African American                    5329002
Unavailable/Unknown                           923066
Asian                                         517078
Some other race                               392277
American Indian or Alaska Native               76618
Declined                                       59346
Native Hawaiian or Other Pacific Islander      20277
Name: FirstRace, dtype: int64