In [1]:
import pandas as pd
import csv
import numpy as np
import seaborn as sns
import os
import psycopg2
from psycopg2 import OperationalError, DatabaseError, sql
import fuzzywuzzy
from fuzzywuzzy import process
from datetime import timedelta

def read_csv(csv_file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)
    print(csv_file_path)
    print('Shape:', df.shape)
    print(df.head())
    return df
"""
Saves pandas DataFrame as a CSV file.
"""
def save_df_as_csv(df, csv_name, directory='dataframes'):
    if not os.path.exists(directory):
        os.makedirs(directory)

    file_path = os.path.join(directory, csv_name)
    df.to_csv(file_path, index=False)

    print(f'DataFrame has been saved as {file_path}')

In [2]:
### Environment Variables for Connection ###
DB_NAME = 'smcdougall'
USERNAME = 'postgres'
PASSWORD = 'postgres'
HOST = 'localhost'
PORT = 5432 

def connect_to_postgres(db_name, username, password, host, port):
    connection = None
    try:
        connection = psycopg2.connect(
            dbname=db_name,
            user=username,
            password=password,
            host=host,
            port=port
        )
        print('Connected to db:', db_name)
        return connection
    except OperationalError as e:
        print('Received the following error:', e)
        return None

def verify_postgres_connection(connection):
    if connection is not None:
        try:
            cur = connection.cursor()
            cur.execute('SELECT version();')
            db_version = cur.fetchone()
            print('The Postgres database version is:', db_version)
            cur.close()
        except DatabaseError as e:
            print('Received the following error:', e)
    else:
        print('Connection to Postgres failed.')

def close_connection(connection):
    if connection is not None:
        connection.close()
        print('Postgres connection has been closed.')

connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
verify_postgres_connection(connection)
close_connection(connection)

Connected to db: smcdougall
The Postgres database version is: ('PostgreSQL 14.5 on aarch64-apple-darwin20.6.0, compiled by Apple clang version 12.0.5 (clang-1205.0.22.9), 64-bit',)
Postgres connection has been closed.


### Creating Hourly Intervals
- use pandas resample method: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.resample.html -- used to resample time-series data and used for frequency conversion. Object must have a datetime-like index
- Then use pandas aggregation method: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.aggregate.html -- need to include functions for aggregating the data - define as a dictionary for each of the data fields
- need to figure out the best way to handle the `itemid` field, and how to one-hot encode after the resampling
- resample --> aggregate --> one-hot encode --> fill in missing values

In [4]:
final_timeseries_df = read_csv('final_dfs/timeseries.csv')
final_timeseries_df = final_timeseries_df.set_index('charttime')

final_dfs/timeseries.csv
Shape: (6366718, 36)
   labevent_id  subject_id  hadm_id          itemid            charttime  \
0          NaN    18300046      NaN  Blood Pressure  2109-04-03 00:00:00   
1          NaN    18300046      NaN     Weight (kg)  2109-04-03 00:00:00   
2          NaN    16224440      NaN     Weight (kg)  2109-04-07 00:00:00   
3          NaN    16224440      NaN     BMI (kg/m2)  2109-04-07 00:00:00   
4          NaN    16224440      NaN  Blood Pressure  2109-04-07 00:00:00   

        value valueuom  ref_range_lower  ref_range_upper labevents_flag  ...  \
0      100/70     mmHg              NaN              NaN            NaN  ...   
1  56.3361264       kg              NaN              NaN            NaN  ...   
2  52.7981088       kg              NaN              NaN            NaN  ...   
3        19.6    kg/m2              NaN              NaN            NaN  ...   
4      100/70     mmHg              NaN              NaN            NaN  ...   

  medication_dis

In [5]:
final_timeseries_df.columns

Index(['labevent_id', 'subject_id', 'hadm_id', 'itemid', 'value', 'valueuom',
       'ref_range_lower', 'ref_range_upper', 'labevents_flag',
       'labevents_priority', 'source', 'stay_id', 'procedure_locationcategory',
       'procedure_ordercategoryname', 'procedure_ordercategorydescription',
       'patientweight', 'duration', 'microbiology_orgname', 'curr_service',
       'transfer_id', 'eventtype', 'care_unit_group', 'disposition',
       'arrived_by_urgent_transport', 'medication_reconciliation',
       'medication_dispensation', 'temperature', 'heartrate', 'resprate',
       'o2sat', 'sbp', 'dbp', 'pain', 'dose_unit_rx',
       'medication_prescription'],
      dtype='object')

In [6]:
# drop valueuom because there are no itemid's for which valueuom is not consistent across
# the rows for that itemid
# drop ref_range_lower and ref_range_upper because they are metadata and not specific to a patient's
# actual measurement
# drop dose_unit_rx because it is also metadata and we are just recording the presence of a medication, not its dosage
final_timeseries_df = final_timeseries_df.drop(columns=['valueuom', 'ref_range_lower',
                                                        'ref_range_upper', 'dose_unit_rx',
                                                       'labevent_id', 'source'])

In [7]:
# convert labevents flag to binary
# used to flag whether the labevents result is abnormal
final_timeseries_df['is_abnormal_lab_result'] = final_timeseries_df['labevents_flag'].apply(lambda x: 1 if x == 'abnormal' else 0)
final_timeseries_df['is_urgent_lab_event'] = final_timeseries_df['labevents_priority'].apply(lambda x: 1 if x == 'STAT' else 0)
final_timeseries_df = final_timeseries_df.drop(columns=['labevents_flag', 'labevents_priority'])

In [8]:
final_timeseries_df = final_timeseries_df.rename(columns={'eventtype': 'transfer_event'})

In [9]:
final_timeseries_df.index = pd.to_datetime(final_timeseries_df.index)

In [10]:
final_timeseries_df[final_timeseries_df['stay_id'].notna() & final_timeseries_df['hadm_id'].notna()].shape

(1053519, 29)

Based on this above logic, there are many rows for which there is both a stay id and hadm id defined. This means that the patient, throughout their ED stay, was admitted to the hospital, or in the case of the ICU data, it means that the stay id corresponds to the individual ward but the hadm id corresponds to the overall stay.

In this case, let's first try to do aggregation based on hadm_id, and if that is undefined, apply it to the stay id instead.

In [11]:
final_timeseries_df[final_timeseries_df['stay_id'].isna() & final_timeseries_df['hadm_id'].isna()].shape

(2487484, 29)

TODO: figure out how to handle data for which stay if and hadm id are both null

#### Aggregation approach
- for `itemid` (and similar columns), don't use `"first"` for the aggregation -- we will lose info if multiple events are recorded within the same hour
- should one-hot encode this field first, save it to a variable, resample and use `max()` for the aggregation (since the field will now be binary), and then handle all the numeric fields separately and combine (`"join"`) with the aggregated one-hot-encoded data
- since `itemid` is tied to the `value` field, we need to preserve the context of each `itemid` -- (1) resample and aggregate the `value` field separately for each item id, (2) combine the aggregated values into dataframe, (3) combine with rest of the dataset

In [12]:
# purpose: for the pivot table, the 'value' column needs to be numeric so that we can aggregate
# by the mean
def contains_forward_slash(value):
    return '/' in str(value)

# Filter rows where 'value' contains a forward slash
filtered_df = final_timeseries_df[final_timeseries_df['value'].apply(contains_forward_slash)]
accumulated_itemids = filtered_df.groupby('itemid').size().reset_index(name='count')
print(accumulated_itemids)

             itemid   count
0    Blood Pressure  339041
1  Ventilation Rate    3008


Blood pressure is the only itemid that is not numeric - We do not currently have readings for systolic and diastolic within `itemid` but we do have columns for them within the final dataframe...

Approach - for rows where itemid is 'Blood Pressure', separate out into `systolic` and `diastolic` (because blood pressure is recorded as `systolic / diastolic`) and populate the `sbp` and `dbp` columns for that row (i.e. for the timestamp)

In [13]:
bp_rows = final_timeseries_df['itemid'] == 'Blood Pressure'
final_timeseries_df.loc[bp_rows, ['sbp', 'dbp']] = final_timeseries_df.loc[bp_rows, 'value'].str.split('/', expand=True)

final_timeseries_df['sbp'] = pd.to_numeric(final_timeseries_df['sbp'], errors='coerce')
final_timeseries_df['dbp'] = pd.to_numeric(final_timeseries_df['dbp'], errors='coerce')

final_timeseries_df.loc[bp_rows, ['itemid', 'value']] = None
final_timeseries_df['value'] = pd.to_numeric(final_timeseries_df['value'], errors='coerce')

In [14]:
final_timeseries_df[final_timeseries_df['stay_id'].notna() & final_timeseries_df['hadm_id'].isna()].shape

(384196, 29)

In [15]:
# if hadm_id is null but stay_id is defined, fill hadm_id with the stay_id -- ids will not be factored
# into the final model, so we are not manipulating any of the information
# this will be used for the aggregation going forward

# if hadm_id null but stay_id is defined, then this was an ED visit (if ICU visit, hadm_id will not be null)
final_timeseries_df['is_ed_visit'] = 0
final_timeseries_df.loc[final_timeseries_df['hadm_id'].isna() & final_timeseries_df['stay_id'].notna(), 'is_ed_visit'] = 1
final_timeseries_df['hadm_id'] = final_timeseries_df['hadm_id'].fillna(final_timeseries_df['stay_id'])
# rows with undefined hadm id and stay id will attempt to be filled with transfer id (from ICU)
final_timeseries_df['hadm_id'] = final_timeseries_df['hadm_id'].fillna(final_timeseries_df['transfer_id'])

In [16]:
# pivot itemid column into separate columns so that we can do proper aggregation
# wide-format DataFrame where each unique itemid becomes a column, and the values are populated according to the value field
# pivot requires the combination of index and columns to be unique
df_cleaned = final_timeseries_df.dropna(subset=['itemid', 'value'])
pivot_df = final_timeseries_df.pivot_table(index=['charttime', 'hadm_id'], columns='itemid', values='value', aggfunc='mean')

In [18]:
pivot_df = pivot_df.astype(pd.SparseDtype("float", np.nan))

#### Modify medication fields

In [20]:
# apply to 'medication_prescription'
from scipy.sparse import csr_matrix
# create column for each medication
prescription_df = final_timeseries_df[['subject_id', 'hadm_id', 'medication_prescription']].copy()
# drops from 6 mill to ~92k
prescription_df = prescription_df.dropna()
print(prescription_df.shape)
prescription_dummies = pd.get_dummies(prescription_df['medication_prescription'], prefix='prescription')
prescription_dummies_sparse = csr_matrix(prescription_dummies)
prescription_dummies_df = pd.DataFrame.sparse.from_spmatrix(prescription_dummies_sparse, 
                                                            index=prescription_df.index, 
                                                            columns=prescription_dummies.columns)
 
# drop original column now that it is expanded
prescription_df = pd.concat([prescription_df, prescription_dummies_df], axis=1)
prescription_df.drop('medication_prescription', axis=1, inplace=True)

(92311, 3)


  prescription_dummies_df = pd.DataFrame.sparse.from_spmatrix(prescription_dummies_sparse,


In [21]:
prescription_df.shape

(92311, 291)

In [22]:
exclude_cols = ['charttime', 'subject_id', 'hadm_id']
cols_to_agg = [col for col in prescription_df.columns if col not in exclude_cols]

prescription_df = prescription_df.reset_index()
prescription_df = prescription_df.groupby('hadm_id').agg({
    'charttime': 'first',
    **{col: 'mean' for col in cols_to_agg}
}).reset_index()
prescription_df.set_index('charttime', inplace=True)

  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = prescription_df.groupby('hadm_id').agg({
  prescription_df = pres

In [23]:
prescription_df.head()

Unnamed: 0_level_0,hadm_id,prescription_acetaminophen,prescription_acetazolamide,prescription_acetylcysteine,prescription_acyclovir sodium,prescription_adenosine,prescription_albumin (human),prescription_albuterol sulfate,prescription_alemtuzumab,prescription_allopurinol,...,prescription_valganciclovir hydrochloride,prescription_valproate sodium,prescription_vancomycin hydrochloride,prescription_vecuronium bromide,prescription_vedolizumab,prescription_vinblastine sulfate,prescription_vincristine sulfate,prescription_vinorelbine,prescription_voriconazole,prescription_zoledronic acid
charttime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2190-01-15 22:00:00,20000057.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2135-10-25 12:00:00,20000102.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2137-06-29 09:00:00,20001461.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2198-08-24 08:00:00,20001720.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2192-06-08 20:00:00,20001863.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# drop these fields from the main df
final_timeseries_df = final_timeseries_df.drop(columns=['medication_prescription', 'medication_dispensation',
                                                        'medication_reconciliation'])

Apply aggregation to the remaining fields:

In [34]:
# drop since they are contained in separate pivot table
final_timeseries_df = final_timeseries_df.drop(columns=['itemid', 'value'])

In [35]:
# ensure there are no rows where stay_id defined but hadm_id not defined
final_timeseries_df[final_timeseries_df['stay_id'].notna() & final_timeseries_df['hadm_id'].isna()].shape

(0, 25)

In [36]:
# see how many rows are not attached to any kind of stay
final_timeseries_df[final_timeseries_df['stay_id'].isna() & final_timeseries_df['hadm_id'].isna()].shape[0] / \
final_timeseries_df.shape[0]

0.38553914277340384

In [37]:
# drop rows where everything is undefined -- likely due to pulling out itemid/value
exclude_columns = ['subject_id', 'is_abnormal_lab_result', 'is_urgent_lab_event', 'is_ed_visit']

# Identify columns to check for NaN values
columns_to_check = [col for col in final_timeseries_df.columns if col not in exclude_columns]
final_timeseries_df = final_timeseries_df.dropna(how='all', subset=columns_to_check)

In [38]:
# see which fields are undefined for these 
final_timeseries_df[final_timeseries_df['stay_id'].isna() & final_timeseries_df['hadm_id'].isna()].isna().sum()

subject_id                                 0
hadm_id                               343969
stay_id                               343969
procedure_locationcategory            343969
procedure_ordercategoryname           343969
procedure_ordercategorydescription    343969
patientweight                          36399
duration                              343969
microbiology_orgname                  307570
curr_service                          343969
transfer_id                           343969
transfer_event                        343969
care_unit_group                       343969
disposition                           343969
arrived_by_urgent_transport           343969
temperature                           343969
heartrate                             343969
resprate                              343969
o2sat                                 343969
sbp                                   343969
dbp                                   343969
pain                                  343969
is_abnorma

In [39]:
final_timeseries_df[final_timeseries_df['patientweight'].notna()]['subject_id'].nunique()

15988

In [40]:
final_timeseries_df['subject_id'].nunique()

19076

In [41]:
final_timeseries_df[final_timeseries_df['patientweight'].notna()]['hadm_id'].isna().sum()

307570

In [42]:
final_timeseries_df[final_timeseries_df['patientweight'].notna()]['hadm_id'].notna().sum()

36910

We have important patient weight data available, and we have it available for a lot of patients, so we don't want to drop this column. For patient weights that are not NA, the overwhelming majority of hadm id's are NA... decision therefore is to try and take an aggregation of the patient's weight from what's available in the time series dataframe, and make it a static field instead... Will need to add this to list of assumptions

In [43]:
# do mean aggregation grouped by subject id so that we end up with single value per patient id
mean_weight_df = final_timeseries_df.groupby('subject_id')['patientweight'].mean().reset_index()
mean_weight_df.rename(columns={'patientweight': 'mean_patient_weight'}, inplace=True)
mean_weight_df.head()

Unnamed: 0,subject_id,mean_patient_weight
0,10000719,107.047712
1,10001319,
2,10001472,78.83429
3,10001884,67.278856
4,10002266,65.118802


In [44]:
# add to static data
static_df = read_csv('final_dfs/static_data.csv')

final_dfs/static_data.csv
Shape: (668038, 22)
   subject_id     stay_id icd_code        source  hadm_id  temperature  \
0    10001884  31306678.0      J45  ed_diagnosis      NaN          NaN   
1    10001884  31742950.0      J44  ed_diagnosis      NaN          NaN   
2    10001884  33281437.0      R06  ed_diagnosis      NaN          NaN   
3    10001884  33281437.0      I10  ed_diagnosis      NaN          NaN   
4    10001884  33281437.0      J45  ed_diagnosis      NaN          NaN   

   heartrate  resprate  o2sat  sbp  ...  acuity  is_preg_chief_complaint  \
0        NaN       NaN    NaN  NaN  ...     NaN                      NaN   
1        NaN       NaN    NaN  NaN  ...     NaN                      NaN   
2        NaN       NaN    NaN  NaN  ...     NaN                      NaN   
3        NaN       NaN    NaN  NaN  ...     NaN                      NaN   
4        NaN       NaN    NaN  NaN  ...     NaN                      NaN   

   admittime  dischtime admission_location discharge

  df = pd.read_csv(csv_file_path)


In [46]:
save_df_as_csv(static_df, 'static_data.csv', 'final_dfs')

DataFrame has been saved as final_dfs/static_data.csv


In [47]:
final_timeseries_df = final_timeseries_df.drop(columns=['patientweight'])
final_timeseries_df = final_timeseries_df.dropna(subset=[col for col in final_timeseries_df.columns if col != 'subject_id'], how='all')

#### Handling undefined hadm_id for microbiology rows
The documentation suggests that microbiology rows that do not have `hadm_id` defined should use the nearest `hadm_id` based on time. This will require sorting the dataframe by subject_id and charttime, and using backfill to fill in the hadm_id values where microbiology_orgname is defined but hadm_id is missing.

In [48]:
final_timeseries_df['microbiology_orgname'].head()

charttime
2109-04-03    NaN
2109-04-07    NaN
2109-04-08    NaN
2109-04-26    NaN
2109-06-14    NaN
Name: microbiology_orgname, dtype: object

In [49]:
final_timeseries_df.sort_index(inplace=True)
def fill_hadm_id(group):
    group['hadm_id'] = group['hadm_id'].fillna(method='bfill')
    # if backfill doesn't work, try forward fill
    group['hadm_id'] = group['hadm_id'].fillna(method='ffill')
    return group

final_timeseries_df = final_timeseries_df.groupby('subject_id').apply(fill_hadm_id)

  group['hadm_id'] = group['hadm_id'].fillna(method='bfill')
  group['hadm_id'] = group['hadm_id'].fillna(method='ffill')
  final_timeseries_df = final_timeseries_df.groupby('subject_id').apply(fill_hadm_id)


In [50]:
# see updated results
exclude_columns = ['subject_id', 'is_abnormal_lab_result', 'is_urgent_lab_event', 'is_ed_visit']

# Identify columns to check for NaN values
columns_to_check = [col for col in final_timeseries_df.columns if col not in exclude_columns]
final_timeseries_df = final_timeseries_df.dropna(how='all', subset=columns_to_check)

In [52]:
final_timeseries_df[final_timeseries_df['stay_id'].isna() & final_timeseries_df['hadm_id'].isna()].isna().sum()

subject_id                            0
hadm_id                               0
stay_id                               0
procedure_locationcategory            0
procedure_ordercategoryname           0
procedure_ordercategorydescription    0
duration                              0
microbiology_orgname                  0
curr_service                          0
transfer_id                           0
transfer_event                        0
care_unit_group                       0
disposition                           0
arrived_by_urgent_transport           0
temperature                           0
heartrate                             0
resprate                              0
o2sat                                 0
sbp                                   0
dbp                                   0
pain                                  0
is_abnormal_lab_result                0
is_urgent_lab_event                   0
is_ed_visit                           0
dtype: int64

In [53]:
agg_functions = {
    #'subject_id': 'first',  # Assuming same subject per hour
    #'hadm_id': 'first',  # Assuming same admission per hour
    'stay_id': 'first',  # Assuming same stay per hour
    'procedure_locationcategory': 'first',
    'procedure_ordercategoryname': 'first',
    'procedure_ordercategorydescription': 'first',
    # total duration per hour
    'duration': 'sum',
    # assume one per hour
    'microbiology_orgname': 'first',
    'curr_service': 'first',
    # assuming same transfer per hour
    'transfer_id': 'first',
    'transfer_event': 'first',
    'care_unit_group': 'first',
    # assume only one disposition within the hour
    'disposition': 'first',
    # tracking presence of field
    'arrived_by_urgent_transport': 'max',
    'heartrate': 'mean',
    'resprate': 'mean',
    'o2sat': 'mean',
    'sbp': 'mean',
    'dbp': 'mean',
    'pain': 'mean',
    'charttime': 'first',
    'is_ed_visit': 'max',
    'is_urgent_lab_event': 'max',
    'is_abnormal_lab_result': 'max'
}

In [54]:
# get the indices as columns
df_reset = final_timeseries_df.drop(columns=['subject_id']).reset_index()
aggregated_df = df_reset.groupby('hadm_id').agg(agg_functions).reset_index()
print(aggregated_df.columns)

df_with_hadm_id = df_reset[['hadm_id', 'subject_id']]
aggregated_df = pd.merge( aggregated_df, df_with_hadm_id, on='hadm_id', how='left')

# Set 'subject_id' and 'charttime' back as MultiIndex if required
aggregated_df.set_index(['subject_id', 'charttime'], inplace=True)
aggregated_df.drop_duplicates(inplace=True)

Index(['hadm_id', 'stay_id', 'procedure_locationcategory',
       'procedure_ordercategoryname', 'procedure_ordercategorydescription',
       'duration', 'microbiology_orgname', 'curr_service', 'transfer_id',
       'transfer_event', 'care_unit_group', 'disposition',
       'arrived_by_urgent_transport', 'heartrate', 'resprate', 'o2sat', 'sbp',
       'dbp', 'pain', 'charttime', 'is_ed_visit', 'is_urgent_lab_event',
       'is_abnormal_lab_result'],
      dtype='object')


In [55]:
aggregated_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,hadm_id,stay_id,procedure_locationcategory,procedure_ordercategoryname,procedure_ordercategorydescription,duration,microbiology_orgname,curr_service,transfer_id,transfer_event,...,arrived_by_urgent_transport,heartrate,resprate,o2sat,sbp,dbp,pain,is_ed_visit,is_urgent_lab_event,is_abnormal_lab_result
subject_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
16925328,2151-05-25 15:03:00,20000024.0,35408147.0,,,,0 days 09:00:000 days 09:00:000 days 18:33:480...,,MED,35408147.0,ED,...,0.0,,,,,,,0,1,1
11146739,2190-01-15 14:41:00,20000057.0,34390189.0,,,,0 days 03:59:000 days 03:59:002 days 21:25:240...,,MED,34390189.0,ED,...,0.0,,,,,,,0,1,1
14546051,2130-12-10 00:00:00,20000069.0,,,,,0 days 05:15:531 days 19:46:180 days 00:00:00,GRAM POSITIVE BACTERIA,OBS,32483896.0,admit,...,,,,,,,,0,0,0
13074106,2135-05-20 00:00:00,20000102.0,,,,,0 days 08:33:123 days 03:31:050 days 23:00:003...,,OBS,38042169.0,admit,...,,,,,,,,0,0,1
13559141,2174-12-06 00:00:00,20000347.0,37412591.0,,,,0 days 06:10:000 days 06:10:001 days 23:47:430...,SERRATIA MARCESCENS,MED,37412591.0,ED,...,0.0,,,,,,,0,0,1


In [56]:
aggregated_df.shape

(110296, 22)

In [57]:
aggregated_df.isna().sum()

hadm_id                                    0
stay_id                                44466
procedure_locationcategory            105689
procedure_ordercategoryname           105689
procedure_ordercategorydescription    105689
duration                                   0
microbiology_orgname                   99324
curr_service                           53474
transfer_id                            20609
transfer_event                         20609
care_unit_group                        20609
disposition                            67158
arrived_by_urgent_transport            67158
heartrate                              68838
resprate                               68893
o2sat                                  69982
sbp                                    68848
dbp                                    68852
pain                                   71971
is_ed_visit                                0
is_urgent_lab_event                        0
is_abnormal_lab_result                     0
dtype: int

In [58]:
aggregated_df.drop(columns=['stay_id'], inplace=True)

In [59]:
# verify that number of rows in the aggregated dataframe correspond to number of unique admission ids
final_timeseries_df['hadm_id'].nunique() == aggregated_df.shape[0]

True

### Static Data Inspection and Interpolation

In [60]:
patient_df = read_csv('final_dfs/patient_data.csv')

final_dfs/patient_data.csv
Shape: (19106, 10)
   subject_id     hadm_id insurance language marital_status   race  dod  \
0    10000719  24558333.0     Other  ENGLISH         SINGLE  White  NaN   
1    10001319  23005466.0     Other  ENGLISH        MARRIED  White  NaN   
2    10001319  24591241.0     Other  ENGLISH        MARRIED  White  NaN   
3    10001319  29230609.0     Other  ENGLISH        MARRIED  White  NaN   
4    10001472  23506139.0     Other  ENGLISH        MARRIED  White  NaN   

   age_at_admission  admit_date    source  
0                34  2140-04-15  patients  
1                30  2135-07-20  patients  
2                33  2138-11-09  patients  
3                29  2134-04-15  patients  
4                35  2186-01-10  patients  


Now that we are aggregated by admission, we need to aggregate the patients table so that each patient is represented by a single row.
- Columns to keep: `subject_id`, `insurance`, `language`, `marital_status`, `race`, `dod`
- Columns to completely remove: `admit_date`, `source`
- Columns to move to aggregated time series table: `age_at_admission` (join on `hadm_id`)

In [61]:
age_df = patient_df[['hadm_id', 'age_at_admission']]
patient_df = patient_df.drop(columns=['hadm_id', 'age_at_admission', 'admit_date', 'source'])
# preserve multindex by resetting it
aggregated_df = aggregated_df.reset_index()
aggregated_df = pd.merge(aggregated_df, age_df, on='hadm_id', how='left')
aggregated_df.set_index(['subject_id', 'charttime'], inplace=True)

In [62]:
aggregated_df['age_at_admission'].isna().sum()

94264

In [63]:
patient_df['dod_binary'] = patient_df['dod'].notna().astype(int)
patient_df.drop(columns=['dod'], inplace=True)
patient_df['dod_binary'].value_counts()

dod_binary
0    19062
1       44
Name: count, dtype: int64

In [64]:
aggregated_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,hadm_id,procedure_locationcategory,procedure_ordercategoryname,procedure_ordercategorydescription,duration,microbiology_orgname,curr_service,transfer_id,transfer_event,care_unit_group,...,heartrate,resprate,o2sat,sbp,dbp,pain,is_ed_visit,is_urgent_lab_event,is_abnormal_lab_result,age_at_admission
subject_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
16925328,2151-05-25 15:03:00,20000024.0,,,,0 days 09:00:000 days 09:00:000 days 18:33:480...,,MED,35408147.0,ED,Emergency,...,,,,,,,0,1,1,
11146739,2190-01-15 14:41:00,20000057.0,,,,0 days 03:59:000 days 03:59:002 days 21:25:240...,,MED,34390189.0,ED,Emergency,...,,,,,,,0,1,1,
14546051,2130-12-10 00:00:00,20000069.0,,,,0 days 05:15:531 days 19:46:180 days 00:00:00,GRAM POSITIVE BACTERIA,OBS,32483896.0,admit,Labor & Delivery,...,,,,,,,0,0,0,32.0
13074106,2135-05-20 00:00:00,20000102.0,,,,0 days 08:33:123 days 03:31:050 days 23:00:003...,,OBS,38042169.0,admit,Labor & Delivery,...,,,,,,,0,0,1,18.0
13559141,2174-12-06 00:00:00,20000347.0,,,,0 days 06:10:000 days 06:10:001 days 23:47:430...,SERRATIA MARCESCENS,MED,37412591.0,ED,Emergency,...,,,,,,,0,0,1,


In [65]:
static_df.head()

Unnamed: 0,subject_id,stay_id,icd_code,source,hadm_id,temperature,heartrate,resprate,o2sat,sbp,...,acuity,is_preg_chief_complaint,admittime,dischtime,admission_location,discharge_location,hospital_expire_flag,mean_patient_weight_x,mean_patient_weight_y,mean_patient_weight
0,10001884,31306678.0,J45,ed_diagnosis,,,,,,,...,,,,,,,,67.278856,67.278856,67.278856
1,10001884,31742950.0,J44,ed_diagnosis,,,,,,,...,,,,,,,,67.278856,67.278856,67.278856
2,10001884,33281437.0,R06,ed_diagnosis,,,,,,,...,,,,,,,,67.278856,67.278856,67.278856
3,10001884,33281437.0,I10,ed_diagnosis,,,,,,,...,,,,,,,,67.278856,67.278856,67.278856
4,10001884,33281437.0,J45,ed_diagnosis,,,,,,,...,,,,,,,,67.278856,67.278856,67.278856


In [66]:
static_df['hadm_id'] = static_df['hadm_id'].fillna(static_df['stay_id'])
static_df.drop(columns=['stay_id', 'source', 'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp',
                        'pain', 'acuity', 'mean_patient_weight_x', 'mean_patient_weight_y'], inplace=True)

### Combining Dataframes
- Decision - combine admission-level data and static data to use in FFN
- vitals data is time series, so keep that separate
- the admission-aggregated data is considered "static" because it is still static in the context of the admission
- Dataframes to combine on `hadm_id`: `prescription_df`, `dispensation_df`, `reconciliation_df`, `static_df`, `aggregated_df`, `patient_df`
- use a sequential approach to avoid errors

In [67]:
prescription_df.head()

Unnamed: 0_level_0,hadm_id,prescription_acetaminophen,prescription_acetazolamide,prescription_acetylcysteine,prescription_acyclovir sodium,prescription_adenosine,prescription_albumin (human),prescription_albuterol sulfate,prescription_alemtuzumab,prescription_allopurinol,...,prescription_valganciclovir hydrochloride,prescription_valproate sodium,prescription_vancomycin hydrochloride,prescription_vecuronium bromide,prescription_vedolizumab,prescription_vinblastine sulfate,prescription_vincristine sulfate,prescription_vinorelbine,prescription_voriconazole,prescription_zoledronic acid
charttime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2190-01-15 22:00:00,20000057.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2135-10-25 12:00:00,20000102.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2137-06-29 09:00:00,20001461.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2198-08-24 08:00:00,20001720.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2192-06-08 20:00:00,20001863.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
# merge aggregated_df with prescription_df
aggregated_df = aggregated_df.reset_index()
print("Shape of aggregated_df", aggregated_df.shape)
print("Shape of prescription_df", prescription_df.shape)
prescription_columns = prescription_df.columns
merged_df = pd.merge(aggregated_df, prescription_df, on='hadm_id', how='left')
for col in prescription_columns:
    merged_df[col] = merged_df[col].astype(float)
fill_values = {col: 0 for col in prescription_columns}
merged_df.fillna(fill_values, inplace=True)
print("Shape of merged df", merged_df.shape)
merged_df.head()

Shape of aggregated_df (110296, 24)
Shape of prescription_df (24443, 290)
Shape of merged df (110296, 313)


Unnamed: 0,subject_id,charttime,hadm_id,procedure_locationcategory,procedure_ordercategoryname,procedure_ordercategorydescription,duration,microbiology_orgname,curr_service,transfer_id,...,prescription_valganciclovir hydrochloride,prescription_valproate sodium,prescription_vancomycin hydrochloride,prescription_vecuronium bromide,prescription_vedolizumab,prescription_vinblastine sulfate,prescription_vincristine sulfate,prescription_vinorelbine,prescription_voriconazole,prescription_zoledronic acid
0,16925328,2151-05-25 15:03:00,20000024.0,,,,0 days 09:00:000 days 09:00:000 days 18:33:480...,,MED,35408147.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11146739,2190-01-15 14:41:00,20000057.0,,,,0 days 03:59:000 days 03:59:002 days 21:25:240...,,MED,34390189.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14546051,2130-12-10 00:00:00,20000069.0,,,,0 days 05:15:531 days 19:46:180 days 00:00:00,GRAM POSITIVE BACTERIA,OBS,32483896.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,13074106,2135-05-20 00:00:00,20000102.0,,,,0 days 08:33:123 days 03:31:050 days 23:00:003...,,OBS,38042169.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13559141,2174-12-06 00:00:00,20000347.0,,,,0 days 06:10:000 days 06:10:001 days 23:47:430...,SERRATIA MARCESCENS,MED,37412591.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
static_df.columns

Index(['subject_id', 'icd_code', 'hadm_id', 'is_preg_chief_complaint',
       'admittime', 'dischtime', 'admission_location', 'discharge_location',
       'hospital_expire_flag', 'mean_patient_weight'],
      dtype='object')

In [72]:
static_df['is_preg_chief_complaint'] = static_df['is_preg_chief_complaint'].fillna(0)
static_df['hospital_expire_flag']= static_df['hospital_expire_flag'].fillna(0)
static_df = static_df.drop_duplicates()
print(static_df.shape)

(668038, 10)


In [73]:
static_df.head()

Unnamed: 0,subject_id,icd_code,hadm_id,is_preg_chief_complaint,admittime,dischtime,admission_location,discharge_location,hospital_expire_flag,mean_patient_weight
0,10001884,J45,31306678.0,0.0,,,,,0.0,67.278856
1,10001884,J44,31742950.0,0.0,,,,,0.0,67.278856
2,10001884,R06,33281437.0,0.0,,,,,0.0,67.278856
3,10001884,I10,33281437.0,0.0,,,,,0.0,67.278856
4,10001884,J45,33281437.0,0.0,,,,,0.0,67.278856


In [74]:
# strip white space
static_df['icd_code'] = static_df['icd_code'].str.strip()
static_df_dummies = pd.get_dummies(static_df['icd_code'])
static_df_dummies = static_df_dummies.astype(int)
print('length of static df with dummies for icd code', static_df_dummies.shape[1])


count_occurrences = static_df_dummies.sum()

# Filter columns where the count of 1s is greater than 3
columns_to_keep = count_occurrences[count_occurrences > 3].index

# Keep only the relevant columns in static_df_dummies
static_df_dummies_filtered = static_df_dummies[columns_to_keep]
print('length of static df with dummies after filtering', static_df_dummies_filtered.shape[1])

static_df_with_dummies = pd.concat([static_df[['hadm_id']], static_df_dummies_filtered], axis=1)
static_df = static_df.drop(columns=['icd_code'])
static_df = static_df.drop_duplicates()
static_df = static_df.groupby('hadm_id').first().reset_index()
static_df_with_dummies = static_df_with_dummies.groupby('hadm_id').max().reset_index()
static_df_with_dummies = pd.concat([static_df, static_df_with_dummies], axis=1)
print(static_df['hadm_id'].nunique())
print(static_df_with_dummies.shape)

length of static df with dummies for icd code 2158
length of static df with dummies after filtering 1554
100020
(100020, 1564)


In [75]:
print(static_df.shape)

(100020, 9)


In [76]:
static_df_with_dummies = static_df_with_dummies.loc[:, ~static_df_with_dummies.columns.duplicated()]

In [77]:
# merge merged_df with static_df
print("Shape of merged_df", merged_df.shape)
print("Shape of static_df", static_df_with_dummies.shape)
merged_df = pd.merge(merged_df, static_df_with_dummies, on='hadm_id', how='left')
print("Shape of merged df", merged_df.shape)
merged_df.head()

Shape of merged_df (110296, 313)
Shape of static_df (100020, 1563)
Shape of merged df (110296, 1875)


Unnamed: 0,subject_id_x,charttime,hadm_id,procedure_locationcategory,procedure_ordercategoryname,procedure_ordercategorydescription,duration,microbiology_orgname,curr_service,transfer_id,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
0,16925328,2151-05-25 15:03:00,20000024.0,,,,0 days 09:00:000 days 09:00:000 days 18:33:480...,,MED,35408147.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11146739,2190-01-15 14:41:00,20000057.0,,,,0 days 03:59:000 days 03:59:002 days 21:25:240...,,MED,34390189.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14546051,2130-12-10 00:00:00,20000069.0,,,,0 days 05:15:531 days 19:46:180 days 00:00:00,GRAM POSITIVE BACTERIA,OBS,32483896.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,13074106,2135-05-20 00:00:00,20000102.0,,,,0 days 08:33:123 days 03:31:050 days 23:00:003...,,OBS,38042169.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13559141,2174-12-06 00:00:00,20000347.0,,,,0 days 06:10:000 days 06:10:001 days 23:47:430...,SERRATIA MARCESCENS,MED,37412591.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
merged_df = merged_df.rename(columns={'subject_id_x': 'subject_id'})

In [79]:
# merge merged_df with patient_df
patient_df = patient_df.drop_duplicates(subset='subject_id', keep='first')
print("Shape of merged_df", merged_df.shape)
print("Shape of patient_df", patient_df.shape)
merged_df = pd.merge(merged_df, patient_df, on='subject_id', how='left')
print("Shape of merged df", merged_df.shape)
merged_df.head()

Shape of merged_df (110296, 1875)
Shape of patient_df (13951, 6)
Shape of merged df (110296, 1880)


Unnamed: 0,subject_id,charttime,hadm_id,procedure_locationcategory,procedure_ordercategoryname,procedure_ordercategorydescription,duration,microbiology_orgname,curr_service,transfer_id,...,Z95,Z96,Z97,Z98,Z99,insurance,language,marital_status,race,dod_binary
0,16925328,2151-05-25 15:03:00,20000024.0,,,,0 days 09:00:000 days 09:00:000 days 18:33:480...,,MED,35408147.0,...,0.0,0.0,0.0,0.0,0.0,,,,,
1,11146739,2190-01-15 14:41:00,20000057.0,,,,0 days 03:59:000 days 03:59:002 days 21:25:240...,,MED,34390189.0,...,0.0,0.0,0.0,0.0,0.0,,,,,
2,14546051,2130-12-10 00:00:00,20000069.0,,,,0 days 05:15:531 days 19:46:180 days 00:00:00,GRAM POSITIVE BACTERIA,OBS,32483896.0,...,0.0,0.0,0.0,0.0,0.0,Other,ENGLISH,MARRIED,Hispanic or Latino,0.0
3,13074106,2135-05-20 00:00:00,20000102.0,,,,0 days 08:33:123 days 03:31:050 days 23:00:003...,,OBS,38042169.0,...,0.0,0.0,0.0,0.0,0.0,Other,ENGLISH,SINGLE,Black or African American,0.0
4,13559141,2174-12-06 00:00:00,20000347.0,,,,0 days 06:10:000 days 06:10:001 days 23:47:430...,SERRATIA MARCESCENS,MED,37412591.0,...,0.0,0.0,0.0,0.0,0.0,Other,ENGLISH,MARRIED,White,1.0


In [80]:
print(merged_df['hadm_id'].nunique())

110296


### Handle NA Values

In [82]:
# handle insurance, language, marital_status, race
fallback_df = merged_df.groupby('subject_id').agg({
    'insurance': 'first',
    'language': 'first',
    'marital_status': 'first',
    'race': 'first'
}).reset_index()
df_merged = pd.merge(merged_df, fallback_df, on='subject_id', suffixes=('', '_fallback'))

df_merged['insurance'] = df_merged['insurance'].fillna(df_merged['insurance_fallback'])
df_merged['language'] = df_merged['language'].fillna(df_merged['language_fallback'])
df_merged['marital_status'] = df_merged['marital_status'].fillna(df_merged['marital_status_fallback'])
df_merged['race'] = df_merged['race'].fillna(df_merged['race_fallback'])

merged_df_with_dems = df_merged.drop(columns=['insurance_fallback', 'language_fallback', 'marital_status_fallback', 'race_fallback'])

In [83]:
merged_df_with_dems['race'].isna().sum()

56219

In [84]:
subject_ids_with_na = merged_df[merged_df['insurance'].isna()]['subject_id'].unique()
subject_ids_with_nat = subject_ids_with_na.tolist()
filtered_df = merged_df[merged_df['subject_id'].isin(subject_ids_with_na)]
print(filtered_df.shape[0])

56219


In [85]:
# fill NA with other
merged_df['insurance'] = merged_df['insurance'].fillna('UNKNOWN')
merged_df['language'] = merged_df['language'].fillna('UNKNOWN')
merged_df['marital_status'] = merged_df['marital_status'].fillna('UNKNOWN')
merged_df['race'] = merged_df['race'].fillna('Unknown')
merged_df['dod_binary'] = merged_df['dod_binary'].fillna(0)
merged_df['hospital_expire_flag'] = merged_df['hospital_expire_flag'].fillna(0)
merged_df['arrived_by_urgent_transport'] = merged_df['arrived_by_urgent_transport'].fillna(0)

In [86]:
# drop field because most are NA and those that aren't are primarily Unknown
merged_df = merged_df.drop(columns=['procedure_locationcategory'])

In [87]:
# information is already captured by 'transfer_event' column
merged_df = merged_df.drop(columns=['transfer_id'])

In [88]:
merged_df['procedure_during_admission'] = merged_df['procedure_ordercategoryname'].notna().astype(int)
merged_df = merged_df.drop(columns=['procedure_ordercategoryname', 'procedure_ordercategorydescription'])

In [89]:
merged_df['microbiology_orgname'] = merged_df['microbiology_orgname'].fillna('None')

In [91]:
merged_df['curr_service'] = np.where(
    (merged_df['is_ed_visit'] == 1) & (merged_df['curr_service'].isna()),
    'ED',
    merged_df['curr_service'].fillna('UNKNOWN'))

In [93]:
merged_df['discharge_location'].value_counts()

discharge_location
Home/Home Health Care                  39388
Unknown                                10100
Facility Care                           6289
Died                                     456
Left against Medical Advice              302
Another Medical/Healthcare Facility      287
Name: count, dtype: int64

In [94]:
# map "discharge_location" and "disposition" to single column
disposition_mapping = {
    "Home/Home Health Care": "HOME",
    "Unknown": "UNKNOWN",
    "Facility Care": "TRANSFER",
    "Died": "EXPIRED",
    "Left against Medical Advice": "LEFT",
    "Another Medical/Healthcare Facility": "TRANSFER"
}
merged_df['mapped_discharge_location'] = merged_df['discharge_location'].map(disposition_mapping)
merged_df['disposition'] = merged_df['mapped_discharge_location'].combine_first(merged_df['disposition'])
merged_df['disposition'] = merged_df['disposition'].fillna('UNKNOWN')
merged_df = merged_df.drop(columns=['discharge_location', 'mapped_discharge_location'])

In [95]:
merged_df['disposition'].value_counts()

disposition
HOME        59642
UNKNOWN     41130
TRANSFER     7340
LEFT         1431
EXPIRED       473
OTHER         204
ADMITTED       76
Name: count, dtype: int64

In [96]:
merged_df['transfer_event'].value_counts()

transfer_event
ED       63537
admit    26150
Name: count, dtype: int64

In [97]:
merged_df[merged_df['transfer_event'].isna()]['is_ed_visit'].value_counts()

is_ed_visit
1    20609
Name: count, dtype: int64

In [98]:
# create new field 'transfer_status' so we can keep the existing information of transfer to ED/hospital admission,
# will making sure that the ED rows are not NA
merged_df['transfer_status'] = merged_df['transfer_event']
merged_df['transfer_status'] = merged_df['transfer_status'].fillna('Stayed in ED')
merged_df = merged_df.drop(columns=['transfer_event'])
merged_df['transfer_status'].value_counts()

transfer_status
ED              63537
admit           26150
Stayed in ED    20609
Name: count, dtype: int64

In [99]:
merged_df['care_unit_group'] = merged_df['care_unit_group'].fillna('Emergency')

In [100]:
# all rows are either set to 0 or NA -- no useful information from this column
merged_df = merged_df.drop(columns=['arrived_by_urgent_transport'])

In [101]:
# field not needed
merged_df = merged_df.drop(columns=['subject_id_y'])

In [102]:
merged_df['mean_patient_weight'].describe()

count    93903.000000
mean        78.799214
std         39.141122
min          0.000000
25%         62.656175
50%         73.539842
75%         88.959377
max       1763.088797
Name: mean_patient_weight, dtype: float64

In [103]:
pat_weight_median = merged_df['mean_patient_weight'].median()
q25 = merged_df['mean_patient_weight'].quantile(0.25)
# fill NA values with median
merged_df['mean_patient_weight'] = merged_df['mean_patient_weight'].fillna(pat_weight_median)
merged_df.loc[merged_df['mean_patient_weight'] < q25, 'mean_patient_weight'] = pat_weight_median
merged_df.loc[merged_df['mean_patient_weight'] > 250, 'mean_patient_weight'] = pat_weight_median

In [104]:
merged_df['mean_patient_weight'].describe()

count    110296.000000
mean         81.447121
std          17.763564
min          62.656175
25%          73.539842
50%          73.539842
75%          85.479122
max         245.072734
Name: mean_patient_weight, dtype: float64

In [105]:
merged_df[merged_df['age_at_admission'].notna()]['subject_id'].nunique()

11723

In [106]:
def get_patient_table_data(connection):
    cur = connection.cursor()
    cur.execute("""
        SELECT subject_id, gender, anchor_age, anchor_year, anchor_year_group, dod
        FROM "mimiciv_hosp.filtered_patients"
    """)
    rows = cur.fetchall()
    cur.close()
    df = pd.DataFrame(rows, columns=["subject_id", "gender", "anchor_age",
                                    "anchor_year", "anchor_year_group",
                                    "dod"])
    return df
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
patients_df = get_patient_table_data(connection)
patients_df.head()

Connected to db: smcdougall


Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000719,F,34,2140,2008 - 2010,
1,10001319,F,28,2133,2008 - 2010,
2,10001472,F,35,2186,2011 - 2013,
3,10001884,F,68,2122,2008 - 2010,2131-01-20
4,10002266,F,31,2124,2011 - 2013,


In [107]:
patients_df['birth_year'] = patients_df['anchor_year'] - patients_df['anchor_age']
merged_df = pd.merge(merged_df, patients_df[['subject_id', 'birth_year']], on='subject_id', how='left')

In [108]:
merged_df['age_at_admission'] = merged_df['charttime'].dt.year - merged_df['birth_year']

In [109]:
merged_df = merged_df.drop(columns=['birth_year'])

Vital signs considered outside thresholds:
- heart rate = > 350 or < 86
- sbp/dbp - > 180 < 90, > 120, < 60
- resp rate - > 25 or < 12

In [110]:
merged_df['triage_heartrate_outside_thresh'] = np.where(
    merged_df['heartrate'].isna(),
    'Not applicable',
    np.where(
        (merged_df['heartrate'] > 350) | (merged_df['heartrate'] < 86),
        "1", # above/below threshold
        "0"
    )
)

In [111]:
merged_df['triage_sbp_outside_thresh'] = np.where(
    merged_df['sbp'].isna(),
    'Not applicable',
    np.where(
        (merged_df['sbp'] > 180) | (merged_df['sbp'] < 90),
        "1", # above/below threshold
        "0"
    )
)

In [112]:
merged_df['triage_dbp_outside_thresh'] = np.where(
    merged_df['dbp'].isna(),
    'Not applicable',
    np.where(
        (merged_df['dbp'] > 120) | (merged_df['dbp'] < 60),
        "1", # above/below threshold
        "0"
    )
)

In [113]:
merged_df['triage_resprate_outside_thresh'] = np.where(
    merged_df['resprate'].isna(),
    'Not applicable',
    np.where(
        (merged_df['resprate'] > 25) | (merged_df['resprate'] < 12),
        "1", # above/below threshold
        "0"
    )
)

In [114]:
merged_df['triage_o2sat_outside_thresh'] = np.where(
    merged_df['o2sat'].isna(),
    'Not applicable',
    np.where(
        (merged_df['o2sat'] > 100) | (merged_df['o2sat'] < 95),
        "1", # above/below threshold
        "0"
    )
)

In [115]:
merged_df['triage_o2sat_outside_thresh'].value_counts()

triage_o2sat_outside_thresh
Not applicable    69982
0                 38954
1                  1360
Name: count, dtype: int64

In [116]:
merged_df = merged_df.drop(columns=[
    'heartrate', 'sbp', 'dbp', 'resprate', 'pain', 'o2sat'
])

In [117]:
merged_df['admission_duration'] = pd.to_datetime(merged_df['dischtime']) - pd.to_datetime(merged_df['admittime'])

In [118]:
merged_df['duration'] = np.where(
    merged_df['admission_duration'].notna(),
    merged_df['admission_duration'],
    merged_df['duration']
)

In [119]:
merged_df = merged_df.drop(columns=['admission_duration', 'dischtime', 'admittime'])

In [120]:
merged_df['admission_location'].value_counts()

admission_location
Emergency/Urgent Care    27674
Referral                 26762
Walk-in                   1748
Procedure Site             605
Unknown                     33
Name: count, dtype: int64

In [121]:
merged_df[merged_df['admission_location'].isna()]['is_ed_visit'].value_counts()

is_ed_visit
1    43053
0    10421
Name: count, dtype: int64

In [122]:
merged_df['admission_location'] = np.where(
    (merged_df['is_ed_visit'] == 1) & (merged_df['admission_location'].isna()),
    'Emergency/Urgent Care',
    merged_df['admission_location']
)
merged_df['admission_location'] = merged_df['admission_location'].fillna('Unknown')

In [123]:
# fill everything else with 0
columns_with_na = merged_df.columns[merged_df.isna().any()].tolist()
chunk_size = 5

# Process columns in chunks
for i in range(0, len(columns_with_na), chunk_size):
    chunk = columns_with_na[i:i + chunk_size]
    merged_df[chunk] = merged_df[chunk].fillna(0)
    print(f'Processed columns: {chunk}')

Processed columns: ['is_preg_chief_complaint', 'A02', 'A04', 'A05', 'A07']
Processed columns: ['A08', 'A09', 'A15', 'A17', 'A31']
Processed columns: ['A32', 'A36', 'A40', 'A41', 'A46']
Processed columns: ['A48', 'A49', 'A52', 'A53', 'A54']
Processed columns: ['A56', 'A57', 'A59', 'A60', 'A63']
Processed columns: ['A64', 'A69', 'A74', 'A87', 'B00']
Processed columns: ['B01', 'B02', 'B08', 'B15', 'B17']
Processed columns: ['B18', 'B19', 'B20', 'B25', 'B27']
Processed columns: ['B30', 'B34', 'B35', 'B36', 'B37']
Processed columns: ['B44', 'B48', 'B49', 'B59', 'B69']
Processed columns: ['B86', 'B91', 'B94', 'B95', 'B96']
Processed columns: ['B97', 'B99', 'C01', 'C02', 'C15']
Processed columns: ['C16', 'C17', 'C18', 'C19', 'C20']
Processed columns: ['C21', 'C22', 'C23', 'C24', 'C25']
Processed columns: ['C26', 'C32', 'C33', 'C34', 'C41']
Processed columns: ['C43', 'C44', 'C48', 'C49', 'C50']
Processed columns: ['C52', 'C53', 'C54', 'C55', 'C56']
Processed columns: ['C57', 'C58', 'C64', 'C67

In [124]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Display NaN count for each column
nan_counts = merged_df.isna().sum()
nonzero_missing_counts = nan_counts[nan_counts > 0]
print(nonzero_missing_counts)

# Reset pandas options to default after viewing
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')

Series([], dtype: int64)


## Save Dataframes

In [125]:
save_df_as_csv(merged_df, 'final_static_data.csv', 'final_dfs')

DataFrame has been saved as final_dfs/final_static_data.csv


In [126]:
unique_ids = merged_df['subject_id'].unique()
unique_ids_df = pd.DataFrame(unique_ids, columns=['subject_id'])
save_df_as_csv(unique_ids_df, 'final_subject_id_list.csv', 'final_dfs')

DataFrame has been saved as final_dfs/final_subject_id_list.csv


In [127]:
save_df_as_csv(pivot_df, 'pivot_df.csv', 'final_dfs')

DataFrame has been saved as final_dfs/pivot_df.csv


## Filter out admissions occurring after the delivery (exluding post-partum period)
- in clinical contexts, this is 6 weeks after delivery
- this will effectively only capture one pregnancy per patient
- relevant ICD-10 codes for delivery: O80-O84, Z38 (liveborn infants), O70-O71 (complications afterwrads)
- will want to remove these codes because they are redundant based on the output value (ex. if someone has code for hemorrhage, they will definitely be marked as part of the "positive" class for a hemorrhage)

In [3]:
input_data = read_csv('final_dfs/final_static_data.csv')

  df = pd.read_csv(csv_file_path)


final_dfs/final_static_data.csv
Shape: (110296, 1871)
   subject_id            charttime     hadm_id         duration  \
0    16925328  2151-05-25 15:03:00  20000024.0   75120000000000   
1    11146739  2190-01-15 14:41:00  20000057.0  255480000000000   
2    14546051  2130-12-10 00:00:00  20000069.0  175560000000000   
3    13074106  2135-05-20 00:00:00  20000102.0  296880000000000   
4    13559141  2174-12-06 00:00:00  20000347.0  177780000000000   

     microbiology_orgname curr_service   care_unit_group disposition  \
0                     NaN          MED         Emergency        HOME   
1                     NaN          MED         Emergency     UNKNOWN   
2  GRAM POSITIVE BACTERIA          OBS  Labor & Delivery        HOME   
3                     NaN          OBS  Labor & Delivery        HOME   
4     SERRATIA MARCESCENS          MED         Emergency     UNKNOWN   

   is_ed_visit  is_urgent_lab_event  ...  marital_status  \
0            0                    1  ...         U

In [4]:
print(input_data.shape)

(110296, 1871)


In [11]:
print(input_data['subject_id'].nunique())

19076


In [6]:
delivery_code_prefixes = ['O6', 'O7', 'O80', 'O81', 'O82', 'Z38']
delivery_code_cols = [col for col in input_data.columns 
                      if any(col.startswith(prefix) for prefix in delivery_code_prefixes)]
print(delivery_code_cols)

['O6002', 'O6003', 'O6012X0', 'O6012X1', 'O6012X2', 'O6013X0', 'O6014X0', 'O6014X1', 'O6014X2', 'O6023X0', 'O610', 'O611', 'O620', 'O621', 'O622', 'O623', 'O624', 'O628', 'O629', 'O630', 'O631', 'O639', 'O640XX0', 'O641XX0', 'O648XX0', 'O655', 'O658', 'O659', 'O660', 'O663', 'O6640', 'O6641', 'O665', 'O670', 'O678', 'O679', 'O68', 'O690XX0', 'O691XX0', 'O692XX0', 'O694XX0', 'O695XX0', 'O6981X0', 'O6981X1', 'O6981X2', 'O6982X0', 'O6989X0', 'O699XX0', 'O700', 'O701', 'O702', 'O7020', 'O7021', 'O703', 'O704', 'O709', 'O711', 'O713', 'O714', 'O715', 'O716', 'O717', 'O7181', 'O7182', 'O7189', 'O720', 'O721', 'O722', 'O723', 'O730', 'O731', 'O745', 'O751', 'O752', 'O753', 'O754', 'O7581', 'O7582', 'O7589', 'O76', 'O770', 'O80', 'O82']


In [8]:
filtered_input_data = input_data.copy()
filtered_input_data['charttime'] = pd.to_datetime(filtered_input_data['charttime'])
filtered_input_data['delivery_admission'] = filtered_input_data[delivery_code_cols].any(axis=1)

delivery_times = filtered_input_data[filtered_input_data['delivery_admission']].groupby('subject_id')['charttime'].min().reset_index()
# delivery time == the charttime when the delivery occurred
delivery_times.columns = ['subject_id', 'delivery_time']

# merge with original df
filtered_input_data = filtered_input_data.merge(delivery_times, on='subject_id', how='left')
# calculate postpartum
filtered_input_data['postpartum_period_end'] = filtered_input_data['delivery_time'] + timedelta(weeks=6)
filtered_input_data['is_postpartum'] = (filtered_input_data['delivery_admission']) & (filtered_input_data['charttime'] <= filtered_input_data['postpartum_period_end'])
filtered_input_data = filtered_input_data[filtered_input_data['is_postpartum'] | ~filtered_input_data['delivery_admission']]
filtered_input_data = filtered_input_data.drop(columns=delivery_code_cols + ['delivery_admission', 'delivery_time', 'postpartum_period_end']) 


In [9]:
print(filtered_input_data.shape)

(109751, 1789)


In [10]:
filtered_input_data['subject_id'].nunique()

19076

In [12]:
save_df_as_csv(filtered_input_data, 'final_static_data.csv', 'final_dfs')

DataFrame has been saved as final_dfs/final_static_data.csv
