# Cleaning:
1. Remove admission_ids which have both icu_mortality and mortality_after_discharge as True
2. Remove 0 hours
3. Remove row in each admission which are very close (<=5 hrs) to discharge. This is to avoid data leakage on discharge decision.
4. Add A
5. Add Y (Mortality within 7 days after discharge)
6. Add D (In-ICU mortality as a competing event)

In [1]:
import pandas as pd
import numpy as np

In [2]:
mimicdata = pd.read_parquet('../../../Datasets/Rounded_12h/df_model_12h_forward_rounded_12h_windows.parquet')

In [3]:
mimicdata = mimicdata.rename(columns={'hours_since_admission__last__overall' : 'hours_since_admission', 'prediction_timestamp': 'observation_window_end'})

In [4]:
mimicdata = mimicdata.sort_values(by=['hours_since_admission'])

In [5]:
mimicdata

Unnamed: 0,hash_patient_id,admission_timestamp,observation_window_end,activated_partial_thromboplastin_time__change_since_previous__last_12h,activated_partial_thromboplastin_time__last__last_12h,age__last__overall,alanine_transaminase__change_since_previous__last_12h,alanine_transaminase__last__last_12h,albumin__change_since_previous__last_12h,albumin__last__last_12h,...,readmission_to_icu,readmission_to_mcu,readmission_to_icu_or_mcu,readmission_or_mortality_after_discharge,outcome_days,table_source,s2g_run_timestamp,s2g_version,g2p_run_timestamp,g2p_version
0,10000032,2180-07-23 14:00:00,2180-07-23 14:00:00,,,52.000000,,,,,...,False,False,False,False,7,,NaT,,2025-01-15 15:03:36.792654,2.1.1
437190,18579410,2160-10-03 08:00:00,2160-10-03 08:00:00,,,55.599998,,,,,...,True,False,True,True,7,,NaT,,2025-01-15 15:03:36.792654,2.1.1
437041,18574585,2139-09-29 08:00:00,2139-09-29 08:00:00,,,57.200001,,,,,...,False,False,False,True,7,,NaT,,2025-01-15 15:03:36.792654,2.1.1
342499,16701193,2188-03-03 16:00:00,2188-03-03 16:00:00,,,48.000000,,,,,...,False,False,False,False,7,,NaT,,2025-01-15 15:03:36.792654,2.1.1
149589,12962986,2187-02-19 01:00:00,2187-02-19 01:00:00,,,57.000000,,,,,...,False,False,False,False,7,,NaT,,2025-01-15 15:03:36.792654,2.1.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364245,17153664,2135-09-25 02:29:00,2136-01-11 03:00:00,,,57.400002,,,,,...,False,False,False,False,7,,NaT,,2025-01-15 15:03:36.792654,2.1.1
364246,17153664,2135-09-25 02:29:00,2136-01-11 15:00:00,,,57.400002,,,,,...,False,False,False,False,7,,NaT,,2025-01-15 15:03:36.792654,2.1.1
364247,17153664,2135-09-25 02:29:00,2136-01-12 03:00:00,,,57.400002,,,,,...,False,False,False,False,7,,NaT,,2025-01-15 15:03:36.792654,2.1.1
364248,17153664,2135-09-25 02:29:00,2136-01-12 15:00:00,,,57.400002,,,,,...,False,False,False,False,7,,NaT,,2025-01-15 15:03:36.792654,2.1.1


In [6]:
mimicdata.shape

(509380, 190)

In [7]:
mimicdata['admission_id'] = mimicdata.admission_timestamp.dt.strftime("%y%m%d%H") + "_" + mimicdata.hash_patient_id.str[:10]  # 10 chars enough to make unique

In [8]:
mimicdata.admission_id.nunique()

69181

In [9]:
nan_columns = mimicdata.columns[mimicdata.isna().all()].tolist()
nan_columns

['origin_department',
 'destination_department',
 'table_source',
 's2g_run_timestamp',
 's2g_version']

In [10]:
mimicdata = mimicdata.drop(columns=nan_columns)

In [11]:
mimicdata.columns

Index(['hash_patient_id', 'admission_timestamp', 'observation_window_end',
       'activated_partial_thromboplastin_time__change_since_previous__last_12h',
       'activated_partial_thromboplastin_time__last__last_12h',
       'age__last__overall',
       'alanine_transaminase__change_since_previous__last_12h',
       'alanine_transaminase__last__last_12h',
       'albumin__change_since_previous__last_12h', 'albumin__last__last_12h',
       ...
       'length_of_stay_hours', 'mortality_after_discharge',
       'readmission_to_icu', 'readmission_to_mcu', 'readmission_to_icu_or_mcu',
       'readmission_or_mortality_after_discharge', 'outcome_days',
       'g2p_run_timestamp', 'g2p_version', 'admission_id'],
      dtype='object', length=186)

In [12]:
mimicdata.iloc[0]

hash_patient_id                                                                             10000032
admission_timestamp                                                              2180-07-23 14:00:00
observation_window_end                                                           2180-07-23 14:00:00
activated_partial_thromboplastin_time__change_since_previous__last_12h                           NaN
activated_partial_thromboplastin_time__last__last_12h                                            NaN
                                                                                     ...            
readmission_or_mortality_after_discharge                                                       False
outcome_days                                                                                       7
g2p_run_timestamp                                                         2025-01-15 15:03:36.792654
g2p_version                                                                                

# Remove admissions which have both icu_mortality=True and mortality_after_discharge=True

In [13]:
# Identify admission_ids where both icu_mortality and mortality_after_discharge are True. These are admissions which have death very close to discharge (<=12 hours)
# means that they were likely icu_mortality but were marked as mortality_after_discharge due to inconsistencies in recording?
admissions_set_icumortality = mimicdata.loc[
    (mimicdata['icu_mortality'] == True) & (mimicdata['mortality_after_discharge'] == True),
    'admission_id'
].unique()

In [14]:
len(admissions_set_icumortality)

297

In [15]:
# Filter mimicdata for those admissions
subset = mimicdata[mimicdata['admission_id'].isin(admissions_set_icumortality)]

# Keep only the last row for each admission_id (assuming time ordering)
last_rows = subset.groupby('admission_id').tail(1)

# Convert to datetime if not already
last_rows['discharge_timestamp'] = pd.to_datetime(last_rows['discharge_timestamp'])
last_rows['observation_window_end'] = pd.to_datetime(last_rows['observation_window_end'])

# Calculate time difference in hours
time_diff_hours = (last_rows['discharge_timestamp'] - last_rows['observation_window_end']).dt.total_seconds() / 3600

# Count how many are within 12 hours
within_hr = time_diff_hours <= 12
result = within_hr.value_counts()
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_rows['discharge_timestamp'] = pd.to_datetime(last_rows['discharge_timestamp'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_rows['observation_window_end'] = pd.to_datetime(last_rows['observation_window_end'])


True    297
Name: count, dtype: int64

In [16]:
#mimicdata = mimicdata[~mimicdata['admission_id'].isin(admissions_to_remove)]
# Update icu_mortality as True and mortality_after_discharge as False for the admissions who have both flags as True
mimicdata.loc[mimicdata['admission_id'].isin(admissions_set_icumortality), 'icu_mortality'] = True
mimicdata.loc[mimicdata['admission_id'].isin(admissions_set_icumortality), 'mortality_after_discharge'] = False

In [17]:
mimicdata.admission_id.nunique()

69181

# Remove first row - 0 hours since admission. Check later if can be used for declaring baseline variables

In [18]:
# Remove the first row for each admission_id
mimicdata_filtered = mimicdata.loc[mimicdata.index.difference(mimicdata.groupby('admission_id').head(1).index)]
#mimicdata = mimicdata.groupby('admission_id', group_keys=False).apply(lambda group: group.iloc[1:])

# Remove last row if time difference is less than or equal to 5 hours

In [19]:
# Group by admission_id and apply logic
def filter_last_row(group):
    # Get the last row
    last_row = group.iloc[-1]
    # Calculate time difference in hours
    time_diff = (last_row['discharge_timestamp'] - last_row['observation_window_end']).total_seconds() / 3600
    # Remove the last row if time difference <= 5 hours
    if time_diff <= 5.0:
        group = group.iloc[:-1]
    return group

# Apply the filtering function
mimicdata = mimicdata.groupby('admission_id', group_keys=False).apply(filter_last_row)

  mimicdata = mimicdata.groupby('admission_id', group_keys=False).apply(filter_last_row)


In [20]:
mimicdata.shape

(480205, 186)

In [21]:
mimicdata.admission_id.nunique()

68409

# Add 't0'

In [22]:
mimicdata.loc[:, 't0'] = np.nan

In [23]:
# Group by 'admission_id' and create an incremental index for 't0'
mimicdata['t0'] = mimicdata.groupby('admission_id').cumcount()

# Add A, Y and D

In [24]:
mimicdata.loc[:, 'A'] = np.nan
mimicdata.loc[:, 'D'] = np.nan #Only when A=0 AND (icu_mortality occurs, D=1 or no icu mortality, D=0). If A=1, D=np.nan
mimicdata.loc[:, 'Y'] = np.nan

In [25]:
# Case 1: discharge_timestamp > observation_window_end (A = 0)
mimicdata.loc[mimicdata['discharge_timestamp'] > mimicdata['observation_window_end'], 'A'] = 0 #1 / (1 + np.exp(-0))  # A = 0

# Case 2: discharge_timestamp <= observation_window_end (A = 1)
mimicdata.loc[mimicdata['discharge_timestamp'] <= mimicdata['observation_window_end'], 'A'] = 1 #1 / (1 + np.exp(-1))  # A = 1


# Update "A" to 0 if "icu_mortality" is True at the last row of each 'admission_id'
mimicdata.loc[mimicdata.groupby('admission_id').tail(1).index, 'A'] = mimicdata.groupby('admission_id').tail(1)['icu_mortality'].apply(lambda x: 0 if x else 1)

In [26]:
# Fill in D and Y

# A = 0 (continued ICU care)
mimicdata.loc[(mimicdata['A'] == 0) & (mimicdata['icu_mortality'] == False), ['Y', 'D']] = [np.nan, 0]
mimicdata.loc[(mimicdata['A'] == 0) & (mimicdata['icu_mortality'] == True),  ['Y', 'D']] = [np.nan, 1]

# A = 1 (discharged)
mimicdata.loc[(mimicdata['A'] == 1) & (mimicdata['mortality_after_discharge'] == False), ['Y', 'D']] = [0, np.nan]
mimicdata.loc[(mimicdata['A'] == 1) & (mimicdata['mortality_after_discharge'] == True),  ['Y', 'D']] = [1, np.nan]


In [27]:
mimicdata.mortality_after_discharge.value_counts()

mortality_after_discharge
False    461027
True      19178
Name: count, dtype: int64

In [28]:
mimicdata = mimicdata.reset_index(drop=True)

In [29]:
# Sort by admission_id and time first
mimicdata = mimicdata.sort_values(['admission_id', 't0'])

# Identify last rows for each admission
last_rows = mimicdata.groupby('admission_id').tail(1).copy()

# Compute All_M = Y OR D for those last rows only (no NaN filling)
last_rows['All_M'] = (
    (last_rows['mortality_after_discharge'] | last_rows['icu_mortality'])
    .astype('Int64')
)

# Merge back: assign All_M to only the last rows
mimicdata = mimicdata.merge(
    last_rows[['admission_id', 't0', 'All_M']],
    on=['admission_id', 't0'],
    how='left'
)

# Rename columns
mimicdata = mimicdata.reset_index(drop=True)
mimicdata = mimicdata.rename(columns={'Y': 'Y_old', 'All_M': 'Y'})


In [30]:
mimicdata.groupby('admission_id').tail(1)['Y'].mean()

0.11087722375710798

In [32]:
mimicdata.admission_id.nunique()

68409

In [31]:
mimicdata.to_parquet('../../../Datasets/Rounded_12h/cleaned-remove-0-5hrsdisch_A_allY_D_df_model_12h_forward_rounded_12h_windows.parquet')

In [33]:
mimicdata.admission_id.unique()

array(['00010109_19373873', '00010401_19576610', '00010411_13282748', ...,
       '99122709_18855147', '99123001_18821803', '99123123_11304959'],
      dtype=object)

In [34]:
mimicdata.loc[mimicdata.admission_id == '00010401_19576610']

Unnamed: 0,hash_patient_id,admission_timestamp,observation_window_end,activated_partial_thromboplastin_time__change_since_previous__last_12h,activated_partial_thromboplastin_time__last__last_12h,age__last__overall,alanine_transaminase__change_since_previous__last_12h,alanine_transaminase__last__last_12h,albumin__change_since_previous__last_12h,albumin__last__last_12h,...,readmission_or_mortality_after_discharge,outcome_days,g2p_run_timestamp,g2p_version,admission_id,t0,A,D,Y_old,Y
7,19576610,2200-01-04 01:00:00,2200-01-04 01:00:00,,,79.300003,,,,,...,False,7,2025-01-15 15:03:36.792654,2.1.1,00010401_19576610,0,0.0,0.0,,
8,19576610,2200-01-04 01:00:00,2200-01-04 13:00:00,,28.1,79.300003,,99.0,,28.0,...,False,7,2025-01-15 15:03:36.792654,2.1.1,00010401_19576610,1,0.0,0.0,,
9,19576610,2200-01-04 01:00:00,2200-01-05 01:00:00,,,79.300003,,,,,...,False,7,2025-01-15 15:03:36.792654,2.1.1,00010401_19576610,2,0.0,0.0,,
10,19576610,2200-01-04 01:00:00,2200-01-05 13:00:00,,27.799999,79.300003,,87.0,,,...,False,7,2025-01-15 15:03:36.792654,2.1.1,00010401_19576610,3,1.0,,0.0,0.0


In [35]:
mimicdata.groupby('admission_id').tail(1)['Y_old'].mean()

0.03208147676639083

# Missing percentage of all columns per admission_id