In [36]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, TensorDataset
import sys
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pickle
from sklearn.utils import resample

In [37]:
# Load Custom Functions
sys.path.append('./model')
from custom_functions import load_raw_data, extract_icd_codes, extract_dynamic_data_dict, extract_demographic_features, summarize_dynamic_features

# Load Data

In [38]:
# Define the percentage to load
percentage = '100%'  # Change this to '5%', '10%', etc., as needed

# Base directory for the data subsets
base_dir = f'./data/subsets/{percentage}_subsets/'

# Load Labels
labels = pd.read_csv(f'{base_dir}labels.csv')
stay_ids = labels['stay_id'].unique()

# Load static features
icd_features = pd.read_pickle(f'{base_dir}icd_code_features.pkl')

# Load summarized dynamic features
dynamic_data_df = pd.read_pickle(f'{base_dir}dynamic_data_df.pkl')

# Load demographic features
demographic_features = pd.read_pickle(f'{base_dir}demographic_features.pkl')

# Print information to confirm the files are loaded
print(f"Files for {percentage} subset loaded successfully.")
print(f"Number of stays: {len(stay_ids)}")
print(f"ICD Features shape: {icd_features.shape}")
print(f"Dynamic Features shape: {dynamic_data_df.shape}")
print(f"Demographic Features shape: {demographic_features.shape}")

Files for 100% subset loaded successfully.
Number of stays: 47581
ICD Features shape: (47581, 1458)
Dynamic Features shape: (570972, 835)
Demographic Features shape: (47581, 4)


In [39]:
# Concatenating the DataFrames along the columns (axis=1)
all_static_features = pd.concat([icd_features, demographic_features], axis=1)

# Train Test Split before any other operation to avoid Data Leakage

In [40]:
# Split data into training and temp sets (temp will be split into validation and test)
train_stays, temp_stays = train_test_split(labels, test_size=0.3, random_state=42, stratify=labels['label'])

# Now split temp into test and validation sets equally
test_stays, val_stays = train_test_split(temp_stays, test_size=0.5, random_state=42, stratify=temp_stays['label'])

In [41]:
# Order by stay_id
train_stays = train_stays.sort_values("stay_id")
test_stays = test_stays.sort_values("stay_id")
val_stays = val_stays.sort_values("stay_id")

In [42]:
# Convert label columns directly to tensors
label_tensor_train = torch.tensor(train_stays['label'].values, dtype=torch.float32)
label_tensor_test = torch.tensor(test_stays['label'].values, dtype=torch.float32)
label_tensor_val = torch.tensor(val_stays['label'].values, dtype=torch.float32)

# Print shapes to confirm
print("Label Train Tensor shape:", label_tensor_train.shape)
print("Label Test Tensor shape:", label_tensor_test.shape)
print("Label Validation Tensor shape:", label_tensor_val.shape)

# Calculate the fraction of positive labels (label=1)
fraction_positive_train = train_stays['label'].mean()
fraction_positive_test = test_stays['label'].mean()
fraction_positive_val = val_stays['label'].mean()

print("Fraction of labels that are 1:", fraction_positive_train)
print("Fraction of labels that are 1:", fraction_positive_test)
print("Fraction of labels that are 1:", fraction_positive_val)

Label Train Tensor shape: torch.Size([33306])
Label Test Tensor shape: torch.Size([7137])
Label Validation Tensor shape: torch.Size([7138])
Fraction of labels that are 1: 0.08280790248003363
Fraction of labels that are 1: 0.08280790248003363
Fraction of labels that are 1: 0.08279630148500981


# Scaling & Encoding of Static Features

In [43]:
# Create Slices of all_static features for each set
static_features_train = all_static_features.loc[train_stays['stay_id']]
static_features_val = all_static_features.loc[val_stays['stay_id']]
static_features_test = all_static_features.loc[test_stays['stay_id']]

Continous Features

In [44]:
# Selecting Continious Columns for the Different Sets
continous_static_columns_train = static_features_train[["Age"]]
continous_static_columns_val = static_features_val[["Age"]]
continous_static_columns_test = static_features_test[["Age"]]

In [45]:
# Train Standard Scaler on the Train Set and apply it to Train, Validation and Test Set 
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(continous_static_columns_train)

# Scale Training Set in a way that yields a Data Frame again
continous_static_columns_train = pd.DataFrame(
    scaler.transform(continous_static_columns_train),
    index=continous_static_columns_train.index,
    columns=continous_static_columns_train.columns
)

# Scale Training Set in a way that yields a Data Frame again
continous_static_columns_val = pd.DataFrame(
    scaler.transform(continous_static_columns_val),
    index=continous_static_columns_val.index,
    columns=continous_static_columns_val.columns
)

# Scale Training Set in a way that yields a Data Frame again
continous_static_columns_test = pd.DataFrame(
    scaler.transform(continous_static_columns_test),
    index=continous_static_columns_test.index,
    columns=continous_static_columns_test.columns
)

Categorical Features

In [46]:
# Selecting Categorical Columns for the Different Sets
categorical_columns = ['gender', 'ethnicity', 'insurance']  
categorical_static_columns_train = static_features_train[categorical_columns]
categorical_static_columns_val = static_features_val[categorical_columns]
categorical_static_columns_test = static_features_test[categorical_columns]

In [47]:
# Initialize the encoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Fit the encoder on the training data
encoder.fit(categorical_static_columns_train)

# Apply the encoder to the train, validation, and test data
categorical_static_columns_train_encoded = pd.DataFrame(
    encoder.transform(categorical_static_columns_train),
    index=categorical_static_columns_train.index,
    columns=encoder.get_feature_names_out(categorical_columns)
)

categorical_static_columns_val_encoded = pd.DataFrame(
    encoder.transform(categorical_static_columns_val),
    index=categorical_static_columns_val.index,
    columns=encoder.get_feature_names_out(categorical_columns)
)

categorical_static_columns_test_encoded = pd.DataFrame(
    encoder.transform(categorical_static_columns_test),
    index=categorical_static_columns_test.index,
    columns=encoder.get_feature_names_out(categorical_columns)
)



Boolean Features (ICD Codes)

In [48]:
# Select Boolean columns for each set (ICD Codes)
bool_static_columns_train = static_features_train[icd_features.columns]
bool_static_columns_val = static_features_val[icd_features.columns]
bool_static_columns_test = static_features_test[icd_features.columns]

In [49]:
# Convert Boolean Columns to Float for Consistent Formatting
bool_static_columns_train = bool_static_columns_train.astype(float)
bool_static_columns_val = bool_static_columns_val.astype(float)
bool_static_columns_test = bool_static_columns_test.astype(float)

Combine Data Frames Back into one per Set

In [50]:
# Combine all parts into one DataFrame using pd.concat
processed_static_features_train = pd.concat([continous_static_columns_train, categorical_static_columns_train_encoded, bool_static_columns_train], axis=1)
processed_static_features_test = pd.concat([continous_static_columns_test, categorical_static_columns_test_encoded, bool_static_columns_test], axis=1)
processed_static_features_val = pd.concat([continous_static_columns_val, categorical_static_columns_val_encoded, bool_static_columns_val], axis=1)

Create Tensors

In [51]:
# Ensure sorting by stay_id
processed_static_features_train = processed_static_features_train.sort_index()
processed_static_features_test = processed_static_features_test.sort_index()
processed_static_features_val = processed_static_features_val.sort_index()

In [52]:
# Convert reindexed static data to tensors
static_train_tensor = torch.tensor(processed_static_features_train.values, dtype=torch.float32)
static_test_tensor = torch.tensor(processed_static_features_test.values, dtype=torch.float32)
static_val_tensor = torch.tensor(processed_static_features_val.values, dtype=torch.float32)

# Print the shapes to confirm the reordering
print("Reordered Static Train Tensor shape:", static_train_tensor.shape)
print("Reordered Static Test Tensor shape:", static_test_tensor.shape)
print("Reordered Static Validation Tensor shape:", static_val_tensor.shape)

Reordered Static Train Tensor shape: torch.Size([33306, 1497])
Reordered Static Test Tensor shape: torch.Size([7137, 1497])
Reordered Static Validation Tensor shape: torch.Size([7138, 1497])


# Preprocess Dynamic Data

In [53]:
# Flatten the multi-level columns into a single level
flattened_columns = ['_'.join(map(str, col)) for col in dynamic_data_df.columns]

# Update the DataFrame with flattened columns
dynamic_data_df.columns = flattened_columns

# Rename 'stay_id_' column to 'stay_id'
dynamic_data_df = dynamic_data_df.rename(columns={'stay_id_': 'stay_id'})

In [54]:
# Keep timestep as column to ensure right temporal order
dynamic_data_df["timestep"] = dynamic_data_df.index

In [55]:
# Create Subsets of Dynamic Data Dataframe for Test Train and Validation Set
dynamic_train_data = dynamic_data_df[dynamic_data_df["stay_id"].isin(train_stays["stay_id"])]
dynamic_test_data = dynamic_data_df[dynamic_data_df["stay_id"].isin(test_stays["stay_id"])]
dynamic_val_data = dynamic_data_df[dynamic_data_df["stay_id"].isin(val_stays["stay_id"])]

Scaling

In [56]:
# Initialize a scaler for dynamic features
dynamic_scaler = StandardScaler()

In [57]:
# Separate the stay_id before scaling
train_stay_ids = dynamic_train_data.pop('stay_id')
test_stay_ids = dynamic_test_data.pop('stay_id')
val_stay_ids = dynamic_val_data.pop('stay_id')

In [58]:
# Normalize the dynamic features
dynamic_train_norm_array = dynamic_scaler.fit_transform(dynamic_train_data)
dynamic_test_norm_array = dynamic_scaler.transform(dynamic_test_data)
dynamic_val_norm_array = dynamic_scaler.transform(dynamic_val_data)

In [59]:
# Convert the resulting arrays back to a DataFrame
dynamic_train_data_norm = pd.DataFrame(
    dynamic_train_norm_array, 
    index=dynamic_train_data.index, 
    columns=dynamic_train_data.columns
)
dynamic_train_data_norm['stay_id'] = train_stay_ids.values

dynamic_test_data_norm = pd.DataFrame(
    dynamic_test_norm_array, 
    index=dynamic_test_data.index, 
    columns=dynamic_test_data.columns
)
dynamic_test_data_norm['stay_id'] = test_stay_ids.values

dynamic_val_data_norm = pd.DataFrame(
    dynamic_val_norm_array, 
    index=dynamic_val_data.index, 
    columns=dynamic_val_data.columns
)
dynamic_val_data_norm['stay_id'] = val_stay_ids.values

In [60]:
# Add timestep as column to ensure temporal order
dynamic_train_data_norm["timestep"] = dynamic_train_data_norm.index
dynamic_test_data_norm["timestep"] = dynamic_test_data_norm.index
dynamic_val_data_norm["timestep"] = dynamic_val_data_norm.index

In [61]:
# Sort data by stay_id and timestep
dynamic_train_data_norm = (dynamic_train_data_norm
                                  .sort_values('stay_id')
                                  .groupby('stay_id')
                                  .apply(lambda x: x.sort_index())
                                  .reset_index(drop=True))

dynamic_test_data_norm = (dynamic_test_data_norm
                                  .sort_values('stay_id')
                                  .groupby('stay_id')
                                  .apply(lambda x: x.sort_index())
                                  .reset_index(drop=True))

dynamic_val_data_norm = (dynamic_val_data_norm
                                  .sort_values('stay_id')
                                  .groupby('stay_id')
                                  .apply(lambda x: x.sort_index())
                                  .reset_index(drop=True))

In [62]:
# Make sure dynamic data is ordered by stay_id and timestep
dynamic_train_data_norm = dynamic_train_data_norm.sort_values(["stay_id", "timestep"])
dynamic_test_data_norm = dynamic_test_data_norm.sort_values(["stay_id", "timestep"])
dynamic_val_data_norm = dynamic_val_data_norm.sort_values(["stay_id", "timestep"])

Create Dynamic Tensors

In [63]:
# Copy dataframes before conversion to tensor to keep the original for further processing
dynamic_train_data_norm_copy = dynamic_train_data_norm.copy()
dynamic_test_data_norm_copy = dynamic_test_data_norm.copy()
dynamic_val_data_norm_copy = dynamic_val_data_norm.copy()

In [64]:
# Drop stay_id and timestep before conversion to tensor
dynamic_train_data_norm_copy.drop(columns=["stay_id", "timestep"], inplace=True)
dynamic_test_data_norm_copy.drop(columns=["stay_id", "timestep"], inplace=True)
dynamic_val_data_norm_copy.drop(columns=["stay_id", "timestep"], inplace=True)

In [65]:
num_time_steps = 12

def reshape_dynamic_data(df, num_time_steps):
    # This will ensure all stays have the required number of time steps
    output_array = []
    stay_id_order = []
    for stay_id, group in df.groupby('stay_id'):
        if len(group) == num_time_steps:
            output_array.append(group.drop(columns=['stay_id']).values)
            stay_id_order.append(stay_id)
        else:
            print(f"Stay ID {stay_id} has {len(group)} time steps, expected {num_time_steps}")
    return np.array(output_array), stay_id_order

In [66]:
# Reshape the data and capture stay_id order
dynamic_features_train_array, train_stay_order = reshape_dynamic_data(dynamic_train_data_norm, num_time_steps)
dynamic_features_test_array, test_stay_order = reshape_dynamic_data(dynamic_test_data_norm, num_time_steps)
dynamic_features_val_array, val_stay_order = reshape_dynamic_data(dynamic_val_data_norm, num_time_steps)

# Convert numpy arrays to tensors
dynamic_train_tensor = torch.tensor(dynamic_features_train_array, dtype=torch.float32)
dynamic_test_tensor = torch.tensor(dynamic_features_test_array, dtype=torch.float32)
dynamic_val_tensor = torch.tensor(dynamic_features_val_array, dtype=torch.float32)

# Print shapes to confirm
print("Dynamic Train Tensor shape:", dynamic_train_tensor.shape)  # Expected: [num_stays, 12, num_features]
print("Dynamic Test Tensor shape:", dynamic_test_tensor.shape)
print("Dynamic Validation Tensor shape:", dynamic_val_tensor.shape)

Dynamic Train Tensor shape: torch.Size([33306, 12, 835])
Dynamic Test Tensor shape: torch.Size([7137, 12, 835])
Dynamic Validation Tensor shape: torch.Size([7138, 12, 835])


In [67]:
# Path to save tensors
tensor_save_path = f'./data/tensors/{percentage}_subset'

# Create directory if it doesn't exist
if not os.path.exists(tensor_save_path):
    os.makedirs(tensor_save_path)

# Save tensors
torch.save(dynamic_train_tensor, os.path.join(tensor_save_path, 'dynamic_train_tensor.pt'))
torch.save(static_train_tensor, os.path.join(tensor_save_path, 'static_train_tensor.pt'))
torch.save(label_tensor_train, os.path.join(tensor_save_path, 'label_train_tensor.pt'))

torch.save(dynamic_test_tensor, os.path.join(tensor_save_path, 'dynamic_test_tensor.pt'))
torch.save(static_test_tensor, os.path.join(tensor_save_path, 'static_test_tensor.pt'))
torch.save(label_tensor_test, os.path.join(tensor_save_path, 'label_test_tensor.pt'))

torch.save(dynamic_val_tensor, os.path.join(tensor_save_path, 'dynamic_val_tensor.pt'))
torch.save(static_val_tensor, os.path.join(tensor_save_path, 'static_val_tensor.pt'))
torch.save(label_tensor_val, os.path.join(tensor_save_path, 'label_val_tensor.pt'))

# Create Oversampled Data

Oversample Train Labels

In [68]:
# Separate the classes
majority = train_stays[train_stays['label'] == 0]
minority = train_stays[train_stays['label'] == 1]

# Upsample minority class
minority_upsampled = resample(minority,
                              replace=True,            # sample with replacement
                              n_samples=len(majority), # to match majority class
                              random_state=123)        # reproducible results

# Combine majority class with upsampled minority class
oversampled_train_labels = pd.concat([majority, minority_upsampled])

# Add a new column to uniquely identify each row even after upsampling
oversampled_train_labels['stay_id_replica'] = oversampled_train_labels.groupby('stay_id').cumcount()

# Create a unique identifier combining the original stay_id with the replica indicator
oversampled_train_labels['unique_stay_id'] = oversampled_train_labels['stay_id'].astype(str) + "_" + oversampled_train_labels['stay_id_replica'].astype(str)

In [69]:
# Sort the oversampled labels by stay_id and replica number
oversampled_train_labels = oversampled_train_labels.sort_values(by=["stay_id", "stay_id_replica"])

# Convert label columns directly to tensors
label_tensor_train_oversampled = torch.tensor(oversampled_train_labels['label'].values, dtype=torch.float32)

print("Oversampled Label Tensor shape:", label_tensor_train_oversampled.shape)

Oversampled Label Tensor shape: torch.Size([61096])


Oversample Dynamic Train Data

In [70]:
# Count stay_ids repetitions
stay_id_counts = oversampled_train_labels['stay_id'].value_counts()

# Add the count to dynamic_train_data
dynamic_train_data_norm_with_count = dynamic_train_data_norm.merge(stay_id_counts, on='stay_id')

: 

In [35]:
# Calculate the total number of rows for the expanded dataframe
total_rows = dynamic_train_data_norm_with_count['count'].sum()

# Prepare an empty list to hold the expanded data
expanded_data = []

# Prepare an empty DataFrame with the same columns plus the unique_stay_id
columns_with_id = dynamic_train_data_norm_with_count.columns.tolist() + ['replica']
oversampled_dynamic_train_data_norm = pd.DataFrame(columns=columns_with_id)

# Use iterrows to duplicate each row and add a unique identifier
all_rows = []  # List to collect all new rows

for index, row in dynamic_train_data_norm_with_count.iterrows():
    stay_id = row['stay_id']  # Get the stay_id from the row
    for i in range(int(row['count'])):  # Duplicate the row 'count' times
        new_row = row.copy()  # Copy the row to modify it without affecting the original DataFrame
        new_row['replica'] = i  # Create a unique identifier for each duplicated row
        all_rows.append(new_row)  # Append the modified row to the list

# Convert the list of rows back into a DataFrame
oversampled_dynamic_train_data_norm = pd.DataFrame(all_rows, columns=columns_with_id)

In [36]:
# Sort Dataframe by stay_id, replica and timestep
oversampled_dynamic_train_data_norm = oversampled_dynamic_train_data_norm.sort_values(["stay_id", "replica", "timestep"])

# Create unique_stay_id column
oversampled_dynamic_train_data_norm["unique_stay_id"] = oversampled_dynamic_train_data_norm['stay_id'].astype(str) + "_" + oversampled_dynamic_train_data_norm["replica"].astype(str)

# Drop meaningless columns before coversion to tensor
oversampled_dynamic_train_data_norm = oversampled_dynamic_train_data_norm.drop(columns=["stay_id", "replica", "timestep", "count"])

In [37]:
### Oversampled Dynamic Tensor
def reshape_oversampled_dynamic_data(df, num_time_steps):
    # This will ensure all stays have the required number of time steps
    output_array = []
    stay_id_order = []
    for unique_stay_id, group in df.groupby('unique_stay_id'):
        if len(group) == num_time_steps:
            # Drop both 'unique_stay_id' and 'stay_id' if present to clean up non-predictive identifiers
            clean_group = group.drop(columns=['unique_stay_id'], errors='ignore')
            output_array.append(clean_group.values)
            stay_id_order.append(unique_stay_id)  # Keep tracking 'stay_id' for mapping predictions back to entities
        else:
            print(f"Stay ID {stay_id} has {len(group)} time steps, expected {num_time_steps}")
    return np.array(output_array), stay_id_order
# Reshape the data and capture stay_id order

dynamic_features_train_array_oversampled, train_stay_order = reshape_oversampled_dynamic_data(oversampled_dynamic_train_data_norm, 12)

In [38]:
# Convert numpy arrays to tensors
dynamic_train_tensor_oversampled = torch.tensor(dynamic_features_train_array_oversampled, dtype=torch.float32)

print("Oversampled Dynamic Train Tensor shape:", dynamic_train_tensor_oversampled.shape)

Oversampled Dynamic Train Tensor shape: torch.Size([30548, 12, 834])


Oversample Static Train Data

In [39]:
# Make stay_id string
oversampled_train_labels["stay_id"] = oversampled_train_labels["stay_id"].astype(str)

# Reset index of static_features_train to make 'stay_id' a column
processed_static_features_train.reset_index(inplace=True)
processed_static_features_train.rename(columns={'index': 'stay_id'}, inplace=True)

# Convert stay_id to string
processed_static_features_train["stay_id"] = processed_static_features_train["stay_id"].astype(str)
stay_id_counts.index = stay_id_counts.index.astype(str)

# Add counts and unique stay_id to features
proc_static_features_train_with_counts = processed_static_features_train.merge(stay_id_counts, left_on='stay_id', right_index=True)

In [40]:
# Expand the DataFrame while tracking the repetition index
oversampled_list = []
for idx, row in proc_static_features_train_with_counts.iterrows():
    repeat_count = int(row['count'])
    for n in range(repeat_count):
        # Create a new row as a copy of the current one
        new_row = row.copy()
        # Generate a unique identifier for each repeated instance
        new_row['replica'] = n
        oversampled_list.append(new_row)

# Convert the list of Series objects into a DataFrame
proc_static_features_train_oversampled = pd.DataFrame(oversampled_list)

# Reset index as the append operation in the loop above will generate an index based on the original DataFrame
proc_static_features_train_oversampled.reset_index(drop=True, inplace=True)

In [41]:
# Sort both dataframes by stay_id and the replica number
proc_static_features_train_oversampled.sort_values(by=["stay_id", "replica"], inplace=True)
oversampled_train_labels.sort_values(by=["stay_id", "stay_id_replica"], inplace=True)

In [42]:
## Check that the unique_stay_ids are identical 
# Extract unique_stay_id columns
proc_static_features_train_oversampled["unique_stay_id"] = proc_static_features_train_oversampled["stay_id"].astype(str) + "_" + proc_static_features_train_oversampled["replica"].astype(str)
proc_unique_ids = proc_static_features_train_oversampled['unique_stay_id']
labels_unique_ids = oversampled_train_labels['unique_stay_id']

# Reset indices before comparison to avoid issues with misaligned indices
proc_unique_ids_reset = proc_unique_ids.reset_index(drop=True)
labels_unique_ids_reset = labels_unique_ids.reset_index(drop=True)

# Check if both columns are identical
ids_match = proc_unique_ids_reset.equals(labels_unique_ids_reset)

# Print the result
print("Do the unique_stay_id columns match across both DataFrames?", ids_match)

# Find differences (if any)
if not ids_match:
    diff = proc_unique_ids_reset[proc_unique_ids_reset != labels_unique_ids_reset]
    print("Differences found at indices:", diff.index.tolist())

Do the unique_stay_id columns match across both DataFrames? True


In [43]:
# Drop meaningless columns before creating tensors
proc_static_features_train_oversampled.drop(columns=["unique_stay_id", "count", "replica", "stay_id"], inplace=True)

# Convert reindexed static data to tensors
static_train_tensor_oversampled = torch.tensor(proc_static_features_train_oversampled.values, dtype=torch.float32)
# Print the shapes to confirm the reordering
print("Reordered Static Train Tensor shape:", static_train_tensor_oversampled.shape)

Reordered Static Train Tensor shape: torch.Size([30548, 1497])


In [61]:
# Oversampled Train Tensors
torch.save(label_tensor_train_oversampled, os.path.join(tensor_save_path, 'label_tensor_train_oversampled.pt'))
torch.save(static_train_tensor_oversampled, os.path.join(tensor_save_path, 'static_train_tensor_oversampled.pt'))
torch.save(dynamic_train_tensor_oversampled, os.path.join(tensor_save_path, 'dynamic_train_tensor_oversampled.pt'))

In [64]:
# Delete Oversampled Objects to free up memory
del proc_static_features_train_oversampled, oversampled_dynamic_train_data_norm, dynamic_features_train_array_oversampled

# Create Data with repeated static features

In [33]:
# Make stay_id a column for all sets
# already a column for train set
processed_static_features_test.reset_index(inplace=True)
processed_static_features_val.reset_index(inplace=True)

processed_static_features_test.rename(columns={'index': 'stay_id'}, inplace=True)
processed_static_features_val.rename(columns={'index': 'stay_id'}, inplace=True)

In [34]:
# Repeat each row 12 times
repeats = 12

repeated_static_train_features = processed_static_features_train.loc[processed_static_features_train.index.repeat(repeats)].reset_index(drop=True)
repeated_static_test_features = processed_static_features_test.loc[processed_static_features_test.index.repeat(repeats)].reset_index(drop=True)
repeated_static_val_features = processed_static_features_val.loc[processed_static_features_val.index.repeat(repeats)].reset_index(drop=True)

In [35]:
# Create a running number for each repeat of 12 and concatenate with stay_id
repeated_static_train_features['sequence'] = np.tile(np.arange(repeats), len(processed_static_features_train))
repeated_static_train_features['stay_id_x'] = repeated_static_train_features['stay_id'].astype(str) + "_" + repeated_static_train_features['sequence'].astype(str)

repeated_static_test_features['sequence'] = np.tile(np.arange(repeats), len(processed_static_features_test))
repeated_static_test_features['stay_id_x'] = repeated_static_test_features['stay_id'].astype(str) + "_" + repeated_static_test_features['sequence'].astype(str)

repeated_static_val_features['sequence'] = np.tile(np.arange(repeats), len(processed_static_features_val))
repeated_static_val_features['stay_id_x'] = repeated_static_val_features['stay_id'].astype(str) + "_" + repeated_static_val_features['sequence'].astype(str)

KeyError: 'stay_id'

In [47]:
# Drop the sequence column if it's no longer needed
repeated_static_train_features.drop(columns='sequence', inplace=True)
repeated_static_test_features.drop(columns='sequence', inplace=True)
repeated_static_val_features.drop(columns='sequence', inplace=True)

Create stay_id_time_step index in Dynamic Data for merging with repeated static data

In [48]:
# Create a new column 'stay_id_x' by combining 'stay_id' and 'time_step'
dynamic_train_data_norm['stay_id_x'] = dynamic_train_data_norm['stay_id'].astype(str) + "_" + dynamic_train_data_norm['timestep'].astype(str)
dynamic_test_data_norm['stay_id_x'] = dynamic_test_data_norm['stay_id'].astype(str) + "_" + dynamic_test_data_norm['timestep'].astype(str)
dynamic_val_data_norm['stay_id_x'] = dynamic_val_data_norm['stay_id'].astype(str) + "_" + dynamic_val_data_norm['timestep'].astype(str)

In [71]:
# Convert all columns to float32 except 'stay_id' and 'stay_id_x'
cols = dynamic_train_data_norm.select_dtypes(include=['float64', 'int']).columns.difference(['stay_id', 'stay_id_x'])
dynamic_train_data_norm[cols] = dynamic_train_data_norm[cols].astype('float32')
dynamic_test_data_norm[cols] = dynamic_test_data_norm[cols].astype('float32')
dynamic_val_data_norm[cols] = dynamic_val_data_norm[cols].astype('float32')

NameError: name 'dynamic_train_data_norm' is not defined

Merge dynamic and static data on stay_id_x

In [65]:
# Perform the merge
all_train_features = pd.merge(
    dynamic_train_data_norm, 
    repeated_static_train_features, 
    on='stay_id_x', 
    how='inner', 
    suffixes=('_dynamic', '_static')  # Suffixes to resolve any other column name conflicts
)

all_test_features = pd.merge(
    dynamic_test_data_norm, 
    repeated_static_test_features, 
    on='stay_id_x', 
    how='inner', 
    suffixes=('_dynamic', '_static')  # Suffixes to resolve any other column name conflicts
)

all_val_features = pd.merge(
    dynamic_val_data_norm, 
    repeated_static_val_features, 
    on='stay_id_x', 
    how='inner', 
    suffixes=('_dynamic', '_static')  # Suffixes to resolve any other column name conflicts
)

In [69]:
# Make sure all Dataframes are sorted by stay_id and timestep
all_train_features = all_train_features.sort_values(["stay_id_static", "timestep"])
all_test_features = all_test_features.sort_values(["stay_id_static", "timestep"])
all_val_features = all_val_features.sort_values(["stay_id_static", "timestep"])

MemoryError: Unable to allocate 2.23 GiB for an array with shape (1497, 199836) and data type float64

In [51]:
# Drop redundant stay_id columns
all_train_features.drop(columns='stay_id_dynamic', inplace=True)
all_train_features.drop(columns='stay_id_x', inplace=True)

all_test_features.drop(columns='stay_id_dynamic', inplace=True)
all_test_features.drop(columns='stay_id_x', inplace=True)

all_val_features.drop(columns='stay_id_dynamic', inplace=True)
all_val_features.drop(columns='stay_id_x', inplace=True)

In [52]:
# Rename stay_id_static back to stay_id
all_train_features.rename(columns={'stay_id_static': 'stay_id'}, inplace=True)
all_test_features.rename(columns={'stay_id_static': 'stay_id'}, inplace=True)
all_val_features.rename(columns={'stay_id_static': 'stay_id'}, inplace=True)

In [53]:
# Drop not needed columns
dynamic_train_data_norm.drop(columns='stay_id_x', inplace=True)
dynamic_train_data_norm.drop(columns='timestep', inplace=True)

dynamic_test_data_norm.drop(columns='stay_id_x', inplace=True)
dynamic_test_data_norm.drop(columns='timestep', inplace=True)

dynamic_val_data_norm.drop(columns='stay_id_x', inplace=True)
dynamic_val_data_norm.drop(columns='timestep', inplace=True)

# Convert to Tensors

Repeated Static Features

In [54]:
# Reshape the data and capture stay_id order for all features
all_features_train_array, train_stay_order = reshape_dynamic_data(all_train_features, num_time_steps)
all_features_test_array, test_stay_order = reshape_dynamic_data(all_test_features, num_time_steps)
all_features_val_array, val_stay_order = reshape_dynamic_data(all_val_features, num_time_steps)

# Convert numpy arrays to tensors
train_tensor_repeated_static_features = torch.tensor(all_features_train_array, dtype=torch.float32)
test_tensor_repeated_static_features = torch.tensor(all_features_test_array, dtype=torch.float32)
val_tensor_repeated_static_features = torch.tensor(all_features_val_array, dtype=torch.float32)

# Print shapes to confirm
print("Repeated Static Train Tensor shape:", train_tensor_repeated_static_features.shape)  # Expected: [num_stays, 12, num_features]
print("Repeated Static Tensor shape:", test_tensor_repeated_static_features.shape)
print("Repeated Static Tensor shape:", val_tensor_repeated_static_features.shape)

Repeated Static Train Tensor shape: torch.Size([8326, 12, 2332])
Repeated Static Tensor shape: torch.Size([1784, 12, 2332])
Repeated Static Tensor shape: torch.Size([1785, 12, 2332])


# Save the tensors

In [55]:
# Save Repeated Static Tensors
torch.save(train_tensor_repeated_static_features, os.path.join(tensor_save_path, 'train_tensor_repeated_static_features.pt'))
torch.save(test_tensor_repeated_static_features, os.path.join(tensor_save_path, 'test_tensor_repeated_static_features.pt'))
torch.save(val_tensor_repeated_static_features, os.path.join(tensor_save_path, 'val_tensor_repeated_static_features.pt'))

Save Feature Names for Feature Importance Computation

In [56]:
# Define and save feature names for Feature Importance Calculation
feature_names_dynamic_features = dynamic_train_data_norm.columns

# Save feature names
feature_name_file = os.path.join(tensor_save_path, 'feature_names_dynamic_features.pkl')
with open(feature_name_file, 'wb') as file:
    pickle.dump(feature_names_dynamic_features, file)

# Define Feature Names for Feature importance estimation
feature_names_all_train_feaures=all_train_features.columns

# Save feature names
feature_name_file = os.path.join(tensor_save_path, 'feature_names_all_train_feaures.pkl')
with open(feature_name_file, 'wb') as file:
    pickle.dump(feature_names_all_train_feaures, file)