In [1]:
import pandas as pd
import sys
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

# Load Custom Functions
sys.path.append('./model')
from custom_functions import load_raw_data, extract_icd_codes, extract_dynamic_data_dict, extract_demographic_features, summarize_dynamic_features

# Load Labels.csv

In [3]:
# Load the subsetted labels file
labels = pd.read_csv('./data/csv/labels_original.csv')    
stay_ids = labels['stay_id'].unique()

# Loop for Creating all Subsets

In [4]:
# List of percentages for which to create subsets
percentages = [1, 5, 10, 25, 50, 75, 100]

In [5]:
for pct in percentages:
    # Calculate the test size based on the percentage
    test_size = 1 - (pct / 100.0)
    
    # If the percentage is 100, copy the full dataset; otherwise, perform stratified sampling
    if pct == 100:
        subset_labels = labels.copy()
    else:
        subset_labels, _ = train_test_split(labels, test_size=test_size, random_state=42, stratify=labels['label'])
    
    # Save the subset to a new file
    subset_file = f'./data/subsets/{pct}%_subsets/labels.csv'
    subset_labels.to_csv(subset_file, index=False)
    
    # Load the subsetted labels file
    labels_subset = pd.read_csv(subset_file)
    stay_ids = labels_subset['stay_id'].unique()
    
    # Load all Time Series Files to Dictionary
    all_data = load_raw_data(labels_subset['stay_id'], data_path='./data/csv/')
    
    # Use Custom Function to Create Clean Data Frame of Static Features
    icd_code_features = extract_icd_codes(all_data)

    # Save processed data to Pickle
    icd_code_features.to_pickle(f'./data/subsets/{pct}%_subsets/icd_code_features.pkl')
    
    # Use Custom Function to Extract the Demographic Data Frame
    demographic_features = extract_demographic_features(all_data)

    # Save processed data to Pickle
    demographic_features.to_pickle(f'./data/subsets/{pct}%_subsets/demographic_features.pkl')
    
    # Use Custom Function to Extract the Dynamic Data Dictionary
    dynamic_data_dict = extract_dynamic_data_dict(all_data)
    
    # Create one Data Frame for Dynamic Data with stay_id
    all_dynamic_values_with_id = []
    for stay_id, df in dynamic_data_dict.items():
        df_with_id = df.copy()
        df_with_id['stay_id'] = stay_id
        all_dynamic_values_with_id.append(df_with_id)

    # Concatenate all dynamic data with the stay_id
    dynamic_data_df = pd.concat(all_dynamic_values_with_id)

    # Save processed data to Pickle 
    dynamic_data_df.to_pickle(f'./data/subsets/{pct}%_subsets/dynamic_data_df.pkl')
    
    # Extract summary statistics for dynamic data
    sum_dynamic_features = summarize_dynamic_features(all_data)

    # Save processed data to Pickle
    sum_dynamic_features.to_pickle(f'./data/subsets/{pct}%_subsets/sum_dynamic_features.pkl')

    # Print progress
    print(f"Completed processing and saving for the {pct}% subset.")

Completed processing and saving for the 75% subset.
Completed processing and saving for the 100% subset.
