In [6]:
!pip install imblearn
!pip install lib
!pip install torchsummary

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl.metadata (296 bytes)
Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1


In [2]:
# Import libraries
import math
import warnings
from typing import Dict, Literal
import matplotlib.pyplot as plt
warnings.simplefilter("ignore")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import zscore
from imblearn.over_sampling import SMOTE
import torch
import torch.nn.functional as F
import torch.optim
from torch import Tensor
from tqdm.std import tqdm
warnings.resetwarnings()
import lib
import torchsummary
!pip install torchinfo
from torchinfo import summary
!pip install pytorch-ignite
from ignite.handlers import EarlyStopping
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report



In [3]:
# Load dataset
df = pd.read_csv('./SepsisData.csv', header=None, low_memory=False)

# Step 1: Drop the first row (original header) and set a new header
df = df.drop(index=0)
df.columns = df.iloc[0]
df = df.drop(index=1)

# Define the columns based on the categories
numeric_features = [
    'age', 'BMI', 'gcs', 'sirs', 'apsiii', 'lods', 'oasis', 'sapsii', 'sofa_total',
    'sofa_respiration', 'sofa_coagulation', 'sofa_liver', 'sofa_cardiovascular',
    'sofa_cns', 'sofa_renal', 'urineoutput_1stday', 'hematocrit_min', 'hematocrit_max',
    'hemoglobin_min', 'hemoglobin_max', 'platelets_min', 'platelets_max', 'wbc_min',
    'wbc_max', 'albumin_min', 'albumin_max', 'aniongap_min', 'aniongap_max', 'bicarbonate_min',
    'bicarbonate_max', 'calcium_min', 'calcium_max', 'chloride_min', 'chloride_max',
    'glucose_mean', 'sodium_min', 'sodium_max', 'potassium_min', 'potassium_max', 'bun_max',
    'creatinine_max', 'INR_min', 'INR_max', 'PT_min', 'PT_max', 'ptt_min', 'ptt_max',
    'ALT_max', 'ALP_max', 'AST_max', 'bilirubin_total_max', 'ld_ldh_max', 'heart_rate_max',
    'SBP_mean', 'DBP_mean', 'mbp_mean', 'resp_rate_min', 'resp_rate_max', 'temperature_min',
    'temperature_max', 'SpO2_min', 'lactate_max_bg', 'pCO2_min_bg', 'pCO2_max_bg',
    'baseexcess_min_bg', 'baseexcess_max_bg'
]

categorical_features = [
    'gender_M1F0', 'Myocardial_infarction', 'Congestive_heart_failure', 'Peripheral_vascular_disease',
    'Cerebrovascular_disease', 'Dementia', 'Chronic_pulmonary_disease', 'Rheumatic_disease',
    'peptic_ulcer_disease', 'mild_liver_disease', 'Diabetes', 'Hemiplegia_paraplegia',
    'renal_disease', 'malignancy', 'Moderate_or_severe_liver_disease', 'Metastatic_solid_tumor',
    'AIDS', 'vasoactive drug ', 'dobutamine', 'vasopressin', 'phenylephrine', 'norepinephrine',
    'dopamine', 'milrinone', 'epinephrine', 'MV'
]

label_encoding_feature = 'race'
output_features = ['death_28day', 'death_90day', 'death_1year']

# Step 2: Handle missing values
# Fill numeric columns with mean
for col in numeric_features:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Ensure numeric dtype
    df[col] = df[col].fillna(df[col].mean())
    df[col] = df[col].astype('int64')

# Fill categorical columns with mode
for col in categorical_features:
    df[col] = df[col].fillna(df[col].mode()[0])
    df[col] = df[col].astype('int64')

# Step 3: Label encode the 'race' column
label_encoder = LabelEncoder()
df[label_encoding_feature] = label_encoder.fit_transform(df[label_encoding_feature].fillna(df[label_encoding_feature].mode()[0]))

for col in output_features:
    df[col] = df[col].astype('int64')

# Step 4: Normalize all numeric columns using z-score
for col in numeric_features:
    df[col] = zscore(df[col])

# Step 5: Extract X (features) and Y (target)
Y = df['death_1year']  # Target column
X = df.drop(columns=['death_1year', 'death_28day', 'death_90day','MV'])  # Features

#Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

# Use RandomUnderSampler for imbalanced dataset
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_train_resampled, Y_train_resampled = rus.fit_resample(X_train, Y_train)
categorical_features.pop(25)
# print("X_train_resampled shape:", X_train_resampled.shape)
# print("Y_train_resampled shape:", Y_train_resampled.shape)
# Split training data into training and validation sets (e.g., 80% train, 20% val)
X_train_resampled, X_val, Y_train_resampled, Y_val = train_test_split(
    X_train_resampled, Y_train_resampled, test_size=0.2, random_state=42
)

# Convert the DataFrames to NumPy arrays before converting them to PyTorch tensors
X_train_tensor = torch.tensor(X_train_resampled.to_numpy(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train_resampled.to_numpy(), dtype=torch.long)
Y_test_tensor = torch.tensor(Y_test.to_numpy(), dtype=torch.long)

X_val_tensor = torch.tensor(X_val.to_numpy(), dtype=torch.float32)
Y_val_tensor = torch.tensor(Y_val.to_numpy(), dtype=torch.long)

In [4]:
# Count the number of rows with label 0 and label 1 in the training set
label_counts = df['death_1year'].value_counts()
print("Label Counts:")
print(label_counts)

# print("X_train_resampled shape:", X_train_resampled.shape)
# print("Y_train_resampled shape:", Y_train_resampled.shape)

print("X_train_tensor shape:", X_train_tensor.shape)
print("Y_train_tensor shape:", Y_train_tensor.shape)
print("X_val_tensor shape:", X_val_tensor.shape)
print("Y_val_tensor shape:", Y_val_tensor.shape)
print("X_test_tensor shape:", X_test_tensor.shape)
print("Y_test_tensor shape:", Y_test_tensor.shape)


# Check the count of each label after undersampling
label_counts = pd.Series(Y_train_resampled).value_counts()
print("Count of each label after undersampling:")
print(label_counts)

label_counts_testset = pd.Series(Y_test_tensor).value_counts()
print("Count of each label in testset:")
print(label_counts_testset)

Label Counts:
death_1year
0    14570
1     4230
Name: count, dtype: int64
X_train_tensor shape: torch.Size([5414, 92])
Y_train_tensor shape: torch.Size([5414])
X_val_tensor shape: torch.Size([1354, 92])
Y_val_tensor shape: torch.Size([1354])
X_test_tensor shape: torch.Size([3760, 92])
Y_test_tensor shape: torch.Size([3760])
Count of each label after undersampling:
death_1year
1    2719
0    2695
Name: count, dtype: int64
Count of each label in testset:
0    2914
1     846
Name: count, dtype: int64
