<a href="https://colab.research.google.com/github/qiyangsun/BDHA/blob/main/code/cse6250_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import confusion_matrix, roc_auc_score
from torch.utils.data import DataLoader, TensorDataset,Dataset
import numpy as np

In [105]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [110]:
class SequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        """
        sequences: List of padded tensors, each tensor is a sequence.
        labels: Tensor containing labels corresponding to each sequence.
        """
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [118]:
# Hyperparameters
INPUT_SIZE = 69  # Features in the dataset
NUM_CLASSES = 2  # Binary classification

In [119]:
csv_file_path = "/content/drive/MyDrive/Colab Notebooks/tensor_sequence.csv"  # Update path accordingly
data = pd.read_csv(csv_file_path)
data.fillna(data.median(), inplace=True)


In [120]:
data

Unnamed: 0,batch_idx,time_idx,temperature_r,heartrate_r,resprate_r,o2sat_r,sbp_r,dbp_r,sepsis,stay_length_hrs,...,dbp_1,dbp_2,dbp_3,dbp_4,dbp_5,dbp_6,dbp_7,dbp_8,gender_encoded,race_r_encoded
0,0,0,97.5,102.0,18.0,0.0,131.0,71.0,0.0,9.95,...,5.837371,6.002668,5.910049,6.038254,5.900535,5.819177,5.943805,5.903796,0.0,3.0
1,0,1,98.0,91.0,16.0,98.0,138.0,63.0,0.0,9.95,...,3.916302,4.062122,3.905994,3.867325,4.052969,3.990040,3.869399,3.909681,0.0,3.0
2,0,2,98.1,100.0,18.0,98.0,132.0,78.0,0.0,9.95,...,7.116859,7.010448,7.137728,7.043869,6.927984,7.061059,7.034362,7.084827,0.0,3.0
3,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
4,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490386,4498,104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
490387,4498,105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
490388,4498,106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
490389,4498,107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0


In [121]:
X = data.loc[:, data.columns != "sepsis"]  # All columns except the last
y = data["sepsis"].values   # Last column

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [122]:
# Convert to PyTorch tensors and create DataLoader
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [123]:
# RNN Model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Add a dummy sequence dimension for RNN
        x = x.unsqueeze(1)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Use the last time step
        return out
# Training Loop
def train_model(model, train_loader, criterion, optimizer, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}")

# Evaluation Function
def evaluate_model(model, test_loader):
    model.eval()
    y_true, y_pred, y_probs = [], [], []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            _, preds = torch.max(outputs, 1)
            probs = torch.softmax(outputs, dim=1)[:, 1]
            y_true.extend(y_batch.numpy())
            y_pred.extend(preds.numpy())
            y_probs.extend(probs.numpy())

    cm = confusion_matrix(y_true, y_pred)
    auc = roc_auc_score(y_true, y_probs)
    return cm, auc

In [125]:
import itertools

# Function to perform hyperparameter tuning
def hyperparameter_tuning(train_loader, model,test_loader, input_size, num_classes, param_grid, epochs):
    best_auc = 0
    best_params = None
    best_model = None

    # Generate all combinations of hyperparameters
    for params in itertools.product(*param_grid.values()):
        param_dict = dict(zip(param_grid.keys(), params))
        print(f"Testing parameters: {param_dict}")

        # Initialize model, criterion, and optimizer with current parameters
        if model == 'RNN':
          model = RNNModel(input_size,
                          hidden_size=param_dict['hidden_size'],
                          num_layers=param_dict['num_layers'],
                          num_classes=num_classes)
        elif model == 'BiLSTM':
          model = BiLSTM(input_size,
                          hidden_size=param_dict['hidden_size'],
                          num_layers=param_dict['num_layers'],
                          num_classes=num_classes)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=param_dict['learning_rate'])

        # Train the model
        train_model(model, train_loader, criterion, optimizer, epochs)

        # Evaluate the model
        cm, auc = evaluate_model(model, test_loader)
        print(f"Confusion Matrix:\n{cm}")
        print(f"AUC: {auc:.4f}")

        # Update best model if necessary
        if auc > best_auc:
            best_auc = auc
            best_params = param_dict
            best_model = model

    print(f"Best Parameters: {best_params}")
    print(f"Best AUC: {best_auc:.4f}")
    return best_model, best_params, best_auc

# Hyperparameter grid
param_grid = {
    'hidden_size': [64, 128],
    'num_layers': [1, 2],
    'learning_rate': [0.0003,0.0005,0.0008],
}

# Perform hyperparameter tuning
best_model, best_params, best_auc = hyperparameter_tuning(
    train_loader=train_loader,
    test_loader=test_loader,
    input_size=INPUT_SIZE,
    model='RNN',
    num_classes=NUM_CLASSES,
    param_grid=param_grid,
    epochs=10,
)

print("Hyperparameter tuning complete.")
print(f"Best Model: {best_model}")
print(f"Best Parameters: {best_params}")
print(f"Best AUC: {best_auc:.4f}")

Testing parameters: {'hidden_size': 64, 'num_layers': 1, 'learning_rate': 0.0003}
Epoch [1/10], Loss: 0.0311
Epoch [2/10], Loss: 0.0184
Epoch [3/10], Loss: 0.0182
Epoch [4/10], Loss: 0.0179
Epoch [5/10], Loss: 0.0178
Epoch [6/10], Loss: 0.0177
Epoch [7/10], Loss: 0.0176
Epoch [8/10], Loss: 0.0175
Epoch [9/10], Loss: 0.0174
Epoch [10/10], Loss: 0.0174
Confusion Matrix:
[[96928   149]
 [  649   353]]
AUC: 0.9945
Testing parameters: {'hidden_size': 64, 'num_layers': 1, 'learning_rate': 0.0005}
Epoch [1/10], Loss: 0.0178
Epoch [2/10], Loss: 0.0177
Epoch [3/10], Loss: 0.0176
Epoch [4/10], Loss: 0.0176
Epoch [5/10], Loss: 0.0175
Epoch [6/10], Loss: 0.0176
Epoch [7/10], Loss: 0.0175


KeyboardInterrupt: 

###bi-LSTM

In [126]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

# Step 1: Dataset Preparation
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

def preprocess_data(filepath, target_col='sepsis', chunk_size=5000):
    # Load the dataset
    data = pd.read_csv(filepath)
    data.fillna(data.median(), inplace=True)  # Fill missing values
    features = [col for col in data.columns if col not in [target_col, 'unique_id']]

    # Sample a chunk if needed
    data = data.sample(n=chunk_size, random_state=42)
    X = data[features].values.astype(np.float32)
    y = data[target_col].values.astype(np.float32)

    # Normalize features
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    # Reshape X for LSTM (batch_size, seq_len=1, input_size)
    X = X.reshape(X.shape[0], 1, X.shape[1])

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    return X_train, X_test, y_train, y_test

# Step 2: Define Bi-LSTM Model
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # *2 for bidirectional

    def forward(self, x):
        # Initialize hidden and cell states
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)  # 2 for bidirectional
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)

        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_len, hidden_size*2)

        # Decode the last hidden state
        out = self.fc(out[:, -1, :])  # Only take the output of the last time step
        return out

# Step 3: Train and Evaluate the Model
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            labels = labels.unsqueeze(1)  # Match output dimensions

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

def evaluate_model(model, test_loader):
    model.eval()
    all_labels = []
    all_outputs = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)

            # Collect true labels and predicted probabilities
            all_labels.append(labels.cpu())
            all_outputs.append(torch.sigmoid(outputs).cpu())  # Apply sigmoid to get probabilities

    # Concatenate all predictions and labels
    all_labels = torch.cat(all_labels).numpy()
    all_outputs = torch.cat(all_outputs).numpy()

    # Calculate AUC
    auc_score = roc_auc_score(all_labels, all_outputs)
    print(f"AUC: {auc_score:.4f}")

In [128]:
# Filepath to dataset
filepath = "/content/drive/MyDrive/Colab Notebooks/tensor_sequence.csv"  # Replace with your dataset path

# Preprocess data
X_train, X_test, y_train, y_test = preprocess_data(filepath)

# Create PyTorch Datasets and DataLoaders
train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
test_dataset = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Initialize the model, loss function, and optimizer
input_size = X_train.shape[2]
hidden_size = 64
output_size = 1  # Binary classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM(input_size, hidden_size, output_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Train and evaluate the model
train_model(model, train_loader, criterion, optimizer, num_epochs=30)
evaluate_model(model, test_loader)

Epoch [1/30], Loss: 0.6873
Epoch [2/30], Loss: 0.6686
Epoch [3/30], Loss: 0.6045
Epoch [4/30], Loss: 0.5619
Epoch [5/30], Loss: 0.5380
Epoch [6/30], Loss: 0.4748
Epoch [7/30], Loss: 0.4015
Epoch [8/30], Loss: 0.3958
Epoch [9/30], Loss: 0.3487
Epoch [10/30], Loss: 0.2818
Epoch [11/30], Loss: 0.2603
Epoch [12/30], Loss: 0.2161
Epoch [13/30], Loss: 0.1542
Epoch [14/30], Loss: 0.1405
Epoch [15/30], Loss: 0.1292
Epoch [16/30], Loss: 0.0960
Epoch [17/30], Loss: 0.0841
Epoch [18/30], Loss: 0.0653
Epoch [19/30], Loss: 0.1812
Epoch [20/30], Loss: 0.0842
Epoch [21/30], Loss: 0.0615
Epoch [22/30], Loss: 0.0345
Epoch [23/30], Loss: 0.0668
Epoch [24/30], Loss: 0.0897
Epoch [25/30], Loss: 0.0359
Epoch [26/30], Loss: 0.0224
Epoch [27/30], Loss: 0.0974
Epoch [28/30], Loss: 0.0190
Epoch [29/30], Loss: 0.0389
Epoch [30/30], Loss: 0.0191
AUC: 0.9951


### Cross validation


In [None]:
import numpy as np
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn

# Step 1: Define the RNN Model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(RNNModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Use the last output of the sequence
        return out

# Step 2: Cross-Validation Function
def cross_validate_rnn(X, y, model_class, input_size, hidden_size, output_size, num_layers=1, k=5, num_epochs=10, batch_size=32):
    kf = KFold(n_splits=k)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    fold_results = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"Fold {fold+1}/{k}")

        # Split data into training and validation sets
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Create DataLoaders
        train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
        val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32))
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Initialize model, loss function, and optimizer
        model = model_class(input_size, hidden_size, output_size, num_layers).to(device)
        criterion = nn.BCEWithLogitsLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        # Train the model
        for epoch in range(num_epochs):
            model.train()
            for inputs, labels in train_loader:
                inputs, labels = inputs.to(device), labels.to(device).unsqueeze(1)

                # Forward pass
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                # Backward pass and optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # Evaluate on validation set
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device).unsqueeze(1)
                outputs = model(inputs)
                val_loss += criterion(outputs, labels).item()
                predicted = (torch.sigmoid(outputs) > 0.5).float()
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        # Average validation loss and accuracy
        val_loss /= len(val_loader)
        val_accuracy = correct / total
        fold_results.append({'loss': val_loss, 'accuracy': val_accuracy})
        print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    # Aggregate results across folds
    avg_loss = np.mean([result['loss'] for result in fold_results])
    avg_accuracy = np.mean([result['accuracy'] for result in fold_results])
    print(f"Average Loss: {avg_loss:.4f}, Average Accuracy: {avg_accuracy:.4f}")

    return fold_results

# Step 3: Example Usage
if __name__ == "__main__":
    # Example dataset (replace with real data)
    X = np.random.rand(1000, 10, 5)  # 1000 samples, 10 sequence length, 5 features
    y = np.random.randint(0, 2, size=(1000,))  # Binary labels

    input_size = 5
    hidden_size = 64
    output_size = 1
    num_layers = 1
    num_epochs = 10
    batch_size = 32

    cross_validate_rnn(X, y, RNNModel, input_size, hidden_size, output_size, num_layers, k=5, num_epochs=num_epochs, batch_size=batch_size)