In [1]:
# pip install transformers

In [1]:
# Standard libraries
import os
import sys
import random

# Data handling and processing
import pandas as pd
import numpy as np

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Sklearn - Preprocessing, Model Selection, Metrics
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    average_precision_score, confusion_matrix, precision_recall_curve, fbeta_score
)
from sklearn.utils import resample

# Imbalanced Learning
from imblearn.over_sampling import RandomOverSampler

# Transformers
from transformers import AutoModel, AutoTokenizer

# Captum for model interpretability
from captum.attr import IntegratedGradients

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Load Custom Functions
sys.path.append('./model')
from custom_functions import load_raw_data, extract_icd_codes, extract_dynamic_data_dict, extract_demographic_features, summarize_dynamic_features

In [4]:
# Set Seed for all packages
def set_seed(seed=42):
    """Set seeds to make the experiment more reproducible."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# Load Data

In [5]:
# Define the percentage to load
percentage = '10%'  # Change this to '5%', '10%', etc., as needed

# Base directory for the data subsets
base_dir = f'./data/subsets/{percentage}_subsets/'

# Load Labels
labels = pd.read_csv(f'{base_dir}labels.csv')
stay_ids = labels['stay_id'].unique()

# Load static features
icd_features = pd.read_pickle(f'{base_dir}icd_code_features.pkl')

# Load summarized dynamic features
sum_dynamic_features = pd.read_pickle(f'{base_dir}sum_dynamic_features.pkl')

# Load demographic features
demographic_features = pd.read_pickle(f'{base_dir}demographic_features.pkl')

# Print information to confirm the files are loaded
print(f"Files for {percentage} subset loaded successfully.")
print(f"Number of stays: {len(stay_ids)}")
print(f"ICD Features shape: {icd_features.shape}")
print(f"Summarized Dynamic Features shape: {sum_dynamic_features.shape}")
print(f"Demographic Features shape: {demographic_features.shape}")

Files for 5% subset loaded successfully.
Number of stays: 2379
ICD Features shape: (2379, 1458)
Summarized Dynamic Features shape: (2379, 16)
Demographic Features shape: (2379, 4)


In [6]:
# Merge the DataFrames along the columns (axis=1)
all_static_features = pd.concat([icd_features, demographic_features], axis=1)

# Train Test Split

In [7]:
# Split data into training and temp sets (temp will be split into validation and test)
train_stays, temp_stays = train_test_split(labels, test_size=0.3, random_state=42, stratify=labels['label'])

# split temp into test and validation sets equally
test_stays, val_stays = train_test_split(temp_stays, test_size=0.5, random_state=42, stratify=temp_stays['label'])

# Scaling & Encoding of Static Features

In [8]:
# Create Slices of icd_code features for each set
icd_features_train = icd_features.loc[train_stays['stay_id']]
icd_features_val = icd_features.loc[val_stays['stay_id']]
icd_features_test = icd_features.loc[test_stays['stay_id']]

In [9]:
# Create Slices of all_static features for each set
static_features_train = all_static_features.loc[train_stays['stay_id']]
static_features_val = all_static_features.loc[val_stays['stay_id']]
static_features_test = all_static_features.loc[test_stays['stay_id']]

Continous Features

In [10]:
# Selecting Continious Columns for the Different Sets
continous_static_columns_train = static_features_train[["Age"]]
continous_static_columns_val = static_features_val[["Age"]]
continous_static_columns_test = static_features_test[["Age"]]

In [11]:
# Train Standard Scaler on the Train Set and apply it to Train, Validation and Test Set 
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(continous_static_columns_train)

# Scale Training Set in a way that yields a Data Frame again
continous_static_columns_train = pd.DataFrame(
    scaler.transform(continous_static_columns_train),
    index=continous_static_columns_train.index,
    columns=continous_static_columns_train.columns
)

# Scale Training Set in a way that yields a Data Frame again
continous_static_columns_val = pd.DataFrame(
    scaler.transform(continous_static_columns_val),
    index=continous_static_columns_val.index,
    columns=continous_static_columns_val.columns
)

# Scale Training Set in a way that yields a Data Frame again
continous_static_columns_test = pd.DataFrame(
    scaler.transform(continous_static_columns_test),
    index=continous_static_columns_test.index,
    columns=continous_static_columns_test.columns
)

Categorical Features

In [12]:
# Selecting Categorical Columns for the Different Sets
categorical_columns = ['gender', 'ethnicity', 'insurance']  
categorical_static_columns_train = static_features_train[categorical_columns]
categorical_static_columns_val = static_features_val[categorical_columns]
categorical_static_columns_test = static_features_test[categorical_columns]

In [13]:
# Initialize the encoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Fit the encoder on the training data
encoder.fit(categorical_static_columns_train)

# Apply the encoder to the train, validation, and test data
categorical_static_columns_train_encoded = pd.DataFrame(
    encoder.transform(categorical_static_columns_train),
    index=categorical_static_columns_train.index,
    columns=encoder.get_feature_names_out(categorical_columns)
)

categorical_static_columns_val_encoded = pd.DataFrame(
    encoder.transform(categorical_static_columns_val),
    index=categorical_static_columns_val.index,
    columns=encoder.get_feature_names_out(categorical_columns)
)

categorical_static_columns_test_encoded = pd.DataFrame(
    encoder.transform(categorical_static_columns_test),
    index=categorical_static_columns_test.index,
    columns=encoder.get_feature_names_out(categorical_columns)
)



Boolean Features (ICD Codes)

In [14]:
# Select Boolean columns for each set (ICD Codes)
bool_static_columns_train = static_features_train[icd_features.columns]
bool_static_columns_val = static_features_val[icd_features.columns]
bool_static_columns_test = static_features_test[icd_features.columns]

In [15]:
# Convert Boolean Columns to Float for Consistent Formatting
bool_static_columns_train = bool_static_columns_train.astype(float)
bool_static_columns_val = bool_static_columns_val.astype(float)
bool_static_columns_test = bool_static_columns_test.astype(float)

Combine Data Frames Back into one per Set

In [16]:
# Combine all parts into one DataFrame using pd.concat
processed_static_features_train = pd.concat([continous_static_columns_train, categorical_static_columns_train_encoded, bool_static_columns_train], axis=1)
processed_static_features_test = pd.concat([continous_static_columns_test, categorical_static_columns_test_encoded, bool_static_columns_test], axis=1)
processed_static_features_val = pd.concat([continous_static_columns_val, categorical_static_columns_val_encoded, bool_static_columns_val], axis=1)

# Tokenization and Embedding of ICD Codes Using Med-BERT

Create ICD Sequences

In [17]:
# Function to create a sequence of ICD codes for each stay_id
def create_icd_sequence(row):
    # Filter the ICD codes that are present (value == 1.0)
    present_codes = [code for code in row.index if row[code] == 1.0]
    return ' '.join(present_codes)  # Joining codes into a single string

# Apply the function to each row and create a new DataFrame
train_icd_sequences = bool_static_columns_train.apply(create_icd_sequence, axis=1).reset_index()
train_icd_sequences.columns = ['stay_id', 'icd_sequence']

# Apply the function to each row and create a new DataFrame
val_icd_sequences = bool_static_columns_val.apply(create_icd_sequence, axis=1).reset_index()
val_icd_sequences.columns = ['stay_id', 'icd_sequence']

# Apply the function to each row and create a new DataFrame
test_icd_sequences = bool_static_columns_test.apply(create_icd_sequence, axis=1).reset_index()
test_icd_sequences.columns = ['stay_id', 'icd_sequence']

# Oversample Minority Class aligning With Oversampled Labels

In [18]:
#### The Sequences are oversampled to align with the oversampled labels.
# Separate the classes
majority = train_stays[train_stays['label'] == 0]
minority = train_stays[train_stays['label'] == 1]

# Upsample minority class
minority_upsampled = resample(minority,
                              replace=True,           # sample with replacement
                              n_samples=len(majority), # to match majority class
                              random_state=123)       # reproducible results

# Combine majority class with upsampled minority class
oversampled_train_labels = pd.concat([majority, minority_upsampled])

In [19]:
# Ensure that the stay_id in train_icd_sequences is set as the index if not already
train_icd_sequences.set_index('stay_id', inplace=True)

# Use the loc method to align and replicate rows in train_icd_sequences according to oversampled_train_labels
train_icd_sequences = train_icd_sequences.loc[oversampled_train_labels['stay_id']].reset_index()

In [20]:
# Sort both sequences and labels by stay_id
train_icd_sequences.sort_values(["stay_id"], inplace=True)
oversampled_train_labels.sort_values(["stay_id"], inplace=True)

# Tokenization

In [21]:
# Calculate the length of each ICD sequence
train_icd_sequences['length'] = train_icd_sequences['icd_sequence'].apply(lambda x: len(x.split()))
# Calculate the length of each ICD sequence
val_icd_sequences['length'] = val_icd_sequences['icd_sequence'].apply(lambda x: len(x.split()))
# Calculate the length of each ICD sequence
test_icd_sequences['length'] = test_icd_sequences['icd_sequence'].apply(lambda x: len(x.split()))

# Calculate percentiles
train_percentiles = np.percentile(train_icd_sequences['length'], [0, 25, 50, 75, 100])  # Modify if you need other percentiles
val_percentiles = np.percentile(val_icd_sequences['length'], [0, 25, 50, 75, 100])  # Modify if you need other percentiles
test_percentiles = np.percentile(test_icd_sequences['length'], [0, 25, 50, 75, 100])  # Modify if you need other percentiles

print("Percentiles of Training sequence lengths: 0th, 25th, 50th, 75th, 100th:", train_percentiles)
print("Percentiles of Training sequence lengths: 0th, 25th, 50th, 75th, 100th:", val_percentiles)
print("Percentiles of Training sequence lengths: 0th, 25th, 50th, 75th, 100th:", test_percentiles)

Percentiles of Training sequence lengths: 0th, 25th, 50th, 75th, 100th: [ 0. 12. 17. 24. 39.]
Percentiles of Training sequence lengths: 0th, 25th, 50th, 75th, 100th: [ 2. 12. 16. 22. 38.]
Percentiles of Training sequence lengths: 0th, 25th, 50th, 75th, 100th: [ 1. 11. 15. 22. 37.]


In [22]:
# Load the tokenizer for MedBERT
tokenizer = AutoTokenizer.from_pretrained('Charangan/MedBERT')

# Specify a max_length for tokenization to ensure truncation
max_length = 38  # This is a common choice for BERT models, adjust based on your specific needs

# Tokenize the sequences with explicit padding and truncation
tokenized_train_data = tokenizer(train_icd_sequences['icd_sequence'].tolist(),
                           padding="max_length",  # ensures all sequences are padded to the same length
                           truncation=True,       # enables truncation to max_length
                           max_length=max_length, # explicitly set max_length
                           return_tensors="pt")   # returns PyTorch tensors

tokenized_val_data = tokenizer(val_icd_sequences['icd_sequence'].tolist(),
                           padding="max_length",  # ensures all sequences are padded to the same length
                           truncation=True,       # enables truncation to max_length
                           max_length=max_length, # explicitly set max_length
                           return_tensors="pt")   # returns PyTorch tensors

tokenized_test_data = tokenizer(test_icd_sequences['icd_sequence'].tolist(),
                           padding="max_length",  # ensures all sequences are padded to the same length
                           truncation=True,       # enables truncation to max_length
                           max_length=max_length, # explicitly set max_length
                           return_tensors="pt")   # returns PyTorch tensors


# Embedding

In [23]:
# Load MedBERT model
MedBERTmodel = AutoModel.from_pretrained('Charangan/MedBERT')

# Ensure the model is in evaluation mode
MedBERTmodel.eval()

# Define input ids and attention masks
input_train_ids = tokenized_train_data['input_ids']
train_attention_mask = tokenized_train_data['attention_mask']

input_val_ids = tokenized_val_data['input_ids']
val_attention_mask = tokenized_val_data['attention_mask']

input_test_ids = tokenized_test_data['input_ids']
test_attention_mask = tokenized_test_data['attention_mask']

# Define the batch size
batch_size = 100  # Adjust this based on your available memory

# Calculate the number of batches
num_batches = (input_train_ids.size(0) + batch_size - 1) // batch_size

# List to store all embeddings
all_embeddings = []

In [24]:
## Embedding Generation for Training Set
# Process each batch
for i in range(num_batches):
    batch_start = i * batch_size
    batch_end = min(batch_start + batch_size, input_train_ids.size(0))
    
    # Slice the batch data
    batch_input_ids = input_train_ids[batch_start:batch_end]
    batch_attention_mask = train_attention_mask[batch_start:batch_end]

    # Forward pass to get embeddings
    with torch.no_grad():
        batch_outputs = MedBERTmodel(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        batch_embeddings = batch_outputs.last_hidden_state  # Extract embeddings
    
    # Store the embeddings
    all_embeddings.append(batch_embeddings)
    
    # Print progress
    print(f"Processed batch {i+1} of {num_batches}")

# Concatenate all embeddings to form a single tensor
train_embeddings = torch.cat(all_embeddings, dim=0)

# Now 'embeddings' contains the embeddings for the entire dataset
print("Embedding generation complete.")

Processed batch 1 of 31
Processed batch 2 of 31
Processed batch 3 of 31
Processed batch 4 of 31
Processed batch 5 of 31
Processed batch 6 of 31
Processed batch 7 of 31
Processed batch 8 of 31
Processed batch 9 of 31
Processed batch 10 of 31
Processed batch 11 of 31
Processed batch 12 of 31
Processed batch 13 of 31
Processed batch 14 of 31
Processed batch 15 of 31
Processed batch 16 of 31
Processed batch 17 of 31
Processed batch 18 of 31
Processed batch 19 of 31
Processed batch 20 of 31
Processed batch 21 of 31
Processed batch 22 of 31
Processed batch 23 of 31
Processed batch 24 of 31
Processed batch 25 of 31
Processed batch 26 of 31
Processed batch 27 of 31
Processed batch 28 of 31
Processed batch 29 of 31
Processed batch 30 of 31
Processed batch 31 of 31
Embedding generation complete.


In [25]:
## Embedding Generation for Validation Set

all_embeddings = []  # Initialize the list to store embeddings

# Process each batch
for i in range(num_batches):
    batch_start = i * batch_size
    batch_end = min(batch_start + batch_size, input_val_ids.size(0))
    
    # Slice the batch data
    batch_input_ids = input_val_ids[batch_start:batch_end]
    batch_attention_mask = val_attention_mask[batch_start:batch_end]

    # Forward pass to get embeddings
    with torch.no_grad():
        batch_outputs = MedBERTmodel(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        batch_embeddings = batch_outputs.last_hidden_state  # Extract embeddings
    
    # Store the embeddings
    all_embeddings.append(batch_embeddings)
    
    # Print progress
    print(f"Processed batch {i+1} of {num_batches}")

# Concatenate all embeddings to form a single tensor
val_embeddings = torch.cat(all_embeddings, dim=0)

# Now 'embeddings' contains the embeddings for the entire dataset
print("Embedding generation complete.")

Processed batch 1 of 31
Processed batch 2 of 31
Processed batch 3 of 31
Processed batch 4 of 31
Processed batch 5 of 31
Processed batch 6 of 31
Processed batch 7 of 31
Processed batch 8 of 31
Processed batch 9 of 31
Processed batch 10 of 31
Processed batch 11 of 31
Processed batch 12 of 31
Processed batch 13 of 31
Processed batch 14 of 31
Processed batch 15 of 31
Processed batch 16 of 31
Processed batch 17 of 31
Processed batch 18 of 31
Processed batch 19 of 31
Processed batch 20 of 31
Processed batch 21 of 31
Processed batch 22 of 31
Processed batch 23 of 31
Processed batch 24 of 31
Processed batch 25 of 31
Processed batch 26 of 31
Processed batch 27 of 31
Processed batch 28 of 31
Processed batch 29 of 31
Processed batch 30 of 31
Processed batch 31 of 31
Embedding generation complete.


In [26]:
## Embedding Generation for Testing Set

all_embeddings = []  # Initialize the list to store embeddings

# Process each batch
for i in range(num_batches):
    batch_start = i * batch_size
    batch_end = min(batch_start + batch_size, input_test_ids.size(0))
    
    # Slice the batch data
    batch_input_ids = input_test_ids[batch_start:batch_end]
    batch_attention_mask = test_attention_mask[batch_start:batch_end]

    # Forward pass to get embeddings
    with torch.no_grad():
        batch_outputs = MedBERTmodel(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        batch_embeddings = batch_outputs.last_hidden_state  # Extract embeddings
    
    # Store the embeddings
    all_embeddings.append(batch_embeddings)
    
    # Print progress
    print(f"Processed batch {i+1} of {num_batches}")

# Concatenate all embeddings to form a single tensor
test_embeddings = torch.cat(all_embeddings, dim=0)

# Now 'embeddings' contains the embeddings for the entire dataset
print("Embedding generation complete.")

Processed batch 1 of 31
Processed batch 2 of 31
Processed batch 3 of 31
Processed batch 4 of 31
Processed batch 5 of 31
Processed batch 6 of 31
Processed batch 7 of 31
Processed batch 8 of 31
Processed batch 9 of 31
Processed batch 10 of 31
Processed batch 11 of 31
Processed batch 12 of 31
Processed batch 13 of 31
Processed batch 14 of 31
Processed batch 15 of 31
Processed batch 16 of 31
Processed batch 17 of 31
Processed batch 18 of 31
Processed batch 19 of 31
Processed batch 20 of 31
Processed batch 21 of 31
Processed batch 22 of 31
Processed batch 23 of 31
Processed batch 24 of 31
Processed batch 25 of 31
Processed batch 26 of 31
Processed batch 27 of 31
Processed batch 28 of 31
Processed batch 29 of 31
Processed batch 30 of 31
Processed batch 31 of 31
Embedding generation complete.


In [27]:
print("Train Embedding Shape:", train_embeddings.shape)
print("Validation Embedding Shape:", val_embeddings.shape)
print("Test Embedding Shape:", test_embeddings.shape)

Train Embedding Shape: torch.Size([3054, 38, 768])
Validation Embedding Shape: torch.Size([357, 38, 768])
Test Embedding Shape: torch.Size([357, 38, 768])


# Define Label Tensor

In [28]:
# Convert label columns directly to tensors
label_tensor_train = torch.tensor(oversampled_train_labels['label'].values, dtype=torch.float32)
label_tensor_test = torch.tensor(test_stays['label'].values, dtype=torch.float32)
label_tensor_val = torch.tensor(val_stays['label'].values, dtype=torch.float32)

# Print shapes to confirm
print("Label Train Tensor shape:", label_tensor_train.shape)
print("Label Test Tensor shape:", label_tensor_test.shape)
print("Label Validation Tensor shape:", label_tensor_val.shape)

Label Train Tensor shape: torch.Size([3054])
Label Test Tensor shape: torch.Size([357])
Label Validation Tensor shape: torch.Size([357])


# Load Pre-defined Tensors

In [29]:
# Path to load tensors
tensor_save_path = f'./data/tensors/{percentage}_subset'

# Load tensors
dynamic_train_tensor_oversampled = torch.load(os.path.join(tensor_save_path, 'dynamic_train_tensor_oversampled.pt'))
dynamic_test_tensor = torch.load(os.path.join(tensor_save_path, 'dynamic_test_tensor.pt'))
dynamic_val_tensor = torch.load(os.path.join(tensor_save_path, 'dynamic_val_tensor.pt'))

label_tensor_train_oversampled = torch.load(os.path.join(tensor_save_path, 'label_tensor_train_oversampled.pt'))
label_tensor_test = torch.load(os.path.join(tensor_save_path, 'label_test_tensor.pt'))
label_tensor_val = torch.load(os.path.join(tensor_save_path, 'label_val_tensor.pt'))

In [30]:
# Print shapes to confirm
print("Label Train Tensor shape:", label_tensor_train_oversampled.shape)
print("Dynamic Train Tensor shape:", dynamic_train_tensor_oversampled.shape)
print("Train Embedding Shape:", train_embeddings.shape)

print("Label Test Tensor shape:", label_tensor_test.shape)
print("Dynamic Test Tensor shape:", dynamic_test_tensor.shape)
print("Test Embedding Shape:", test_embeddings.shape)


print("Label Validation Tensor shape:", label_tensor_val.shape)
print("Dynamic Validation Tensor shape:", dynamic_val_tensor.shape)
print("Validation Embedding Shape:", val_embeddings.shape)

Label Train Tensor shape: torch.Size([3054])
Dynamic Train Tensor shape: torch.Size([3054, 12, 834])
Train Embedding Shape: torch.Size([3054, 38, 768])
Label Test Tensor shape: torch.Size([357])
Dynamic Test Tensor shape: torch.Size([357, 12, 835])
Test Embedding Shape: torch.Size([357, 38, 768])
Label Validation Tensor shape: torch.Size([357])
Dynamic Validation Tensor shape: torch.Size([357, 12, 835])
Validation Embedding Shape: torch.Size([357, 38, 768])


In [31]:
# Verify that the redefined oversampled labels in this notebook are identical to the oversampled labels in the saved tensors
print("Label Tensors Are Identical?")
print(torch.equal(label_tensor_train, label_tensor_train_oversampled))

Label Tensors Are Identical?
False


# Train Model on Embeddings and Dynamic Data

In [32]:
class HybridModel(nn.Module):
    def __init__(self, num_dynamic_features, embedding_dim, lstm_hidden_dim, fc_hidden_dim, output_dim):
        super(HybridModel, self).__init__()
        # LSTM for processing dynamic data
        self.lstm = nn.LSTM(input_size=num_dynamic_features, hidden_size=lstm_hidden_dim, batch_first=True)
        
        # Dense layers for processing embeddings with two layers
        self.embedding_fc1 = nn.Linear(embedding_dim, fc_hidden_dim)
        self.embedding_fc2 = nn.Linear(fc_hidden_dim, fc_hidden_dim)  # Additional layer
        self.relu = nn.ReLU()
        
        # Final dense layer after concatenation
        self.final_fc = nn.Linear(lstm_hidden_dim + fc_hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, dynamic_input, embedding_input):
        # LSTM pathway
        lstm_out, (hidden, _) = self.lstm(dynamic_input)
        lstm_out = lstm_out[:, -1, :]  # Take the last hidden state

        # Check if averaging is necessary for embeddings
        if embedding_input.dim() > 2:
            embedding_input = torch.mean(embedding_input, dim=1)

        # Embedding pathway
        embedding_out = self.relu(self.embedding_fc1(embedding_input))
        embedding_out = self.relu(self.embedding_fc2(embedding_out))  # Pass through second dense layer

        # Concatenate outputs
        combined_out = torch.cat((lstm_out, embedding_out), dim=1)
        
        # Final output
        final_out = self.sigmoid(self.final_fc(combined_out))
        return final_out
    
# Parameters setup
num_dynamic_features = 834  # As per your dynamic data
embedding_dim = 768         # Dimension of Med-BERT embeddings
lstm_hidden_dim = 128       # Hidden dimension for LSTM
fc_hidden_dim = 128         # Hidden dimension for the embedding fully connected layer
output_dim = 1              # Output dimension (binary classification)

# Instantiate the model
model = HybridModel(num_dynamic_features, embedding_dim, lstm_hidden_dim, fc_hidden_dim, output_dim)

# Setup the criterion and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define your training loop
def train_model(model, criterion, optimizer, dynamic_train, embedding_train, labels_train, dynamic_val, embedding_val, labels_val, epochs=100):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(dynamic_train, embedding_train)
        loss = criterion(outputs.squeeze(), labels_train)
        
        # Backward and optimize
        loss.backward()
        optimizer.step()
        
        # Evaluation phase
        model.eval()
        with torch.no_grad():
            val_outputs = model(dynamic_val, embedding_val)
            val_loss = criterion(val_outputs.squeeze(), labels_val)
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')

# Example use (assuming you have tensors for dynamic data and embeddings ready)
train_model(model, criterion, optimizer, dynamic_train_tensor_oversampled, train_embeddings, label_tensor_train_oversampled, dynamic_val_tensor, val_embeddings, label_tensor_val)


Epoch [1/100], Loss: 0.6918, Val Loss: 0.6692
Epoch [2/100], Loss: 0.6660, Val Loss: 0.6673
Epoch [3/100], Loss: 0.6419, Val Loss: 0.6695
Epoch [4/100], Loss: 0.6188, Val Loss: 0.6531
Epoch [5/100], Loss: 0.5961, Val Loss: 0.6440
Epoch [6/100], Loss: 0.5740, Val Loss: 0.6478
Epoch [7/100], Loss: 0.5523, Val Loss: 0.6389
Epoch [8/100], Loss: 0.5310, Val Loss: 0.6304
Epoch [9/100], Loss: 0.5102, Val Loss: 0.6343
Epoch [10/100], Loss: 0.4899, Val Loss: 0.6261
Epoch [11/100], Loss: 0.4704, Val Loss: 0.6189
Epoch [12/100], Loss: 0.4515, Val Loss: 0.6247
Epoch [13/100], Loss: 0.4335, Val Loss: 0.6009
Epoch [14/100], Loss: 0.4160, Val Loss: 0.6211
Epoch [15/100], Loss: 0.3989, Val Loss: 0.5824
Epoch [16/100], Loss: 0.3817, Val Loss: 0.5947
Epoch [17/100], Loss: 0.3644, Val Loss: 0.5719
Epoch [18/100], Loss: 0.3471, Val Loss: 0.5514
Epoch [19/100], Loss: 0.3299, Val Loss: 0.5533
Epoch [20/100], Loss: 0.3128, Val Loss: 0.5092
Epoch [21/100], Loss: 0.2959, Val Loss: 0.5238
Epoch [22/100], Loss: 

In [33]:
def evaluate_hybrid_med_bert_model(model, dynamic_test, embedding_test, labels_test, directory, model_name, beta=2):
    model.eval()  # Set the model to evaluation mode
    sns.set()  # For better plot styling

    with torch.no_grad():
        # Predictions
        outputs = model(dynamic_test, embedding_test).squeeze()
        test_probs = outputs.numpy()  # Probability predictions

    # True labels for comparison
    true_labels = labels_test.numpy()

    # Calculate precision-recall curve and corresponding thresholds
    precision, recall, thresholds = precision_recall_curve(true_labels, test_probs)

    # Calculate F-beta scores for each possible threshold
    f_beta_scores = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
    f_beta_scores = np.nan_to_num(f_beta_scores)  # Handling NaNs

    # Find the threshold that maximizes the F-beta score
    optimal_idx = np.argmax(f_beta_scores)
    best_threshold = thresholds[optimal_idx]

    # Use the best threshold found
    test_predictions = (test_probs > best_threshold).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(true_labels, test_predictions)
    precision = precision_score(true_labels, test_predictions, zero_division=0)
    recall = recall_score(true_labels, test_predictions)
    f1 = f1_score(true_labels, test_predictions)
    auc_roc = roc_auc_score(true_labels, test_probs)
    auprc = average_precision_score(true_labels, test_probs)

    # Print the results
    print(f"Best Threshold: {best_threshold:.2f}")
    print(f"{model_name} Model Performance on Test Set:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-ROC: {auc_roc:.4f}")
    print(f"AUPRC: {auprc:.2f}")

    # Ensure the directory exists
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Save the confusion matrix
    cm = confusion_matrix(true_labels, test_predictions)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(f'{directory}/{model_name}_confusion_matrix.png')
    plt.close()  # Close the plot to avoid display

    # Save performance metrics to a text file
    metrics_filepath = f'{directory}/{model_name}_metrics.txt'
    with open(metrics_filepath, 'w') as f:
        f.write(f"{model_name} Model Performance on Test Set:\n")
        f.write(f"Best Threshold: {best_threshold:.2f}\n")
        f.write(f"Accuracy: {accuracy:.4f}\n")
        f.write(f"Precision: {precision:.4f}\n")
        f.write(f"Recall: {recall:.4f}\n")
        f.write(f"F1 Score: {f1:.4f}\n")
        f.write(f"AUC-ROC: {auc_roc:.4f}\n")
        f.write(f"AUPRC: {auprc:.2f}\n")

    print(f"Performance metrics saved to: {metrics_filepath}")

# Example usage
evaluate_hybrid_med_bert_model(model, dynamic_test_tensor, test_embeddings, label_tensor_test, f'./saved_models/Hybrid_Med-BERT/trained_on_{percentage}', 'Hybrid_Med_BERT')

Best Threshold: 0.24
Hybrid_Med_BERT Model Performance on Test Set:
Accuracy: 0.7311
Precision: 0.1827
Recall: 0.6333
F1 Score: 0.2836
AUC-ROC: 0.6700
AUPRC: 0.21
Performance metrics saved to: ./saved_models/Hybrid_Med-BERT/trained_on_5%/Hybrid_Med_BERT_metrics.txt


  f_beta_scores = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)


In [34]:
# Save model and optimizer state
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, f'./saved_models/Hybrid_Med-BERT/trained_on_{percentage}/best_Hybrid_Med-BERT_model.pth')