In [None]:
import pandas as pd

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 1000)

In [None]:
data = pd.read_csv('sample_train.csv')

In [None]:
data.columns

In [None]:
data.head()

In [None]:
sample = data[['event_type', 'event_text', 'relative_time_to_final_event']]
data.head()

### Data Pre Processing

In [None]:
import os 

os.environ["HF_TOKEN"] = "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"
print("Hugging Face API token configured!")

os.environ['HF_CACHE'] = '/Users/rahulbouri/Desktop/projects/mimic/hf_cache'
print("Hugging Face cache configured!")

import numpy as np
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from typing import Dict, List, Tuple, Optional

In [None]:
class ClinicalDataPreprocessor:
    """
    Handles all preprocessing steps for clinical data including:
    - One-hot encoding of event_type
    - ClinicalBERT embedding generation for event_text
    - Time encoding for relative_time_to_final_event
    - One-hot encoding of static features
    """
    
    def __init__(self, bert_model_name: str = "medicalai/ClinicalBERT", 
                 bert_embed_dim: int = 64, time_embed_dim: int = 16):
        """
        Initialize preprocessor with BERT model and embedding dimensions
        
        Args:
            bert_model_name: HuggingFace model name for ClinicalBERT
            bert_embed_dim: Target dimension for BERT embeddings (reduced from 768)
            time_embed_dim: Dimension for time encoding
        """
        self.bert_model_name = bert_model_name
        self.bert_embed_dim = bert_embed_dim
        self.time_embed_dim = time_embed_dim
        
        # Initialize BERT tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
        self.bert_model = AutoModel.from_pretrained(bert_model_name)
        self.bert_model.eval()  # Set to evaluation mode
        
        # Dimensionality reduction layer for BERT embeddings
        self.bert_reducer = nn.Linear(768, bert_embed_dim)  # ClinicalBERT outputs 768-dim
        
        # Initialize encoders (will be fitted during preprocessing)
        self.event_type_encoder = None
        self.static_encoders = {}
        self.time_scaler = StandardScaler()
        
        # Store processed embeddings to avoid recomputation
        self.precomputed_embeddings = {}
        
    def encode_event_type(self, event_types: pd.Series) -> np.ndarray:
        """
        One-hot encode event_type column
        
        Args:
            event_types: Series containing event type strings
            
        Returns:
            One-hot encoded array of shape (n_samples, n_event_types)
        """
        if self.event_type_encoder is None:
            # Fit encoder on unique event types
            unique_types = event_types.unique()
            self.event_type_encoder = OneHotEncoder(sparse_output=False)
            self.event_type_encoder.fit(unique_types.reshape(-1, 1))
            print(f"Event types found: {unique_types}")
        
        # Transform event types
        encoded = self.event_type_encoder.transform(event_types.values.reshape(-1, 1))
        print(f"Event type encoding shape: {encoded.shape}")
        return encoded
    
    def encode_event_text_batch(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        """
        Generate ClinicalBERT embeddings for event_text in batches
        
        Args:
            texts: List of clinical text descriptions
            batch_size: Number of texts to process in each batch
            
        Returns:
            Array of embeddings with shape (n_texts, bert_embed_dim)
        """
        all_embeddings = []
        
        # Process in batches to manage memory
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            # Tokenize batch
            inputs = self.tokenizer(
                batch_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            )
            
            # Generate embeddings without gradients (frozen BERT)
            with torch.no_grad():
                outputs = self.bert_model(**inputs)
                # Use CLS token embedding (first token) as sentence representation
                cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, 768)
                
                # Reduce dimensionality
                reduced_embeddings = self.bert_reducer(cls_embeddings)  # Shape: (batch_size, bert_embed_dim)
                
            all_embeddings.append(reduced_embeddings.numpy())
            
            if (i // batch_size + 1) % 10 == 0:
                print(f"Processed {min(i + batch_size, len(texts))}/{len(texts)} texts")
        
        embeddings = np.vstack(all_embeddings)
        print(f"Generated BERT embeddings shape: {embeddings.shape}")
        return embeddings
    
    def encode_relative_time(self, times: pd.Series) -> np.ndarray:
        """
        Encode relative_time_to_final_event using learned embeddings
        
        Args:
            times: Series containing relative time values
            
        Returns:
            Time embeddings of shape (n_samples, time_embed_dim)
        """
        # Normalize time values
        times_scaled = self.time_scaler.fit_transform(times.values.reshape(-1, 1))
        
        # Create time embeddings using sinusoidal encoding (similar to transformer positional encoding)
        embeddings = np.zeros((len(times), self.time_embed_dim))
        
        for i in range(self.time_embed_dim):
            if i % 2 == 0:
                embeddings[:, i] = np.sin(times_scaled.flatten() / (10000 ** (i / self.time_embed_dim)))
            else:
                embeddings[:, i] = np.cos(times_scaled.flatten() / (10000 ** (i / self.time_embed_dim)))
        
        print(f"Time encoding shape: {embeddings.shape}")
        return embeddings
    
    def encode_static_features(self, df: pd.DataFrame, static_columns: List[str]) -> np.ndarray:
        """
        One-hot encode static features (ICU careunit, gender, etc.)
        
        Args:
            df: DataFrame containing static features
            static_columns: List of column names to encode
            
        Returns:
            One-hot encoded static features
        """
        all_static_encoded = []
        
        for col in static_columns:
            if col not in self.static_encoders and col != 'patient_age':
                # Handle missing values by filling with 'Unknown'
                df[col] = df[col].fillna('Unknown')
                
                # Fit encoder
                encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
                encoder.fit(df[col].values.reshape(-1, 1))
                self.static_encoders[col] = encoder
                
                print(f"Static feature '{col}' categories: {encoder.categories_[0]}")
            elif col not in self.static_encoders and col == 'patient_age':
                df[col] = df[col].fillna(df[col].mean())
                encoder = StandardScaler()
                encoder.fit(df[col].values.reshape(-1, 1))
                self.static_encoders[col] = encoder
                print(f"Static feature '{col}' categories: {encoder.categories_[0]}")
            
            # Transform feature
            encoded = self.static_encoders[col].transform(df[col].values.reshape(-1, 1))
            all_static_encoded.append(encoded)
        
        if all_static_encoded:
            static_features = np.hstack(all_static_encoded)
            print(f"Static features encoding shape: {static_features.shape}")
            return static_features
        else:
            return np.empty((len(df), 0))

In [None]:
class ClinicalSequenceDataset(Dataset):
    """
    Custom Dataset class for clinical sequential data
    Handles variable sequence lengths and proper batching
    """
    
    def __init__(self, sequences: Dict[str, List[np.ndarray]], 
                 static_features: np.ndarray, 
                 labels: np.ndarray,
                 max_seq_length: Optional[int] = None):
        """
        Initialize dataset
        
        Args:
            sequences: Dict with keys 'event_type', 'event_text', 'time' containing lists of arrays
            static_features: Array of static features for each patient
            labels: Binary labels for hospital mortality
            max_seq_length: Maximum sequence length for padding/truncation
        """
        self.sequences = sequences
        self.static_features = static_features
        self.labels = labels
        self.max_seq_length = max_seq_length or self._get_max_length()
        
    def _get_max_length(self) -> int:
        """Get maximum sequence length across all patients"""
        return max(len(seq) for seq in self.sequences['event_type'])
    
    def __len__(self) -> int:
        return len(self.labels)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Get a single sample
        
        Returns:
            transformer_input: Concatenated features for transformer (seq_len, feature_dim)
            static_features: Static features for this patient
            label: Binary mortality label
        """
        # Get sequences for this patient
        event_type_seq = self.sequences['event_type'][idx]
        event_text_seq = self.sequences['event_text'][idx]
        time_seq = self.sequences['time'][idx]
        
        # Pad or truncate sequences
        seq_len = min(len(event_type_seq), self.max_seq_length)
        
        # Concatenate features for each time step
        transformer_input = []
        for i in range(seq_len):
            # Concatenate: event_type (5) + event_text (64) + time (16)
            timestep_features = np.concatenate([
                event_type_seq[i],      # One-hot event type
                event_text_seq[i],      # BERT embeddings
                time_seq[i]            # Time encoding
            ])
            transformer_input.append(timestep_features)
        
        # Pad sequence if necessary
        while len(transformer_input) < self.max_seq_length:
            # Zero padding
            zero_pad = np.zeros_like(transformer_input[0])
            transformer_input.append(zero_pad)
        
        transformer_input = np.array(transformer_input)
        
        return (
            torch.FloatTensor(transformer_input),
            torch.FloatTensor(self.static_features[idx]),
            torch.FloatTensor([self.labels[idx]])
        )

In [None]:
def prepare_clinical_data(df: pd.DataFrame, 
                         patient_id_col: str = 'patient_id',
                         test_size: float = 0.2,
                         max_seq_length: int = 100) -> Tuple:
    """
    Complete data preparation pipeline for clinical transformer model
    
    Args:
        df: Raw clinical dataframe
        patient_id_col: Column name for patient ID
        test_size: Proportion for test split
        max_seq_length: Maximum sequence length
        
    Returns:
        Tuple of (train_dataset, val_dataset, test_dataset, preprocessor, model_params)
    """
    print("Starting clinical data preparation...")
    
    # Initialize preprocessor
    preprocessor = ClinicalDataPreprocessor()
    
    # Define static feature columns
    static_columns = [
        'patient_gender', 'first_icu_careunit', 'last_icu_careunit',
        'admission_type', 'admission_location', 'discharge_location',
        'insurance', 'marital_status', 'patient_race', 
        'patient_age'
    ]
    
    # Group by patient to create sequences
    print("Grouping data by patient...")
    patient_groups = df.groupby(patient_id_col)
    
    # Prepare containers for processed data
    all_sequences = {'event_type': [], 'event_text': [], 'time': []}
    all_static_features = []
    all_labels = []
    patient_ids = []
    
    print("Processing patients...")
    for patient_id, patient_data in patient_groups:
        if len(patient_data) == 0:
            continue
            
        # Sort by event time or any temporal column
        patient_data = patient_data.sort_values('relative_time_to_final_event', ascending=False)
        
        # Limit sequence length
        if len(patient_data) > max_seq_length:
            patient_data = patient_data.head(max_seq_length)
        

In [None]:
train_ds, val_ds, test_ds, preprocessor, model_params = prepare_clinical_data(data)