In [None]:
import pandas as pd

#load filtered jump point data
df = pd.read_csv("../data/filtered_jump_points.csv")

#show first few rows
df.head()

Unnamed: 0,patient_id,label,timestamp,event_type,event_value
0,1,0,2025-01-01 08:00:26.600,flow,25.733894
1,1,0,2025-01-01 08:00:31.000,flow,29.680849
2,1,0,2025-01-01 08:00:40.000,flow,9.589884
3,1,0,2025-01-01 08:00:40.200,flow,4.657168
4,1,0,2025-01-01 08:00:40.600,flow,6.713505


In [2]:
#1. Sort and caluculate time gaps
#each jump point row becomes a triple: (event_type, event_value, time_gap)
#covert timestamp column to datetime
df["timestamp"] = pd.to_datetime(df["timestamp"])

#sort by timestamp
df = df.sort_values("timestamp").reset_index(drop=True)

#calculate time gap (in seconds) between events
df["time_gap"] = df["timestamp"].diff().dt.total_seconds().fillna(0)

#show first 10 results
df.head(10)

Unnamed: 0,patient_id,label,timestamp,event_type,event_value,time_gap
0,1,0,2025-01-01 08:00:01.200,pressure,13.766793,0.0
1,1,0,2025-01-01 08:00:01.400,pressure,5.55084,0.2
2,1,0,2025-01-01 08:00:02.400,pressure,10.001789,1.0
3,1,0,2025-01-01 08:00:04.000,pressure,12.761725,1.6
4,1,0,2025-01-01 08:00:04.200,pressure,6.026644,0.2
5,1,0,2025-01-01 08:00:04.600,pressure,5.751649,0.4
6,1,0,2025-01-01 08:00:05.600,pressure,9.020591,1.0
7,1,0,2025-01-01 08:00:05.800,pressure,14.415737,0.2
8,1,0,2025-01-01 08:00:06.000,pressure,5.148983,0.2
9,1,0,2025-01-01 08:00:06.600,pressure,7.398954,0.6


In [3]:
from sklearn.preprocessing import LabelEncoder

#2. Encode event_type as integer tokens (ID's)
event_type_encoder = LabelEncoder() #object used to transform event_type column
event_type_encoder.fit(df["event_type"]) #find unique event_type values
df["event_type_id"] = event_type_encoder.transform(df["event_type"])#transform event types into IDs


#save mapping - dictionary to map each event type to its encoded integer
event_type_vocab = dict(zip(event_type_encoder.classes_, event_type_encoder.transform(event_type_encoder.classes_)))
print("Event type to ID mapping: ", event_type_vocab)

#shows unique event_type/id combination
df[["event_type", "event_type_id"]].drop_duplicates()

Event type to ID mapping:  {'flow': np.int64(0), 'leak': np.int64(1), 'minutevent': np.int64(2), 'pressure': np.int64(3), 'resrate': np.int64(4)}


Unnamed: 0,event_type,event_type_id
0,pressure,3
26,minutevent,2
28,leak,1
29,resrate,4
48,flow,0


In [4]:
from sklearn.preprocessing import MinMaxScaler

#3. Normalize event_value and time gap
scaler = MinMaxScaler()

#learn min/max values in colums and apply scaling formula.
df[["event_value_scaled", "time_gap_scaled"]] = scaler.fit_transform(df[["event_value", "time_gap"]])

#show few rows
df.head()

Unnamed: 0,patient_id,label,timestamp,event_type,event_value,time_gap,event_type_id,event_value_scaled,time_gap_scaled
0,1,0,2025-01-01 08:00:01.200,pressure,13.766793,0.0,3,0.670504,0.0
1,1,0,2025-01-01 08:00:01.400,pressure,5.55084,0.2,3,0.549097,0.076923
2,1,0,2025-01-01 08:00:02.400,pressure,10.001789,1.0,3,0.614869,0.384615
3,1,0,2025-01-01 08:00:04.000,pressure,12.761725,1.6,3,0.655652,0.615385
4,1,0,2025-01-01 08:00:04.200,pressure,6.026644,0.2,3,0.556128,0.076923


In [5]:
#4. Format data into sequences
#each jump point (row) becomes a 3 element NumPy vector for input into transformer
sequence = df[["event_type_id", "event_value_scaled", "time_gap_scaled"]].values

#confirm shape: (number_of_events, number_of_features_per_event)
print("Sequence shape:", sequence.shape)

#show first few entries
print(sequence[:5])

Sequence shape: (109, 3)
[[3.         0.67050374 0.        ]
 [3.         0.54909723 0.07692308]
 [3.         0.61486855 0.38461538]
 [3.         0.65565192 0.61538462]
 [3.         0.55612814 0.07692308]]


In [6]:
import torch

#5 Convert sequence to PyTorch tensors
sequence_tensor = torch.tensor(sequence, dtype=torch.float32)
print("Tensor shape:", sequence_tensor.shape) #confirm shape

#add a batch dimension for feeding multiple patients at a later time
sequence_tensor = sequence_tensor.unsqueeze(0)
print("Batch tensor shape:", sequence_tensor.shape) #confirm updated shape

Tensor shape: torch.Size([109, 3])
Batch tensor shape: torch.Size([1, 109, 3])


In [10]:
import torch.nn as nn

#6.1 Set model parameters
event_type_vocab_size = len(event_type_vocab) #number of unique event types (defined step 2)
embedding_dim = 8 #size of the vector for each category for model to learn (tune if needed)
feature_dim = 2   #event_value_scaled + time_game_scaled (2 features added to model per step)

transformer_input_dim = embedding_dim + feature_dim #total size of vector passed to model
sequence_length = sequence_tensor.shape #confirm shape
print("Tensor shape:", sequence_length)

Tensor shape: torch.Size([1, 109, 3])


In [None]:
#6.2 Transformer model
class AECOPDTransformer(nn.module):
    def __init__(self, event_type_vocab_size, embedding_dim, transformer_input_dim):
        super().__init__() #constructor: initialize base class for PyTorch model

        #embed event_type_id - vector of size (num_event_types, 8)
        self.event_embedding = nn.Embedding(event_type_vocab_size, embedding_dim)

        #Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model = transformer_input_dim, #total input vector size per event
            nhead = 2,                       #number of attention heads
            dim_feedforward = 128,           #size of internal FFN
            dropout = 0.1,                   #regularization
            batch_first = True               #shape: (batch, seq, feature)
        )

        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)

        #Final output
        #single patient embedding (mean of all time steps - 1 vector per patient)
        #layer expects shape: (batch, feature_dim, seq_len)
        self.pooling = nnAdaptiveAvgPool1d(1)

        #forward runs when calling model(x_cat, x_num)
        def forward(self, x_cat, x_num):
            #x_cat: catigorical features: (batch, seq_len) - event_type_id
            #x_num: numeric features: (batch, seq_len) - event_value + time_gap

            #1. embed categorical input
            embedded = self.event_embedding(x_cat)         #shape after embedding: (batch, seq_len, embedding_dim)
            #2. concatenate embedded + numeric
            combined = torch.cat([embedded, x_num], dim=2) #new shape: (batch, seq_len, embedding_dim+2)

            #3. Run through transformer
            transformed = self.transformer(combined) #(batch, seq_len, embedded+2)

            #4. Pool over time steps to get one patient vector: (batch, feature_dim)
            #averages over all time steps and removes now-singleton time dimension
            pooled = self.pooling(transformed.transpose(1, 2)).squeeze(2) #pooled shape: (batch, transformer_input_dim)
            return pooled