In [1]:
import pandas as pd

#load filtered jump point data
df = pd.read_csv("../data/filtered_jump_points.csv")

#show first few rows
df.head()

Unnamed: 0,patient_id,label,timestamp,event_type,event_value
0,1,0,2025-01-01 08:00:10.200,flow,22.988039
1,1,0,2025-01-01 08:00:33.800,flow,26.679169
2,1,0,2025-01-01 08:00:34.600,flow,8.291424
3,1,0,2025-01-01 08:00:35.400,flow,12.047348
4,1,0,2025-01-01 08:00:35.600,flow,4.242724


In [2]:
#1. Sort and caluculate time gaps
#each jump point row becomes a triple: (event_type, event_value, time_gap)
#covert timestamp column to datetime
df["timestamp"] = pd.to_datetime(df["timestamp"])

#sort by timestamp
df = df.sort_values("timestamp").reset_index(drop=True)

#calculate time gap (in seconds) between events
df["time_gap"] = df["timestamp"].diff().dt.total_seconds().fillna(0)

#show first 10 results
df.head(10)

Unnamed: 0,patient_id,label,timestamp,event_type,event_value,time_gap
0,1,0,2025-01-01 08:00:02.200,pressure,12.871221,0.0
1,1,0,2025-01-01 08:00:02.400,pressure,3.561661,0.2
2,1,0,2025-01-01 08:00:02.800,pressure,7.087086,0.4
3,1,0,2025-01-01 08:00:06.800,pressure,9.052294,4.0
4,1,0,2025-01-01 08:00:07.000,pressure,4.497117,0.2
5,1,0,2025-01-01 08:00:07.400,pressure,9.563705,0.4
6,1,0,2025-01-01 08:00:07.600,pressure,16.166415,0.2
7,1,0,2025-01-01 08:00:08.400,pressure,8.416227,0.8
8,1,0,2025-01-01 08:00:09.200,pressure,13.74428,0.8
9,1,0,2025-01-01 08:00:09.400,pressure,5.945877,0.2


In [3]:
from sklearn.preprocessing import LabelEncoder

#2. Encode event_type as integer tokens (ID's)
event_type_encoder = LabelEncoder() #object used to transform event_type column
event_type_encoder.fit(df["event_type"]) #find unique event_type values
df["event_type_id"] = event_type_encoder.transform(df["event_type"])#transform event types into IDs


#save mapping - dictionary to map each event type to its encoded integer
event_type_vocab = dict(zip(event_type_encoder.classes_, event_type_encoder.transform(event_type_encoder.classes_)))
print("Event type to ID mapping: ", event_type_vocab)

#shows unique event_type/id combination
df[["event_type", "event_type_id"]].drop_duplicates()

Event type to ID mapping:  {'flow': np.int64(0), 'leak': np.int64(1), 'minutevent': np.int64(2), 'pressure': np.int64(3), 'resrate': np.int64(4)}


Unnamed: 0,event_type,event_type_id
0,pressure,3
11,flow,0
25,minutevent,2
30,resrate,4
38,leak,1


In [4]:
from sklearn.preprocessing import MinMaxScaler

#3. Normalize event_value and time gap
scaler = MinMaxScaler()

#learn min/max values in colums and apply scaling formula.
df[["event_value_scaled", "time_gap_scaled"]] = scaler.fit_transform(df[["event_value", "time_gap"]])

#show few rows
df.head()

Unnamed: 0,patient_id,label,timestamp,event_type,event_value,time_gap,event_type_id,event_value_scaled,time_gap_scaled
0,1,0,2025-01-01 08:00:02.200,pressure,12.871221,0.0,3,0.606321,0.0
1,1,0,2025-01-01 08:00:02.400,pressure,3.561661,0.2,3,0.423843,0.05
2,1,0,2025-01-01 08:00:02.800,pressure,7.087086,0.4,3,0.492946,0.1
3,1,0,2025-01-01 08:00:06.800,pressure,9.052294,4.0,3,0.531466,1.0
4,1,0,2025-01-01 08:00:07.000,pressure,4.497117,0.2,3,0.442179,0.05


In [5]:
#4. Format data into sequences
#each jump point (row) becomes a 3 element NumPy vector for input into transformer
sequence = df[["event_type_id", "event_value_scaled", "time_gap_scaled"]].values

#confirm shape: (number_of_events, number_of_features_per_event)
print("Sequence shape:", sequence.shape)

#show first few entries
print(sequence[:5])

Sequence shape: (106, 3)
[[3.         0.60632134 0.        ]
 [3.         0.42384331 0.05      ]
 [3.         0.49294566 0.1       ]
 [3.         0.53146599 1.        ]
 [3.         0.44217932 0.05      ]]


In [6]:
import torch

#5 convert sequence to PyTorch tensors
sequence_tensor = torch.tensor(sequence, dtype=torch.float32)
print("Tensor shape:", sequence_tensor.shape) #confirm shape

#add a batch dimension for feeding multiple patients at a later time
sequence_tensor = sequence_tensor.unsqueeze(0)
print("Batch tensor shape:", sequence_tensor.shape) #confirm updated shape

Tensor shape: torch.Size([106, 3])
Batch tensor shape: torch.Size([1, 106, 3])
