In [1]:
# Import neccessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, auc

import pickle

# Set plotting style
plt.style.use('dark_background')

In [2]:
# Data Loading

# Load ECG data for 1 subject, day 4
file_id = '1URua1BRmcTgkwU1nGPWRkrxhabWVotxH'
url = f'https://drive.google.com/uc?id={file_id}&export=download'
df_tochunk = pd.read_feather(url)  # ECG data

# Load glucose data for the same subject, all days
file_id = '1qGfSIb9EEJ4ZxlWnBcsILgh9LbHAiMld'
url = f'https://drive.google.com/uc?id={file_id}&export=download'
gl_d1 = pd.read_feather(url)  # Glucose data

In [6]:
df_tochunk.tail()

Unnamed: 0_level_0,EcgWaveform
datetime,Unnamed: 1_level_1
2014-10-04 13:48:01.403,1956
2014-10-04 13:48:01.407,1956
2014-10-04 13:48:01.411,1957
2014-10-04 13:48:01.415,1957
2014-10-04 13:48:01.419,1957


In [5]:
gl_d1.tail()

Unnamed: 0_level_0,glucose,type
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-10-06 16:34:02,12.3,cgm
2014-10-06 16:39:02,12.5,cgm
2014-10-06 16:44:02,12.5,cgm
2014-10-06 16:49:02,12.3,cgm
2014-10-06 16:54:02,12.1,cgm


In [7]:
# Get the time range of the ECG data
ecg_start_time = df_tochunk.index.min()
ecg_end_time = df_tochunk.index.max()

# Filter glucose data to match the ECG time range
gl_d1_filtered = gl_d1[(gl_d1.index >= ecg_start_time) & (gl_d1.index <= ecg_end_time)]

In [8]:
gl_d1_filtered

Unnamed: 0_level_0,glucose,type
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-10-04 06:39:01,6.9,cgm
2014-10-04 06:44:01,7.3,cgm
2014-10-04 06:49:01,7.8,cgm
2014-10-04 06:54:01,8.3,cgm
2014-10-04 06:59:01,8.8,cgm
...,...,...
2014-10-04 13:24:01,5.9,cgm
2014-10-04 13:29:01,6.3,cgm
2014-10-04 13:34:01,6.7,cgm
2014-10-04 13:39:01,7.0,cgm


In [9]:
def chunkify_with_time(df: pd.DataFrame, chunk_size: int, step_size: int) -> list[tuple[pd.DataFrame, pd.Timestamp, pd.Timestamp]]:
    """
    Splits sensor data into overlapping chunks and returns chunks with their time ranges.
    """
    chunks = []
    for start in range(0, df.shape[0], step_size):
        end = start + chunk_size
        if end < df.shape[0]:
            chunk = df.iloc[start:end]
        else:
            chunk = df.iloc[start:]
        chunks.append((chunk, chunk.index[0], chunk.index[-1]))
    return chunks

# Define chunk size and step size
CHUNK_SIZE = 60 * 60 * 250  # 60 minutes in samples
STEP_SIZE = 60 * 250  # 60 seconds in samples

# Chunk the ECG data with time ranges
chunks_with_time = chunkify_with_time(df_tochunk, CHUNK_SIZE, STEP_SIZE)

In [10]:
def generate_target_labels_aligned(chunks_with_time, glucose_df, threshold=3.9, forecast_window=pd.Timedelta(minutes=30)):
    """
    Generates binary target labels for each chunk based on aligned glucose data.
    """
    target_labels = []
    for chunk, start_time, end_time in chunks_with_time:
        # Get glucose data within the chunk's time range
        glucose_in_chunk = glucose_df[(glucose_df.index >= start_time) & (glucose_df.index <= end_time)]
        
        # Check if hypoglycemia occurs within the forecast window after the chunk
        label = 0
        for onset_time in glucose_df[glucose_df['glucose'] <= threshold].index:
            if end_time < onset_time <= end_time + forecast_window:
                label = 1
                break
        target_labels.append(label)
    return target_labels

# Generate labels for aligned chunks
target_labels = generate_target_labels_aligned(chunks_with_time, gl_d1_filtered)

In [None]:
#Show me where target labels == 1
indices_with_label_1 = [i for i, label in enumerate(target_labels) if label == 1]
print(f"Indices where target_labels == 1: {indices_with_label_1}")
print(f"Total count: {len(indices_with_label_1)}")
print(f"Total labels: {len(target_labels)}")
print(f"Label distribution: {sum(target_labels)} ones, {len(target_labels) - sum(target_labels)} zeros")

Indices where target_labels == 1: [71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324]
Total count: 165
Total labels: 434
Label distribution: 165 ones, 269 zeros


In [19]:
def train_test_split_chunks(chunks: list[pd.DataFrame], target_labels: list[int], test_size: float = 0.2) -> tuple:
    """
    Splits chunks and labels into train and test sets.
    """
    total_chunks = len(chunks)
    num_test_chunks = int(np.ceil(total_chunks * test_size))
    split_index = total_chunks - num_test_chunks
    X_train = chunks[:split_index]
    y_train = target_labels[:split_index]
    X_test = chunks[split_index:]
    y_test = target_labels[split_index:]
    return X_train, X_test, y_train, y_test


In [20]:
# Extract chunks without time ranges for training/testing
aligned_chunks = [chunk for chunk, _, _ in chunks_with_time]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split_chunks(aligned_chunks, target_labels, test_size=0.2)

### Feature Engineering

In [21]:
def extract_features(chunks: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Extracts statistical features (mean, std, min, max, etc.) from each chunk.
    """
    feature_rows = []
    for chunk in chunks:
        feats = pd.concat([
            chunk.mean().add_suffix("_mean"),
            chunk.std().add_suffix("_std"),
            chunk.min().add_suffix("_min"),
            chunk.max().add_suffix("_max"),
            chunk.quantile(0.25).add_suffix("_q25"),
            chunk.median().add_suffix("_median"),
            chunk.quantile(0.75).add_suffix("_q75"),
            chunk.skew().add_suffix("_skew"),
            chunk.kurtosis().add_suffix("_kurtosis")
        ])
        feature_rows.append(feats)
    return pd.DataFrame(feature_rows)

# Extract features for train and test sets
X_train_features = extract_features(X_train)
X_test_features = extract_features(X_test)

### Model Training

In [22]:
#Define and train an SVM model
svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel="rbf", C=1.0, gamma="scale", probability=True))
])
svm_clf.fit(X_train_features, y_train)

0,1,2
,steps,"[('scaler', ...), ('svm', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


### Model evaluation

In [23]:
# Evaluate the model using Precision-Recall AUC
y_train_pred_proba = svm_clf.predict_proba(X_train_features)[:, 1]
precision, recall, _ = precision_recall_curve(y_train, y_train_pred_proba)
pr_auc = auc(recall, precision)
print(f"Training PR-AUC: {pr_auc:.4f}")

y_test_pred_proba = svm_clf.predict_proba(X_test_features)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_test_pred_proba)
pr_auc = auc(recall, precision)
print(f"Testing PR-AUC: {pr_auc:.4f}")

Training PR-AUC: 0.9964
Testing PR-AUC: 0.5000




In [28]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Predict on test data
y_test_pred = svm_clf.predict(X_test_features)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Generate classification report
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.9770
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        87
           1       0.00      0.00      0.00         0

    accuracy                           0.98        87
   macro avg       0.50      0.49      0.49        87
weighted avg       1.00      0.98      0.99        87



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [26]:
# Save the trained SVM model
import os

# Ensure the directory exists
model_dir = '../models'
os.makedirs(model_dir, exist_ok=True)

# Save the model
with open(os.path.join(model_dir, 'jnnyvyr_svm_model01.pkl'), 'wb') as f:
    pickle.dump(svm_clf, f)

### Load saved model and Predict on New X_test

In [27]:
# Load the saved model
with open('../models/jnnyvyr_svm_model01.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Predict on test data
predictions = loaded_model.predict(X_test_features)
print(predictions)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [29]:
X_test_features

Unnamed: 0,EcgWaveform_mean,EcgWaveform_std,EcgWaveform_min,EcgWaveform_max,EcgWaveform_q25,EcgWaveform_median,EcgWaveform_q75,EcgWaveform_skew,EcgWaveform_kurtosis
0,1981.419878,113.700492,157.0,3907.0,1953.0,1978.0,2001.0,2.537326,34.218455
1,1981.192236,113.002445,869.0,3907.0,1952.0,1978.0,2000.0,2.944858,31.156786
2,1981.378200,114.282679,869.0,3907.0,1951.0,1978.0,2001.0,3.036356,30.819956
3,1981.316801,116.738077,869.0,3907.0,1950.0,1978.0,2001.0,3.044352,29.770245
4,1981.333033,116.766462,869.0,3907.0,1950.0,1978.0,2001.0,3.041763,29.736839
...,...,...,...,...,...,...,...,...,...
82,1975.860393,242.893072,284.0,3916.0,1912.0,1970.0,2027.0,1.349558,25.995475
83,1979.118630,201.167979,284.0,3916.0,1924.0,1973.0,2021.0,0.956801,43.978571
84,1983.616452,236.713606,284.0,3916.0,1929.0,1973.0,2017.0,0.822200,33.265748
85,1987.850563,319.228721,284.0,3916.0,1943.0,1982.0,2017.0,0.598438,18.200970


In [30]:
X_train_features

Unnamed: 0,EcgWaveform_mean,EcgWaveform_std,EcgWaveform_min,EcgWaveform_max,EcgWaveform_q25,EcgWaveform_median,EcgWaveform_q75,EcgWaveform_skew,EcgWaveform_kurtosis
0,1994.801601,192.068531,6.0,4088.0,1941.0,1979.0,2015.0,2.027784,35.374922
1,1990.102278,147.017435,6.0,4088.0,1940.0,1979.0,2013.0,2.107751,32.375335
2,1987.327432,140.040414,6.0,4088.0,1940.0,1978.0,2011.0,2.055524,35.680523
3,1985.392579,135.885434,6.0,4088.0,1940.0,1978.0,2010.0,2.167978,40.124159
4,1985.219796,133.984398,6.0,4088.0,1941.0,1978.0,2008.0,2.276933,42.497851
...,...,...,...,...,...,...,...,...,...
342,1981.144971,107.048111,157.0,3907.0,1955.0,1979.0,2001.0,2.264916,36.806211
343,1981.301862,109.707060,157.0,3907.0,1954.0,1979.0,2002.0,2.343107,35.746480
344,1981.408157,110.802451,157.0,3907.0,1954.0,1979.0,2002.0,2.359111,34.822753
345,1981.245118,113.459860,157.0,3907.0,1953.0,1979.0,2002.0,2.414423,33.903066
