In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
# Set the random seed for reproducibility
np.random.seed(42)  # For NumPy
random.seed(42)     # For Python's built-in random module

In [3]:
# Set window size (124 time steps per sample)
window_size = 1240

In [4]:
# Initialize a list to hold the reshaped data
X_ECG = []
X_PPG = []
X_resp = []
X_ECG_PPG = []
X_ECG_resp = []
X_PPG_resp = []
X_all = []
y = []

In [5]:
# reading af data
filename = "data/mimic_perform_af_csv/mimic_perform_af_{}_data.csv"

for i in range(1, 20):
    f = filename.format(str(i).rjust(3, '0'))
    df = pd.read_csv(f)
    if len(list(df.keys())) != 4:
        continue
    df.dropna(inplace=True)
    # Loop through the DataFrame in windows of size `window_size`
    for i in range(0, len(df) - window_size + 1, window_size):
        # Extract windows of both 'PPG' and 'ECG' columns
        PPG_window = df['PPG'].iloc[i:i+window_size].values
        ECG_window = df['ECG'].iloc[i:i+window_size].values
        resp_window = df['resp'].iloc[i:i+window_size].values
        
        # Stack both signals together to form a multi-channel input (3 channels: PPG, ECG and resp)
        record_ECG_PPG = np.vstack((ECG_window, PPG_window)).T  # Shape will be (window_size, 2)
        record_ECG_resp = np.vstack((ECG_window, resp_window)).T  # Shape will be (window_size, 2)
        record_PPG_resp = np.vstack((PPG_window, resp_window)).T  # Shape will be (window_size, 2)
        record_all = np.vstack((ECG_window, PPG_window, resp_window)).T  # Shape will be (window_size, 3)
        X_ECG.append(ECG_window)
        X_PPG.append(PPG_window)
        X_resp.append(resp_window)
        X_ECG_PPG.append(record_ECG_PPG)
        X_ECG_resp.append(record_ECG_resp)
        X_PPG_resp.append(record_PPG_resp)
        X_all.append(record_all)
        
        # Label is 1 for each window
        y.append(1)

In [6]:
# reading af data
filename = "data/mimic_perform_non_af_csv/mimic_perform_non_af_{}_data.csv"

for i in range(1, 16):
    f = filename.format(str(i).rjust(3, '0'))
    df = pd.read_csv(f)
    if len(list(df.keys())) != 4:
        continue
    df.dropna(inplace=True)
    # Loop through the DataFrame in windows of size `window_size`
    for i in range(0, len(df) - window_size + 1, window_size):
        # Extract windows of both 'PPG' and 'ECG' columns
        PPG_window = df['PPG'].iloc[i:i+window_size].values
        ECG_window = df['ECG'].iloc[i:i+window_size].values
        resp_window = df['resp'].iloc[i:i+window_size].values
        
        # Stack both signals together to form a multi-channel input (3 channels: PPG, ECG and resp)
        record_ECG_PPG = np.vstack((ECG_window, PPG_window)).T  # Shape will be (window_size, 2)
        record_ECG_resp = np.vstack((ECG_window, resp_window)).T  # Shape will be (window_size, 2)
        record_PPG_resp = np.vstack((PPG_window, resp_window)).T  # Shape will be (window_size, 2)
        record_all = np.vstack((ECG_window, PPG_window, resp_window)).T  # Shape will be (window_size, 3)
        X_ECG.append(ECG_window)
        X_PPG.append(PPG_window)
        X_resp.append(resp_window)
        X_ECG_PPG.append(record_ECG_PPG)
        X_ECG_resp.append(record_ECG_resp)
        X_PPG_resp.append(record_PPG_resp)
        X_all.append(record_all)
        
        # Label is 1 for each window
        y.append(0)

In [7]:
# Convert X and y to NumPy arrays
X_ECG = np.array(X_ECG)
X_PPG = np.array(X_PPG)
X_resp = np.array(X_resp)
X_ECG_PPG = np.array(X_ECG_PPG)
X_ECG_resp = np.array(X_ECG_resp)
X_PPG_resp = np.array(X_PPG_resp)
X_all = np.array(X_all)
y = np.array(y)

data = [X_ECG, X_PPG, X_resp, X_ECG_PPG, X_ECG_resp, X_PPG_resp, X_all]
for X in data:
    # Use np.unique with axis=0 and return_inverse to identify duplicates
    _, idx, counts = np.unique(X, axis=0, return_index=True, return_counts=True)

    # Get duplicated rows
    duplicated_rows = X[np.isin(np.arange(X.shape[0]), idx[counts > 1])]

    print("Duplicated rows:")
    print(duplicated_rows)
    print(X.shape, y.shape)

Duplicated rows:
[]
(3000, 1240) (3000,)
Duplicated rows:
[]
(3000, 1240) (3000,)
Duplicated rows:
[]
(3000, 1240) (3000,)
Duplicated rows:
[]
(3000, 1240, 2) (3000,)
Duplicated rows:
[]
(3000, 1240, 2) (3000,)
Duplicated rows:
[]
(3000, 1240, 2) (3000,)
Duplicated rows:
[]
(3000, 1240, 3) (3000,)


In [8]:
for n in range(1,6,1):
    # Shuffle row indices
    indices = np.random.permutation(y.shape[0])

    # Define train/test split ratio
    train_ratio = 0.2
    split_idx = int(train_ratio * len(indices))

    # Split indices
    train_idx = indices[:split_idx]
    test_idx = indices[split_idx:]

    data_names = ['X_ECG', 'X_PPG', 'X_resp', 'X_ECG_PPG', 'X_ECG_resp', 'X_PPG_resp', 'X_all']

    for X,filename in zip(data, data_names):
        # Create train/test sets
        train_data = X[train_idx]
        test_data = X[test_idx]

        # Save to .npy files (binary format)
        np.save(f'data/train_test_data/dataset_{n}_{filename}_train_data.npy', train_data)
        np.save(f'data/train_test_data/dataset_{n}_{filename}_test_data.npy', test_data)

    train_label = y[train_idx]
    test_label = y[test_idx]
    np.save(f'data/train_test_data/dataset_{n}_train_label.npy', train_label)
    np.save(f'data/train_test_data/dataset_{n}_test_label.npy', test_label)