In [1]:
import numpy as np
from wfdb import processing

import os
import sys

import data_visualisation as dv

In [2]:
# Path to the data
brugada_path = 'DAT China/DAT China Brugada ECGs/dat'
RBBB_path = 'DAT China/DAT China RBBB ECGs/datnew'
Normal_path = 'DAT China/Normal/dat'

In [3]:
def load_china_data(dir_path, label):
    
    # init array
    data = []
    
    # read all files in the directory
    for file in os.listdir(dir_path):
        path = os.path.join(dir_path, file)
        f = open(path, 'r')
        ecg = np.fromfile(f, dtype=np.int16)
        ecg = np.reshape(ecg, (8, 5000))
        
        ## Downsample to 100hz
        ecg = ecg.reshape(8, 1000, 5)
        ecg = np.mean(ecg, axis=2)
        
        data.append(ecg)
        f.close()
        
    # Convert to numpy array
    data = np.array(data)
    # Reshape to (n_samples, n_samples_per_lead, n_leads)
    data = np.swapaxes(data, 1, 2)
    
    # Calculate remaining leads
    lead_III = data[:,:,1] - data[:,:,0]           # ecg_dict['III'] = ecg_dict['II'] - ecg_dict['I']
    lead_aVR = -0.5 * (data[:,:,0] + data[:,:,1])  # ecg_dict['aVR'] = -0.5 * (ecg_dict['I'] + ecg_dict['II'])
    lead_aVL = data[:,:,0] - 0.5 * data[:,:,1]     # ecg_dict['aVL'] = ecg_dict['I'] - 0.5 * ecg_dict['II']
    lead_aVF = data[:,:,1] - 0.5 * data[:,:,0]     # ecg_dict['aVF'] = ecg_dict['II'] - 0.5 * ecg_dict['I']
    
    # Add remaining leads to data
    data = np.concatenate((data,  
                              np.expand_dims(lead_III, axis=2), 
                              np.expand_dims(lead_aVR, axis=2), 
                              np.expand_dims(lead_aVL, axis=2), 
                              np.expand_dims(lead_aVF, axis=2)), axis=2)
    
    return data, np.full(data.shape[0], label)

In [4]:
Normal_data, normal_labels = load_china_data(Normal_path, 0)
brugada_data, brugada_labels = load_china_data(brugada_path, 1)
RBBB_data, RBBB_labels = load_china_data(RBBB_path, 2)

print("Normal:", Normal_data.shape)
print("Brugada:", brugada_data.shape)
print("RBBB:", RBBB_data.shape)

Normal: (9998, 1000, 12)
Brugada: (176, 1000, 12)
RBBB: (10000, 1000, 12)


In [6]:
# Normalise data
from sklearn.preprocessing import StandardScaler

all_data = np.concatenate((Normal_data, brugada_data, RBBB_data), axis=0)
scaler = StandardScaler()
sclaer = scaler.fit(all_data.reshape(-1, all_data.shape[-1]))

Normal_data = scaler.transform(Normal_data.reshape(-1, Normal_data.shape[-1])).reshape(Normal_data.shape)
brugada_data = scaler.transform(brugada_data.reshape(-1, brugada_data.shape[-1])).reshape(brugada_data.shape)
RBBB_data = scaler.transform(RBBB_data.reshape(-1, RBBB_data.shape[-1])).reshape(RBBB_data.shape)


# Downsmaple classes
RBBB_data = RBBB_data[np.random.choice(RBBB_data.shape[0], brugada_data.shape[0], replace=False)]
Normal_data = Normal_data[np.random.choice(Normal_data.shape[0], 2*brugada_data.shape[0], replace=False)]

print("Normal:", Normal_data.shape)
print("Brugada:", brugada_data.shape)
print("RBBB:", RBBB_data.shape)

Normal: (352, 1000, 12)
Brugada: (176, 1000, 12)
RBBB: (176, 1000, 12)


In [11]:
# shuffle data
np.random.shuffle(Normal_data)
np.random.shuffle(RBBB_data)
np.random.shuffle(brugada_data)

data = np.concatenate((Normal_data, RBBB_data, brugada_data), axis=0)
labels = np.concatenate((normal_labels[:Normal_data.shape[0]], RBBB_labels[:RBBB_data.shape[0]], brugada_labels), axis=0)

fnc = lambda x: "Normal" if x == 0 else "Brugada" if x == 2 else "RBBB"
text_labels = np.array([fnc(label) for label in labels])

print("\nData shape:", data.shape)
print("Labels shape:", labels.shape)
print("Text labels shape:", text_labels.shape)

# shuffle data
permutation = np.random.permutation(data.shape[0])
np.random.shuffle(permutation)

shuffled_data = data[permutation]
shuffled_labels = labels[permutation]


Data shape: (704, 1000, 12)
Labels shape: (704,)
Text labels shape: (704,)


In [None]:
test_split = 0.75

test_split = int(data.shape[0] * test_split)

X_train = shuffled_data[test_split:]
Y_train = shuffled_labels[test_split:]
Y_train_text = text_labels[test_split:]

X_test = shuffled_data[:test_split]
Y_test = shuffled_labels[:test_split]
Y_test_text = text_labels[:test_split]

# label distribution
print("\nTrain label distribution:")
print("Total:", Y_train.shape[0])
print("Normal:", np.sum(Y_train == 0), "(", np.sum(Y_train == 0) / Y_train.shape[0], "%)")
print("Brugada:", np.sum(Y_train == 1), "(", np.sum(Y_train == 1) / Y_train.shape[0], "%)")
print("RBBB:", np.sum(Y_train == 2), "(", np.sum(Y_train == 2) / Y_train.shape[0], "%)")

print("\nTest label distribution:")
print("Total:", Y_test.shape[0])
print("Normal:", np.sum(Y_test == 0), "(", np.sum(Y_test == 0) / Y_test.shape[0], "%)")
print("Brugada:", np.sum(Y_test == 1), "(", np.sum(Y_test == 1) / Y_test.shape[0], "%)")
print("RBBB:", np.sum(Y_test == 2), "(", np.sum(Y_test == 2) / Y_test.shape[0], "%)")



Train label distribution:
Total: 176
Normal: 89 ( 0.5056818181818182 %)
Brugada: 42 ( 0.23863636363636365 %)
RBBB: 45 ( 0.2556818181818182 %)

Test label distribution:
Total: 528
Normal: 263 ( 0.4981060606060606 %)
Brugada: 134 ( 0.2537878787878788 %)
RBBB: 131 ( 0.2481060606060606 %)


In [13]:
NP_DATA_LR = 'data/DAT_China_025.npz'

save_args = {
    'X_train': X_train,
    'Y_train': Y_train,
    'Y_train_text': Y_train_text,
    'X_test': X_test,
    'Y_test': Y_test,
    'Y_test_text': Y_test_text
}

np.savez(NP_DATA_LR, **save_args)

In [14]:
# Load data and check

thismodule = sys.modules[__name__]
NP_DATA = 'data/DAT_China_025.npz'

with np.load(NP_DATA, allow_pickle=True) as data:
    for k in data.keys():
        if 'text' in k:
            setattr(thismodule, k, data[k])
        else:
            setattr(thismodule, k, data[k].astype(float))
            
print("\nLoaded data:")
print("X_train:", X_train.shape)
print("Y_train:", Y_train.shape)
print("Y_train_text:", Y_train_text.shape)

print("\nX_test:", X_test.shape)
print("Y_test:", Y_test.shape)
print("Y_test_text:", Y_test_text.shape)


Loaded data:
X_train: (176, 1000, 12)
Y_train: (176,)
Y_train_text: (176,)

X_test: (528, 1000, 12)
Y_test: (528,)
Y_test_text: (528,)


In [15]:
# split train data into folds for cross validation
k = 5
train_len = X_train.shape[0]
fold_size = train_len // k

# create a list mapping each sample to a fold
folds = np.zeros(train_len, dtype=int)
for i in range(k):
    folds[i*fold_size:(i+1)*fold_size] = i

In [16]:
def split_into_beats(X, Y, Y_text, folds=None, window_size=50):
    X_HB = []
    Y_HB = []
    Y_HB_text = []
    Folds_HB = []
    
    get_text_label = lambda x: label_dict[x]
    
    for i, sample in enumerate(X):
        print("Progress:", i, "/", X.shape[0], end="\r")
        xqrs = processing.XQRS(sig=sample[:,0], fs=sampling_rate)
        xqrs.detect(verbose=False)
        qrs_inds = xqrs.qrs_inds
        for indx in qrs_inds:
            if indx + window_size > sample.shape[0] or indx - window_size < 0:
                continue
            X_HB.append(sample[indx-window_size:indx+window_size])
            Y_HB.append(Y[i])
            Y_HB_text.append(Y_text[i])
            if folds is not None:
                Folds_HB.append(folds[i])
    
    return np.array(X_HB), np.array(Y_HB), np.array(Y_HB_text), np.array(Folds_HB)

In [17]:
sampling_rate = 100
window_size = int(sampling_rate * 0.5)

X_train_HB, Y_train_HB, Y_train_HB_text, folds_train_HB = split_into_beats(X_train, Y_train, Y_train_text, folds=folds, window_size=window_size)
X_test_HB, Y_test_HB, Y_test_HB_text, _ = split_into_beats(X_test, Y_test, Y_test_text, window_size=window_size)

print(f"Train set: Extracted {X_train_HB.shape[0]} heartbeats from {X_train.shape[0]} samples")
print(f"Test set: Extracted {X_test_HB.shape[0]} heartbeats from {X_test.shape[0]} samples")

Train set: Extracted 2624 heartbeats from 176 samples
Test set: Extracted 7887 heartbeats from 528 samples


In [23]:
# convert RBBB labels to normal
Y_train_HB[Y_train_HB == 2] = 0
Y_test_HB[Y_test_HB == 2] = 0

In [24]:
NP_DATA_HB = 'data/DAT_China_1s_025.npz'

save_args = {
    'X_train': X_train_HB,
    'Y_train': Y_train_HB,
    'Y_train_text': Y_train_HB_text,
    'folds_train': folds_train_HB,
    'X_test': X_test_HB,
    'Y_test': Y_test_HB,
    'Y_test_text': Y_test_HB_text
}

np.savez(NP_DATA_HB, **save_args)

In [35]:
# load and check
thismodule = sys.modules[__name__]
NP_DATA = 'data/DAT_China_1s_025.npz'

with np.load(NP_DATA, allow_pickle=True) as data:
    for k in data.keys():
        if 'text' in k:
            setattr(thismodule, k, data[k])
        else:
            setattr(thismodule, k, data[k].astype(float))
            
print("Train set:")
print("X_train:", X_train.shape)
print("Y_train:", Y_train.shape)
print("Y_train_text:", Y_train_text.shape)
print("Folds_train:", folds_train.shape)

print("\nTest set:")
print("X_test:", X_test.shape)
print("Y_test:", Y_test.shape)
print("Y_test_text:", Y_test_text.shape)

print(np.unique(folds_train, return_counts=True))

Train set:
X_train: (2624, 100, 12)
Y_train: (2624,)
Y_train_text: (2624,)
Folds_train: (2624,)

Test set:
X_test: (7887, 100, 12)
Y_test: (7887,)
Y_test_text: (7887,)
(array([0., 1., 2., 3., 4.]), array([480, 539, 544, 529, 532]))


In [None]:
# label dist of each fold  
# print counts of each label in each fold
for i in range(5):
    print(f'Fold {i} label counts: {np.unique(Y_train[folds_train == i], return_counts=True)}')
    

Fold 0 size: 480
Fold 0 true labels: 148.0
Fold 1 size: 539
Fold 1 true labels: 144.0
Fold 2 size: 544
Fold 2 true labels: 74.0
Fold 3 size: 529
Fold 3 true labels: 110.0
Fold 4 size: 532
Fold 4 true labels: 120.0
Fold 0 label counts: (array([0., 1.]), array([332, 148]))
Fold 1 label counts: (array([0., 1.]), array([395, 144]))
Fold 2 label counts: (array([0., 1.]), array([470,  74]))
Fold 3 label counts: (array([0., 1.]), array([419, 110]))
Fold 4 label counts: (array([0., 1.]), array([412, 120]))


# Apply SMOTE

In [28]:
from imblearn.over_sampling import SMOTE

In [36]:
# convert the labels to binary
Y_train = np.where(Y_train == 1, True, False)
Y_test = np.where(Y_test == 1, True, False)

In [37]:
# smote per fold

# empty arrays to store the resampled data of shape (0, 100, 12)
Resampled_X_train = np.array([], dtype=np.float32).reshape(0, 100, 12)
Resampled_Y_train = np.array([], dtype=bool).reshape(0,)
Resampled_folds_train = np.array([], dtype=int).reshape(0,)

new_X_train = np.array([], dtype=np.float32).reshape(0, 100, 12)

for k in range(5):
    # get the fold samples + labels
    X_fold = X_train[folds_train == k]
    Y_fold = Y_train[folds_train == k]
    X_fold_flattened = X_fold.reshape((X_fold.shape[0], -1))
    
    # resample the fold
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_resampled_fold_flattened, y_resampled_fold = smote.fit_resample(X_fold_flattened, Y_fold)
    X_resampled_fold = X_resampled_fold_flattened.reshape((-1, 100, 12))
    
    new_X = X_resampled_fold[X_fold.shape[0]:]
    
    folds = np.full((X_resampled_fold.shape[0],), k)

    print(f"\nFold {k} shape:           {X_fold.shape}")
    print(f"Resampled Fold {k} shape: {X_resampled_fold.shape}")
    print(f"Fold {k} class distribution:           {np.bincount(Y_fold)}")
    print(f"Resampled Fold {k} class distribution: {np.bincount(y_resampled_fold)}")

    
    # concatenate the resampled fold with the original data
    Resampled_X_train = np.concatenate([Resampled_X_train, X_resampled_fold])
    Resampled_Y_train = np.concatenate([Resampled_Y_train, y_resampled_fold])
    Resampled_folds_train = np.concatenate([Resampled_folds_train, folds])
    
    new_X_train = np.concatenate([new_X_train, new_X])
    
    
print(X_train.shape, Y_train.shape, folds_train.shape)
print(Resampled_X_train.shape, Resampled_Y_train.shape, Resampled_folds_train.shape)
print(new_X_train.shape)


Fold 0 shape:           (480, 100, 12)
Resampled Fold 0 shape: (664, 100, 12)
Fold 0 class distribution:           [332 148]
Resampled Fold 0 class distribution: [332 332]

Fold 1 shape:           (539, 100, 12)
Resampled Fold 1 shape: (790, 100, 12)
Fold 1 class distribution:           [395 144]
Resampled Fold 1 class distribution: [395 395]

Fold 2 shape:           (544, 100, 12)
Resampled Fold 2 shape: (940, 100, 12)
Fold 2 class distribution:           [470  74]
Resampled Fold 2 class distribution: [470 470]

Fold 3 shape:           (529, 100, 12)
Resampled Fold 3 shape: (838, 100, 12)
Fold 3 class distribution:           [419 110]
Resampled Fold 3 class distribution: [419 419]

Fold 4 shape:           (532, 100, 12)
Resampled Fold 4 shape: (824, 100, 12)
Fold 4 class distribution:           [412 120]
Resampled Fold 4 class distribution: [412 412]
(2624, 100, 12) (2624,) (2624,)
(4056, 100, 12) (4056,) (4056,)
(1432, 100, 12)


In [38]:
# save the resampled data per fold
np.savez_compressed('data/DAT_China_1s_025_SMOTE.npz',
                    X_train=X_train, Y_train=Y_train,
                    X_train_resampled=Resampled_X_train, Y_train_resampled=Resampled_Y_train,
                    folds_train=folds_train, folds_train_resampled=Resampled_folds_train,
                    X_test=X_test, Y_test=Y_test)