In [1]:
import pandas as pd
import numpy as np
import wfdb
import ast
import ecg_plot
import os
import tqdm
import warnings
warnings.filterwarnings('ignore')

# Functions to prepare data

Reading data has 2 parts to it.

1. Reading the csv file which has patients demographic information and diagnostic statements.
2. Reading the wave form based on the specified sampling frequency, they are storing in signal formats, we use wfdb to read these kind of signals

In [2]:
def load_raw_data(df, sampling_rate, path):
    '''
    Function to read signal data based on the sampling rate
    '''
    if sampling_rate == 100:
        data = [wfdb.rdsamp(path+f) for f in tqdm.tqdm(df.filename_lr)]
    else:
        data = [wfdb.rdsamp(path+f) for f in tqdm.tqdm(df.filename_hr)]
    data = np.array([signal for signal, meta in data])
    return data

In [3]:
def aggregate_supclass_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

In [4]:
def aggregate_subclass_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_subclass)
    ret = list(set(tmp))
    ret = ['sub_'+r for r in ret] # to distinguish between subclass and superclass columns
    return ret

In [5]:
class ClassUpdate():
    def __init__(self, cols):
        self.cols = cols

    def __call__(self, row):
        for sc in row['diagnostic_superclass']:
            row[sc] = 1
        for sc in row['diagnostic_subclass']:
            row[sc] = 1
            
        return row

In [6]:
def get_data_by_folds(folds, x, y, update_cols, feature_cols):
    assert len(folds)  > 0, '# of provided folds should longer than 1'
    #print(y.strat_fold)
    filt = np.isin(y.strat_fold.values, folds)
    x_selected = x[filt]
    y_selected = y[filt]
    
    for sc in update_cols:
        y_selected[sc] = 0
        
    cls_updt = ClassUpdate(update_cols)
    
    y_selected = y_selected.apply(cls_updt, axis=1)
    
    return x_selected, y_selected[list(feature_cols)+list(update_cols)+['strat_fold']]

## Reading csv signal files

In [7]:
path = ''
data = pd.read_csv(path+'ptbxl_database.csv', index_col='ecg_id')
# SCP codes are stored as string, so converting them to dictionary using ast.literal_eval
# This function evaluates the original string type and converts the input to the same
data.scp_codes = data.scp_codes.apply(lambda x: ast.literal_eval(x))
print(" shape of data ", data.shape)

 shape of data  (21837, 27)


### Reading signal files

In [8]:
sampling_rate=100
signal_data = load_raw_data(data, sampling_rate, path)
print('shape of signal data', signal_data.shape)

100%|██████████| 21837/21837 [00:30<00:00, 724.88it/s]


shape of signal data (21837, 1000, 12)


## Reading SCP statements

In [9]:
agg_df = pd.read_csv(path+'scp_statements.csv', index_col=0)
print(agg_df.shape)
agg_df.head()
agg_df = agg_df[agg_df.diagnostic == 1]
print(agg_df.shape)

(71, 12)
(44, 12)


## Diagnostic super class and subclass aggregation

In [10]:
data['diagnostic_superclass'] = data.scp_codes.apply(aggregate_supclass_diagnostic)
data['diagnostic_superclass_len'] = data['diagnostic_superclass'].apply(len)
# data.loc[data.diagnostic_superclass_len > 1, 'diagnostic_superclass']

In [11]:
Y = data
Y['diagnostic_subclass'] = Y.scp_codes.apply(aggregate_subclass_diagnostic)
Y['diagnostic_subclass_len'] = Y['diagnostic_subclass'].apply(len)
# Y.loc[Y.diagnostic_subclass_len > 1, 'diagnostic_subclass']

In [12]:
all_superclass = pd.Series(np.concatenate(Y['diagnostic_superclass'].values))
all_subclass = pd.Series(np.concatenate(Y['diagnostic_subclass'].values))
superclass_cols = all_superclass.unique()
subclass_cols = all_subclass.unique()
update_cols = np.concatenate([superclass_cols, subclass_cols]) # add meta data columns
meta_cols = ['age', 'sex', 'height', 'weight', 'nurse', 'site', 'device',] # could add more columns as features

In [13]:
X = signal_data
x_all, y_all = get_data_by_folds(np.arange(1, 11), X, Y, update_cols, meta_cols)

# Train-Valid-Test Set Splitting

According to source it is recommended 10-fold train-test splits (strat_fold) obtained via stratified sampling while respecting patient assignments, i.e. all records of a particular patient were assigned to the same fold. Records in fold 9 and 10 underwent at least one human evaluation and are therefore of a particularly high label quality. We therefore propose to use folds 1-8 as training set, fold 9 as validation set and fold 10 as test set.

Here, we will split compile fold 1-8 as train sets, fold 9 as validation set, and fold 10 as test set.

## Train

In [14]:
x_train, y_train = get_data_by_folds(np.arange(1, 9), X, Y, update_cols, meta_cols)
print('data shape', x_train.shape)

data shape (17441, 1000, 12)


## Valid

In [15]:
x_valid, y_valid = get_data_by_folds([9], X, Y, update_cols, meta_cols)
print('data shape', x_valid.shape)

data shape (2193, 1000, 12)


## Test

In [16]:
x_test, y_test = get_data_by_folds([10], X, Y, update_cols, meta_cols)
print('data shape', x_test.shape)

data shape (2203, 1000, 12)


## Saving the generated data

Saving the generated split data in csv files to reuse them in future

In [17]:
id_cols = ['ecg_id']
channel_cols = ['channel-{}'.format(i) for i in range(12)]

y_train.to_csv('train_meta.csv', index=True)
y_train_signal = pd.DataFrame(columns=id_cols+channel_cols, index=np.arange(y_train.shape[0]*1000), dtype=np.float32)

ecg_ids = []
signals = []
for i, ecg_id in enumerate(y_train.index.values):
    y_train_signal.loc[i*1000:(i+1)*1000-1, 'ecg_id'] = [ecg_id]*1000
    y_train_signal.loc[i*1000:(i+1)*1000-1, channel_cols] = x_train[i,:,:]

y_train_signal['ecg_id'] = y_train_signal['ecg_id'].astype(np.int)
y_train_signal.to_csv('train_signal.csv', index=False)

In [18]:
y_valid.to_csv('valid_meta.csv', index=True)
y_valid_signal = pd.DataFrame(columns=id_cols+channel_cols, index=np.arange(y_valid.shape[0]*1000), dtype=np.float32)

ecg_ids = []
signals = []
for i, ecg_id in enumerate(y_valid.index.values):
    y_valid_signal.loc[i*1000:(i+1)*1000-1, 'ecg_id'] = [ecg_id]*1000
    y_valid_signal.loc[i*1000:(i+1)*1000-1, channel_cols] = x_valid[i,:,:]

y_valid_signal['ecg_id'] = y_valid_signal['ecg_id'].astype(np.int)
y_valid_signal.to_csv('valid_signal.csv', index=False)

# display(y_valid) 
# display(y_valid_signal)

In [19]:
y_test.to_csv('test_meta.csv', index=True)
y_test_signal = pd.DataFrame(columns=id_cols+channel_cols, index=np.arange(y_test.shape[0]*1000), dtype=np.float32)

ecg_ids = []
signals = []
for i, ecg_id in enumerate(y_test.index.values):
    y_test_signal.loc[i*1000:(i+1)*1000-1, 'ecg_id'] = [ecg_id]*1000
    y_test_signal.loc[i*1000:(i+1)*1000-1, channel_cols] = x_test[i,:,:]

y_test_signal['ecg_id'] = y_test_signal['ecg_id'].astype(np.int)
y_test_signal.to_csv('test_signal.csv', index=False)

# display(y_test) 
# display(y_test_signal)

# Generate images of signals

In [20]:
def generate_ecg_images(data_array, folder):
    if not os.path.exists(folder):
        os.mkdir(folder)
    single_img = os.path.join(folder, 'single_img')
    twelve_plots = os.path.join(folder, 'tweleve_img')

    if not os.path.exists(single_img):
        os.mkdir(single_img)
    if not os.path.exists(twelve_plots):
        os.mkdir(twelve_plots)
    for idx, pt in tqdm.tqdm(enumerate(data_array)):
        ecg_plot.plot(pt.T, sample_rate = 100, show_grid=False, style = 'bw')
        ecg_plot.save_as_png(str(idx),twelve_plots+'/')
        ecg_plot.plot_1(pt.T)
        ecg_plot.save_as_png(str(idx),single_img+'/')

In [21]:
def remove_all_0(target_train):
    target_train = target_train.loc[~((target_train.NORM ==0) & (target_train.MI ==0) 
                & (target_train.STTC ==0) & (target_train.HYP ==0) & (target_train.CD ==0))]
    return target_train

## Reshaping and pre processing signal data

In [22]:
signal_train = y_train_signal.values[:, 1:].reshape(-1, 1000, 12)
signal_valid = y_valid_signal.values[:, 1:].reshape(-1, 1000, 12)
signal_test = y_valid_signal.values[:, 1:].reshape(-1, 1000, 12)
print(signal_train.shape)
print(signal_valid.shape)
print(signal_test.shape)

(17441, 1000, 12)
(2193, 1000, 12)
(2193, 1000, 12)


In [23]:
superclass_cols
target_train = y_train[superclass_cols]
target_valid = y_valid[superclass_cols]
target_test = y_test[superclass_cols]
print(target_train.shape)
print(target_valid.shape)
print(target_test.shape)

(17441, 5)
(2193, 5)
(2203, 5)


## Generating signal images

In [24]:
generate_ecg_images(x_train, 'train_images')
generate_ecg_images(x_test, 'test_images')
generate_ecg_images(x_valid, 'valid_images')

17441it [4:34:32,  1.06it/s] 
2203it [40:09,  1.09s/it]
2193it [39:54,  1.09s/it]


In [25]:
train_folder = (os.path.join('train_images', 'tweleve_img'))
test_folder = (os.path.join('test_images', 'tweleve_img'))
valid_folder = (os.path.join('valid_images', 'tweleve_img'))

In [26]:
target_train['index_img'] = [ str(i)+'.png' for i in range(target_train.shape[0]) ] 
target_test['index_img'] = [ str(i)+'.png' for i in range(target_test.shape[0]) ] 
target_valid['index_img'] = [ str(i)+'.png' for i in range(target_valid.shape[0]) ] 

In [27]:
target_train = remove_all_0(target_train)
target_test = remove_all_0(target_test)
target_valid = remove_all_0(target_valid)

In [28]:
target_train.to_csv('target_train.csv',index=False)
target_valid.to_csv('target_valid.csv',index=False)
target_test.to_csv('target_test.csv',index=False)