# Create arrays

This notebook contains code for reading the *ptb-xl* data and converting useful files to numpy arrays, corresponding to the different folds, which would be later uploaded into my personal drive for further access in google collab.

This is done to reduce the computing time associated to loading the data.

## Download the data

If not done, download the physionet data. I have done this in the terminal, but I add the line code here for completeness

In [None]:
!wget -r -N -c -np https://physionet.org/files/ptb-xl/1.0.2/

## Import required libraries

In [1]:
import pandas as pd
import numpy as np
import wfdb
import ast
import sklearn
from sklearn.preprocessing import OneHotEncoder

## Load signal data

In [None]:
# load and convert signal data
path = "data/" # Path where the data is stored. 
# I renamed the data folder manually after downloading the physionet data to "data"

Y = pd.read_csv(path+'ptbxl_database.csv', index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(path+'scp_statements.csv', index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]

def load_raw_data(df, path):
    data = [wfdb.rdsamp(path+f) for f in df.filename_lr]
    return data

X = load_raw_data(Y, path)

## Save signal data

In [None]:
for i in range(1, 11):
    name = f"data/nparrays/{str(i).zfill(2)}"
    X_train = X[np.where(Y.strat_fold == i)]
    np.save(name, X_train)

## Aggregate the diagnostic

In [2]:
def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

# Apply diagnostic superclass
Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_diagnostic)

## Save label data

In [17]:
from sklearn.preprocessing import MultiLabelBinarizer
enc = MultiLabelBinarizer(classes = ["NORM", "MI", "STTC", "CD", "HYP"])

for i in range(1, 11):
    name = f"data/nparrays/labels/{str(i).zfill(2)}"
    Y_ = Y[(Y.strat_fold == i)].diagnostic_superclass.tolist()
    Y_ = enc.fit_transform(Y_)
    
    np.save(name, Y_)

## Compute heart beats

In [None]:
def compute_bpm(df, path):
    HB = []
    for idx, f in enumerate(df.filename_lr):
        sig, fields = wfdb.rdsamp(path+f)
        qrs_inds = [wfdb.processing.XQRS(sig=sig[:,ch_idx],fs=fields['fs']) for ch_idx in range(12)]
        for qr in qrs_inds:
            qr.detect(verbose=False)
        
        intervals = [wfdb.processing.calc_rr(qrs_inds[ch_idx].qrs_inds) for ch_idx in range(12)]
        bpm = [wfdb.processing.calc_mean_hr(intervals[ch_idx], fs=fields["fs"]) for ch_idx in range(12)]
        
        if idx % 100 == 0: print(f"Processed {idx} samples")
        HB += [bpm]
    return HB

BPM = compute_bpm(Y, path)

## Save heart beat data

In [None]:
for i in range(1, 11):
    name = f"data/nparrays/bpm/{str(i).zfill(2)}"
    X_ = BPM[np.where(Y.strat_fold == i)[0]]
    np.save(name, X_)