#### Convert the original SHP dataset files to CSV format.
 - The original files are in HDF5 format.
 - Each file represents an ECG exam and has an array of shape 12 x L, where 12 is the number of leads and L is the number of samples.
 - The number of samples in each file ranges from 5000 to 30000 (records lasting 10 to 60 seconds at a sample frequency of 500 Hz).

#### Import necessary libraries.

In [1]:

import pandas as pd
import csv
import h5py
import glob
import os
import numpy as np


#### Generate the CSV file.
- All 25770 ECGs was used.
- Only the first 10 seconds of each ECG were used.


In [2]:

def get_arrhythmia_codes(dataframe):
    # Returns the arrhythmia classification code for each file.
    # Parameters:
    #    dataframe (pandas.DataFrame): metadata with arrhythmia classification.
    # Return:
    #    Arrhythmia codes (dict).
    #    ECG codes (dict).
    codes_dict = {}
    ecgs_dict = {}
    for index, row in dataframe.iterrows():
        arrhythmia_code = row["AHA_Code"]
        # Extract the primary statement without modifiers.
        arrhythmia_code = arrhythmia_code.split(";")[0]
        arrhythmia_code = arrhythmia_code.split("+")[0]
        if arrhythmia_code in codes_dict:
            codes_dict[arrhythmia_code].append(row["ECG_ID"])
        else:
            codes_dict[arrhythmia_code] = [row["ECG_ID"]]
        ecgs_dict[row["ECG_ID"]] = arrhythmia_code
    return codes_dict, ecgs_dict

column_names = ["idx", "ecg_id", "lead1", "lead2", "lead3", "aVR", "aVL", "aVF", "V1", "V2", "V3", "V4", "V5", "V6", "arrhythmia_code"]
number_of_steps = 5000
csv_file_path = "../dataset/csv_files/ecg_sph_dataset.csv"

# Load the h5 files list.
files_list = glob.glob('../dataset/original_files/records/*.h5')
print('Number of available files: {}'.format(len(files_list)))

try:
    print("\nLoading metadata...")
    df_metadata = pd.read_csv("../dataset/original_files/metadata.csv", sep=",")
    arrhythmia_codes, ecg_codes = get_arrhythmia_codes(df_metadata)
    if os.path.exists(csv_file_path):
        os.remove(csv_file_path)
    print("\nStart writing ecg_sph_dataset.csv file...\n")
    dict_idx = 0
    with open(csv_file_path, "w", newline = '') as csvfile:
        csvwriter = csv.DictWriter(csvfile, delimiter = '|', fieldnames = column_names)
        csvwriter.writeheader()
        for file_name in files_list:
            ecg_id = os.path.basename(file_name).replace(".h5", "")
            with h5py.File(file_name, 'r') as file:
                signal = file['ecg'][()]
                signal_matrix = np.zeros((number_of_steps, 12), dtype = np.float64)
                for lead_index in range(0, 12):
                    # Use the first 10 seconds (5000 samples) of each ECG.
                    signals_array = signal[lead_index][0:number_of_steps]
                    for line_index in range(0, number_of_steps):
                        signal_matrix[line_index, lead_index] = signals_array[line_index]
                for line_idx in range(0, number_of_steps):
                    dict_aux = {}
                    dict_aux["idx"] = dict_idx
                    dict_aux["ecg_id"] = ecg_id
                    for lead_id in range(2, 14):
                        dict_aux[column_names[lead_id]] = signal_matrix[line_idx, lead_id - 2]
                    dict_aux["arrhythmia_code"] = int(ecg_codes[ecg_id])
                    dict_idx += 1
                    csvwriter.writerow(dict_aux)
                    if dict_idx % 100000 == 0:
                        print("{} samples processed.".format(dict_idx))
    print("\nFinish writing ecg_sph_dataset.csv file...")
except Exception as e:
    print("\nFail to generate CSV file.")
    print("Error: {}".format(e))


Number of available files: 25770

Loading metadata...

Start writing ecg_sph_dataset.csv file...

100000 samples processed.
200000 samples processed.
300000 samples processed.
400000 samples processed.
500000 samples processed.
600000 samples processed.
700000 samples processed.
800000 samples processed.
900000 samples processed.
1000000 samples processed.
1100000 samples processed.
1200000 samples processed.
1300000 samples processed.
1400000 samples processed.
1500000 samples processed.
1600000 samples processed.
1700000 samples processed.
1800000 samples processed.
1900000 samples processed.
2000000 samples processed.
2100000 samples processed.
2200000 samples processed.
2300000 samples processed.
2400000 samples processed.
2500000 samples processed.
2600000 samples processed.
2700000 samples processed.
2800000 samples processed.
2900000 samples processed.
3000000 samples processed.
3100000 samples processed.
3200000 samples processed.
3300000 samples processed.
3400000 samples proc