In [2]:
import os
import scipy.io
from tqdm import tqdm

In [3]:
main_directory = r'E:\Coding\Jupyter_files\ECG_2\Original_data'
save_directory = r'E:\Coding\Jupyter_files\ECG_2\Data_saves\Selected_data'
total_files = sum(len(files) for _, _, files in os.walk(main_directory))
progress_bar = tqdm(total=total_files, desc="Data is loading...")
loaded_data = {}

#Selected classes
# Sinus rhythm # Dx: 426783006 = 10629 files
# Myocardial infarction # Dx: 164865005 = 4904 files
# Right bundle branch block # Dx: 59118001 = 2090 files
# Atrial fibrillation # Dx: 164889003 = 2033 files
# Nonspecific st t abnormality # Dx: 428750005 = 2201 files
# Myocardial ischemia # Dx: 164861001 = 2389 files



# Target disease codes
target_codes = {
    '426783006',
    '164865005',
    '59118001',
    '164889003',
    '428750005',
    '164861001'
}

for root, dirs, files in os.walk(main_directory):
    for filename in files:
        if filename.endswith('.hea'):
            hea_file_path = os.path.join(root, filename)
            mat_file_path = os.path.join(root, filename.replace('.hea', '.mat'))
            with open(hea_file_path, 'r') as f:
                header_data = f.readlines()

            disease_number = None
            for line in header_data:
                if line.startswith('# Dx'):
                    disease_number = line.split(': ')[1].strip().split(',')[0]
                    break
            if disease_number in target_codes:
                if os.path.exists(mat_file_path):
                    try:
                        mat_data = scipy.io.loadmat(mat_file_path)
                        with open(hea_file_path, 'r') as f:
                            hea_data = f.read()
                        loaded_data[filename] = hea_data
                        loaded_data[filename.replace('.hea', '.mat')] = mat_data

                        relative_path = os.path.relpath(root, main_directory)
                        save_path = os.path.join(save_directory, relative_path)
                        os.makedirs(save_path, exist_ok=True)

                        save_hea_path = os.path.join(save_path, filename)
                        with open(save_hea_path, 'w') as f:
                            f.write(hea_data)

                        save_mat_path = os.path.join(save_path, filename.replace('.hea', '.mat'))
                        scipy.io.savemat(save_mat_path, mat_data)

                    except Exception as e:
                        print(f"Error loading .mat file {mat_file_path}: {e}")
                else:
                    print(f".mat file does not exist: {mat_file_path}")

        progress_bar.update(1)

progress_bar.close()


Data is loading...: 100%|███████████████████████████████████████████████████████| 86100/86100 [04:26<00:00, 323.40it/s]


In [4]:
#DEBUGGING

print(f"Total entries loaded: {len(loaded_data) // 2}")
print(f"First 50 keys in loaded_data:")
keys = list(loaded_data.keys())
for idx in range(0, 100, 2):  # We print pairs: .hea and .mat files
    if idx < len(keys):
        print(keys[idx], keys[idx + 1])

mat_files_count = 0
hea_files_count = 0

for key in loaded_data.keys():
    if key.endswith('.mat'):
        mat_files_count += 1
    elif key.endswith('.hea'):
        hea_files_count += 1

print(f"Number of .mat files: {mat_files_count}")
print(f"Number of .hea files: {hea_files_count}")

Total entries loaded: 24246
First 50 keys in loaded_data:
A0001.hea A0001.mat
A0002.hea A0002.mat
A0003.hea A0003.mat
A0004.hea A0004.mat
A0006.hea A0006.mat
A0007.hea A0007.mat
A0009.hea A0009.mat
A0010.hea A0010.mat
A0015.hea A0015.mat
A0016.hea A0016.mat
A0017.hea A0017.mat
A0019.hea A0019.mat
A0020.hea A0020.mat
A0022.hea A0022.mat
A0023.hea A0023.mat
A0026.hea A0026.mat
A0027.hea A0027.mat
A0028.hea A0028.mat
A0029.hea A0029.mat
A0030.hea A0030.mat
A0035.hea A0035.mat
A0037.hea A0037.mat
A0038.hea A0038.mat
A0041.hea A0041.mat
A0043.hea A0043.mat
A0051.hea A0051.mat
A0053.hea A0053.mat
A0059.hea A0059.mat
A0061.hea A0061.mat
A0064.hea A0064.mat
A0065.hea A0065.mat
A0066.hea A0066.mat
A0068.hea A0068.mat
A0069.hea A0069.mat
A0071.hea A0071.mat
A0073.hea A0073.mat
A0075.hea A0075.mat
A0078.hea A0078.mat
A0079.hea A0079.mat
A0083.hea A0083.mat
A0085.hea A0085.mat
A0086.hea A0086.mat
A0087.hea A0087.mat
A0089.hea A0089.mat
A0090.hea A0090.mat
A0094.hea A0094.mat
A0095.hea A0095.mat
A0