In [1]:
# Required Packages
import numpy as np
import pandas as pd
import wfdb
import pywt
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")
import random
random.seed(42)

# Path to MIT-BIH data
data = 'mitdb/'

# List of Patients
patients = ['100','101','102','103','104','105','106','107',
           '108','109','111','112','113','114','115','116',
           '117','118','119','121','122','123','124','200',
           '201','202','203','205','207','208','209','210',
           '212','213','214','215','217','219','220','221',
           '222','223','228','230','231','232','233','234']

# Abnormal & Normal Beat Symbols
abnormal = ['L','R','V','/','A','f','F','j','a','E','J','e','S']
normal = ['N']

# Wavelet denoising
def wavelet_denoising(signal, wavelet='db6', level=1):
    coeff = pywt.wavedec(signal, wavelet, mode='per')
    sigma = np.median(np.abs(coeff[-level])) / 0.6745
    uthresh = sigma * np.sqrt(2 * np.log(len(signal)))
    coeff[1:] = [pywt.threshold(i, value=uthresh, mode='soft') for i in coeff[1:]]
    return pywt.waverec(coeff, wavelet, mode='per')

# Apply denoising to dataset
def denoise_ecg_signals(signals):
    return np.array([wavelet_denoising(signal) for signal in signals])

# Load ECG signal & annotation
def load_ecg(file):
    record = wfdb.rdrecord(file)
    annotation = wfdb.rdann(file, 'atr')
    p_signal = record.p_signal[:, 0]  # First channel
    return p_signal, annotation.symbol, annotation.sample

# Extract X (ECG segment), Y (label), sym (beat symbol)
def build_XY(p_signal, df_ann, num_cols, abnormal):
    num_rows = len(df_ann)
    X = np.zeros((num_rows, num_cols))
    Y = np.zeros((num_rows, 1))
    sym = []
    max_row = 0

    for atr_sample, atr_sym in zip(df_ann.atr_sample.values, df_ann.atr_sym.values):
        left = max(0, atr_sample - num_sec * fs)
        right = min(len(p_signal), atr_sample + num_sec * fs)
        x = p_signal[left:right]
        if len(x) == num_cols:
            X[max_row, :] = x
            Y[max_row, :] = int(atr_sym in abnormal)
            sym.append(atr_sym)
            max_row += 1

    return X[:max_row, :], Y[:max_row, :], sym

# Create dataset from all patients
def make_dataset(patients, num_sec, fs, abnormal):
    num_cols = 2 * num_sec * fs
    X_all = []
    Y_all = []
    sym_all = []

    for pt in patients:
        file = data + pt
        p_signal, atr_sym, atr_sample = load_ecg(file)
        df_ann = pd.DataFrame({'atr_sym': atr_sym, 'atr_sample': atr_sample})
        df_ann = df_ann[df_ann.atr_sym.isin(abnormal + ['N'])]
        X, Y, sym = build_XY(p_signal, df_ann, num_cols, abnormal)
        if len(X) > 0:
            X_all.append(X)
            Y_all.append(Y)
            sym_all += sym

    X_all = np.vstack(X_all)
    Y_all = np.vstack(Y_all)
    return X_all, Y_all, sym_all

# Parameters
num_sec = 3
fs = 360

# Prepare dataset
X_all, Y_all, sym_all = make_dataset(patients, num_sec, fs, abnormal)

# Denoise
X_all_denoised = denoise_ecg_signals(X_all)

# Normalize
scaler = MinMaxScaler()
X_all_normalized = scaler.fit_transform(X_all_denoised)

In [2]:
import os
import pickle
import numpy as np
import pandas as pd

# Create directory if it doesn't exist
save_dir = 'preprocessed_data'
os.makedirs(save_dir, exist_ok=True)

# Save normalized signal data
np.save(os.path.join(save_dir, 'X_all_normalized.npy'), X_all_normalized)

# Save beat symbols (as strings)
with open(os.path.join(save_dir, 'sym_all.pkl'), 'wb') as f:
    pickle.dump(sym_all, f)

# Optional: save symbols as CSV
pd.DataFrame(sym_all, columns=['beat_symbol']).to_csv(os.path.join(save_dir, 'sym_all.csv'), index=False)

In [3]:
# Load normalized signals
X_all_normalized = np.load('preprocessed_data/X_all_normalized.npy')

# Load beat symbols
with open('preprocessed_data/sym_all.pkl', 'rb') as f:
    sym_all = pickle.load(f)