In [1]:
!pip install wfdb numpy pandas matplotlib scipy

Collecting wfdb
  Downloading wfdb-4.3.1-py3-none-any.whl.metadata (3.8 kB)
Collecting pandas
  Downloading pandas-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading wfdb-4.3.1-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.9/163.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (10.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas, wfdb
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages

In [2]:
import wfdb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from scipy.signal import butter, filtfilt
from google.colab import drive

In [3]:
drive.mount('/content/drive')
!unzip -q /content/drive/MyDrive/mit-bih-arrhythmia-database-1.0.0.zip

Mounted at /content/drive


In [4]:
data_path = "mit-bih-arrhythmia-database-1.0.0"
records = []
# getting the list of all records
for file in os.listdir(data_path):
    if file.endswith(".dat"):
            records.append(file.split(".")[0])
#spliting 100.dat and getting just the number
records = sorted(records)
print(records[:10])
print("Total Records:", len(records))

['100', '101', '102', '103', '104', '105', '106', '107', '108', '109']
Total Records: 48


In [5]:
def bandpass_filter(signal, lowcut=0.5, highcut=40, fs=360, order=2):
      #0.5-40 Hz - usefull ECG signal range
      #fs- sampling rate(360hz: 360 values recorded per sec)
      nyq = 0.5 * fs #max frequency that can be correctly captured (half of fs)
      low = lowcut / nyq #normalizing frequencies
      high = highcut / nyq
      b, a = butter(order, [low, high], btype='band')
      return filtfilt(b, a, signal)
#bandpass filter removes noice from ecg signals

In [6]:
all_beats = []
all_labels = []

window = 100   # 200 samples total per beat

for rec in records:
    print("Processing Record:", rec)
    record = wfdb.rdrecord(f"{data_path}/{rec}")
    annotation = wfdb.rdann(f"{data_path}/{rec}", "atr")
    signal = record.p_signal[:, 0]
    signal = np.nan_to_num(signal)
    # Filter
    filtered = bandpass_filter(signal)

    # Normalize
    normalized = (filtered - np.mean(filtered)) / np.std(filtered)

    # Extract beats
    for i, sample in enumerate(annotation.sample):
        if sample-window > 0 and sample+window < len(normalized):
           beat = normalized[sample-window:sample+window]
           label = annotation.symbol[i]
           all_beats.append(beat)
           all_labels.append(label)


Processing Record: 100
Processing Record: 101
Processing Record: 102
Processing Record: 103
Processing Record: 104
Processing Record: 105
Processing Record: 106
Processing Record: 107
Processing Record: 108
Processing Record: 109
Processing Record: 111
Processing Record: 112
Processing Record: 113
Processing Record: 114
Processing Record: 115
Processing Record: 116
Processing Record: 117
Processing Record: 118
Processing Record: 119
Processing Record: 121
Processing Record: 122
Processing Record: 123
Processing Record: 124
Processing Record: 200
Processing Record: 201
Processing Record: 202
Processing Record: 203
Processing Record: 205
Processing Record: 207
Processing Record: 208
Processing Record: 209
Processing Record: 210
Processing Record: 212
Processing Record: 213
Processing Record: 214
Processing Record: 215
Processing Record: 217
Processing Record: 219
Processing Record: 220
Processing Record: 221
Processing Record: 222
Processing Record: 223
Processing Record: 228
Processing 

In [7]:
X = np.array(all_beats)
y = np.array(all_labels)
print("Dataset Shape:", X.shape)
print("Labels Shape:", y.shape)

Dataset Shape: (112571, 200)
Labels Shape: (112571,)


In [8]:
valid_classes = ['N','V','A','L','R']
mask = np.isin(y, valid_classes)
X = X[mask]
y = y[mask]
print("After Filtering:", X.shape)

After Filtering: (100033, 200)


In [9]:
# Normal=0
# Arrhythmia = 1 (1-4)
y_binary = np.where(y == 'N', 0, 1)
print("Binary Label Distribution:")
unique, counts = np.unique(y_binary, return_counts=True)
print(dict(zip(unique, counts)))

Binary Label Distribution:
{np.int64(0): np.int64(75028), np.int64(1): np.int64(25005)}


In [10]:
np.save("/content/drive/MyDrive/X_mitbih_binary.npy", X)
np.save("/content/drive/MyDrive/y_mitbih_binary.npy", y_binary)
print("Saved to Drive Successfully!")

Saved to Drive Successfully!


In [11]:
!ls /content/drive/MyDrive

'Colab Notebooks'			 X_mitbih_binary.npy
 mit-bih-arrhythmia-database-1.0.0.zip	 X_mitbih_clean.npy
 model_results.txt			 y_mitbih_binary.npy
 xgboost_mitbih_model.pkl		 y_mitbih_clean.npy
