In [None]:
print("Hello")

In [1]:
import pandas as pd
import numpy as np
import os
from scipy.signal import butter, filtfilt, resample_poly
import matplotlib.pyplot as plt

# --- Step 1: Load CSV file ---
df = pd.read_csv('combined_patient_ecg_outcome.csv')

# --- Step 2: Parsing function for space-separated ECG string to 1D numpy array ---

def string_to_float_array_fixed(s):
    values = []
    for val in s.strip().split():
        try:
            values.append(float(val))
        except:
            values.append(np.nan)
    return np.array(values)

df['data_array'] = df['data'].apply(string_to_float_array_fixed)

# --- Step 3: NaN interpolation helper ---

def interpolate_nan(data):
    n = len(data)
    if n == 0:
        return data

    indices = np.arange(n)
    valid = ~np.isnan(data)

    if valid.sum() == 0:
        return np.zeros_like(data)

    return np.interp(indices, indices[valid], data[valid])

# --- Step 4: Save raw numpy arrays with NaNs interpolated ---

np_folder = 'ecg_numpy_arrays'
os.makedirs(np_folder, exist_ok=True)

for _, row in df.iterrows():
    patient_id = row['patient_id']
    ecg_array = row['data_array']
    ecg_array_clean = interpolate_nan(ecg_array)
    np.save(os.path.join(np_folder, f"{patient_id}_ecg.npy"), ecg_array_clean)

print(f"Saved {len(df)} raw ECG numpy files with NaNs interpolated.")

Saved 99 raw ECG numpy files with NaNs interpolated.


In [2]:
df.head()

Unnamed: 0,patient_id,data,label,data_array
0,284,24433.0 24317.0 24354.0 24377.0 24270.0 24272....,1,"[24433.0, 24317.0, 24354.0, 24377.0, 24270.0, ..."
1,286,-31920.0 -31920.0 -31920.0 -31920.0 -31920.0 -...,1,"[-31920.0, -31920.0, -31920.0, -31920.0, -3192..."
2,296,3221.0 3274.0 3380.0 3098.0 2843.0 2910.0 3274...,1,"[3221.0, 3274.0, 3380.0, 3098.0, 2843.0, 2910...."
3,299,18584.0 18488.0 18648.0 19008.0 19239.0 19347....,1,"[18584.0, 18488.0, 18648.0, 19008.0, 19239.0, ..."
4,303,26985.0 10038.0 -9400.0 -20411.0 -18704.0 -419...,1,"[26985.0, 10038.0, -9400.0, -20411.0, -18704.0..."


In [3]:
from scipy.signal import butter, filtfilt, resample_poly, medfilt
import numpy as np
from scipy.signal import medfilt
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

In [4]:
# Butterworth low-pass filter and helpers

def butter_lowpass(cutoff, fs, order=4):
    from scipy.signal import butter
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return b, a

def butter_lowpass_filter(data, cutoff, fs, order=3):
    from scipy.signal import filtfilt
    b, a = butter_lowpass(cutoff, fs, order=order)
    data = data[:, np.newaxis] if data.ndim == 1 else data
    filtered = filtfilt(b, a, data, axis=0)
    return filtered.squeeze()

def resample_ecg(data, original_fs, target_fs):
    from scipy.signal import resample_poly
    from math import gcd
    g = gcd(int(original_fs), int(target_fs))
    up = int(target_fs // g)
    down = int(original_fs // g)
    data = data[:, np.newaxis] if data.ndim == 1 else data
    resampled = resample_poly(data, up, down, axis=0)
    return resampled.squeeze()

def median_filter_denoise(signal, kernel_size=5):
    return medfilt(signal, kernel_size=kernel_size)

In [None]:
# Process raw ECG numpy files: filtering, resampling, median filtering

original_fs = 310.0
target_fs = 45.0
cutoff_frequency = 20.0
filter_order = 3
median_kernel_size = 5

filtered_folder = 'ecg_resampled_filtered_arrays_order3'
os.makedirs(filtered_folder, exist_ok=True)

print("Starting ECG signal processing (filter, resample, median filter)...")
for filename in tqdm(os.listdir(np_folder)):
    if filename.endswith('.npy'):
        file_path = os.path.join(np_folder, filename)
        ecg_data = np.load(file_path)

        # Resample
        ecg_resampled = resample_ecg(ecg_data, original_fs, target_fs)

        # Median filter to remove spikes
        ecg_median = median_filter_denoise(ecg_resampled, kernel_size=median_kernel_size)

        # Butterworth low-pass filter to smooth signal
        ecg_filtered = butter_lowpass_filter(ecg_median, cutoff_frequency, target_fs, filter_order)

        # Save filtered ECG
        save_path = os.path.join(filtered_folder, filename.replace('_ecg.npy', '_processed.npy'))
        np.save(save_path, ecg_filtered)

print(f"Processing completed. Saved filtered ECG files to '{filtered_folder}'.")

Starting ECG signal processing (filter, resample, median filter)...


100%|██████████| 99/99 [00:06<00:00, 15.98it/s]

Processing completed. Saved filtered ECG files to 'ecg_resampled_filtered_arrays_order3'.





In [6]:
# Define Dataset class to load processed raw ECG and segment into windows

class RawECGSegmentDataset(Dataset):
    def __init__(self, np_folder, labels_dict, fs=45, window_sec=60, overlap_sec=10):
        self.fs = fs
        self.window_sec = window_sec
        self.overlap_sec = overlap_sec
        self.samples = []
        self.labels = []

        for file in tqdm(os.listdir(np_folder), desc="Loading processed ECG files"):
            if file.endswith('.npy'):
                pid = int(file.split('_')[0])
                ecg = np.load(os.path.join(np_folder, file))

                window_len = int(window_sec * fs)
                overlap_len = int(overlap_sec * fs)
                step = window_len - overlap_len

                for start in range(0, len(ecg) - window_len + 1, step):
                    segment = ecg[start:start+window_len]
                    self.samples.append(segment)
                    self.labels.append(labels_dict[pid])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return torch.tensor(self.samples[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)


In [7]:
# Instantiate dataset and split into train/val

labels_dict = dict(zip(df['patient_id'].astype(int), df['label']))
raw_dataset = RawECGSegmentDataset(np_folder=filtered_folder, labels_dict=labels_dict,
                                  fs=target_fs, window_sec=60, overlap_sec=10)

print(f"Total segments in dataset: {len(raw_dataset)}")

Loading processed ECG files: 100%|██████████| 198/198 [00:01<00:00, 103.53it/s]

Total segments in dataset: 14006





In [8]:
# Extract features and labels arrays to split
features = [raw_dataset[i][0].numpy() for i in range(len(raw_dataset))]
labels = [raw_dataset[i][1].item() for i in range(len(raw_dataset))]

X_train, X_val, y_train, y_val = train_test_split(features, labels,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=labels)

class ECGTensorDataset(Dataset):
    def __init__(self, signals, labels):
        self.signals = torch.tensor(signals, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
    def __len__(self):
        return len(self.signals)
    def __getitem__(self, idx):
        return self.signals[idx], self.labels[idx]

train_dataset = ECGTensorDataset(X_train, y_train)
val_dataset = ECGTensorDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


  self.signals = torch.tensor(signals, dtype=torch.float32)


In [9]:
# --- Step 9: Define 1D CNN + Attention model ---

class RawECGAttention1DCNN(nn.Module):
    def __init__(self, input_len, num_classes=2, dropout=0.3):
        super(RawECGAttention1DCNN, self).__init__()

        self.conv1 = nn.Conv1d(1, 32, kernel_size=7, padding=3)
        self.bn1 = nn.BatchNorm1d(32)

        self.conv2 = nn.Conv1d(32, 64, kernel_size=5, padding=2)
        self.bn2 = nn.BatchNorm1d(64)

        self.conv3 = nn.Conv1d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(128)

        self.dropout = nn.Dropout(dropout)

        self.attention = nn.Sequential(
            nn.Conv1d(128, 128, kernel_size=1),
            nn.Tanh(),
            nn.Conv1d(128, 1, kernel_size=1),
            nn.Softmax(dim=2)
        )

        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))

        attn_weights = self.attention(x)
        x = (x * attn_weights).sum(dim=2)

        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x


In [11]:
# Train and validate the model with AdamW and early stopping

import torch.optim as optim
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

input_len = X_train[0].shape[0]
num_classes = len(set(y_train))

model = RawECGAttention1DCNN(input_len=input_len, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

num_epochs = 50
patience = 7
best_val_acc = 0.0
early_stop_counter = 0

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} Training"):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    train_loss = running_loss / len(train_loader.dataset)

    # Validation
    model.eval()
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_acc = correct / total

    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Accuracy={val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        early_stop_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
        print("Saved best model")
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered")
            break


Using device: cpu


Epoch 1/50 Training: 100%|██████████| 351/351 [02:48<00:00,  2.08it/s]


Epoch 1: Train Loss=0.6689, Val Accuracy=0.6463
Saved best model


Epoch 2/50 Training: 100%|██████████| 351/351 [02:35<00:00,  2.26it/s]


Epoch 2: Train Loss=0.6446, Val Accuracy=0.6574
Saved best model


Epoch 3/50 Training: 100%|██████████| 351/351 [02:35<00:00,  2.25it/s]


Epoch 3: Train Loss=0.6170, Val Accuracy=0.6820
Saved best model


Epoch 4/50 Training: 100%|██████████| 351/351 [02:35<00:00,  2.25it/s]


Epoch 4: Train Loss=0.5940, Val Accuracy=0.6706


Epoch 5/50 Training: 100%|██████████| 351/351 [02:36<00:00,  2.24it/s]


Epoch 5: Train Loss=0.5776, Val Accuracy=0.7152
Saved best model


Epoch 6/50 Training: 100%|██████████| 351/351 [02:37<00:00,  2.24it/s]


Epoch 6: Train Loss=0.5800, Val Accuracy=0.7320
Saved best model


Epoch 7/50 Training: 100%|██████████| 351/351 [02:44<00:00,  2.13it/s]


Epoch 7: Train Loss=0.5352, Val Accuracy=0.6842


Epoch 8/50 Training: 100%|██████████| 351/351 [02:33<00:00,  2.28it/s]


Epoch 8: Train Loss=0.5429, Val Accuracy=0.6788


Epoch 9/50 Training: 100%|██████████| 351/351 [02:33<00:00,  2.29it/s]


Epoch 9: Train Loss=0.5157, Val Accuracy=0.7641
Saved best model


Epoch 10/50 Training: 100%|██████████| 351/351 [02:35<00:00,  2.26it/s]


Epoch 10: Train Loss=0.5017, Val Accuracy=0.7452


Epoch 11/50 Training: 100%|██████████| 351/351 [02:35<00:00,  2.26it/s]


Epoch 11: Train Loss=0.4828, Val Accuracy=0.7762
Saved best model


Epoch 12/50 Training: 100%|██████████| 351/351 [02:37<00:00,  2.22it/s]


Epoch 12: Train Loss=0.4712, Val Accuracy=0.7937
Saved best model


Epoch 13/50 Training: 100%|██████████| 351/351 [02:38<00:00,  2.21it/s]


Epoch 13: Train Loss=0.4724, Val Accuracy=0.7741


Epoch 14/50 Training: 100%|██████████| 351/351 [02:37<00:00,  2.23it/s]


Epoch 14: Train Loss=0.4450, Val Accuracy=0.7859


Epoch 15/50 Training: 100%|██████████| 351/351 [02:36<00:00,  2.24it/s]


Epoch 15: Train Loss=0.4407, Val Accuracy=0.7855


Epoch 16/50 Training: 100%|██████████| 351/351 [02:39<00:00,  2.20it/s]


Epoch 16: Train Loss=0.4414, Val Accuracy=0.7919


Epoch 17/50 Training: 100%|██████████| 351/351 [02:42<00:00,  2.17it/s]


Epoch 17: Train Loss=0.4181, Val Accuracy=0.8048
Saved best model


Epoch 18/50 Training: 100%|██████████| 351/351 [02:37<00:00,  2.23it/s]


Epoch 18: Train Loss=0.4374, Val Accuracy=0.8012


Epoch 19/50 Training: 100%|██████████| 351/351 [02:37<00:00,  2.23it/s]


Epoch 19: Train Loss=0.4006, Val Accuracy=0.8123
Saved best model


Epoch 20/50 Training: 100%|██████████| 351/351 [02:40<00:00,  2.19it/s]


Epoch 20: Train Loss=0.3885, Val Accuracy=0.7716


Epoch 21/50 Training: 100%|██████████| 351/351 [02:34<00:00,  2.27it/s]


Epoch 21: Train Loss=0.3997, Val Accuracy=0.8169
Saved best model


Epoch 22/50 Training: 100%|██████████| 351/351 [02:38<00:00,  2.21it/s]


Epoch 22: Train Loss=0.3758, Val Accuracy=0.8051


Epoch 23/50 Training: 100%|██████████| 351/351 [02:39<00:00,  2.20it/s]


Epoch 23: Train Loss=0.3623, Val Accuracy=0.8448
Saved best model


Epoch 24/50 Training: 100%|██████████| 351/351 [02:44<00:00,  2.13it/s]


Epoch 24: Train Loss=0.3667, Val Accuracy=0.8248


Epoch 25/50 Training: 100%|██████████| 351/351 [02:35<00:00,  2.26it/s]


Epoch 25: Train Loss=0.3477, Val Accuracy=0.8476
Saved best model


Epoch 26/50 Training: 100%|██████████| 351/351 [02:36<00:00,  2.24it/s]


Epoch 26: Train Loss=0.3358, Val Accuracy=0.7662


Epoch 27/50 Training: 100%|██████████| 351/351 [03:02<00:00,  1.93it/s]


Epoch 27: Train Loss=0.3395, Val Accuracy=0.8480
Saved best model


Epoch 28/50 Training: 100%|██████████| 351/351 [02:59<00:00,  1.96it/s]


Epoch 28: Train Loss=0.3281, Val Accuracy=0.8587
Saved best model


Epoch 29/50 Training: 100%|██████████| 351/351 [02:51<00:00,  2.05it/s]


Epoch 29: Train Loss=0.3235, Val Accuracy=0.8469


Epoch 30/50 Training: 100%|██████████| 351/351 [02:35<00:00,  2.26it/s]


Epoch 30: Train Loss=0.3018, Val Accuracy=0.8490


Epoch 31/50 Training: 100%|██████████| 351/351 [02:38<00:00,  2.22it/s]


Epoch 31: Train Loss=0.2891, Val Accuracy=0.8947
Saved best model


Epoch 32/50 Training: 100%|██████████| 351/351 [02:54<00:00,  2.02it/s]


Epoch 32: Train Loss=0.2819, Val Accuracy=0.8298


Epoch 33/50 Training: 100%|██████████| 351/351 [02:36<00:00,  2.25it/s]


Epoch 33: Train Loss=0.2827, Val Accuracy=0.8851


Epoch 34/50 Training: 100%|██████████| 351/351 [02:36<00:00,  2.25it/s]


Epoch 34: Train Loss=0.2755, Val Accuracy=0.8801


Epoch 35/50 Training: 100%|██████████| 351/351 [02:35<00:00,  2.26it/s]


Epoch 35: Train Loss=0.2706, Val Accuracy=0.8708


Epoch 36/50 Training: 100%|██████████| 351/351 [02:39<00:00,  2.20it/s]


Epoch 36: Train Loss=0.2721, Val Accuracy=0.8997
Saved best model


Epoch 37/50 Training: 100%|██████████| 351/351 [02:38<00:00,  2.22it/s]


Epoch 37: Train Loss=0.2432, Val Accuracy=0.8665


Epoch 38/50 Training: 100%|██████████| 351/351 [02:37<00:00,  2.23it/s]


Epoch 38: Train Loss=0.2514, Val Accuracy=0.8951


Epoch 39/50 Training: 100%|██████████| 351/351 [02:39<00:00,  2.20it/s]


Epoch 39: Train Loss=0.2521, Val Accuracy=0.8829


Epoch 40/50 Training: 100%|██████████| 351/351 [02:33<00:00,  2.29it/s]


Epoch 40: Train Loss=0.2393, Val Accuracy=0.8969


Epoch 41/50 Training: 100%|██████████| 351/351 [02:32<00:00,  2.30it/s]


Epoch 41: Train Loss=0.2330, Val Accuracy=0.8961


Epoch 42/50 Training: 100%|██████████| 351/351 [02:34<00:00,  2.27it/s]


Epoch 42: Train Loss=0.2310, Val Accuracy=0.9040
Saved best model


Epoch 43/50 Training: 100%|██████████| 351/351 [02:35<00:00,  2.26it/s]


Epoch 43: Train Loss=0.2314, Val Accuracy=0.8694


Epoch 44/50 Training: 100%|██████████| 351/351 [02:36<00:00,  2.25it/s]


Epoch 44: Train Loss=0.2705, Val Accuracy=0.9065
Saved best model


Epoch 45/50 Training: 100%|██████████| 351/351 [02:35<00:00,  2.26it/s]


Epoch 45: Train Loss=0.2258, Val Accuracy=0.8412


Epoch 46/50 Training: 100%|██████████| 351/351 [02:35<00:00,  2.25it/s]


Epoch 46: Train Loss=0.2185, Val Accuracy=0.8865


Epoch 47/50 Training: 100%|██████████| 351/351 [02:30<00:00,  2.34it/s]


Epoch 47: Train Loss=0.2177, Val Accuracy=0.9126
Saved best model


Epoch 48/50 Training: 100%|██████████| 351/351 [02:30<00:00,  2.33it/s]


Epoch 48: Train Loss=0.2219, Val Accuracy=0.9026


Epoch 49/50 Training: 100%|██████████| 351/351 [02:29<00:00,  2.34it/s]


Epoch 49: Train Loss=0.2112, Val Accuracy=0.9108


Epoch 50/50 Training: 100%|██████████| 351/351 [02:30<00:00,  2.34it/s]


Epoch 50: Train Loss=0.2231, Val Accuracy=0.9201
Saved best model


In [None]:
# plt.figure(figsize=(12,5))

# plt.subplot(1,2,1)
# plt.plot(range(1, len(train_losses)+1), train_losses, label='Train Loss')
# plt.plot(range(1, len(val_losses)+1), val_losses, label='Validation Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.title('Training and Validation Loss')
# plt.legend()

# plt.subplot(1,2,2)
# plt.plot(range(1, len(val_accuracies)+1), val_accuracies, label='Validation Accuracy')
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')
# plt.title('Validation Accuracy')
# plt.legend()

# plt.tight_layout()
# plt.show()


In [12]:
from sklearn.metrics import classification_report

# Make sure your model and device are defined as in training
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

all_labels = []
all_preds = []

with torch.no_grad():
    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(predicted.cpu().numpy())

class_names = ['Poor', 'Good']  # Update as per your classes

report = classification_report(all_labels, all_preds, target_names=class_names, digits=2)
print("Detailed Classification Report on Validation Set:")
print(report)


Detailed Classification Report on Validation Set:
              precision    recall  f1-score   support

        Poor       0.93      0.91      0.92      1442
        Good       0.91      0.93      0.92      1360

    accuracy                           0.92      2802
   macro avg       0.92      0.92      0.92      2802
weighted avg       0.92      0.92      0.92      2802



In [13]:
# Load best model and print classification report

from sklearn.metrics import classification_report

# After model evaluation and collecting all_preds and all_labels

class_names = ['Poor', 'Good']  # Replace with your actual class names

report = classification_report(all_labels, all_preds, target_names=class_names, digits=4)
print("Detailed classification report on validation set:")
print(report)


Detailed classification report on validation set:
              precision    recall  f1-score   support

        Poor     0.9307    0.9126    0.9216      1442
        Good     0.9092    0.9279    0.9185      1360

    accuracy                         0.9201      2802
   macro avg     0.9200    0.9203    0.9200      2802
weighted avg     0.9203    0.9201    0.9201      2802



In [None]:
# Steps 1–4: Your original code to load and save raw ECG arrays.

# Steps 5–6: Filter, resample, median-filter, and save processed ECG arrays.

# Step 7: New Dataset that loads these processed ECG signals and segments into smaller windows.

# Step 8: Split dataset and create PyTorch DataLoader.

# Step 9: Define the 1D CNN + attention model that takes raw ECG windows.

# Step 10: Train with AdamW optimizer, CrossEntropyLoss, early stopping.

# Step 11: Evaluate best model and print detailed classification report.

In [None]:
# Input: Takes a raw ECG signal (a sequence of numbers representing heart activity over time).

# Pre-process: median and butterworth-low pass filter

# CNN Layers:

# The input raw ECG window (a sequence of voltage values over time) passes through several 1D convolutional layers.

# Each convolution layer extracts increasingly complex features:

# Early layers might detect simple patterns like edges or peaks (e.g., QRS complexes).

# Deeper layers combine those to capture rhythms, morphology, and higher-level ECG patterns.

# These layers output a feature map — a set of learned features across time and channels.

# Attention Layer:

# The attention mechanism takes these CNN feature maps as input.

# It assigns weights (importance scores) to each time step or segment of the feature map.

# This means it highlights which parts of the signal are most relevant for classification.

# It produces a weighted summary (a context vector), emphasizing important time regions dynamically.

# Classification: Passes the summarized features through two dense layers to predict the class (e.g., normal or abnormal heart activity).

# Regularization: Uses dropout to prevent overfitting and batch normalization to stabilize training.