In [None]:
import sys
print(sys.executable)

: 

In [None]:
import pandas as pd
import os

DATA_DIR = os.path.join("data", "ptbxl")
csv_path = os.path.join(DATA_DIR, "ptbxl_database.csv")

df = pd.read_csv(csv_path)
df.head()

In [None]:
import wfdb
import matplotlib.pyplot as plt
import os

# pick the first ECG in the dataset
row = df.iloc[0]

# build the full path to the ECG waveform file
ecg_path = os.path.join("data", "ptbxl", row["filename_hr"])

print("Loading ECG from:", ecg_path)

# load the waveform (signal) and metadata
signal, meta = wfdb.rdsamp(ecg_path)

print("Signal shape:", signal.shape)
signal[:5]

In [None]:
plt.figure(figsize=(12, 4))
plt.plot(signal[:, 0])  # Lead I
plt.title("Example ECG – Lead I")
plt.xlabel("Time (samples)")
plt.ylabel("Amplitude")
plt.show()


In [None]:
import numpy as np

def normalize_signal(sig):
    return (sig - np.mean(sig, axis=0)) / np.std(sig, axis=0)

In [None]:
norm_signal = normalize_signal(signal)
plt.figure(figsize=(12,4))
plt.plot(norm_signal[:,0])
plt.title("Normalized Lead I")
plt.show()

In [None]:
from scipy.signal import butter, filtfilt

def bandpass_filter(sig, low=0.5, high=40, fs=500):
    b, a = butter(1, [low/(fs/2), high/(fs/2)], btype='band')
    return filtfilt(b, a, sig, axis=0)

In [None]:
filt_signal = bandpass_filter(signal)
plt.figure(figsize=(12,4))
plt.plot(filt_signal[:,0])
plt.title("Filtered Lead I (0.5–40 Hz)")
plt.show()

In [None]:
def preprocess(sig):
    sig = bandpass_filter(sig)
    sig = normalize_signal(sig)
    return sig

In [None]:
clean = preprocess(signal)

plt.figure(figsize=(12,4))
plt.plot(clean[:,0])
plt.title("Fully Preprocessed Lead I")
plt.show()

In [None]:
df[['filename_hr', 'diagnostic_superclass']].head()

In [None]:
df.columns

In [None]:
import os
import pandas as pd
import ast

# Load scp_statements.csv
scp_path = os.path.join("data", "ptbxl", "scp_statements.csv")
scp = pd.read_csv(scp_path)

# Keep only rows where diagnostic_class is not empty
scp_diag = scp[scp['diagnostic_class'].notna()]

# Build mapping: scp_code (in 'Unnamed: 0') -> diagnostic_class
diag_map = scp_diag.set_index('Unnamed: 0')['diagnostic_class'].to_dict()

len(diag_map), list(diag_map.items())[:10]

In [None]:
df['scp_codes'] = df['scp_codes'].apply(ast.literal_eval)
df['scp_codes'].iloc[0]

In [None]:
def get_superclass_from_scp_codes(row):
    codes = row['scp_codes'].keys()
    classes = [diag_map[c] for c in codes if c in diag_map]
    if len(classes) == 0:
        return None  # no diagnostic class found
    return classes[0]  # just pick the first mapped class

df['diagnostic_superclass'] = df.apply(get_superclass_from_scp_codes, axis=1)

df['diagnostic_superclass'].value_counts()


In [None]:
label_map = {
    'NORM': 0,
    'MI': 1,
    'STTC': 2,
    'CD': 3,
    'HYP': 4
}

df['label'] = df['diagnostic_superclass'].map(label_map)
df['label'].value_counts(dropna=False)

In [None]:
df_model = df[df['label'].notna()].copy()
df_model.shape

In [None]:
subset = df_model.iloc[:200]   # first 200 ECGs
subset[['filename_hr', 'diagnostic_superclass', 'label']].head()

In [None]:
import os
import wfdb

def load_ecg_from_row(row):
    path = os.path.join("data", "ptbxl", row["filename_hr"])
    sig, meta = wfdb.rdsamp(path)
    sig = preprocess(sig)   # bandpass + normalize
    return sig

In [None]:
X = []
y = []

for idx, row in subset.iterrows():
    sig = load_ecg_from_row(row)
    X.append(sig)
    y.append(int(row["label"]))  # cast to int

len(X), len(y)


In [None]:
import numpy as np

X = np.array(X)
y = np.array(y, dtype=int)

X.shape, y.shape

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 4))
plt.plot(X[0][:, 0])
plt.title(f"Example preprocessed ECG — label = {y[0]}")
plt.xlabel("Time (samples)")
plt.ylabel("Normalized amplitude")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_val.shape, y_train.shape, y_val.shape


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

input_shape = (5000, 12)   # (time, leads)

model = models.Sequential([
    layers.Conv1D(32, kernel_size=7, activation='relu', input_shape=input_shape),
    layers.MaxPooling1D(pool_size=2),

    layers.Conv1D(64, kernel_size=7, activation='relu'),
    layers.MaxPooling1D(pool_size=2),

    layers.Conv1D(128, kernel_size=7, activation='relu'),
    layers.MaxPooling1D(pool_size=2),

    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(5, activation='softmax')   # 5 classes: 0..4
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()


In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=16
)

In [None]:
val_loss, val_acc = model.evaluate(X_val, y_val)
print("Validation loss:", val_loss)
print("Validation accuracy:", val_acc)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

y_val_pred_probs = model.predict(X_val)
y_val_pred = np.argmax(y_val_pred_probs, axis=1)

print("Classification report:")
print(classification_report(y_val, y_val_pred))

print("Confusion matrix:")
print(confusion_matrix(y_val, y_val_pred))
