# PBT-XL Classification

## Links
DATASET: https://physionet.org/content/ptb-xl/1.0.1/

Project Requirements: https://apmonitor.com/pds/index.php/Main/CourseProject

Github Repository: https://github.com/scraven4/EKG-Classification


In [1]:
##SAMPLE CODE FROM DATASET INFO PAGE - IMPORTS DATA AND SPLITS INTO TEST/TRAIN SET

import pandas as pd
import numpy as np
import wfdb
import ast

def load_raw_data(df, sampling_rate, path):
    #choose 100 for the 100hz file (500mb) and 500hz (2.7gB)
    if sampling_rate == 100:
        data = [wfdb.rdsamp(path+f) for f in df.filename_lr]
    else:
        data = [wfdb.rdsamp(path+f) for f in df.filename_hr]
    data = np.array([signal for signal, meta in data])
    return data

path = r"C:\Users\Hayden Wilde\PYTHON\ML Project\PTB-XL Extracted\\"
sampling_rate = 100

# load and convert annotation data
Y = pd.read_csv(r"C:\Users\Hayden Wilde\PYTHON\ML Project\PTB-XL Extracted\ptbxl_database.csv", index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

# Load raw signal data
X = load_raw_data(Y, sampling_rate, path)

# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(path+'scp_statements.csv', index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]

def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

# Apply diagnostic superclass
Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_diagnostic)

In [2]:
mdata = pd.DataFrame(Y)
mdata.head()

Unnamed: 0_level_0,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,...,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr,diagnostic_superclass
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,sinusrhythmus periphere niederspannung,...,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr,[NORM]
2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,sinusbradykardie sonst normales ekg,...,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr,[NORM]
3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,sinusrhythmus normales ekg,...,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr,[NORM]
4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,sinusrhythmus normales ekg,...,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr,[NORM]
5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,sinusrhythmus normales ekg,...,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr,[NORM]


In [3]:
del mdata['patient_id'] #using ecg_id for indexing
del mdata['recording_date']
del mdata['nurse']
del mdata['site']
del mdata['device']
del mdata['heart_axis'] #I don't think the doctor has this information easily from an ekg (refers to signal direction direction)
del mdata['infarction_stadium1'] #from what I can tell this is the likelihood of an infarction which should be unknown
del mdata['infarction_stadium2']
del mdata['validated_by']
del mdata['second_opinion']
del mdata['initial_autogenerated_report']
del mdata['validated_by_human']
del mdata['scp_codes'] #too meta
del mdata['report'] # too much german and this isn't usable by an algorithm

del mdata['filename_lr']
del mdata['filename_hr']

mdata.drop(mdata.dropna(subset=['pacemaker']).index) #deletes ~40 entries
del mdata['pacemaker'] # we are not going to deal with this
mdata.drop(mdata.dropna(subset=['electrodes_problems']).index) # we are not going to deal with this
del mdata['electrodes_problems'] #deletes ~300 entrie

#drop entries with no age (~80 entries)
mdata = mdata.dropna(axis=0, subset=['age'])

#methods of meta data usage: (1) Delete height and wiehgt columns, (2) delete all rows with missing height or weight
#(1)
del mdata['height']
del mdata['weight']

# turn extra beats, static noise, baseline drift, burst noise to a true false column, (1 = True)
mdata['extra_beats'] = mdata['extra_beats'].notnull().astype(int)
mdata['burst_noise'] = mdata['burst_noise'].notnull().astype(int)
mdata['static_noise'] = mdata['static_noise'].notnull().astype(int)
mdata['baseline_drift'] = mdata['baseline_drift'].notnull().astype(int)

# we could delete entries with burst noise(~600)(extra spike)
# maybe delete baseline drift (~1600)(lower baseline), extra beats (~2000) (smooth peak)
# static noise(~3000) probably don't delete

# keeping strat_fold for validation test (strat_fold relates accuracy of diagnostic)
print(len(mdata))
mdata.head()
#mdata.to_csv('mdata.csv',sep = '\t')

21748


Unnamed: 0_level_0,age,sex,baseline_drift,static_noise,burst_noise,extra_beats,strat_fold,diagnostic_superclass
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,56.0,1,0,1,0,0,3,[NORM]
2,19.0,0,0,0,0,0,2,[NORM]
3,37.0,1,0,0,0,0,5,[NORM]
4,24.0,0,1,0,0,0,3,[NORM]
5,19.0,1,1,0,0,0,4,[NORM]


In [4]:
# Split data into train and test
test_fold = 10
# Train
X_train = X[np.where(Y.strat_fold != test_fold)]
y_train = Y[(Y.strat_fold != test_fold)].diagnostic_superclass
# Test
X_test = X[np.where(Y.strat_fold == test_fold)]
y_test = Y[Y.strat_fold == test_fold].diagnostic_superclass