In [44]:
import numpy as np
from helper_code import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import neurokit2 as nk
from tqdm import tqdm

# Feature Extraction

In [None]:
header_files, recording_files = find_challenge_files('data/data8')

In [None]:
def get_peaks_df(header_file,recording_file,lead):
    header = load_header(header_file)
    recording_temp = load_recording(recording_file)
    leads = get_leads(header)
    recording = choose_leads(recording_temp, header, leads)
    fs = get_frequency(header)
    for i in range(len(leads)):
        if lead == leads[i]:
            cleaned = nk.ecg_clean(recording[i], sampling_rate=fs)
            _, rpeaks = nk.ecg_peaks(cleaned, sampling_rate=fs)
            signal, waves = nk.ecg_delineate(cleaned, rpeaks, sampling_rate=fs, method="peak")
            peaks = {"ECG_P_Peaks":waves['ECG_P_Peaks'],"ECG_Q_Peaks":waves['ECG_Q_Peaks'],"ECG_R_Peaks":rpeaks['ECG_R_Peaks'],"ECG_S_Peaks":waves['ECG_S_Peaks'],"ECG_T_Peaks":waves['ECG_T_Peaks']}
            df = pd.DataFrame(peaks)
            df = df.dropna(axis=0)
            return df

In [None]:
with open('snomed_ct_code.txt') as f:
    mylist = f.read().splitlines()
dict = {}
for line in mylist:
    split_string = line.split(',')
    if split_string[1].isnumeric():
        dict_temp = {}
        dict_temp['short'] = split_string[2]
        dict_temp['full'] = split_string[0]
        dict[split_string[1]] = dict_temp

def get_disease(label,short = False):
    if short:
        return dict[label]['short']
    else :
        return dict[label]['full']

In [None]:
error = list()
def get_heart_rate(ecg,fs):
    cleaned = nk.ecg_clean(ecg, sampling_rate=fs)
    _, rpeaks = nk.ecg_peaks(cleaned, sampling_rate=fs)
    rate = nk.signal_rate(rpeaks, sampling_rate=fs, desired_length=len(cleaned))
    return np.mean(rate),np.std(rate)
def get_statistics(arr):
    return np.mean(arr),np.min(arr),np.max(arr),np.std(arr)
def get_r_peaks_amp(ecg,fs):
    cleaned = nk.ecg_clean(ecg, sampling_rate=fs)
    _, rpeaks = nk.ecg_peaks(cleaned, sampling_rate=fs)
    return get_statistics(cleaned[rpeaks['ECG_R_Peaks']])
def get_features(header_files,recording_files,lead):
    dict = defaultdict(list)
    num_recordings = len(recording_files)
    for i in tqdm(range(num_recordings),desc="Loading..."):
        header = load_header(header_files[i])
        recording_temp = load_recording(recording_files[i])
        leads = get_leads(header)
        labels = get_labels(header)
        recording = choose_leads(recording_temp, header, leads)
        fs = get_frequency(header)
        for j in range(len(leads)):
            if leads[j] == lead:
                    try :
                        f1,f2 = get_heart_rate(recording[j],fs)
                        pq_dur = list()
                        qrs_dur = list()
                        st_dur = list()
                        peaks_df = get_peaks_df(header_files[i],recording_files[i],lead)
                        for index, row in peaks_df.iterrows():
                            pq_dur.append((row['ECG_Q_Peaks']-row['ECG_P_Peaks'])/fs)
                            qrs_dur.append((row['ECG_S_Peaks']-row['ECG_Q_Peaks'])/fs)
                            st_dur.append((row['ECG_T_Peaks']-row['ECG_S_Peaks'])/fs)
                        f3,f4,f5,f6 = get_statistics(pq_dur)
                        f7,f8,f9,f10 = get_statistics(qrs_dur)
                        f11,f12,f13,f14 = get_statistics(st_dur)
                        f15,f16,f17,f18 = get_r_peaks_amp(recording[j],fs)
                        diseases = list()
                        for label in labels:
                            diseases.append(get_disease(label,True))
                    except (KeyError,ZeroDivisionError,ValueError,IndexError) :
                        error.append(i)
                        continue
                    dict['Index'].append(i)
                    dict['Heart_Rate_Mean'].append(f1)
                    dict['Heart_Rate_Std'].append(f2)
                    dict['PQ_Mean'].append(f3)
                    dict['PQ_Min'].append(f4)
                    dict['PQ_Max'].append(f5)
                    dict['PQ_Std'].append(f6)
                    dict['QRS_Mean'].append(f7)
                    dict['QRS_Min'].append(f8)
                    dict['QRS_Max'].append(f9)
                    dict['QRS_Std'].append(f10)
                    dict['ST_Mean'].append(f11)
                    dict['ST_Min'].append(f12)
                    dict['ST_Max'].append(f13)
                    dict['ST_Std'].append(f14)
                    dict['R_Peak_Mean'].append(f15)
                    dict['R_Peak_Min'].append(f16)
                    dict['R_Peak_Max'].append(f17)
                    dict['R_Peak_Std'].append(f18)
                    dict['Diseases'].append(diseases)
    df = pd.DataFrame(dict)
    return df

In [None]:
df = get_features(header_files, recording_files,'II')

# Box-Plot


In [46]:
df = pd.read_csv('features/data_overall.csv')

In [47]:
print(len(df))

77046


In [49]:
df_mapping = pd.read_csv('score.csv')
scored = df_mapping['Abbreviation']
dict_map = {}
for index, abbr in scored.iteritems():
    dict_map[abbr]=abbr
'''
Here mapping is according to this
URL : https://github.com/physionetchallenges/evaluation-2021/blob/main/dx_mapping_scored.csv

So If two maps to same then replace 2nd name with First
eg:- We score 733534002 and 164909002 as the same diagnosis
So 164909002 will be treated as 733534002
'''
dict_map['LBBB'] = 'CLBBB'
dict_map['RBBB'] = 'CRBBB'
dict_map['SVPB'] = 'PAC'
dict_map['VPB'] = 'PVC'
final_classes = list()
for key,val in dict_map.items():
    if key==val:
        final_classes.append(key)
print(final_classes)

['AF', 'AFL', 'BBB', 'Brady', 'CLBBB', 'CRBBB', 'IAVB', 'IRBBB', 'LAD', 'LAnFB', 'LQRSV', 'NSIVCB', 'NSR', 'PAC', 'PR', 'PRWP', 'PVC', 'LPR', 'LQT', 'QAb', 'RAD', 'SA', 'SB', 'STach', 'TAb', 'TInv']


In [50]:
classes = df['Diseases']
dfwc = df.drop(['Diseases'],axis=1)

In [51]:
dict_class_df = {}
for val in final_classes:
    df_temp = pd.DataFrame()
    for i in tqdm(range(len(df)),desc="Loading..."):
        if val=='CLBBB':
            if 'LBBB' in classes.loc[i]:
                df_temp = df_temp.append(dfwc.loc[i])
        if val=='CRBBB':
            if 'RBBB' in classes.loc[i]:
                df_temp = df_temp.append(dfwc.loc[i])
        if val=='PAC':
            if 'SVPB' in classes.loc[i]:
                df_temp = df_temp.append(dfwc.loc[i])
        if val=='PVC':
            if 'VPB' in classes.loc[i]:
                df_temp = df_temp.append(dfwc.loc[i])
        if val in classes.loc[i]:
            df_temp = df_temp.append(dfwc.loc[i])
    dict_class_df[val]=df_temp
f_list = list(dict_class_df['AF'].columns.values)

Loading...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 77046/77046 [00:21<00:00, 3522.14it/s]
Loading...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 77046/77046 [00:11<00:00, 6540.26it/s]
Loading...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 77046/77046 [00:20<00:00, 3774.41it/s]
Loading...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 77046/77046 [00:01<00:00, 58268.03it/s]
Loading...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 77046/77046 [00:04<00:00, 18093.18it/s]
Loading...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 77046/77046 [00:14<00:00, 5251.08it/s]
Loading...: 100%|███████████████████████████████████

In [54]:
def plot_boxplot(dict_class_df,f_list):
    def f(disease,feature):
        sns.boxplot(y = dict_class_df[disease][feature])
    interact(f,disease = dict_class_df.keys(),feature = f_list)

In [55]:
plot_boxplot(dict_class_df,f_list)

interactive(children=(Dropdown(description='disease', options=('AF', 'AFL', 'BBB', 'Brady', 'CLBBB', 'CRBBB', …

In [58]:
dict_class_df['AF'].describe()

Unnamed: 0.2,Heart_Rate_Mean,Heart_Rate_Std,Index,PQ_Max,PQ_Mean,PQ_Min,PQ_Std,QRS_Max,QRS_Mean,QRS_Min,...,R_Peak_Max,R_Peak_Mean,R_Peak_Min,R_Peak_Std,ST_Max,ST_Mean,ST_Min,ST_Std,Unnamed: 0,Unnamed: 0.1
count,10756.0,10756.0,10756.0,10756.0,10756.0,10756.0,10756.0,10756.0,10756.0,10756.0,...,10756.0,10756.0,10756.0,10756.0,10756.0,10756.0,10756.0,10756.0,10756.0,10756.0
mean,96.278217,14.940122,6022.967646,0.18992,0.103934,0.041061,0.045738,0.23766,0.149351,0.091651,...,563.154204,433.289772,308.365564,66.438307,0.23676,0.166258,0.089602,0.044209,40256.15833,4990.890108
std,27.064813,7.585019,4063.814035,0.051995,0.02881,0.023854,0.019066,0.083803,0.056149,0.043352,...,364.040122,266.437479,267.674811,77.757916,0.064946,0.054167,0.064368,0.02845,18105.471663,3885.608405
min,20.472412,0.0,0.0,0.01,0.00975,0.004,0.0,0.034,0.0335,0.024,...,-131.640367,-284.663922,-1194.74652,4.522548,0.02,0.02,0.004,0.0,2.0,0.0
25%,75.95821,10.044389,3240.75,0.156,0.0854,0.026,0.03309,0.178,0.108182,0.066,...,331.826317,244.892637,115.184485,29.544194,0.198,0.12844,0.038,0.023053,28770.75,2568.75
50%,91.433361,14.640635,5484.5,0.194,0.101,0.034,0.046444,0.228,0.1392,0.078,...,500.694159,395.235902,273.100435,44.193009,0.236,0.164,0.072,0.040844,49855.5,4295.0
75%,113.783764,19.513034,7839.25,0.224,0.119,0.048,0.05847,0.29,0.180953,0.106,...,709.119913,574.412829,457.98442,70.783393,0.274,0.203073,0.12,0.060909,53897.25,6041.0
max,186.576847,53.995219,21803.0,0.666,0.4175,0.41,0.192937,0.628,0.55,0.55,...,6223.627291,4030.1445,3716.218445,1393.793433,1.264,0.4955,0.36,0.469145,57248.0,21037.0
