In [1]:
import numpy as np
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import wfdb
import statistics
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
!pip3 install xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier



# Извлекаем данные

In [2]:
class Wave:
    def __init__(self, beg, peak, end):
        self.beg=beg
        self.peak=peak
        self.end=end

class Complex:
    def __init__(self, p, qrs, t):
        self.p=p
        self.qrs=qrs
        self.t=t
class Lead():
    def __init__(self, name, num):
        self.name=name
        self.complexes=[]
        header = wfdb.rdsamp(f'data/{num}')[1]
        names=header['sig_name']
        self.signal=(wfdb.rdsamp(f'data/{num}', channels=[names.index(name)]))[0]

In [3]:
def fillLead(ann, num):
    isymbol = ann.symbol
    isample = ann.sample
    col = []
    col = ann.symbol.copy()
    lead=Lead(ann.extension, num)
    for k in range(len(col)):
        if col[k] == 'N':
            qrs_peak=isample[k]
            qrs_beg=isample[k-1]
            qrs_end=isample[k+1]
            QRS=Wave(qrs_beg, qrs_peak, qrs_end)
            if k-3 < 0:
                P=None
            elif col[k-3]=='p':
                p_peak=isample[k-3]
                p_beg=isample[k-4]
                p_end=isample[k-2]
                P=Wave(p_beg, p_peak, p_end)
            elif col[k-3]=='N' or col[k-3]=='t':
                P=None
            if k+3>len(col):
                T=None
            elif col[k+3]=='t':
                t_peak=isample[k+3]
                t_beg=isample[k+2]
                t_end=isample[k+4]
                T=Wave(t_beg, t_peak, t_end)
            elif col[k+3]=='N' or col[k+3]=='p':
                T=None
            A=Complex(P, QRS, T)
            lead.complexes.append(A)
    return lead

In [4]:
def createECG(num):
    ECG={}
    header = wfdb.rdsamp(f'data/{num}')[1]
    names=header['sig_name']
    for lead_name in names:
        name = wfdb.rdann(f'data/{num}', lead_name)
        ECG[lead_name]=fillLead(name, num)
    ECG['age']=int(header['comments'][0][header['comments'][1].find(':') + 2:])
    ECG['sex']=0 if header['comments'][1][-1] == 'F' else 1
    ECG['diagnosis']=0 if header['comments'][3] == 'Rhythm: Sinus rhythm.' else 1
    return ECG

In [5]:
createECG(1)

{'i': <__main__.Lead at 0x1c7ad1cb4c0>,
 'ii': <__main__.Lead at 0x1c7ad1cb310>,
 'iii': <__main__.Lead at 0x1c7a7633670>,
 'avr': <__main__.Lead at 0x1c7a76332b0>,
 'avl': <__main__.Lead at 0x1c7ad20f820>,
 'avf': <__main__.Lead at 0x1c7ad211130>,
 'v1': <__main__.Lead at 0x1c7ad211a00>,
 'v2': <__main__.Lead at 0x1c7ad214310>,
 'v3': <__main__.Lead at 0x1c7ad214be0>,
 'v4': <__main__.Lead at 0x1c7ad2174f0>,
 'v5': <__main__.Lead at 0x1c7ad217dc0>,
 'v6': <__main__.Lead at 0x1c7ad2186d0>,
 'age': 51,
 'sex': 0,
 'diagnosis': 1}

In [6]:
ECGS=[]
ECGS.append(None)
errors=[7, 8, 34, 95, 104, 111, 116, 198]
for i in range(1, 201):
    if i in errors:
        ECGS.append(None)
    if i not in(errors):
        ECG=createECG(i)
        ECGS.append(ECG)

# Формируем датафрейм

In [7]:
# def duration_stats(func, num):
#     header = wfdb.rdsamp(f'data/{num}')[1]
#     names=header['sig_name']
#     len_means=[]
#     for name in names:
#         waves = [func(c) for c in ECGS[num][name].complexes if func(c) is not None]
#         duration = np.array([w.end - w.beg for w in waves]) / 500
#         len_means.append(duration.mean() if len(duration) else None)
#     return (np.mean(len_means), np.std(len_means)
# def amplitude_stats(func, num):
#     header = wfdb.rdsamp(f'data/{num}')[1]
#     names=header['sig_name']
#     hei_means=[]
#     k=0
#     for name in names:
#         waves = [func(c) if func(c) is not None else None for c in ECGS[num][name].complexes]
#         subsignals = [ECGS[num][name].signal[w.beg:w.end] if waves is not None else 0.0 for w in waves]
#         hei_means.append(np.mean([np.max(s) - np.min(s) for s in subsignals]))
#     height_means=sum(hei_means)/k
#     stds=[np.sqrt(abs(l - height_means**2)) for l in hei_means if l is not None]
#     stds_means=sum(stds)/k 
#     return np.mean(hei_means), np.std(hei_means)

In [8]:
def duration_stats(func, num):
    header = wfdb.rdsamp(f'data/{num}')[1]
    names=header['sig_name']
    len_means=[]
    for name in names:
        waves = [func(c) for c in ECGS[num][name].complexes if func(c) is not None]
        duration = np.array([w.end - w.beg for w in waves]) / 500
        len_means.append(duration.mean() if len(duration) else np.nan)
    return (np.nanmean(len_means), np.nanstd(len_means))
            
def amplitude_stats(func, num):
    header = wfdb.rdsamp(f'data/{num}')[1]
    names=header['sig_name']
    hei_means=[]
    for name in names:
        waves = [func(c) for c in ECGS[num][name].complexes if func(c) is not None]
        subsignals = [ECGS[num][name].signal[w.beg:w.end] for w in waves]
        if not subsignals:
            hei_means.append(np.nan)
        hei_means.append(np.mean([np.max(s) - np.min(s) for s in subsignals]))
    return (np.nanmean(hei_means), np.nanstd(hei_means))

In [9]:
def create_df():
    cols=['age', 'sex', 'diagnosis']
    header = wfdb.rdsamp(f'data/1')[1]
    leads = header['sig_name']
    for wave in ('p', 'qrs', 't'):
        for value in (f'{wave}_len_mean', f'{wave}_len_std', f'{wave}_height_mean', f'{wave}_height_std'):
            cols.append(value)
    df=pd.DataFrame(columns=cols)
    return df
create_df()

Unnamed: 0,age,sex,diagnosis,p_len_mean,p_len_std,p_height_mean,p_height_std,qrs_len_mean,qrs_len_std,qrs_height_mean,qrs_height_std,t_len_mean,t_len_std,t_height_mean,t_height_std


In [10]:
def create_row(num):
    row={}
    row['age']=ECGS[num]['age']
    row['sex']=ECGS[num]['sex']
    row['diagnosis']=ECGS[num]['diagnosis']
    header = wfdb.rdsamp(f'data/{num}')[1]
    leads = header['sig_name']
    row['p_len_mean']=duration_stats(lambda c: c.p, num)[0]
    row['p_len_std']=duration_stats(lambda c: c.p, num)[1]
    row['p_height_mean']=amplitude_stats(lambda c: c.p, num)[0]
    row['p_height_std']=amplitude_stats(lambda c: c.p, num)[1]
    row['qrs_len_mean']=duration_stats(lambda c: c.qrs, num)[0]
    row['qrs_len_std']=duration_stats(lambda c: c.qrs, num)[1]
    row['qrs_height_mean']=amplitude_stats(lambda c: c.qrs, num)[0]
    row['qrs_height_std']=amplitude_stats(lambda c: c.qrs, num)[1]
    row['t_len_mean']=duration_stats(lambda c: c.t, num)[0]
    row['t_len_std']=duration_stats(lambda c: c.t, num)[1]
    row['t_height_mean']=amplitude_stats(lambda c: c.t, num)[0]
    row['t_height_std']=amplitude_stats(lambda c: c.t, num)[1]
    return row

In [11]:
def fill_dataframe():
    df=create_df()
    errors=[7, 8, 34, 95, 104, 111, 116, 198]
    valid=[*range(1, 201)]
    for num in errors:
        valid.remove(num)
    for num in valid:
        row=create_row(num)
        df=df.append(row, ignore_index=True)
    return df

In [12]:
df=fill_dataframe()

  return (np.nanmean(len_means), np.nanstd(len_means))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return (np.nanmean(hei_means), np.nanstd(hei_means))


In [13]:
df.head(46)

Unnamed: 0,age,sex,diagnosis,p_len_mean,p_len_std,p_height_mean,p_height_std,qrs_len_mean,qrs_len_std,qrs_height_mean,qrs_height_std,t_len_mean,t_len_std,t_height_mean,t_height_std
0,51.0,0.0,1.0,0.0964,0.014603,0.055155,0.021621,0.084222,0.008329,0.91959,0.035564,0.2226,0.02057,0.193471,0.093905
1,64.0,1.0,0.0,0.107833,0.021063,0.06493,0.029809,0.086111,0.009784,0.771826,0.118742,0.181312,0.01916,0.176244,0.067563
2,53.0,1.0,0.0,0.107167,0.015954,0.048464,0.031404,0.110278,0.003878,0.912847,0.045068,0.171104,0.043954,0.154999,0.077537
3,56.0,1.0,0.0,0.100521,0.013571,0.048817,0.025599,0.095759,0.007129,0.737953,0.12843,0.215167,0.032645,0.295983,0.114301
4,61.0,1.0,0.0,0.082028,0.010249,0.069191,0.038759,0.080095,0.009401,0.836427,0.117373,0.169194,0.030031,0.206417,0.049602
5,76.0,1.0,1.0,0.098333,0.01908,0.091271,0.061005,0.0845,0.011159,0.815281,0.085803,0.197167,0.035235,0.234846,0.199589
6,57.0,0.0,0.0,0.108479,0.016663,0.08067,0.024015,0.081944,0.008262,0.820893,0.116177,0.181708,0.016951,0.224188,0.114622
7,79.0,0.0,0.0,0.109778,0.006538,0.045444,0.021223,0.11175,0.017323,0.894097,0.058107,0.176476,0.021689,0.161338,0.071147
8,54.0,1.0,0.0,0.111619,0.017295,0.063464,0.031383,0.100292,0.008888,0.856046,0.08554,0.228905,0.030157,0.316179,0.192698
9,61.0,1.0,1.0,0.091972,0.009899,0.069203,0.040033,0.086929,0.006264,0.71133,0.162562,0.176861,0.025955,0.193821,0.059785


In [14]:
df.isna().any()

age                False
sex                False
diagnosis          False
p_len_mean          True
p_len_std           True
p_height_mean       True
p_height_std        True
qrs_len_mean       False
qrs_len_std        False
qrs_height_mean    False
qrs_height_std     False
t_len_mean         False
t_len_std          False
t_height_mean      False
t_height_std       False
dtype: bool

In [15]:
y=df['diagnosis']
X=df.drop('diagnosis', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)

Зависимость эффективности обучения от выбора модели:

In [117]:
def best_results(model, params):
    pipeline=Pipeline(steps=[('preprocessor', SimpleImputer(strategy='median')), ('normalize', StandardScaler()), ('model', model())])
    clf = GridSearchCV(pipeline, params, scoring='f1', refit=True)
    return(clf)

In [135]:
model=DecisionTreeClassifier
clf=best_results(model, {'model__min_samples_split': list(range(2, 10)), 'model__max_depth': list(range(1, 80))})
clf.fit(X, y)
preds=cross_val_predict(clf.best_estimator_, X, y, cv=5)
print(confusion_matrix(y, preds), clf.best_score_, clf.best_params_)

[[99 38]
 [32 23]] 0.4373523624575518 {'model__max_depth': 78, 'model__min_samples_split': 6}


In [139]:
X.isna().any()

age                False
sex                False
p_len_mean          True
p_len_std           True
p_height_mean       True
p_height_std        True
qrs_len_mean       False
qrs_len_std        False
qrs_height_mean    False
qrs_height_std     False
t_len_mean         False
t_len_std          False
t_height_mean      False
t_height_std       False
dtype: bool

In [137]:
preds

array([0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1.,
       1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1.,
       0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0.,
       1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
       0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0.,
       0., 0., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.])

In [138]:
y

0      1.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
187    0.0
188    0.0
189    1.0
190    0.0
191    0.0
Name: diagnosis, Length: 192, dtype: float64

In [23]:
ass = wfdb.rdann(f'data/1', 'ii')

In [25]:
ass.sample

array([ 644,  662,  682,  776,  843,  878, 1250, 1278, 1302, 1324, 1342,
       1374, 1458, 1524, 1572, 1911, 1935, 1955, 1979, 2000, 2028, 2120,
       2176, 2224, 2546, 2578, 2599, 2624, 2642, 2668, 2765, 2824, 2871,
       3223, 3247, 3270, 3286, 3314, 3347, 3434, 3491, 3539, 3879, 3903,
       3926, 3950, 3969, 3996])

In [31]:
 signals, fields = wfdb.rdsamp('data/1')

In [35]:
ass = wfdb.rdann('data/1', 'ii')

In [40]:
ass.symbol

['(',
 'N',
 ')',
 '(',
 't',
 ')',
 '(',
 'p',
 ')',
 '(',
 'N',
 ')',
 '(',
 't',
 ')',
 '(',
 'p',
 ')',
 '(',
 'N',
 ')',
 '(',
 't',
 ')',
 '(',
 'p',
 ')',
 '(',
 'N',
 ')',
 '(',
 't',
 ')',
 '(',
 'p',
 ')',
 '(',
 'N',
 ')',
 '(',
 't',
 ')',
 '(',
 'p',
 ')',
 '(',
 'N',
 ')']