In [1]:
# Importando bibliotecas
import os
import wfdb # to read physionet files
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# For Classification Report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, confusion_matrix, cohen_kappa_score

# For Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# For K-Fold
#from sklearn.model_selection import GroupKFold
from sklearn.model_selection import KFold

# For SQI
from scipy import signal
from scipy.stats import skew, kurtosis
from scipy.signal import find_peaks, butter, lfilter
from scipy.signal.windows import tukey
from scipy.fft import fft
from scipy.signal import filtfilt, butter, bessel
from scipy.signal import welch

#### Classification Report Functions

In [2]:
# Generate binary classification analysis
def classification_metrics_binary(y_true, y_pred, model, fold):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    perc_tn, perc_fp, perc_fn, perc_tp = list(map(lambda x:x/len(y_true)*100, [tn, fp, fn, tp]))
    sensitivity = tp/(tp+fn)
    false_positive_rate = fp/(tn+fp)
    precision = tp/(tp+fp)
    specificity = tn/(tn+fp)
    accuracy = (tp+tn)/(tp+fp+tn+fn)
    f1_score = 2*precision*sensitivity/(sensitivity+precision)
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    auc_value = auc(fpr, tpr)
    kappa = cohen_kappa_score(y_true, y_pred)
    
    return {"model":model, "fold": fold,
            "tn":tn, "fp":fp, "fn":fn, "tp":tp, "perc_tn":perc_tn, "perc_fp":perc_fp, 
            "perc_fn":perc_fn, "perc_tp":perc_tp, "sensitivity":sensitivity, 
            "tpr":sensitivity, "recall":sensitivity, "fpr":false_positive_rate, 
            "precision":precision, "ppv":precision, "specificity":specificity, 
            "tnr":specificity,"f1_score":f1_score, "auc":auc_value, "kappa":kappa, 
            "accuracy":accuracy}

In [3]:
# Plot Confusion Matrix
def plot_confusion(y_true, y_pred):
    # Generate the confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    class_name = ['Good', 'Bad']
    cm = pd.DataFrame(cm, index=class_name, columns=class_name)

    # Plot the confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', cbar=False)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')

    # Salva a figura como .eps
    #plt.savefig('confusion_matrix_FAxNORMAL_preprocessed.eps', format='eps')

    # Salva a figura como .png com 300 DPI
    #plt.savefig('confusion_matrix_FAxNORMAL_preprocessed.png', format='png', dpi=300)

    plt.show()

#### Classifier Functions

In [4]:
RANDOM_SEED = 32

# Método random forest estoura a memória. Utilizar o HPC para rodar.
methods = { 'xgboost': xgboost.XGBClassifier(seed=RANDOM_SEED),
            'logistic_regression': LogisticRegression(random_state=RANDOM_SEED),
            'random_forest': RandomForestClassifier(random_state=RANDOM_SEED),
            'gradient_boosting': GradientBoostingClassifier(random_state=RANDOM_SEED),
            'svm': SVC(random_state=RANDOM_SEED),
            'knn': KNeighborsClassifier(),
            'decision_tree': DecisionTreeClassifier(random_state=RANDOM_SEED),
            'naive_bayes': GaussianNB()
           }

#### Defining Signal Quality Indexes Functions

In [5]:
def compute_iSQI(detected_peaks, n_samples):
    '''
    This function calculates the iSQI based on the number of detected peaks in an ECG signal. 
    The iSQI is expressed as a percentage, representing the ratio of detected peaks to the total number of samples in the signal.

    Args.
        detected_peaks: A list or array containing the indices of detected peaks in the ECG signal.
        n_samples: The total number of samples in the ECG signal.

    Returns.
        iSQI: The computed iSQI expressed as a percentage.
    '''

    if n_samples == 0:
        return 0
    
    return len(detected_peaks) / n_samples * 100

In [6]:
def compute_bSQI(detected_peaks_method1, detected_peaks_method2):
    '''
    This function calculates the bSQI based on the common peaks detected by two different methods. 
    The bSQI is expressed as a percentage, representing the ratio of common peaks to the total number of peaks detected by the first method.

    Args.
        detected_peaks_method1: A list or array containing the indices of peaks detected by the first method.
        detected_peaks_method2: A list or array containing the indices of peaks detected by the second method.

    Returns.
        bSQI: The computed bSQI expressed as a percentage.
    '''
    common_peaks = set(detected_peaks_method1).intersection(detected_peaks_method2)

    if len(detected_peaks_method1) == 0:
        return 0
        
    return len(common_peaks) / len(detected_peaks_method1) * 100

In [7]:
def compute_fSQI(ecg_signal, sampling_rate=1000, freq_range=(5, 20)):
    '''
    This function calculates the fSQI based on the power spectrum of the ECG signal in a specified frequency range. 
    The fSQI is expressed as the ratio of power between 5 and 20 Hz to the power up to half of the sampling rate.

    Args.
        ecg_signal: The ECG signal for which to compute the fSQI.
        sampling_rate: The sampling rate of the ECG signal (default is 1000 Hz).
        freq_range: The frequency range for computing the power ratio (default is (5, 20) Hz).

    Returns.
        fSQI
    '''
    f, pxx = welch(ecg_signal, fs=sampling_rate, nperseg=len(ecg_signal))
    
    freq_mask = (f >= freq_range[0]) & (f <= freq_range[1])
    power_5_20Hz = np.sum(pxx[freq_mask])
    
    freq_mask_0_fnHz = f <= 0.5 * sampling_rate
    power_0_fnHz = np.sum(pxx[freq_mask_0_fnHz])

    if power_0_fnHz == 0:
        return np.nan  # Retorna NaN se power_0_fnHz for zero para evitar warning
    
    return power_5_20Hz / power_0_fnHz

In [8]:
def compute_sSQI(ecg_signal):
    '''
    This function calculates the sSQI based on the skewness of the ECG signal.

    Args.
        ecg_signal: The ECG signal for which to compute the sSQI.

    Returns.
        sSQI
    '''
    return skew(ecg_signal)

In [9]:
def compute_kSQI(ecg_signal):
    '''
    This function calculates the kSQI based on the kurtosis of the ECG signal.

    Args.
        ecg_signal: The ECG signal for which to compute the kSQI.

    Returns.
        kSQI
    '''
    return kurtosis(ecg_signal)

In [10]:
def compute_pSQI(ecg_signal, epsilon=0.01):
    '''
    This function calculates the pSQI based on the percentage of flat lines in the ECG signal. 
    Flat lines are identified using the gradient of the signal.

    Args.
        ecg_signal: The ECG signal for which to compute the pSQI.
        epsilon: The threshold for considering a gradient as flat (default is 0.01).

    Returns.
            pSQI: The computed Percentage of Flat Lines.
    '''
    dxm_dt = np.gradient(ecg_signal)
    flat_line_percentage = np.sum(np.abs(dxm_dt) < epsilon) / len(ecg_signal) * 100
    return flat_line_percentage

In [11]:
def compute_zcrSQI(y, threshold=1e-10, zero_pos=False, axis=-1):
    '''
    This function calculates the zero_crossings_rate_sqi based on the rate of sign changes in the processed signal. 
    It represents the rate at which the signal changes from positive to negative or vice versa. The zero_crossings_rate_sqi is expressed as a percentage.

    Args.
        y: The signal for which to compute the zero_crossings_rate_sqi.
        threshold: Values below this threshold are clipped to 0 (default is 1e-10).
        zero_pos: Boolean indicating whether zero crossings should be marked (default is False).
        axis: Axis along which to compute zero-crossings (default is -1).

    Returns.
        zcrSQI: The computed zcrSQI expressed as a percentage.
    '''
    if threshold is not None:
        y[np.abs(y) <= threshold] = 0

    # Extract the sign bit
    if zero_pos:
        y_sign = np.signbit(y).astype(int)
    else:
        y_sign = np.sign(y)

    crossings = np.diff(y_sign, axis=axis) != 0

    return np.mean(crossings, axis=axis, keepdims=False)

In [12]:
def compute_mzcrSQI(y, threshold=1e-10, zero_pos=True, axis=-1):
    '''
    This function calculates the mean_crossing_rate_sqi, similar to zero_crossings_rate_sqi, but it focuses on the rate of crossing the signal mean. 
    The mean_crossing_rate_sqi is expressed as a percentage.

    Args.
        y: The signal for which to compute the mean_crossing_rate_sqi.
        threshold: Values below this threshold are clipped to 0 (default is 1e-10).
        zero_pos: Boolean indicating whether zero crossings should be marked (default is True).
        axis: Axis along which to compute mean-crossings (default is -1).

    Returns.
        mzcrSQI: The computed Mean zero crossing rate expressed as a percentage.
    '''
    return compute_zcrSQI(y-np.mean(y), threshold, zero_pos, axis)

In [13]:
def find_max_lenght_repeated_true(x):
    '''
    This function finds the maximum length of repeated true values in a boolean array. 
    It is used internally for the flat_line_sqi and saturation_sqi functions.

    Args.
        x: Boolean array.

    Returns.
        max_length: Maximum length of repeated true values.
    '''
    repeated = np.concatenate(([0], x, [0]))
    rep_diff = np.diff(repeated)
    rep_in = np.nonzero(rep_diff == 1)[0]
    rep_out = np.nonzero(rep_diff == -1)[0]
    
    if rep_in.size == 0:
        return 1
    
    if len(rep_out) > len(rep_in):
        rep_out = rep_out[:-2]

    # Garantir que ambos tenham o mesmo tamanho
    min_len = min(len(rep_out), len(rep_in))
    rep_out = rep_out[:min_len]
    rep_in = rep_in[:min_len]

    # Calcular a diferença entre os resultados
    diff_lengths = rep_out - rep_in

    if len(diff_lengths) == 0:
        return 1  # Retorna 1 se não houver diferenças (array vazio)

    return np.max(diff_lengths) + 1

In [14]:
def flat_line_sqi(x, fs, axis=-1):
    '''
    This function calculates the flat_line_sqi based on the length of repeated flat lines in the ECG signal. 
    Flat lines are identified using the find_max_length_repeated_true function.

    Args.
        x: The ECG signal for which to compute the flat_line_sqi.
        fs: The sampling rate of the ECG signal.
        axis: Axis along which to compute flat-line detection (default is -2).

    Returns.
        flat_line_sqi: The computed Flat Line Signal Quality Index.
    '''
    def find_lenght_repeated(x_):
        return find_max_lenght_repeated_true(np.diff(x_) == 0) - 1
    return np.apply_along_axis(find_lenght_repeated, axis, x) / fs

In [15]:
def saturation_sqi(x, fs, axis=-1, threshold=2.0):
    '''
    This function calculates the saturation_sqi based on the length of repeated values exceeding a specified threshold in the ECG signal.

    Args.
        x: The ECG signal for which to compute the saturation_sqi.
        fs: The sampling rate of the ECG signal.
        axis: Axis along which to compute saturation detection (default is -2).
        threshold: The threshold for identifying saturation (default is 2.0).

    Returns.
        saturation_sqi: The computed Saturation Signal Quality Index.
    '''
    def find_lenght_saturation(x_):
        return find_max_lenght_repeated_true(np.abs(x_) > threshold) - 1
    return np.apply_along_axis(find_lenght_saturation, axis, x) / fs

In [16]:
# noinspection PyTupleAssignmentBalance
def baseline_sqi(x, fs, axis=-1):
    '''
    This function calculates the baseline_sqi based on the maximum absolute values of the baseline of the ECG signal.

    Args.
        x: The ECG signal for which to compute the baseline_sqi.
        fs: The sampling rate of the ECG signal.
        axis: Axis along which to compute baseline detection (default is -2).

    Returns.
        max_baseline: The maximum absolute values of the baseline.
        baseline: The baseline of the ECG signal.
    '''
    b, a = butter(6, 1, 'low', analog=False, fs=fs)
    baseline = filtfilt(b, a, x, axis=axis)
    return np.max(np.abs(baseline), axis=axis), baseline

In [17]:
# noinspection PyTupleAssignmentBalance
def amplitude_sqi(x, fs, baseline=None, axis=-1):
    '''
    This function calculates the amplitude_sqi based on the maximum absolute values of the signal after subtracting the baseline.

    Args.
        x: The ECG signal for which to compute the amplitude_sqi.
        fs: The sampling rate of the ECG signal.
        baseline: The baseline of the ECG signal (if not provided, it will be calculated).
        axis: Axis along which to compute amplitude detection (default is -2).

    Returns.
        amplitude_sqi: The computed Amplitude Signal Quality Index.
    '''
    if baseline is None:
        baseline = filtfilt(*butter(6, 1, 'low', analog=False, fs=fs),
                            x, axis=axis)
    x = x - baseline
    x = filtfilt(*butter(2, 40, 'low', analog=False, fs=fs),
                 x, axis=axis)
    return np.max(np.abs(x), axis=axis)

In [18]:
def rr_variability_sqi(x, fs):
    '''
    This function calculates the RR Variability Signal Quality Index based on the mean, standard deviation, and coefficient of variation of RR intervals.

    Args.
        x: The signal as (n_channel, samples).
        fs: The sampling rate of the ECG signal.

    Returns.
        rr_mean: The mean RR interval.
        rr_std: The standard deviation of RR intervals.
        rr_cv: The coefficient of variation of RR intervals.
    '''

    # Verifica se x é unidimensional e o converte para um array bidimensional se necessário
    if x.ndim == 1:
        x = x.reshape(1, -1)
    
    nsig = x.shape[0]
    sig_len = x.shape[1]
    window = np.expand_dims(tukey(sig_len, 0.1), axis=(0))
    x = x * window

    x = filtfilt(*butter(2, 2, 'high', fs=fs), x, axis=1)
    x = filtfilt(*butter(2, [5, 15], 'bandpass', fs=fs), x, axis=1)
    x = filtfilt(*bessel(2, [0.5, 5], 'bandpass', fs=fs), x ** 2, axis=1)
    x = x / x.std(axis=1, keepdims=True)

    # Detect local peaks
    local_peaks = np.zeros(x.shape)
    local_peaks[:, 1:-1] = (x[:, :-2] < x[:, 1:-1]) & (x[:, 2:] < x[:, 1:-1]) & (x[:, 1:-1] > 0.5)
    rr_mean = np.zeros(nsig)
    rr_std = np.zeros(nsig)
    for i in range(nsig):
        sig_peaks = np.nonzero(local_peaks[i, :])[0]
        if len(sig_peaks) <= 1:
            rr_mean[i] = 1
            rr_std[i] = 0
        else:
            rr_mean[i] = np.mean(np.diff(sig_peaks))
            rr_std[i] = np.std(np.diff(sig_peaks))
    rr_cv = rr_std/rr_mean

    return rr_mean/fs, rr_std/fs, rr_cv

In [19]:
def power_sqi(x, fs):
    '''
    This function calculates the power_sqi and bsqi based on the power spectral density of the ECG signal.

    Args.
        x: The ECG signal for which to compute the power_sqi.
        fs: The sampling rate of the ECG signal.

    Returns.
        psqi: The Power Spectral Density Signal Quality Index.
        bsqi: The Baseline Power Spectral Density Signal Quality Index.
    '''
    
    if x.ndim == 1:
        x = x.reshape(1, -1)
    
    n = x.shape[1]

    y = fft(x, axis=1)

    # Estimates the Power Spectral Density
    y = (1 / (fs * n)) * abs(y)**2

    thresh_0 = 0
    thresh_1 = int(n/(fs/2) * 1)
    thresh_5 = int(n / (fs / 2) * 5)
    thresh_15 = int(n / (fs / 2) * 15)
    thresh_40 = int(n / (fs / 2) * 40)

    psqi = np.sum(y[:, thresh_5:thresh_15 ], axis=1) / np.sum(y[:, thresh_5:thresh_40], axis=1)
    bsqi = 1-np.sum(y[:, thresh_0:thresh_1], axis=1) / np.sum(y[:, thresh_0:thresh_40], axis=1)

    return psqi, bsqi

#### Join all SQIs in one Dictionary

In [20]:
def get_sqi_values(ecg_signal, fs, duration, detected_peaks_method1, detected_peaks_method2 = None):
    
    # Parameters
    t = np.arange(0, duration, 1/fs)
    n_samples = len(t)

    # If don't get detected_peaks_method2, make it equals to detected_peaks_method1
    if detected_peaks_method2 == None:
        detected_peaks_method2 = detected_peaks_method1
    
    # Calculate SQIs
    iSQI = compute_iSQI(detected_peaks_method1, n_samples)
    bSQI = compute_bSQI(detected_peaks_method1, detected_peaks_method2)
    fSQI = compute_fSQI(ecg_signal, fs)
    sSQI = compute_sSQI(ecg_signal)
    kSQI = compute_kSQI(ecg_signal)
    pSQI = compute_pSQI(ecg_signal)
    zcrSQI = compute_zcrSQI(ecg_signal)
    mzcrSQI = compute_mzcrSQI(ecg_signal)
    max_len_sqi = find_max_lenght_repeated_true(ecg_signal)
    flat_line_sqi_result = flat_line_sqi(ecg_signal, fs)
    saturation_sqi_result = saturation_sqi(ecg_signal, fs)
    baseline_sqi_result, _ = baseline_sqi(ecg_signal, fs)
    amplitude_sqi_result = amplitude_sqi(ecg_signal, fs)
    rr_mean, rr_std, rr_cv = rr_variability_sqi(ecg_signal, fs)
    power_sqi_result, bsqi_result = power_sqi(ecg_signal, fs)

    # Store values in dictionary
    sqi_functions_dir = {
        'iSQI': iSQI,
        'bSQI': bSQI,
        'fSQI': fSQI,
        'sSQI': sSQI,
        'kSQI': kSQI,
        'pSQI': pSQI,
        'zero_crossings_rate_sqi': zcrSQI,
        'mean_crossing_rate_sqi': mzcrSQI,
        'find_max_lenght_repeated_true': max_len_sqi,
        'flat_line_sqi': flat_line_sqi_result,
        'saturation_sqi': saturation_sqi_result,
        'baseline_sqi': baseline_sqi_result,
        'amplitude_sqi': amplitude_sqi_result,
        'rr_variability_sqi_mean': float(rr_mean),
        'rr_variability_sqi_std': float(rr_std),
        'rr_variability_sqi_cv': float(rr_cv),
        'power_sqi': float(power_sqi_result),
        'bsqi': float(bsqi_result)
    }


    sqi_functions_list = [
        iSQI,
        bSQI,
        fSQI,
        sSQI,
        kSQI,
        pSQI,
        zcrSQI,
        mzcrSQI,
        max_len_sqi,
        flat_line_sqi_result,
        saturation_sqi_result,
        baseline_sqi_result,
        amplitude_sqi_result,
        float(rr_mean),
        float(rr_std),
        float(rr_cv),
        float(power_sqi_result),
        float(bsqi_result)
        ]
    
    # print('SQIS: \n', sqi_functions_dir)
    return sqi_functions_dir, sqi_functions_list

#### Plot Detected Peaks

In [21]:
# Plot
def plot_detected_peaks(t, ecg_signal, detected_peaks):
    # Plotar os sinais original e filtrado com os picos QRS detectados
    plt.figure(figsize=(12, 8))
    plt.plot(t, ecg_signal, label='Sinal original', alpha=0.7)
    plt.scatter(t[detected_peaks], ecg_signal[detected_peaks], color='red', marker='o', label='Picos QRS')
    plt.title('Sinal de ECG com Picos R Detectados')
    plt.vlines(t[detected_peaks], *plt.ylim(), label="Picos", color=".2", alpha=0.3, ls="--")
    plt.xlabel('Tempo (s)')
    plt.ylabel('Amplitude')
    plt.legend()
    plt.show()

#### Getting Physionet CINC 2011 Dataset

In [22]:
# Gerando o dataset
def get_dataset(dir):
    # Lista para armazenar os sinais, nomes de arquivo e classes
    df_ecg_cinc2011 = []
    
    # Lê os nomes dos pacientes da classe 'positiva' a partir do arquivo .txt
    with open(dir + 'RECORDS-acceptable', 'r') as file_positive:
        good_quality = [line.strip() for line in file_positive]

    # Lê os nomes dos pacientes da classe 'negativa' a partir do arquivo .txt
    with open(dir + 'RECORDS-unacceptable', 'r') as file_negative:
        bad_quality = [line.strip() for line in file_negative]

    # Itera sobre todos os arquivos no diretório
    for file in os.listdir(dir):
        if file.endswith('.hea'):
            # Extrai o nome do arquivo sem a extensão
            name = os.path.splitext(file)[0]

            # Carrega as informações do cabeçalho (.hea)
            record = wfdb.rdheader(os.path.join(dir, name))

            # Carrega os dados do sinal (.dat)
            signals, _ = wfdb.rdsamp(os.path.join(dir, name))

            # Como vou usar apenas a DII, seleciono apenas essa derivação
            ecg_signal = signals[:,1]

            # Determina a classe do paciente
            if name in good_quality:
                label = 1 # bom
            elif name in bad_quality:
                label = 0 # ruim
            else:
                label = np.nan

            # Identifica os picos nos sinais de ECG
            sampling_rate = record.fs
            duration = 10 # segundos
            t = np.arange(0, duration, 1/sampling_rate)
            peaks, _ = find_peaks(ecg_signal, height = 0.2, distance = sampling_rate/2)
            
            # plot_detected_peaks(t, ecg_signal, peaks)

            # Gera os indices de qualidade
            indexes_dict, indexes_list = get_sqi_values(ecg_signal, sampling_rate, duration, peaks)

            # Adiciona os sinais, informações adicionais e a classe à lista
            df_ecg_cinc2011.append({'Subject': name, 'Signal': ecg_signal, 'Sampling Rate': record.fs, 
                                    'Labels': label, 'Peaks': peaks, 'Features_dict': indexes_dict, 'Features_list': indexes_list})

    return df_ecg_cinc2011

## TESTING FUNCTIONS ON PHYSIONET CINC 2011 DATASET

In [23]:
# Diretório que contém os arquivos .hea e .dat
dir = 'C:/Users/estel/Documents/Python_Codes/set-a/'

In [24]:
df_orig = get_dataset(dir)

  return skew(ecg_signal)
  'rr_variability_sqi_mean': float(rr_mean),
  'rr_variability_sqi_std': float(rr_std),
  'rr_variability_sqi_cv': float(rr_cv),
  'power_sqi': float(power_sqi_result),
  'bsqi': float(bsqi_result)
  float(rr_mean),
  float(rr_std),
  float(rr_cv),
  float(power_sqi_result),
  float(bsqi_result)
  x = x / x.std(axis=1, keepdims=True)
  psqi = np.sum(y[:, thresh_5:thresh_15 ], axis=1) / np.sum(y[:, thresh_5:thresh_40], axis=1)
  bsqi = 1-np.sum(y[:, thresh_0:thresh_1], axis=1) / np.sum(y[:, thresh_0:thresh_40], axis=1)


In [25]:
df_orig = pd.DataFrame(df_orig)
df_orig.head()

Unnamed: 0,Subject,Signal,Sampling Rate,Labels,Peaks,Features_dict,Features_list
0,1002603,"[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -80...",500,0.0,[],"{'iSQI': 0.0, 'bSQI': 0, 'fSQI': nan, 'sSQI': ...","[0.0, 0, nan, nan, nan, 100.0, 0.0, 0.0, 1, 9...."
1,1002867,"[-3.1, -3.07, -3.04, -3.02, -2.995, -2.955, -2...",500,1.0,"[248, 731, 1218, 1706, 2197, 2679, 3167, 3648,...","{'iSQI': 0.2, 'bSQI': 100.0, 'fSQI': 0.5440391...","[0.2, 100.0, 0.5440391449948266, -0.4661209318..."
2,1003574,"[-0.78, -0.77, -0.755, -0.74, -0.72, -0.7, -0....",500,0.0,"[713, 1283, 2344, 2655, 3277, 3548, 4240, 4605]","{'iSQI': 0.16, 'bSQI': 100.0, 'fSQI': 0.005520...","[0.16, 100.0, 0.005520486057508684, -2.1746740..."
3,1004502,"[0.32, 0.3, 0.275, 0.24, 0.215, 0.21, 0.22, 0....",500,0.0,"[245, 499, 848, 1228, 1723, 1974, 2450, 2813, ...","{'iSQI': 0.3, 'bSQI': 100.0, 'fSQI': 0.2788567...","[0.3, 100.0, 0.2788567246878214, -0.6684783077..."
4,1005639,"[-0.04, -0.015, 0.0, 0.0, 0.0, 0.015, 0.03, 0....",500,1.0,"[240, 713, 1196, 1674, 2154, 2630, 3109, 3590,...","{'iSQI': 0.2, 'bSQI': 100.0, 'fSQI': 0.6196047...","[0.2, 100.0, 0.6196047753776014, 4.41984119786..."


In [26]:
df_orig['Features_list'][0]

[0.0,
 0,
 nan,
 nan,
 nan,
 100.0,
 0.0,
 0.0,
 1,
 9.998,
 10.0,
 80.01300202817336,
 0.01300202832038576,
 4.726,
 4.278,
 0.9052052475666525,
 0.7557418802263444,
 0.0]

In [27]:
df_orig['Features_dict'][0]

{'iSQI': 0.0,
 'bSQI': 0,
 'fSQI': nan,
 'sSQI': nan,
 'kSQI': nan,
 'pSQI': 100.0,
 'zero_crossings_rate_sqi': 0.0,
 'mean_crossing_rate_sqi': 0.0,
 'find_max_lenght_repeated_true': 1,
 'flat_line_sqi': 9.998,
 'saturation_sqi': 10.0,
 'baseline_sqi': 80.01300202817336,
 'amplitude_sqi': 0.01300202832038576,
 'rr_variability_sqi_mean': 4.726,
 'rr_variability_sqi_std': 4.278,
 'rr_variability_sqi_cv': 0.9052052475666525,
 'power_sqi': 0.7557418802263444,
 'bsqi': 0.0}

In [28]:
float(df_orig['Features_dict'][0]['bsqi'])

0.0

In [29]:
df_orig['Features_dict'][1]

{'iSQI': 0.2,
 'bSQI': 100.0,
 'fSQI': 0.5440391449948266,
 'sSQI': -0.46612093186253545,
 'kSQI': 19.392805175501238,
 'pSQI': 64.56,
 'zero_crossings_rate_sqi': 0.07801560312062412,
 'mean_crossing_rate_sqi': 0.039407881576315265,
 'find_max_lenght_repeated_true': 1,
 'flat_line_sqi': 0.016,
 'saturation_sqi': 0.048,
 'baseline_sqi': 1.9390373857759158,
 'amplitude_sqi': 1.8678861817318133,
 'rr_variability_sqi_mean': 0.9717777777777779,
 'rr_variability_sqi_std': 0.005202088849208723,
 'rr_variability_sqi_cv': 0.005353167121298708,
 'power_sqi': 0.9237084305624395,
 'bsqi': 0.690461960113415}

In [30]:
# Verificar se há NaN nos dados
df_orig.isna().any()

Subject          False
Signal           False
Sampling Rate    False
Labels            True
Peaks            False
Features_dict    False
Features_list    False
dtype: bool

In [31]:
# É provavel que no LABEL haja NaN. Vou remover esses:
df_orig = df_orig.dropna()

In [32]:
# Separar o dataset para fazer a classificação

df = pd.DataFrame(df_orig['Features_list'].tolist(), columns=df_orig['Features_dict'][0].keys())
df['Labels'] = df_orig['Labels']

In [33]:
df.head(10)

Unnamed: 0,iSQI,bSQI,fSQI,sSQI,kSQI,pSQI,zero_crossings_rate_sqi,mean_crossing_rate_sqi,find_max_lenght_repeated_true,flat_line_sqi,saturation_sqi,baseline_sqi,amplitude_sqi,rr_variability_sqi_mean,rr_variability_sqi_std,rr_variability_sqi_cv,power_sqi,bsqi,Labels
0,0.0,0.0,,,,100.0,0.0,0.0,1,9.998,10.0,80.013002,0.013002,4.726,4.278,0.905205,0.755742,0.0,0.0
1,0.2,100.0,0.5440391,-0.466121,19.392805,64.56,0.078016,0.039408,1,0.016,0.048,1.939037,1.867886,0.971778,0.005202,0.005353,0.923708,0.690462,1.0
2,0.16,100.0,0.005520486,-2.174674,9.449453,49.3,0.030206,0.013203,1,0.012,0.36,2.977677,1.656793,1.005111,0.277088,0.275679,0.912231,0.033376,0.0
3,0.3,100.0,0.2788567,-0.668478,2.535006,45.08,0.047009,0.042408,1,0.012,0.0,0.243427,1.078492,0.497889,0.083689,0.168087,0.89694,0.804223,0.0
4,0.2,100.0,0.6196048,4.419841,25.069053,47.04,0.106821,0.067213,1,0.016,0.0,0.046265,0.863515,0.955333,0.008944,0.009362,0.839863,0.963011,1.0
5,0.0,0.0,1.9034410000000002e-33,,,100.0,0.0,0.0,1,9.998,10.0,79.910072,0.02004,4.726,4.278,0.905205,0.665845,0.0,0.0
6,0.22,100.0,0.2369448,-4.129357,36.067994,54.8,0.119424,0.086617,1,0.02,0.0,0.283151,1.387522,1.11,0.0,0.0,0.861823,0.460622,1.0
7,0.04,100.0,0.3202616,-0.820782,4.241162,69.58,0.093019,0.062613,1,0.032,0.0,0.059582,0.277474,0.675077,0.026725,0.039588,0.823916,0.71805,1.0
8,0.32,100.0,0.7125465,-0.928658,6.246902,49.92,0.132026,0.105021,1,0.012,0.0,0.072389,0.553734,0.614714,0.092036,0.149722,0.967226,0.847245,1.0
9,0.24,100.0,0.6193395,3.91853,17.626171,52.98,0.067013,0.05161,1,0.02,0.0,0.079264,1.264574,0.831455,0.003421,0.004114,0.975185,0.882593,1.0


In [34]:
# Substituir NaN por 0 em todo o DataFrame
df.fillna(0, inplace=True)

In [35]:
# Verifica se há NaN
df.isna().any()

iSQI                             False
bSQI                             False
fSQI                             False
sSQI                             False
kSQI                             False
pSQI                             False
zero_crossings_rate_sqi          False
mean_crossing_rate_sqi           False
find_max_lenght_repeated_true    False
flat_line_sqi                    False
saturation_sqi                   False
baseline_sqi                     False
amplitude_sqi                    False
rr_variability_sqi_mean          False
rr_variability_sqi_std           False
rr_variability_sqi_cv            False
power_sqi                        False
bsqi                             False
Labels                           False
dtype: bool

In [36]:
#df.to_csv('dataframe_cinc2011.csv')

In [37]:
# Criando X e y para classificação
X = df.drop(columns=['Labels'])
y = df['Labels']

In [38]:
X

Unnamed: 0,iSQI,bSQI,fSQI,sSQI,kSQI,pSQI,zero_crossings_rate_sqi,mean_crossing_rate_sqi,find_max_lenght_repeated_true,flat_line_sqi,saturation_sqi,baseline_sqi,amplitude_sqi,rr_variability_sqi_mean,rr_variability_sqi_std,rr_variability_sqi_cv,power_sqi,bsqi
0,0.00,0.0,0.000000,0.000000,0.000000,100.00,0.000000,0.000000,1,9.998,10.000,80.013002,0.013002,4.726000,4.278000,0.905205,0.755742,0.000000
1,0.20,100.0,0.544039,-0.466121,19.392805,64.56,0.078016,0.039408,1,0.016,0.048,1.939037,1.867886,0.971778,0.005202,0.005353,0.923708,0.690462
2,0.16,100.0,0.005520,-2.174674,9.449453,49.30,0.030206,0.013203,1,0.012,0.360,2.977677,1.656793,1.005111,0.277088,0.275679,0.912231,0.033376
3,0.30,100.0,0.278857,-0.668478,2.535006,45.08,0.047009,0.042408,1,0.012,0.000,0.243427,1.078492,0.497889,0.083689,0.168087,0.896940,0.804223
4,0.20,100.0,0.619605,4.419841,25.069053,47.04,0.106821,0.067213,1,0.016,0.000,0.046265,0.863515,0.955333,0.008944,0.009362,0.839863,0.963011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,0.04,100.0,0.237600,4.682297,20.321459,78.22,0.000800,0.000800,1,2.618,10.000,96.460091,101.184824,2.161000,1.825000,0.844516,0.795078,0.010157
994,0.26,100.0,0.486928,-1.396489,5.821627,56.62,0.096819,0.059812,1,0.022,0.000,0.100702,0.710963,0.439300,0.069436,0.158060,0.913264,0.817194
995,0.24,100.0,0.599578,3.107946,14.621376,69.52,0.043409,0.031606,1,0.030,0.000,0.023694,0.698364,0.815273,0.033491,0.041080,0.906902,0.887943
996,0.24,100.0,0.624298,3.345171,16.979691,59.98,0.166833,0.126025,1,0.020,0.000,0.054890,0.520235,0.805273,0.076226,0.094658,0.926429,0.884870


In [39]:
y

0      0.0
1      1.0
2      0.0
3      0.0
4      1.0
      ... 
993    0.0
994    1.0
995    0.0
996    0.0
997    1.0
Name: Labels, Length: 998, dtype: float64

In [40]:
# Classificando K-FOLD
resultados = []

for method in methods.keys():
    m = methods[method]
    
    # Configuração do k-fold
    n_splits = 5
    kfold = KFold(n_splits=n_splits, shuffle=False)
    
    # Loop sobre os k-folds
    for idx, (train_idx, test_idx) in enumerate(kfold.split(X)):
        
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        print(method, X_train.shape, y_train.shape, X_test.shape, y_test.shape)
        model = m.fit(X_train, y_train)
        
        # Faz predições no conjunto de teste
        y_pred = model.predict(X_test)
        
        # Avalia o desempenho do modelo e armazena os resultados
        results_fold = classification_metrics_binary(y_test, y_pred, method, idx)
        resultados.append(results_fold)
    

xgboost (798, 18) (798,) (200, 18) (200,)
xgboost (798, 18) (798,) (200, 18) (200,)
xgboost (798, 18) (798,) (200, 18) (200,)
xgboost (799, 18) (799,) (199, 18) (199,)
xgboost (799, 18) (799,) (199, 18) (199,)
logistic_regression (798, 18) (798,) (200, 18) (200,)
logistic_regression (798, 18) (798,) (200, 18) (200,)
logistic_regression (798, 18) (798,) (200, 18) (200,)
logistic_regression (799, 18) (799,) (199, 18) (199,)
logistic_regression (799, 18) (799,) (199, 18) (199,)
random_forest (798, 18) (798,) (200, 18) (200,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

random_forest (798, 18) (798,) (200, 18) (200,)
random_forest (798, 18) (798,) (200, 18) (200,)
random_forest (799, 18) (799,) (199, 18) (199,)
random_forest (799, 18) (799,) (199, 18) (199,)
gradient_boosting (798, 18) (798,) (200, 18) (200,)
gradient_boosting (798, 18) (798,) (200, 18) (200,)
gradient_boosting (798, 18) (798,) (200, 18) (200,)
gradient_boosting (799, 18) (799,) (199, 18) (199,)
gradient_boosting (799, 18) (799,) (199, 18) (199,)
svm (798, 18) (798,) (200, 18) (200,)
svm (798, 18) (798,) (200, 18) (200,)
svm (798, 18) (798,) (200, 18) (200,)
svm (799, 18) (799,) (199, 18) (199,)
svm (799, 18) (799,) (199, 18) (199,)
knn (798, 18) (798,) (200, 18) (200,)
knn (798, 18) (798,) (200, 18) (200,)
knn (798, 18) (798,) (200, 18) (200,)
knn (799, 18) (799,) (199, 18) (199,)
knn (799, 18) (799,) (199, 18) (199,)
decision_tree (798, 18) (798,) (200, 18) (200,)
decision_tree (798, 18) (798,) (200, 18) (200,)
decision_tree (798, 18) (798,) (200, 18) (200,)
decision_tree (799, 18) 

In [41]:
# Imprime os resultados
for i, resultado in enumerate(resultados):
    print(resultados[i], '\n')

{'model': 'xgboost', 'fold': 0, 'tn': 5, 'fp': 59, 'fn': 11, 'tp': 125, 'perc_tn': 2.5, 'perc_fp': 29.5, 'perc_fn': 5.5, 'perc_tp': 62.5, 'sensitivity': 0.9191176470588235, 'tpr': 0.9191176470588235, 'recall': 0.9191176470588235, 'fpr': 0.921875, 'precision': 0.6793478260869565, 'ppv': 0.6793478260869565, 'specificity': 0.078125, 'tnr': 0.078125, 'f1_score': 0.7812500000000001, 'auc': 0.49862132352941174, 'kappa': -0.0034403669724769603, 'accuracy': 0.65} 

{'model': 'xgboost', 'fold': 1, 'tn': 7, 'fp': 43, 'fn': 11, 'tp': 139, 'perc_tn': 3.5000000000000004, 'perc_fp': 21.5, 'perc_fn': 5.5, 'perc_tp': 69.5, 'sensitivity': 0.9266666666666666, 'tpr': 0.9266666666666666, 'recall': 0.9266666666666666, 'fpr': 0.86, 'precision': 0.7637362637362637, 'ppv': 0.7637362637362637, 'specificity': 0.14, 'tnr': 0.14, 'f1_score': 0.8373493975903613, 'auc': 0.5333333333333333, 'kappa': 0.0847457627118644, 'accuracy': 0.73} 

{'model': 'xgboost', 'fold': 2, 'tn': 2, 'fp': 34, 'fn': 15, 'tp': 149, 'perc_

In [42]:
df_resultados = pd.DataFrame(resultados)
df_resultados

Unnamed: 0,model,fold,tn,fp,fn,tp,perc_tn,perc_fp,perc_fn,perc_tp,...,recall,fpr,precision,ppv,specificity,tnr,f1_score,auc,kappa,accuracy
0,xgboost,0,5,59,11,125,2.5,29.5,5.5,62.5,...,0.919118,0.921875,0.679348,0.679348,0.078125,0.078125,0.78125,0.498621,-0.00344,0.65
1,xgboost,1,7,43,11,139,3.5,21.5,5.5,69.5,...,0.926667,0.86,0.763736,0.763736,0.14,0.14,0.837349,0.533333,0.084746,0.73
2,xgboost,2,2,34,15,149,1.0,17.0,7.5,74.5,...,0.908537,0.944444,0.814208,0.814208,0.055556,0.055556,0.85879,0.482046,-0.045222,0.755
3,xgboost,3,3,40,16,140,1.507538,20.100503,8.040201,70.351759,...,0.897436,0.930233,0.777778,0.777778,0.069767,0.069767,0.833333,0.483602,-0.041106,0.718593
4,xgboost,4,5,29,13,152,2.512563,14.572864,6.532663,76.38191,...,0.921212,0.852941,0.839779,0.839779,0.147059,0.147059,0.878613,0.534135,0.083954,0.788945
5,logistic_regression,0,0,64,0,136,0.0,32.0,0.0,68.0,...,1.0,1.0,0.68,0.68,0.0,0.0,0.809524,0.5,0.0,0.68
6,logistic_regression,1,0,50,0,150,0.0,25.0,0.0,75.0,...,1.0,1.0,0.75,0.75,0.0,0.0,0.857143,0.5,0.0,0.75
7,logistic_regression,2,0,36,1,163,0.0,18.0,0.5,81.5,...,0.993902,1.0,0.819095,0.819095,0.0,0.0,0.898072,0.496951,-0.009825,0.815
8,logistic_regression,3,0,43,3,153,0.0,21.60804,1.507538,76.884422,...,0.980769,1.0,0.780612,0.780612,0.0,0.0,0.869318,0.490385,-0.029002,0.768844
9,logistic_regression,4,1,33,0,165,0.502513,16.582915,0.0,82.914573,...,1.0,0.970588,0.833333,0.833333,0.029412,0.029412,0.909091,0.514706,0.047847,0.834171


In [43]:
selected_features = ['iSQI', 'fSQI', 'sSQI', 'kSQI', 'pSQI']
X_1 = df[selected_features]

In [44]:
X_1

Unnamed: 0,iSQI,fSQI,sSQI,kSQI,pSQI
0,0.00,0.000000,0.000000,0.000000,100.00
1,0.20,0.544039,-0.466121,19.392805,64.56
2,0.16,0.005520,-2.174674,9.449453,49.30
3,0.30,0.278857,-0.668478,2.535006,45.08
4,0.20,0.619605,4.419841,25.069053,47.04
...,...,...,...,...,...
993,0.04,0.237600,4.682297,20.321459,78.22
994,0.26,0.486928,-1.396489,5.821627,56.62
995,0.24,0.599578,3.107946,14.621376,69.52
996,0.24,0.624298,3.345171,16.979691,59.98


In [45]:
# Classificando K-FOLD
resultados_1 = []

for method in methods.keys():
    m = methods[method]
    
    # Configuração do k-fold
    n_splits = 5
    kfold = KFold(n_splits=n_splits, shuffle=False)
    
    # Loop sobre os k-folds
    for idx, (train_idx, test_idx) in enumerate(kfold.split(X_1)):
        
        X_train, X_test = X_1.iloc[train_idx], X_1.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        print(method, X_train.shape, y_train.shape, X_test.shape, y_test.shape)
        model = m.fit(X_train, y_train)
        
        # Faz predições no conjunto de teste
        y_pred = model.predict(X_test)
        
        # Avalia o desempenho do modelo e armazena os resultados
        results_fold = classification_metrics_binary(y_test, y_pred, method, idx)
        resultados_1.append(results_fold)
    

xgboost (798, 5) (798,) (200, 5) (200,)
xgboost (798, 5) (798,) (200, 5) (200,)
xgboost (798, 5) (798,) (200, 5) (200,)
xgboost (799, 5) (799,) (199, 5) (199,)
xgboost (799, 5) (799,) (199, 5) (199,)
logistic_regression (798, 5) (798,) (200, 5) (200,)
logistic_regression (798, 5) (798,) (200, 5) (200,)
logistic_regression (798, 5) (798,) (200, 5) (200,)
logistic_regression (799, 5) (799,) (199, 5) (199,)
logistic_regression (799, 5) (799,) (199, 5) (199,)
random_forest (798, 5) (798,) (200, 5) (200,)
random_forest (798, 5) (798,) (200, 5) (200,)
random_forest (798, 5) (798,) (200, 5) (200,)
random_forest (799, 5) (799,) (199, 5) (199,)
random_forest (799, 5) (799,) (199, 5) (199,)
gradient_boosting (798, 5) (798,) (200, 5) (200,)
gradient_boosting (798, 5) (798,) (200, 5) (200,)
gradient_boosting (798, 5) (798,) (200, 5) (200,)
gradient_boosting (799, 5) (799,) (199, 5) (199,)
gradient_boosting (799, 5) (799,) (199, 5) (199,)
svm (798, 5) (798,) (200, 5) (200,)
svm (798, 5) (798,) (200

In [46]:
df_resultados_1 = pd.DataFrame(resultados_1)
df_resultados_1

Unnamed: 0,model,fold,tn,fp,fn,tp,perc_tn,perc_fp,perc_fn,perc_tp,...,recall,fpr,precision,ppv,specificity,tnr,f1_score,auc,kappa,accuracy
0,xgboost,0,7,57,8,128,3.5,28.5,4.0,64.0,...,0.941176,0.890625,0.691892,0.691892,0.109375,0.109375,0.797508,0.525276,0.063401,0.675
1,xgboost,1,6,44,17,133,3.0,22.0,8.5,66.5,...,0.886667,0.88,0.751412,0.751412,0.12,0.12,0.813456,0.503333,0.00813,0.695
2,xgboost,2,4,32,17,147,2.0,16.0,8.5,73.5,...,0.896341,0.888889,0.821229,0.821229,0.111111,0.111111,0.857143,0.503726,0.0089,0.755
3,xgboost,3,5,38,16,140,2.512563,19.095477,8.040201,70.351759,...,0.897436,0.883721,0.786517,0.786517,0.116279,0.116279,0.838323,0.506857,0.016834,0.728643
4,xgboost,4,4,30,19,146,2.01005,15.075377,9.547739,73.366834,...,0.884848,0.882353,0.829545,0.829545,0.117647,0.117647,0.856305,0.501248,0.002863,0.753769
5,logistic_regression,0,0,64,0,136,0.0,32.0,0.0,68.0,...,1.0,1.0,0.68,0.68,0.0,0.0,0.809524,0.5,0.0,0.68
6,logistic_regression,1,0,50,0,150,0.0,25.0,0.0,75.0,...,1.0,1.0,0.75,0.75,0.0,0.0,0.857143,0.5,0.0,0.75
7,logistic_regression,2,0,36,0,164,0.0,18.0,0.0,82.0,...,1.0,1.0,0.82,0.82,0.0,0.0,0.901099,0.5,0.0,0.82
8,logistic_regression,3,0,43,0,156,0.0,21.60804,0.0,78.39196,...,1.0,1.0,0.78392,0.78392,0.0,0.0,0.878873,0.5,0.0,0.78392
9,logistic_regression,4,0,34,0,165,0.0,17.085427,0.0,82.914573,...,1.0,1.0,0.829146,0.829146,0.0,0.0,0.906593,0.5,0.0,0.829146
