##  Sklearn PCG classification pipeline for the SIPAIM conference paper

In this notebook we tested two sets of features and some classifiers according to the SIPAIM paper.
The features came from a MP-LPC representation of a PCG signal, two sets were creating according to average the parameters or the samples of each PCG signal.

In [1]:
# Loading libraries
import os, sys 
import numpy as np
from os import listdir, path
import math
import pandas as pd
from pandas import DataFrame
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE

# Functions used

In [2]:
# Function to count the elements in the series or list, give as output a frequency table
def to_frequency_table(data):
    frequencytable = {}
    for key in data:
        if key in frequencytable:
            frequencytable[key] += 1
        else:
            frequencytable[key] = 1
    return frequencytable
#  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
# Function to split the data into train, test and validation datasets 
def MyTrainTestSplit (csv_data,PerTest):
    X , y = csv_data.iloc[:,:-1], csv_data.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=PerTest, random_state=1)
    return  X_train, y_train, X_test, y_test 
#  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 
# Cross validation test and metrics calculation function
# The input arguments are the Features "X_data" and classes "y_data", a random seed = "seed", 
# the number of Folds for the CV test "nFolds" and a list of objects which contains the bunch of classifiers= "models"
#  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 
def AllCVTest(X_data,y_data,seed,nFolds,models):
    # Empty lists to save at each iteration the Mean, Std of the Se, Sp, MCC and ACC. The number of classidier also
    Mean_Acc = []
    Std_Acc = []
    Mean_Se = []
    Std_Se = []
    Mean_Sp = []
    Std_Sp = []
    Mean_MCC = []
    Std_MCC = []
    nameClassifier = []
 # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  
 # We have to define our own functions to calculate the metrix in order to use the "make_scorer" approach. Noticed 
 # that each function have to return just one value
 # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  
    # Returning Matthews correlation coefficient MCC
    def MCC(y_true,y_pred): return matthews_corrcoef(y_true,y_pred)
    # Returning Accuracy Score ACC
    def Acc(y_true,y_pred): return accuracy_score(y_true,y_pred)
    # Function to calculate Sensitivity SE 
    def Se(y_true,y_pred):
        # True positives and False negatives can be calculated from confusion matrix
        cm =confusion_matrix(y_true,y_pred)
        TP = cm[0, 0]
        FN = cm[1, 0]
        Se = (TP/(TP+FN))*100
        return Se
    def Sp(y_true,y_pred):
        cm =confusion_matrix(y_true,y_pred)
        FP = cm[0,1]
        TN = cm[1,1]
        Sp = (TN/(TN+FP))*100
        return Sp    
    scoring = {'MCC':make_scorer(MCC),'Acc':make_scorer(Acc),'Se':make_scorer(Se),'Sp':make_scorer(Sp)}
    for name, model in models:
        # If the model is SVM, parameters are being normalized 
        if  name =='SVM':
            X_scaled = preprocessing.scale(X_data)
            min_max_scaler = preprocessing.MinMaxScaler()
            X_data = min_max_scaler.fit_transform(X_scaled)
            le = preprocessing.LabelEncoder()
            le.fit(y_data)
            y_data = le.transform(y_data)            
        # The stratified (due to data unbalancing) k-fold indices to split the data. 
        kfold = model_selection.StratifiedKFold(n_splits=nFolds, random_state=seed, shuffle=True)
        # Accuracy scores coming out for each fold in the test (a list of k-fold values at each iteration)
        cv_results = model_selection.cross_validate(model,X_data,y_data, cv = kfold, scoring = scoring)
        # Saving the names of the models or classifiers to be ordered in a dataset
        Mean_Acc.append(round(cv_results['test_Acc'].mean(),2))
        Std_Acc.append(round(cv_results['test_Acc'].std(),3))
        Mean_Se.append(round(cv_results['test_Se'].mean(),2))
        Std_Se.append(round(cv_results['test_Se'].std(),2))
        Mean_Sp.append(round(cv_results['test_Sp'].mean(),2))
        Std_Sp.append(round(cv_results['test_Sp'].std(),2))
        Mean_MCC.append(round(cv_results['test_MCC'].mean(),2))
        Std_MCC.append(round(cv_results['test_MCC'].std(),3))
        nameClassifier.append(name)
    # Variables to return (output metrics)    
    return Mean_Acc, Std_Acc, Mean_Se, Std_Se, Mean_Sp, Std_Sp, Mean_MCC, Std_MCC, nameClassifier
    

#  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 
# Convert the list of accuracy scores to a data frame for the seaborn boxplotting (python)
#  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 
def Results2DataFrame (AccScores,nameClassifier,FeaturesLabel):
    # Reshape all the Accuracy Scores in a column
    ColRes = np.reshape(AccScores,(len(AccScores)*nFolds,1))
    # Saving the values 
    data = (ColRes)
    # Creating the dataFrame
    dFrame = pd.DataFrame()
    # Replicating the classifier names for each fold 
    namesClassifier = [nameClassifier for j in range(nFolds)]
    dFrame.loc[:,0] = np.reshape(namesClassifier,len(ColRes),1)
    # Replicating the name of the features set 
    dFrame.loc[:,1] = [FeaturesLabel]*len(ColRes)
    # Column containing all the data
    dFrame.loc[:,2] = data
    # Names of the columns 
    dFrame.columns = ['Model','Features','Value']
    return dFrame



## Reading the csv files from paths

In [3]:
allFeaturesPath = r'/Users/roilhi/Documents/AllCSVPCGFeatures/'
MP = path.join(allFeaturesPath,'MP_PCGFeatures_PerCycle.csv')
MP2 = path.join(allFeaturesPath,'MP_LPCFeatures2.csv')

## Creating the datasets from the files

In [4]:
dataset_A = pd.read_csv(MP)
dataset_B = pd.read_csv(MP2)
# Separating data from labels (target) values
XA, yA = dataset_A.iloc[:,:-1],dataset_A.iloc[:,-1]
XB, yB = dataset_B.iloc[:,:-1],dataset_B.iloc[:,-1]
#XA , yA, XtsA, ytsA = MyTrainTestSplit(dataset_A, 0.2)
#XB , yB, XtsB, ytsB = MyTrainTestSplit(dataset_B, 0.2)

## Creating an array with the classifiers to be tested

In [5]:
seed = 1 
nFolds = 10
scoring = 'accuracy'
# Defining a list of classifiers to be tested 
models = []
models.append(('LR', LogisticRegression(penalty='l2', dual=False, tol=0.0001,random_state=seed)))
models.append(('LDA', LinearDiscriminantAnalysis(solver='svd')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier(criterion='gini', splitter = 'best', random_state=seed)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(C=2.37, kernel='rbf',random_state=seed, class_weight='balanced')))
models.append(('RF',RandomForestClassifier(n_estimators = 100, min_samples_split = 2, n_jobs = -1, verbose = 1,random_state=seed)))

## Calculating the metrics without oversampling

In [6]:
AMean_Acc, AStd_Acc, AMean_Se, AStd_Se, AMean_Sp, AStd_Sp, AMean_MCC, AStd_MCC, nameClassifier = AllCVTest(XA,yA,seed,nFolds,models)
BMean_Acc, BStd_Acc, BMean_Se, BStd_Se, BMean_Sp, BStd_Sp, BMean_MCC, BStd_MCC, nameClassifier = AllCVTest(XB,yB,seed,nFolds,models)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]

## Creating a Data frame with the obtained metrics

In [7]:
dfNoSmoteMetrics = pd.DataFrame({'Dataset':['SetA']*7+['SetB']*7 ,'Classifier':nameClassifier*2,
                          'Mean_Se':AMean_Se+BMean_Se,'Std_Se':AStd_Se+BStd_Se,'Mean_Sp':AMean_Sp+BMean_Sp,
                          'Std_Sp': AStd_Sp+BStd_Sp,'Mean_Acc':AMean_Acc+BMean_Acc,'Std_Acc':AStd_Acc+BStd_Acc,
                          'Mean_MCC':AMean_MCC+BMean_MCC,'Std_MCC':AStd_MCC+BStd_MCC })

In [8]:
dfNoSmoteMetrics

Unnamed: 0,Classifier,Dataset,Mean_Acc,Mean_MCC,Mean_Se,Mean_Sp,Std_Acc,Std_MCC,Std_Se,Std_Sp
0,LR,SetA,0.81,0.33,59.88,83.5,0.017,0.067,8.31,1.03
1,LDA,SetA,0.81,0.31,58.32,83.36,0.013,0.054,6.23,0.87
2,KNN,SetA,0.78,0.35,48.26,86.36,0.022,0.058,4.86,1.21
3,CART,SetA,0.79,0.38,50.19,87.3,0.029,0.082,6.33,1.91
4,NB,SetA,0.79,0.18,49.56,80.84,0.015,0.061,10.41,0.68
5,SVM,SetA,0.76,0.45,46.41,92.29,0.015,0.026,1.99,0.76
6,RF,SetA,0.86,0.55,75.8,88.03,0.016,0.059,5.32,1.32
7,LR,SetB,0.79,0.17,54.31,80.43,0.013,0.082,12.73,0.88
8,LDA,SetB,0.79,0.17,48.11,80.65,0.013,0.07,10.78,0.76
9,KNN,SetB,0.82,0.36,60.63,84.54,0.008,0.039,3.82,0.85


## Performing the oversampling

In [9]:
# Create the oversampling SMOTE object, calling the 'minority' option to oversample the minority class 
# to equalize the number of elements
sm = SMOTE(random_state = seed, ratio = 'minority')
# Oversampling the dataset A
xA_res, yA_res = sm.fit_sample(XA, yA)
# Oversampling the dataset B
xB_res, yB_res = sm.fit_sample(XB, yB)

## Changing the SVM parameters for the balanced data

In [10]:
models[5] = (('SVM', SVC(kernel='rbf', C=1 ,random_state=seed)))

## Calculating metrics for balanced data

In [11]:
AMean_Acc, AStd_Acc, AMean_Se, AStd_Se, AMean_Sp, AStd_Sp, AMean_MCC, AStd_MCC, nameClassifier = AllCVTest(xA_res,yA_res,seed,nFolds,models)
BMean_Acc, BStd_Acc, BMean_Se, BStd_Se, BMean_Sp, BStd_Sp, BMean_MCC, BStd_MCC, nameClassifier = AllCVTest(xB_res,yB_res,seed,nFolds,models)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]

In [12]:
dfSmoteMetrics = pd.DataFrame({'Dataset':['SetA']*7+['SetB']*7 ,'Classifier':nameClassifier*2,
                          'Mean_Se':AMean_Se+BMean_Se,'Std_Se':AStd_Se+BStd_Se,'Mean_Sp':AMean_Sp+BMean_Sp,
                          'Std_Sp': AStd_Sp+BStd_Sp,'Mean_Acc':AMean_Acc+BMean_Acc,'Std_Acc':AStd_Acc+BStd_Acc,
                          'Mean_MCC':AMean_MCC+BMean_MCC,'Std_MCC':AStd_MCC+BStd_MCC })

In [13]:
dfSmoteMetrics

Unnamed: 0,Classifier,Dataset,Mean_Acc,Mean_MCC,Mean_Se,Mean_Sp,Std_Acc,Std_MCC,Std_Se,Std_Sp
0,LR,SetA,0.8,0.61,78.6,82.38,0.018,0.036,1.6,2.5
1,LDA,SetA,0.79,0.59,76.95,82.37,0.014,0.028,1.27,2.14
2,KNN,SetA,0.79,0.62,70.74,97.1,0.015,0.029,1.39,1.63
3,CART,SetA,0.84,0.68,82.61,84.98,0.016,0.032,1.6,1.85
4,NB,SetA,0.56,0.21,84.54,53.29,0.011,0.028,4.58,0.63
5,SVM,SetA,0.78,0.56,77.2,78.77,0.013,0.026,1.09,1.99
6,RF,SetA,0.92,0.84,91.6,92.1,0.014,0.027,1.77,1.7
7,LR,SetB,0.69,0.38,69.9,67.97,0.018,0.036,1.79,1.94
8,LDA,SetB,0.71,0.41,70.55,70.89,0.019,0.038,1.52,2.44
9,KNN,SetB,0.86,0.73,79.24,95.87,0.012,0.021,1.56,0.79
