In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For Classification Report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, confusion_matrix, cohen_kappa_score

# For Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# For K-Fold
#from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedKFold
#from sklearn.model_selection import KFold

In [2]:
# Generate binary classification analysis
def classification_metrics_binary(y_true, y_pred, model, fold):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    perc_tn, perc_fp, perc_fn, perc_tp = list(map(lambda x:x/len(y_true)*100, [tn, fp, fn, tp]))
    sensitivity = tp/(tp+fn)
    false_positive_rate = fp/(tn+fp)
    precision = tp/(tp+fp)
    specificity = tn/(tn+fp)
    accuracy = (tp+tn)/(tp+fp+tn+fn)
    f1_score = 2*precision*sensitivity/(sensitivity+precision)
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    auc_value = auc(fpr, tpr)
    kappa = cohen_kappa_score(y_true, y_pred)
    
    return {"model":model, "fold": fold,
            "tn":tn, "fp":fp, "fn":fn, "tp":tp, "perc_tn":perc_tn, "perc_fp":perc_fp, 
            "perc_fn":perc_fn, "perc_tp":perc_tp, "sensitivity":sensitivity, 
            "tpr":sensitivity, "recall":sensitivity, "fpr":false_positive_rate, 
            "precision":precision, "ppv":precision, "specificity":specificity, 
            "tnr":specificity,"f1_score":f1_score, "auc":auc_value, "kappa":kappa, 
            "accuracy":accuracy}

In [3]:
# Plot Confusion Matrix
def plot_confusion(y_true, y_pred):
    # Generate the confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    class_name = ['Good', 'Bad']
    cm = pd.DataFrame(cm, index=class_name, columns=class_name)

    # Plot the confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', cbar=False)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')

    # Salva a figura como .eps
    #plt.savefig('confusion_matrix_FAxNORMAL_preprocessed.eps', format='eps')

    # Salva a figura como .png com 300 DPI
    #plt.savefig('confusion_matrix_FAxNORMAL_preprocessed.png', format='png', dpi=300)

    plt.show()

In [4]:
RANDOM_SEED = 32

# Método random forest estoura a memória. Utilizar o HPC para rodar.
methods = { 'xgboost': xgboost.XGBClassifier(seed=RANDOM_SEED),
            'logistic_regression': LogisticRegression(random_state=RANDOM_SEED),
            'random_forest': RandomForestClassifier(random_state=RANDOM_SEED),
            'gradient_boosting': GradientBoostingClassifier(random_state=RANDOM_SEED),
            'svm': SVC(random_state=RANDOM_SEED),
            'knn': KNeighborsClassifier(),
            'decision_tree': DecisionTreeClassifier(random_state=RANDOM_SEED),
            'naive_bayes': GaussianNB()
           }

In [5]:
dir = 'C:/Users/estel/Documents/Python_Codes/'
file = 'dataframe_cinc2011.csv'
df = pd.read_csv(dir + file)

In [6]:
df

Unnamed: 0.1,Unnamed: 0,iSQI,bSQI,fSQI,sSQI,kSQI,pSQI,zero_crossings_rate_sqi,mean_crossing_rate_sqi,find_max_lenght_repeated_true,flat_line_sqi,saturation_sqi,baseline_sqi,amplitude_sqi,rr_variability_sqi_mean,rr_variability_sqi_std,rr_variability_sqi_cv,power_sqi,bsqi,Labels
0,0,0.00,0.0,0.000000,0.000000,0.000000,100.00,0.000000,0.000000,1,9.998,10.000,80.013002,0.013002,4.726000,4.278000,0.905205,0.755742,0.000000,1.0
1,1,0.20,100.0,0.544039,-0.466121,19.392805,64.56,0.078016,0.039408,1,0.016,0.048,1.939037,1.867886,0.971778,0.005202,0.005353,0.923708,0.690462,0.0
2,2,0.16,100.0,0.005520,-2.174674,9.449453,49.30,0.030206,0.013203,1,0.012,0.360,2.977677,1.656793,1.005111,0.277088,0.275679,0.912231,0.033376,1.0
3,3,0.30,100.0,0.278857,-0.668478,2.535006,45.08,0.047009,0.042408,1,0.012,0.000,0.243427,1.078492,0.497889,0.083689,0.168087,0.896940,0.804223,1.0
4,4,0.20,100.0,0.619605,4.419841,25.069053,47.04,0.106821,0.067213,1,0.016,0.000,0.046265,0.863515,0.955333,0.008944,0.009362,0.839863,0.963011,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,993,0.04,100.0,0.237600,4.682297,20.321459,78.22,0.000800,0.000800,1,2.618,10.000,96.460091,101.184824,2.161000,1.825000,0.844516,0.795078,0.010157,1.0
994,994,0.26,100.0,0.486928,-1.396489,5.821627,56.62,0.096819,0.059812,1,0.022,0.000,0.100702,0.710963,0.439300,0.069436,0.158060,0.913264,0.817194,0.0
995,995,0.24,100.0,0.599578,3.107946,14.621376,69.52,0.043409,0.031606,1,0.030,0.000,0.023694,0.698364,0.815273,0.033491,0.041080,0.906902,0.887943,1.0
996,996,0.24,100.0,0.624298,3.345171,16.979691,59.98,0.166833,0.126025,1,0.020,0.000,0.054890,0.520235,0.805273,0.076226,0.094658,0.926429,0.884870,1.0


In [7]:
X = df.drop(columns=['Unnamed: 0', 'Labels'])
y = df['Labels']

In [8]:
X

Unnamed: 0,iSQI,bSQI,fSQI,sSQI,kSQI,pSQI,zero_crossings_rate_sqi,mean_crossing_rate_sqi,find_max_lenght_repeated_true,flat_line_sqi,saturation_sqi,baseline_sqi,amplitude_sqi,rr_variability_sqi_mean,rr_variability_sqi_std,rr_variability_sqi_cv,power_sqi,bsqi
0,0.00,0.0,0.000000,0.000000,0.000000,100.00,0.000000,0.000000,1,9.998,10.000,80.013002,0.013002,4.726000,4.278000,0.905205,0.755742,0.000000
1,0.20,100.0,0.544039,-0.466121,19.392805,64.56,0.078016,0.039408,1,0.016,0.048,1.939037,1.867886,0.971778,0.005202,0.005353,0.923708,0.690462
2,0.16,100.0,0.005520,-2.174674,9.449453,49.30,0.030206,0.013203,1,0.012,0.360,2.977677,1.656793,1.005111,0.277088,0.275679,0.912231,0.033376
3,0.30,100.0,0.278857,-0.668478,2.535006,45.08,0.047009,0.042408,1,0.012,0.000,0.243427,1.078492,0.497889,0.083689,0.168087,0.896940,0.804223
4,0.20,100.0,0.619605,4.419841,25.069053,47.04,0.106821,0.067213,1,0.016,0.000,0.046265,0.863515,0.955333,0.008944,0.009362,0.839863,0.963011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,0.04,100.0,0.237600,4.682297,20.321459,78.22,0.000800,0.000800,1,2.618,10.000,96.460091,101.184824,2.161000,1.825000,0.844516,0.795078,0.010157
994,0.26,100.0,0.486928,-1.396489,5.821627,56.62,0.096819,0.059812,1,0.022,0.000,0.100702,0.710963,0.439300,0.069436,0.158060,0.913264,0.817194
995,0.24,100.0,0.599578,3.107946,14.621376,69.52,0.043409,0.031606,1,0.030,0.000,0.023694,0.698364,0.815273,0.033491,0.041080,0.906902,0.887943
996,0.24,100.0,0.624298,3.345171,16.979691,59.98,0.166833,0.126025,1,0.020,0.000,0.054890,0.520235,0.805273,0.076226,0.094658,0.926429,0.884870


In [9]:
y

0      1.0
1      0.0
2      1.0
3      1.0
4      0.0
      ... 
993    1.0
994    0.0
995    1.0
996    1.0
997    0.0
Name: Labels, Length: 998, dtype: float64

In [12]:
# Classificando K-FOLD
resultados = []

for method in methods.keys():
    m = methods[method]
    
    # Número de folds desejado
    num_folds = 5
    
    # Inicialize o objeto StratifiedKFold
    stratkf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    
    # Loop sobre os k-folds
    for idx, (train_idx, test_idx) in enumerate(stratkf.split(X, y)):
        
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        print(method, X_train.shape, y_train.shape, X_test.shape, y_test.shape)
        model = m.fit(X_train, y_train)
        
        # Faz predições no conjunto de teste
        y_pred = model.predict(X_test)
        
        # Avalia o desempenho do modelo e armazena os resultados
        results_fold = classification_metrics_binary(y_test, y_pred, method, idx)
        resultados.append(results_fold)
    

xgboost (798, 18) (798,) (200, 18) (200,)
xgboost (798, 18) (798,) (200, 18) (200,)
xgboost (798, 18) (798,) (200, 18) (200,)
xgboost (799, 18) (799,) (199, 18) (199,)
xgboost (799, 18) (799,) (199, 18) (199,)
logistic_regression (798, 18) (798,) (200, 18) (200,)
logistic_regression (798, 18) (798,) (200, 18) (200,)
logistic_regression (798, 18) (798,) (200, 18) (200,)
logistic_regression (799, 18) (799,) (199, 18) (199,)
logistic_regression (799, 18) (799,) (199, 18) (199,)
random_forest (798, 18) (798,) (200, 18) (200,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  f1_score = 2*precision*sensitivity/(sensitivity+precision)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  f1_score = 2*precision*sensitivity/(sensitivity+precision)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    ht

random_forest (798, 18) (798,) (200, 18) (200,)
random_forest (798, 18) (798,) (200, 18) (200,)
random_forest (799, 18) (799,) (199, 18) (199,)


  f1_score = 2*precision*sensitivity/(sensitivity+precision)


random_forest (799, 18) (799,) (199, 18) (199,)
gradient_boosting (798, 18) (798,) (200, 18) (200,)
gradient_boosting (798, 18) (798,) (200, 18) (200,)
gradient_boosting (798, 18) (798,) (200, 18) (200,)
gradient_boosting (799, 18) (799,) (199, 18) (199,)
gradient_boosting (799, 18) (799,) (199, 18) (199,)
svm (798, 18) (798,) (200, 18) (200,)
svm (798, 18) (798,) (200, 18) (200,)
svm (798, 18) (798,) (200, 18) (200,)
svm (799, 18) (799,) (199, 18) (199,)
svm (799, 18) (799,) (199, 18) (199,)
knn (798, 18) (798,) (200, 18) (200,)


  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)


knn (798, 18) (798,) (200, 18) (200,)
knn (798, 18) (798,) (200, 18) (200,)
knn (799, 18) (799,) (199, 18) (199,)
knn (799, 18) (799,) (199, 18) (199,)
decision_tree (798, 18) (798,) (200, 18) (200,)
decision_tree (798, 18) (798,) (200, 18) (200,)
decision_tree (798, 18) (798,) (200, 18) (200,)
decision_tree (799, 18) (799,) (199, 18) (199,)
decision_tree (799, 18) (799,) (199, 18) (199,)
naive_bayes (798, 18) (798,) (200, 18) (200,)
naive_bayes (798, 18) (798,) (200, 18) (200,)
naive_bayes (798, 18) (798,) (200, 18) (200,)
naive_bayes (799, 18) (799,) (199, 18) (199,)
naive_bayes (799, 18) (799,) (199, 18) (199,)


In [13]:
# Imprime os resultados
for i, resultado in enumerate(resultados):
    print(resultados[i], '\n')

{'model': 'xgboost', 'fold': 0, 'tn': 136, 'fp': 19, 'fn': 42, 'tp': 3, 'perc_tn': 68.0, 'perc_fp': 9.5, 'perc_fn': 21.0, 'perc_tp': 1.5, 'sensitivity': 0.06666666666666667, 'tpr': 0.06666666666666667, 'recall': 0.06666666666666667, 'fpr': 0.12258064516129032, 'precision': 0.13636363636363635, 'ppv': 0.13636363636363635, 'specificity': 0.8774193548387097, 'tnr': 0.8774193548387097, 'f1_score': 0.08955223880597016, 'auc': 0.4720430107526882, 'kappa': -0.06830122591943977, 'accuracy': 0.695} 

{'model': 'xgboost', 'fold': 1, 'tn': 149, 'fp': 6, 'fn': 40, 'tp': 5, 'perc_tn': 74.5, 'perc_fp': 3.0, 'perc_fn': 20.0, 'perc_tp': 2.5, 'sensitivity': 0.1111111111111111, 'tpr': 0.1111111111111111, 'recall': 0.1111111111111111, 'fpr': 0.03870967741935484, 'precision': 0.45454545454545453, 'ppv': 0.45454545454545453, 'specificity': 0.9612903225806452, 'tnr': 0.9612903225806452, 'f1_score': 0.17857142857142855, 'auc': 0.5362007168458782, 'kappa': 0.09892262487757097, 'accuracy': 0.77} 

{'model': 'x

In [14]:
df_resultados = pd.DataFrame(resultados)
df_resultados

Unnamed: 0,model,fold,tn,fp,fn,tp,perc_tn,perc_fp,perc_fn,perc_tp,...,recall,fpr,precision,ppv,specificity,tnr,f1_score,auc,kappa,accuracy
0,xgboost,0,136,19,42,3,68.0,9.5,21.0,1.5,...,0.066667,0.122581,0.136364,0.136364,0.877419,0.877419,0.089552,0.472043,-0.068301,0.695
1,xgboost,1,149,6,40,5,74.5,3.0,20.0,2.5,...,0.111111,0.03871,0.454545,0.454545,0.96129,0.96129,0.178571,0.536201,0.098923,0.77
2,xgboost,2,139,16,41,4,69.5,8.0,20.5,2.0,...,0.088889,0.103226,0.2,0.2,0.896774,0.896774,0.123077,0.492832,-0.017857,0.715
3,xgboost,3,139,15,39,6,69.849246,7.537688,19.59799,3.015075,...,0.133333,0.097403,0.285714,0.285714,0.902597,0.902597,0.181818,0.517965,0.04429,0.728643
4,xgboost,4,145,9,42,3,72.864322,4.522613,21.105528,1.507538,...,0.066667,0.058442,0.25,0.25,0.941558,0.941558,0.105263,0.504113,0.011108,0.743719
5,logistic_regression,0,154,1,45,0,77.0,0.5,22.5,0.0,...,0.0,0.006452,0.0,0.0,0.993548,0.993548,,0.496774,-0.009879,0.77
6,logistic_regression,1,154,1,45,0,77.0,0.5,22.5,0.0,...,0.0,0.006452,0.0,0.0,0.993548,0.993548,,0.496774,-0.009879,0.77
7,logistic_regression,2,155,0,45,0,77.5,0.0,22.5,0.0,...,0.0,0.0,,,1.0,1.0,,0.5,0.0,0.775
8,logistic_regression,3,154,0,45,0,77.386935,0.0,22.613065,0.0,...,0.0,0.0,,,1.0,1.0,,0.5,0.0,0.773869
9,logistic_regression,4,152,2,45,0,76.38191,1.005025,22.613065,0.0,...,0.0,0.012987,0.0,0.0,0.987013,0.987013,,0.493506,-0.019623,0.763819


In [16]:
df.keys()

Index(['Unnamed: 0', 'iSQI', 'bSQI', 'fSQI', 'sSQI', 'kSQI', 'pSQI',
       'zero_crossings_rate_sqi', 'mean_crossing_rate_sqi',
       'find_max_lenght_repeated_true', 'flat_line_sqi', 'saturation_sqi',
       'baseline_sqi', 'amplitude_sqi', 'rr_variability_sqi_mean',
       'rr_variability_sqi_std', 'rr_variability_sqi_cv', 'power_sqi', 'bsqi',
       'Labels'],
      dtype='object')

In [23]:
selected_features = ['iSQI', 'fSQI', 'sSQI', 'kSQI', 'pSQI']

In [24]:
X_1 = df[selected_features]

In [25]:
X_1

Unnamed: 0,iSQI,fSQI,sSQI,kSQI,pSQI
0,0.00,0.000000,0.000000,0.000000,100.00
1,0.20,0.544039,-0.466121,19.392805,64.56
2,0.16,0.005520,-2.174674,9.449453,49.30
3,0.30,0.278857,-0.668478,2.535006,45.08
4,0.20,0.619605,4.419841,25.069053,47.04
...,...,...,...,...,...
993,0.04,0.237600,4.682297,20.321459,78.22
994,0.26,0.486928,-1.396489,5.821627,56.62
995,0.24,0.599578,3.107946,14.621376,69.52
996,0.24,0.624298,3.345171,16.979691,59.98


In [28]:
# Classificando K-FOLD
resultados_1 = []

for method in methods.keys():
    m = methods[method]
    
    # Número de folds desejado
    num_folds = 5
    
    # Inicialize o objeto StratifiedKFold
    stratkf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    
    # Loop sobre os k-folds
    for idx, (train_idx, test_idx) in enumerate(stratkf.split(X_1, y)):
        
        X_train, X_test = X_1.iloc[train_idx], X_1.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        print(method, X_train.shape, y_train.shape, X_test.shape, y_test.shape)
        model = m.fit(X_train, y_train)
        
        # Faz predições no conjunto de teste
        y_pred = model.predict(X_test)
        
        # Avalia o desempenho do modelo e armazena os resultados
        results_fold = classification_metrics_binary(y_test, y_pred, method, idx)
        resultados_1.append(results_fold)
    

xgboost (798, 5) (798,) (200, 5) (200,)
xgboost (798, 5) (798,) (200, 5) (200,)
xgboost (798, 5) (798,) (200, 5) (200,)
xgboost (799, 5) (799,) (199, 5) (199,)
xgboost (799, 5) (799,) (199, 5) (199,)
logistic_regression (798, 5) (798,) (200, 5) (200,)
logistic_regression (798, 5) (798,) (200, 5) (200,)
logistic_regression (798, 5) (798,) (200, 5) (200,)
logistic_regression (799, 5) (799,) (199, 5) (199,)


  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)


logistic_regression (799, 5) (799,) (199, 5) (199,)
random_forest (798, 5) (798,) (200, 5) (200,)
random_forest (798, 5) (798,) (200, 5) (200,)
random_forest (798, 5) (798,) (200, 5) (200,)
random_forest (799, 5) (799,) (199, 5) (199,)
random_forest (799, 5) (799,) (199, 5) (199,)


  f1_score = 2*precision*sensitivity/(sensitivity+precision)


gradient_boosting (798, 5) (798,) (200, 5) (200,)
gradient_boosting (798, 5) (798,) (200, 5) (200,)
gradient_boosting (798, 5) (798,) (200, 5) (200,)
gradient_boosting (799, 5) (799,) (199, 5) (199,)
gradient_boosting (799, 5) (799,) (199, 5) (199,)
svm (798, 5) (798,) (200, 5) (200,)
svm (798, 5) (798,) (200, 5) (200,)
svm (798, 5) (798,) (200, 5) (200,)
svm (799, 5) (799,) (199, 5) (199,)
svm (799, 5) (799,) (199, 5) (199,)
knn (798, 5) (798,) (200, 5) (200,)
knn (798, 5) (798,) (200, 5) (200,)
knn (798, 5) (798,) (200, 5) (200,)


  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)


knn (799, 5) (799,) (199, 5) (199,)
knn (799, 5) (799,) (199, 5) (199,)
decision_tree (798, 5) (798,) (200, 5) (200,)
decision_tree (798, 5) (798,) (200, 5) (200,)
decision_tree (798, 5) (798,) (200, 5) (200,)
decision_tree (799, 5) (799,) (199, 5) (199,)
decision_tree (799, 5) (799,) (199, 5) (199,)
naive_bayes (798, 5) (798,) (200, 5) (200,)
naive_bayes (798, 5) (798,) (200, 5) (200,)
naive_bayes (798, 5) (798,) (200, 5) (200,)
naive_bayes (799, 5) (799,) (199, 5) (199,)
naive_bayes (799, 5) (799,) (199, 5) (199,)


In [29]:
df_resultados_1 = pd.DataFrame(resultados_1)
df_resultados_1

Unnamed: 0,model,fold,tn,fp,fn,tp,perc_tn,perc_fp,perc_fn,perc_tp,...,recall,fpr,precision,ppv,specificity,tnr,f1_score,auc,kappa,accuracy
0,xgboost,0,139,16,39,6,69.5,8.0,19.5,3.0,...,0.133333,0.103226,0.272727,0.272727,0.896774,0.896774,0.179104,0.515054,0.036778,0.725
1,xgboost,1,146,9,41,4,73.0,4.5,20.5,2.0,...,0.088889,0.058065,0.307692,0.307692,0.941935,0.941935,0.137931,0.515412,0.041227,0.75
2,xgboost,2,139,16,39,6,69.5,8.0,19.5,3.0,...,0.133333,0.103226,0.272727,0.272727,0.896774,0.896774,0.179104,0.515054,0.036778,0.725
3,xgboost,3,131,23,36,9,65.829146,11.557789,18.090452,4.522613,...,0.2,0.149351,0.28125,0.28125,0.850649,0.850649,0.233766,0.525325,0.056417,0.703518
4,xgboost,4,129,25,40,5,64.824121,12.562814,20.100503,2.512563,...,0.111111,0.162338,0.166667,0.166667,0.837662,0.837662,0.133333,0.474387,-0.058078,0.673367
5,logistic_regression,0,155,0,45,0,77.5,0.0,22.5,0.0,...,0.0,0.0,,,1.0,1.0,,0.5,0.0,0.775
6,logistic_regression,1,155,0,45,0,77.5,0.0,22.5,0.0,...,0.0,0.0,,,1.0,1.0,,0.5,0.0,0.775
7,logistic_regression,2,155,0,45,0,77.5,0.0,22.5,0.0,...,0.0,0.0,,,1.0,1.0,,0.5,0.0,0.775
8,logistic_regression,3,154,0,45,0,77.386935,0.0,22.613065,0.0,...,0.0,0.0,,,1.0,1.0,,0.5,0.0,0.773869
9,logistic_regression,4,154,0,45,0,77.386935,0.0,22.613065,0.0,...,0.0,0.0,,,1.0,1.0,,0.5,0.0,0.773869


In [30]:
selected_features_2 = ['iSQI', 'sSQI', 'kSQI']
X_2 = df[selected_features_2]

In [31]:
X_2

Unnamed: 0,iSQI,sSQI,kSQI
0,0.00,0.000000,0.000000
1,0.20,-0.466121,19.392805
2,0.16,-2.174674,9.449453
3,0.30,-0.668478,2.535006
4,0.20,4.419841,25.069053
...,...,...,...
993,0.04,4.682297,20.321459
994,0.26,-1.396489,5.821627
995,0.24,3.107946,14.621376
996,0.24,3.345171,16.979691


In [32]:
# Classificando K-FOLD
resultados_2 = []

for method in methods.keys():
    m = methods[method]
    
    # Número de folds desejado
    num_folds = 5
    
    # Inicialize o objeto StratifiedKFold
    stratkf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    
    # Loop sobre os k-folds
    for idx, (train_idx, test_idx) in enumerate(stratkf.split(X_2, y)):
        
        X_train, X_test = X_2.iloc[train_idx], X_2.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        print(method, X_train.shape, y_train.shape, X_test.shape, y_test.shape)
        model = m.fit(X_train, y_train)
        
        # Faz predições no conjunto de teste
        y_pred = model.predict(X_test)
        
        # Avalia o desempenho do modelo e armazena os resultados
        results_fold = classification_metrics_binary(y_test, y_pred, method, idx)
        resultados_2.append(results_fold)
    

xgboost (798, 3) (798,) (200, 3) (200,)
xgboost (798, 3) (798,) (200, 3) (200,)
xgboost (798, 3) (798,) (200, 3) (200,)
xgboost (799, 3) (799,) (199, 3) (199,)
xgboost (799, 3) (799,) (199, 3) (199,)
logistic_regression (798, 3) (798,) (200, 3) (200,)
logistic_regression (798, 3) (798,) (200, 3) (200,)
logistic_regression (798, 3) (798,) (200, 3) (200,)
logistic_regression (799, 3) (799,) (199, 3) (199,)
logistic_regression (799, 3) (799,) (199, 3) (199,)
random_forest (798, 3) (798,) (200, 3) (200,)


  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)


random_forest (798, 3) (798,) (200, 3) (200,)
random_forest (798, 3) (798,) (200, 3) (200,)
random_forest (799, 3) (799,) (199, 3) (199,)
random_forest (799, 3) (799,) (199, 3) (199,)
gradient_boosting (798, 3) (798,) (200, 3) (200,)


  f1_score = 2*precision*sensitivity/(sensitivity+precision)


gradient_boosting (798, 3) (798,) (200, 3) (200,)
gradient_boosting (798, 3) (798,) (200, 3) (200,)
gradient_boosting (799, 3) (799,) (199, 3) (199,)
gradient_boosting (799, 3) (799,) (199, 3) (199,)
svm (798, 3) (798,) (200, 3) (200,)
svm (798, 3) (798,) (200, 3) (200,)
svm (798, 3) (798,) (200, 3) (200,)
svm (799, 3) (799,) (199, 3) (199,)
svm (799, 3) (799,) (199, 3) (199,)


  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)


knn (798, 3) (798,) (200, 3) (200,)
knn (798, 3) (798,) (200, 3) (200,)
knn (798, 3) (798,) (200, 3) (200,)
knn (799, 3) (799,) (199, 3) (199,)
knn (799, 3) (799,) (199, 3) (199,)
decision_tree (798, 3) (798,) (200, 3) (200,)
decision_tree (798, 3) (798,) (200, 3) (200,)
decision_tree (798, 3) (798,) (200, 3) (200,)
decision_tree (799, 3) (799,) (199, 3) (199,)
decision_tree (799, 3) (799,) (199, 3) (199,)
naive_bayes (798, 3) (798,) (200, 3) (200,)
naive_bayes (798, 3) (798,) (200, 3) (200,)
naive_bayes (798, 3) (798,) (200, 3) (200,)
naive_bayes (799, 3) (799,) (199, 3) (199,)
naive_bayes (799, 3) (799,) (199, 3) (199,)


In [33]:
df_resultados_2 = pd.DataFrame(resultados_2)
df_resultados_2

Unnamed: 0,model,fold,tn,fp,fn,tp,perc_tn,perc_fp,perc_fn,perc_tp,...,recall,fpr,precision,ppv,specificity,tnr,f1_score,auc,kappa,accuracy
0,xgboost,0,145,10,40,5,72.5,5.0,20.0,2.5,...,0.111111,0.064516,0.333333,0.333333,0.935484,0.935484,0.166667,0.523297,0.061033,0.75
1,xgboost,1,142,13,43,2,71.0,6.5,21.5,1.0,...,0.044444,0.083871,0.133333,0.133333,0.916129,0.916129,0.066667,0.480287,-0.051643,0.72
2,xgboost,2,130,25,37,8,65.0,12.5,18.5,4.0,...,0.177778,0.16129,0.242424,0.242424,0.83871,0.83871,0.205128,0.508244,0.018211,0.69
3,xgboost,3,133,21,37,8,66.834171,10.552764,18.592965,4.020101,...,0.177778,0.136364,0.275862,0.275862,0.863636,0.863636,0.216216,0.520707,0.047375,0.708543
4,xgboost,4,138,16,42,3,69.346734,8.040201,21.105528,1.507538,...,0.066667,0.103896,0.157895,0.157895,0.896104,0.896104,0.09375,0.481385,-0.046798,0.708543
5,logistic_regression,0,155,0,45,0,77.5,0.0,22.5,0.0,...,0.0,0.0,,,1.0,1.0,,0.5,0.0,0.775
6,logistic_regression,1,155,0,45,0,77.5,0.0,22.5,0.0,...,0.0,0.0,,,1.0,1.0,,0.5,0.0,0.775
7,logistic_regression,2,155,0,45,0,77.5,0.0,22.5,0.0,...,0.0,0.0,,,1.0,1.0,,0.5,0.0,0.775
8,logistic_regression,3,154,0,45,0,77.386935,0.0,22.613065,0.0,...,0.0,0.0,,,1.0,1.0,,0.5,0.0,0.773869
9,logistic_regression,4,154,0,45,0,77.386935,0.0,22.613065,0.0,...,0.0,0.0,,,1.0,1.0,,0.5,0.0,0.773869
