In [1]:
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB

In [2]:
#dados = pd.read_csv("Vertebral.csv")
dados = pd.read_csv("./datasets/diabetes_012_health_indicators_BRFSS2015.csv")
dados = shuffle(dados)
X = dados.iloc[:,1:]
Y = dados.iloc[:,0:1]

**Gerando os conjuntos de treino, teste e validação**


In [3]:
x_treino,x_temp,y_treino,y_temp=train_test_split(X,Y,test_size=0.5,stratify=Y)
x_validacao,x_teste,y_validacao,y_teste=train_test_split(x_temp,y_temp,test_size=0.5, stratify = y_temp)

print("Treino")
x_treino.info()
y_treino.info()

print("\nValidação")
x_validacao.info()
y_validacao.info()

print("\nTeste")
x_teste.info()
y_teste.info()

Treino
<class 'pandas.core.frame.DataFrame'>
Index: 126840 entries, 62460 to 112673
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HighBP                126840 non-null  float64
 1   HighChol              126840 non-null  float64
 2   CholCheck             126840 non-null  float64
 3   BMI                   126840 non-null  float64
 4   Smoker                126840 non-null  float64
 5   Stroke                126840 non-null  float64
 6   HeartDiseaseorAttack  126840 non-null  float64
 7   PhysActivity          126840 non-null  float64
 8   Fruits                126840 non-null  float64
 9   Veggies               126840 non-null  float64
 10  HvyAlcoholConsump     126840 non-null  float64
 11  AnyHealthcare         126840 non-null  float64
 12  NoDocbcCost           126840 non-null  float64
 13  GenHlth               126840 non-null  float64
 14  MentHlth              126840 non-null  float64

In [4]:
from sklearn import metrics
def plot_roc_curve(fper,tper,cor,classsificador):
    plt.plot(fper, tper, color=cor, label=classsificador)
    plt.plot([0, 1], [0, 1], color="green", linestyle='--')
    plt.xlabel('Taxa de Falsos Positivos (FPR)')
    plt.ylabel('Taxa de Verdadeiros Positivos (TPR)')
    plt.title('Curva ROC')
    plt.legend()

In [9]:
#atribuindo valores aos hiperparâmetros
#n_neighbors corresponde ao tamanho da vizinhança
#weights indica se os vizinhos terão pesos diferentes ou não. Pode assumir os valores uniform ou distante (ou callabe)
def confusionMatrix(conf_matrix):
    for i in range(conf_matrix.shape[0]):
        tp = conf_matrix[i, i]
        fp = conf_matrix[:, i].sum() - tp
        fn = conf_matrix[i, :].sum() - tp
        tn = conf_matrix.sum() - (tp + fp + fn)
        print(f"Classe {i}: TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}")
    return tn, fp, fn, tp

KNN = KNeighborsClassifier(n_neighbors=13,weights="distance")
KNN.fit(x_treino,y_treino)
opiniao = KNN.predict(x_teste)

print("Desempenho KNN")
conf_matrix = confusion_matrix(y_teste, opiniao)
print("Matriz de Confusão\n", conf_matrix)


TN, FP, FN, TP = confusionMatrix(conf_matrix)
print("TP: ", TP, " FN: ",FN," FP: ",FP," TN: ",TN)
print("Acurácia: ",accuracy_score(y_teste, opiniao))
print("Sensibilidade: ",(TP/(TP+FN)))
print("Especificade: ",(TN/(FP+TN)))
print("AUC: ",roc_auc_score(y_teste,opiniao, multi_class="ovr"))
print("F-Score: ",f1_score(y_teste, opiniao))
print("Precision: ",precision_score(y_teste, opiniao))
print("Recall: ",recall_score(y_teste, opiniao))
print("\n\n")

#print(KNN.predict_proba(x_teste))

y_score = KNN.predict_proba(x_teste)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_teste,y_score,pos_label=2)
plot_roc_curve(fpr, tpr,"red","KNN")


NB = GaussianNB()
NB.fit(x_treino,y_treino)
opiniao = NB.predict(x_teste)

print("Desempenho Naive Bayes")
conf_matrix = confusion_matrix(y_teste, opiniao)
print("Matriz de Confusão\n", conf_matrix)
TN, FP, FN, TP = confusionMatrix(conf_matrix)
print("TP: ", TP, " FN: ",FN," FP: ",FP," TN: ",TN)
print("Acurácia: ",accuracy_score(y_teste, opiniao))
print("Sensibilidade: ",(TP/(TP+FN)))
print("Especificade: ",(TN/(FP+TN)))
print("AUC: ",roc_auc_score(y_teste,opiniao))
print("F-Score: ",f1_score(y_teste, opiniao))
print("Precision: ",precision_score(y_teste, opiniao))
print("Recall: ",recall_score(y_teste, opiniao))
print("\n\n")

y_score = NB.predict_proba(x_teste)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_teste,y_score,pos_label=2)
plot_roc_curve(fpr, tpr,"blue","Naïve Bayes")



DT = tree.DecisionTreeClassifier()
DT.fit(x_treino,y_treino)
opiniao = DT.predict(x_teste)

print("Desempenho Decision Tree")
conf_matrix = confusion_matrix(y_teste, opiniao)
print("Matriz de Confusão\n", conf_matrix)
TN, FP, FN, TP = confusionMatrix(conf_matrix)
print("TP: ", TP, " FN: ",FN," FP: ",FP," TN: ",TN)
print("Acurácia: ",accuracy_score(y_teste, opiniao))
print("Sensibilidade: ",(TP/(TP+FN)))
print("Especificade: ",(TN/(FP+TN)))
print("AUC: ",roc_auc_score(y_teste,opiniao))
print("F-Score: ",f1_score(y_teste, opiniao))
print("Precision: ",precision_score(y_teste, opiniao))
print("Recall: ",recall_score(y_teste, opiniao))
print("\n\n")

y_score = DT.predict_proba(x_teste)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_teste,y_score,pos_label=2)
plot_roc_curve(fpr, tpr,"black","DT")

MLP = MLPClassifier(hidden_layer_sizes=(10,10),activation="relu",max_iter=500)
MLP.fit(x_treino,y_treino)
opiniao = MLP.predict(x_teste)


print("Desempenho Multilayer Perceptron")
conf_matrix = confusion_matrix(y_teste, opiniao)
print("Matriz de Confusão\n", conf_matrix)
TN, FP, FN, TP = confusionMatrix(conf_matrix)

print("TP: ", TP, " FN: ",FN," FP: ",FP," TN: ",TN)
print("Acurácia: ",accuracy_score(y_teste, opiniao))
print("Sensibilidade: ",(TP/(TP+FN)))
print("Especificade: ",(TN/(FP+TN)))
print("AUC: ",roc_auc_score(y_teste,opiniao))
print("F-Score: ",f1_score(y_teste, opiniao))
print("Precision: ",precision_score(y_teste, opiniao))
print("Recall: ",recall_score(y_teste, opiniao))
print("\n\n")

y_score = MLP.predict_proba(x_teste)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_teste,y_score,pos_label=2)
plot_roc_curve(fpr, tpr,"green","MLP")




SVM = SVC(kernel='rbf',C=0.9,probability=True)
SVM.fit(x_treino,y_treino)
opiniao = SVM.predict(x_teste)

print("Desempenho SVM")
conf_matrix = confusion_matrix(y_teste, opiniao)
print("Matriz de Confusão\n", conf_matrix)
TN, FP, FN, TP = confusionMatrix(conf_matrix)
print("TP: ", TP, " FN: ",FN," FP: ",FP," TN: ",TN)
print("Acurácia: ",accuracy_score(y_teste, opiniao))
print("Sensibilidade: ",(TP/(TP+FN)))
print("Especificade: ",(TN/(FP+TN)))
print("AUC: ",roc_auc_score(y_teste,opiniao))
print("F-Score: ",f1_score(y_teste, opiniao))
print("Precision: ",precision_score(y_teste, opiniao))
print("Recall: ",recall_score(y_teste, opiniao))
print("\n\n")

y_score = SVM.predict_proba(x_teste)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_teste,y_score,pos_label=2)
plot_roc_curve(fpr, tpr,"yellow","SVM")



  return self._fit(X, y)


Desempenho KNN
Matriz de Confusão
 [[51730    39  1657]
 [ 1034     0   124]
 [ 7466     8  1362]]
Classe 0: TP: 51730, FP: 8500, FN: 1696, TN: 1494
Classe 1: TP: 0, FP: 47, FN: 1158, TN: 62215
Classe 2: TP: 1362, FP: 1781, FN: 7474, TN: 52803
TP:  1362  FN:  7474  FP:  1781  TN:  52803
Acurácia:  0.8371491643014822
Sensibilidade:  0.15414214576731552
Especificade:  0.9673713908837754


AxisError: axis 1 is out of bounds for array of dimension 1