# Análise de diabetes com algoritmo KNN

## Instalando bibliotecas necessárias

In [None]:
pip install scikit-learn

## Importando bibliotecas necessárias

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

## Lendo base de dados

In [3]:
dataset = pd.read_csv("diabetes.csv")

## Mostrando primeiros 10 registros

In [4]:
dataset.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


## Normalizando dados

In [5]:
dataset = dataset.apply(lambda x: (x - x.min()) / (x.max() - x.min()) if x.dtype in ['float64', 'int64'] else x)

## Mostrando primeiros 10 registros (normalizados)

In [6]:
dataset.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333,1.0
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667,0.0
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333,1.0
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,0.0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,1.0
5,0.294118,0.582915,0.606557,0.0,0.0,0.38152,0.052519,0.15,0.0
6,0.176471,0.39196,0.409836,0.323232,0.104019,0.461997,0.072588,0.083333,1.0
7,0.588235,0.577889,0.0,0.0,0.0,0.52608,0.023911,0.133333,0.0
8,0.117647,0.98995,0.57377,0.454545,0.641844,0.454545,0.034159,0.533333,1.0
9,0.470588,0.628141,0.786885,0.0,0.0,0.0,0.065756,0.55,1.0


## Mostrando informações sobre a base de dados

In [7]:
dataset.info()
dataset.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    float64
 1   Glucose                   768 non-null    float64
 2   BloodPressure             768 non-null    float64
 3   SkinThickness             768 non-null    float64
 4   Insulin                   768 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    float64
 8   Outcome                   768 non-null    float64
dtypes: float64(9)
memory usage: 54.1 KB


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.22618,0.60751,0.566438,0.207439,0.094326,0.47679,0.168179,0.204015,0.348958
std,0.19821,0.160666,0.158654,0.161134,0.136222,0.117499,0.141473,0.196004,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.058824,0.497487,0.508197,0.0,0.0,0.406855,0.070773,0.05,0.0
50%,0.176471,0.58794,0.590164,0.232323,0.036052,0.4769,0.125747,0.133333,0.0
75%,0.352941,0.704774,0.655738,0.323232,0.150414,0.545455,0.234095,0.333333,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Separamos a base entre X (atributos descritivos) e y (atributo classe)

In [8]:
X = dataset.drop(columns="Outcome") # X é maiúsculo pois é uma matriz
y = dataset.Outcome # y é minúsculo pois é um vetor

print("\n----------------- Atributos descritivos -----------------\n")
print(X)

print("\n----------------- Atributo classe -----------------\n")
print(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 1)


----------------- Atributos descritivos -----------------

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0       0.352941  0.743719       0.590164       0.353535  0.000000  0.500745   
1       0.058824  0.427136       0.540984       0.292929  0.000000  0.396423   
2       0.470588  0.919598       0.524590       0.000000  0.000000  0.347243   
3       0.058824  0.447236       0.540984       0.232323  0.111111  0.418778   
4       0.000000  0.688442       0.327869       0.353535  0.198582  0.642325   
..           ...       ...            ...            ...       ...       ...   
763     0.588235  0.507538       0.622951       0.484848  0.212766  0.490313   
764     0.117647  0.613065       0.573770       0.272727  0.000000  0.548435   
765     0.294118  0.608040       0.590164       0.232323  0.132388  0.390462   
766     0.058824  0.633166       0.491803       0.000000  0.000000  0.448584   
767     0.058824  0.467337       0.573770       0.313131  0.

## Mostrando o que cada variável guarda

In [9]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(514, 8) (254, 8) (514,) (254,)


## Instanciando algoritmo KNN

In [10]:
model = KNeighborsClassifier(n_neighbors=5)

## Fazendo treinamento do modelo

In [11]:
model.fit(X_train, y_train)

## Fazendo previsão

In [12]:
yhat = model.predict(X_test)
acc = accuracy_score(y_test, yhat) # Comparando o y "real" com y "de chapéu" (notação para representar respota da previsão do modelo) para achar a acurácia

## Resultados

In [13]:
print(f"\nAcurácia: {acc}\n")


Acurácia: 0.7834645669291339

