Implementando KNN no dataset WINE


1. Importando as bibliotecas e visualização do dataset

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import datasets


In [None]:
wine = datasets.load_wine(as_frame = True)

In [None]:
wine.data

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


2. Definindo os valores de x e y, e logo em seguida, treinamento dos dados

In [None]:
X = wine['data'].values
y = wine['target'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

3. Normalização dos dados com o StandardScaler


In [None]:
#Utilizar StandardScaler para normalizar os dados
sc = StandardScaler()

In [None]:
#O StandardScaler utiliza o fit e o transform para normalização
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

4. Criação de uma classe

In [None]:
class KNN():

  #Classe responsável pelos K-vizinhos mais próximos
  #Inicializando com 5 vizinhos
  def __init__(self, k=5, distance = 'euclidian'):
    self.k = k
    self.X = 0
    self.y = 0
    self.dist = distance

  #Agora a função para calcular a distância
  def distance (self, X_pred):
    if self.dist == 'euclidian':
      D = np.sqrt(((X_pred - self.X)**2).sum(axis = 1))
    else:
      D = np.abs(X_pred - self.X).sum(axis = 1)

    return D

  #Realizando o fit
  def fit (self, X, y):
    self.X = X
    self.y = y

  #Método do predict que retorn os K-vizinhos mais próximos
  def predict (self, X_pred):
    label = np.zeros(X_pred.shape[0])

    for i in range(len(X_pred)):
      dist = self.distance(X_pred[i])
      kviz = dist.argsort()
      idx_viz = kviz[0:self.k]
      y_viz = self.y[idx_viz]
      vote = np.bincount(y_viz)
      label[i] = np.argmax(vote)

    return label


In [None]:
knn = KNN()

In [None]:
knn.fit(X, y)

In [None]:
knn.fit(X_train, y_train)

5. Determinando a acurácia

In [None]:
resultados = knn.predict(X_test)

acuracia = np.sum(resultados == y_test) / len(y_test)
print(acuracia)

0.9722222222222222


In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(5)

In [None]:
accuracy = []

for i in [1, 3, 5, 7]:
  knn_sk = KNeighborsClassifier(i)
  knn_sk.fit(X_train, y_train)
  resultado = knn_sk.predict(X_test)
  acuracia = np.sum(resultado==y_test) / len(y_test)
  accuracy.append(acuracia)
  print(acuracia)

0.9722222222222222
0.9722222222222222
0.9722222222222222
0.9722222222222222
