### KNeighborsClassifier

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix,roc_auc_score, roc_curve
from pandas_profiling import ProfileReport

In [2]:
df=pd.read_csv("winequality-red.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
x=df.drop(columns='quality')
y=df.quality

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, 
                                                    random_state=100)

In [5]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(959, 11)
(959,)
(640, 11)
(640,)


In [6]:
knn=KNeighborsClassifier()
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [7]:
knn.score(x_test,y_test)

0.5171875

In [8]:
knn.score(x_train, y_train)

0.6287799791449427

In [9]:
# Hyperparameter tuning using gridsearchCV
pram={
    'n_neighbors': [3,5,7,9,12,13,15,17,21,23,25],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10,15,20,25,30,35,45,50],
    'p': [1,2],
    'weights': ['uniform', 'distance']
}

gridcv=GridSearchCV(knn, param_grid=pram)

gridcv.fit(x_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'leaf_size': [10, 15, 20, 25, 30, 35, 45, 50],
                         'n_neighbors': [3, 5, 7, 9, 12, 13, 15, 17, 21, 23,
                                         25],
                         'p': [1, 2], 'weights': ['uniform', 'distance']})

In [10]:
gridcv.best_params_

{'algorithm': 'auto',
 'leaf_size': 10,
 'n_neighbors': 21,
 'p': 2,
 'weights': 'distance'}

In [11]:
knn=KNeighborsClassifier(algorithm = 'auto', leaf_size = 10, n_neighbors = 21, 
                         p = 2, weights = 'distance')
knn.fit(x_train, y_train)

KNeighborsClassifier(leaf_size=10, n_neighbors=21, weights='distance')

In [12]:
knn.score(x_train, y_train)

1.0

In [13]:
knn.score(x_test, y_test)

0.5859375

In [16]:
y_test_pred=knn.predict(x_test)

In [17]:
from sklearn import metrics
metrics.confusion_matrix(y_test, y_test_pred)

array([[  0,   0,   3,   0,   1,   0],
       [  0,   0,   9,   9,   2,   0],
       [  0,   0, 189,  76,   2,   0],
       [  0,   0,  83, 163,  17,   0],
       [  0,   0,  20,  38,  23,   0],
       [  0,   0,   2,   3,   0,   0]], dtype=int64)

In [19]:
np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))

0.8129806270754549

In [21]:
metrics.mean_absolute_percentage_error(y_test, y_test_pred)

0.08889508928571428

In [25]:
x_test.iloc[0]

fixed acidity            7.80000
volatile acidity         0.70000
citric acid              0.06000
residual sugar           1.90000
chlorides                0.07900
free sulfur dioxide     20.00000
total sulfur dioxide    35.00000
density                  0.99628
pH                       3.40000
sulphates                0.69000
alcohol                 10.90000
Name: 1254, dtype: float64

In [27]:
knn.predict([x_test.iloc[0]])

array([6], dtype=int64)

In [32]:
knn.predict([x_test.iloc[9]])

array([7], dtype=int64)