# k - Nearest Neighbours

In [2]:
# import necessary packages
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter

In [3]:
# Load the data
data = pd.read_csv('car_evaluation.csv')
data.head(2)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc


In [4]:
# Basic Checks
data.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
outcome     0
dtype: int64

In [5]:
data.describe()

Unnamed: 0,doors,persons
count,1728.0,1728.0
mean,3.5,3.666667
std,1.118358,1.24758
min,2.0,2.0
25%,2.75,2.0
50%,3.5,4.0
75%,4.25,5.0
max,5.0,5.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null int64
persons     1728 non-null int64
lug_boot    1728 non-null object
safety      1728 non-null object
outcome     1728 non-null object
dtypes: int64(2), object(5)
memory usage: 94.6+ KB


In [7]:
Counter(data.outcome)

Counter({'unacc': 1210, 'acc': 384, 'vgood': 65, 'good': 69})

In [9]:
data.shape

(1728, 7)

In [12]:
# Define 
X = data.iloc[:,:6] #X = data.iloc[:,:-1]
X.head(2)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med


In [13]:
y = data.outcome

In [16]:
enc = LabelEncoder()
X.buying = enc.fit_transform(X.buying)
X.maint = enc.fit_transform(X.maint)
X.lug_boot = enc.fit_transform(X.lug_boot)
X.safety = enc.fit_transform(X.safety)
X.head(2)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3,3,2,2,2,1
1,3,3,2,2,2,2


In [17]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10)

In [18]:
# define model
model = KNeighborsClassifier(n_neighbors=5)

In [19]:
# Train / fit
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [20]:
y_predict = model.predict(X_test)

In [21]:
accuracy_score(y_test,y_predict)

0.9421296296296297

In [23]:
confusion_matrix(y_test,y_predict)

array([[ 77,   1,   6,   0],
       [  5,  14,   0,   0],
       [  3,   0, 304,   0],
       [  7,   2,   1,  12]], dtype=int64)

In [24]:
pd.crosstab(y_test,y_predict)

col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,77,1,6,0
good,5,14,0,0
unacc,3,0,304,0
vgood,7,2,1,12


In [25]:
Counter(y_test)

Counter({'unacc': 307, 'good': 19, 'acc': 84, 'vgood': 22})