## KNN klasifikasi

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
%matplotlib notebook

In [2]:
dataset = pd.read_csv('clustered_analysis.csv')

In [3]:
dataset.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100),Segment
0,1,Male,19,15,39,5
1,2,Male,21,15,81,4
2,3,Female,20,16,6,5
3,4,Female,23,16,77,4
4,5,Female,31,17,40,5


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
 5   Segment                 200 non-null    int64 
dtypes: int64(5), object(1)
memory usage: 9.5+ KB


In [5]:
X = dataset.iloc[:, 3:5]

In [6]:
X.head()

Unnamed: 0,Annual Income (k$),Spending Score (1-100)
0,15,39
1,15,81
2,16,6
3,16,77
4,17,40


In [7]:
X = pd.get_dummies(X)

In [8]:
X = X.values

In [9]:
X

array([[ 15,  39],
       [ 15,  81],
       [ 16,   6],
       [ 16,  77],
       [ 17,  40],
       [ 17,  76],
       [ 18,   6],
       [ 18,  94],
       [ 19,   3],
       [ 19,  72],
       [ 19,  14],
       [ 19,  99],
       [ 20,  15],
       [ 20,  77],
       [ 20,  13],
       [ 20,  79],
       [ 21,  35],
       [ 21,  66],
       [ 23,  29],
       [ 23,  98],
       [ 24,  35],
       [ 24,  73],
       [ 25,   5],
       [ 25,  73],
       [ 28,  14],
       [ 28,  82],
       [ 28,  32],
       [ 28,  61],
       [ 29,  31],
       [ 29,  87],
       [ 30,   4],
       [ 30,  73],
       [ 33,   4],
       [ 33,  92],
       [ 33,  14],
       [ 33,  81],
       [ 34,  17],
       [ 34,  73],
       [ 37,  26],
       [ 37,  75],
       [ 38,  35],
       [ 38,  92],
       [ 39,  36],
       [ 39,  61],
       [ 39,  28],
       [ 39,  65],
       [ 40,  55],
       [ 40,  47],
       [ 40,  42],
       [ 40,  42],
       [ 42,  52],
       [ 42,  60],
       [ 43,

In [10]:
Y = dataset.iloc[:, -1].values

In [11]:
Y

array([5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4,
       5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 0, 4, 0, 3,
       5, 4, 0, 3, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3,
       0, 0, 3, 3, 0, 0, 0, 0, 0, 3, 0, 3, 3, 0, 0, 3, 0, 0, 3, 0, 0, 3,
       3, 0, 0, 3, 0, 3, 3, 3, 0, 3, 0, 3, 3, 0, 0, 3, 0, 3, 0, 0, 0, 0,
       0, 3, 3, 3, 3, 3, 0, 0, 0, 0, 3, 3, 3, 1, 3, 1, 2, 1, 2, 1, 2, 1,
       3, 1, 2, 1, 2, 1, 2, 1, 2, 1, 3, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
       2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
       2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
       2, 1], dtype=int64)

## Split data

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=.25, random_state = 42)

In [14]:
from sklearn.neighbors import KNeighborsClassifier

In [15]:
KNN = KNeighborsClassifier(
    algorithm='kd_tree',
    n_jobs=-1
)

In [16]:
KNN.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='kd_tree', n_jobs=-1)

In [17]:
Y_pred = KNN.predict(X_test)

## accuracy

In [18]:
from sklearn.metrics import classification_report, confusion_matrix

In [19]:
print(classification_report(Y_test.reshape(-1, 1), Y_pred))

              precision    recall  f1-score   support

           0       0.50      0.80      0.62        10
           1       1.00      1.00      1.00         8
           2       0.83      1.00      0.91        10
           3       0.60      0.25      0.35        12
           4       1.00      0.80      0.89         5
           5       1.00      1.00      1.00         5

    accuracy                           0.76        50
   macro avg       0.82      0.81      0.79        50
weighted avg       0.77      0.76      0.74        50



In [20]:
from sklearn.model_selection import cross_val_score

In [26]:
print('Cross val: ', cross_val_score(KNN, Y_test.reshape(-1, 1), Y_pred, cv = 5))
print('Accuracy : ', np.mean(cross_val_score(KNN, Y_test.reshape(-1, 1), Y_pred)))

Cross val:  [0.8 0.9 0.8 0.8 0.9]
Accuracy :  0.8400000000000001


