# KNN Notebook
----
We will use Social_Network_Ads.csv dataset in this notebook.

The dataset has information on customers, and whether they bought a certain product or not.

In this notebook, we will first implement KNN from scratch, then use sklearn [KNN](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

## 1. Reading Data

In [2]:
df = pd.read_csv('Social_Network_Ads.csv')
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


## (EXTRA) To replace strings with unique integers WITHOUT OHB encoding

In [3]:
df2 = df.copy()
df2

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [4]:
df2 = df2.drop('User ID', axis='columns')
df2

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0
...,...,...,...,...
395,Female,46,41000,1
396,Male,51,23000,1
397,Female,50,20000,1
398,Male,36,33000,0


#### I want to change values in Gender column to integers
----
ASSUMING we don't know the unique values in this column.

In [5]:
unique_values = pd.unique(df2['Gender'])
print(unique_values)
i = 0
for value in unique_values:
    df2['Gender'] = df2['Gender'].replace(value, i)
    i += 1

df2

['Male' 'Female']


Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,0,19,19000,0
1,0,35,20000,0
2,1,26,43000,0
3,1,27,57000,0
4,0,19,76000,0
...,...,...,...,...
395,1,46,41000,1
396,0,51,23000,1
397,1,50,20000,1
398,0,36,33000,0


## 2. Split Data

In [6]:
X = df.loc[:, 'Age':'EstimatedSalary']
y = df['Purchased']
X

Unnamed: 0,Age,EstimatedSalary
0,19,19000
1,35,20000
2,26,43000
3,27,57000
4,19,76000
...,...,...
395,46,41000
396,51,23000
397,50,20000
398,36,33000


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

y_train = y_train.values
y_test = y_test.values

(300, 2) (300,)
(100, 2) (100,)


In [8]:
import numpy as np

class UserStandardScaler:
    def fit(self, X):
        self.mean = np.mean(X, axis=0)
        self.std = np.std(X, axis=0)
        return self

    def transform(self, X):
        return (X - self.mean) / self.std

In [11]:
def fit(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    return mean,std

def transform( X,m,s):
    return (X - m) / s

In [14]:
m,s=fit(X_test)

In [15]:
Xstd_train=transform(X_test,m,s)
Xstd_train

Unnamed: 0,Age,EstimatedSalary
132,-0.547490,0.513073
309,0.154420,-0.618256
341,-0.108796,0.146155
196,-0.547490,0.268461
246,-0.108796,-0.618256
...,...,...
146,-0.810706,0.788261
135,-1.161661,-0.220762
390,1.031808,-1.138055
264,1.031808,0.604802


In [9]:
scaler = UserStandardScaler()
scaler.fit(X_train)
scaler.fit(X_test)
Xstd_train = scaler.transform(X_train)
Xstd_test=scaler.transform(X_test)

In [10]:
#Xstd_test[0:5]
Xstd_test

Unnamed: 0,Age,EstimatedSalary
132,-0.547490,0.513073
309,0.154420,-0.618256
341,-0.108796,0.146155
196,-0.547490,0.268461
246,-0.108796,-0.618256
...,...,...
146,-0.810706,0.788261
135,-1.161661,-0.220762
390,1.031808,-1.138055
264,1.031808,0.604802


# With Using SKlearn

In [11]:
#sc_X = StandardScaler()
#X_train = sc_X.fit_transform(X_train)
#X_test = sc_X.transform(X_test)
#X_test[0:5]

## 3. Model Building

### 3.1. From Scratch

In [12]:
#def eucledian(p1, p2):
   # dist = np.sqrt(np.sum((p1-p2)**2))
    #return dist

To get the most votes from a classes array, we can use [mode](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mode.html) function from scipy library.

In [13]:
#accuracy = accuracy_score(y_test, y_pred)
#print('For K = %d, accuracy = %.2f%%\n' % (K, accuracy * 100))
#cm = confusion_matrix(y_test, y_pred)
#print(cm)

### 3.2. Using Sklearn 

In [14]:
# we will test using k from 1 to 25
scores = {}
for k in range(1, 26):
    knn = KNN(n_neighbors=k)
    knn = knn.fit(Xstd_train, y_train)
    y_pred = knn.predict(Xstd_test)
    scores[k] = accuracy_score(y_test, y_pred)

In [15]:
besk_K = 0
best_acc = 0
for k in scores:
    print('For K = %d, accuracy = %.2f%%' % (k, scores[k] * 100))
    if scores[k] > best_acc:
        best_acc = scores[k]
        best_K = k

For K = 1, accuracy = 89.00%
For K = 2, accuracy = 89.00%
For K = 3, accuracy = 93.00%
For K = 4, accuracy = 92.00%
For K = 5, accuracy = 93.00%
For K = 6, accuracy = 93.00%
For K = 7, accuracy = 93.00%
For K = 8, accuracy = 93.00%
For K = 9, accuracy = 93.00%
For K = 10, accuracy = 93.00%
For K = 11, accuracy = 93.00%
For K = 12, accuracy = 93.00%
For K = 13, accuracy = 93.00%
For K = 14, accuracy = 93.00%
For K = 15, accuracy = 93.00%
For K = 16, accuracy = 93.00%
For K = 17, accuracy = 93.00%
For K = 18, accuracy = 93.00%
For K = 19, accuracy = 93.00%
For K = 20, accuracy = 93.00%
For K = 21, accuracy = 93.00%
For K = 22, accuracy = 93.00%
For K = 23, accuracy = 93.00%
For K = 24, accuracy = 93.00%
For K = 25, accuracy = 93.00%


In [16]:
knn = KNN(n_neighbors=best_K)
knn = knn.fit(Xstd_train, y_train)
y_pred = knn.predict(Xstd_test)

In [17]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[64  4]
 [ 3 29]]
