In [81]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import mode

In [82]:
# reading the csv file 
df = pd.read_csv(r"cancer.csv")
df

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,0,5,1,1,1,2,1,3,1,1
1,0,5,4,4,5,7,10,3,2,1
2,0,3,1,1,1,2,2,3,1,1
3,0,6,8,8,1,3,4,3,7,1
4,0,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...,...
678,0,3,1,1,1,3,2,1,1,1
679,0,2,1,1,1,2,1,1,1,1
680,1,5,10,10,3,7,3,8,10,2
681,1,4,8,6,4,3,4,10,6,1


In [83]:
df.columns

Index(['Class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps',
       'deg-malig', 'breast', 'breast-quad', 'irradiat'],
      dtype='object')

In [84]:
df.shape

(683, 10)

In [85]:
df.isna().sum()

Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
dtype: int64

In [86]:
col = ['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps',
       'deg-malig', 'breast', 'breast-quad', 'irradiat']
X = np.array(df[col])
Y = np.array(df['Class'])

In [87]:
#splitting the dataset into tain and test shape
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

### **From scratch**

In [88]:
class Knn:
    # constructor of the class
    def __init__(self,n):
        self.neighbors = n
    
    # function to find the euclidean distance 
    def euclidean(self,k,x_train):
        return np.sqrt(np.sum(np.square(k - x_train)))
    
    def train(self,X_train,y_train):
        self.m , self.n = X_train.shape
        self.X_train = X_train
        self.y_train = y_train
    
    # function to calculate the euclidean distances of passed x with all the data points in X_train
    def calc_euclidean(self,x):
        distances = np.zeros(self.m)
        
        for i in range(self.m):
            temp = self.euclidean(x,self.X_train[i])
            distances[i] = temp
            
        indices = distances.argsort()
        updated_y_train = self.y_train[indices]
        return updated_y_train[:self.neighbors]
    # function to predict the class the test points belongs to
    def predict(self,X_test):
        self.X_test = X_test
        
        y_pred = np.zeros(self.X_test.shape[0])
        for i in range(self.X_test.shape[0]):
            x = self.X_test[i]
            neigh = np.zeros(self.neighbors)
            
            neigh = self.calc_euclidean(x)
            # most frequent class in the array neigh 
            y_pred[i] = mode(neigh)[0][0]
        return y_pred

In [89]:
clf = Knn(n=3)
clf.train(X_train,y_train)

In [90]:
# getting the prediction
y_pred = clf.predict(X_test)
y_pred

array([1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1.,
       0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0.,
       1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0.,
       0.])

In [91]:
accuracy_score(y_test,y_pred)

0.948905109489051

### **Using sklearn**

In [92]:
clf = KNeighborsClassifier(n_neighbors=9)
clf.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=9)

In [93]:
y_pred = clf.predict(X_test)
y_pred

array([1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 0], dtype=int64)

In [94]:
# accuracy score
accuracy_score(y_test,y_pred)

0.9562043795620438