### BCW Classification using k-Nearest Neighbors

In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv('bc.csv')
print(len(dataset))
dataset['Class'].replace({2:0, 4:1}, inplace=True)
print(dataset.tail())

699
     Sample_Code_No  Clump_Thickness  Cell_Size_Uniformity  \
694          776715                3                     1   
695          841769                2                     1   
696          888820                5                    10   
697          897471                4                     8   
698          897471                4                     8   

     Cell_Shape_Uniformity  Marginal_Adhesion  Single_Epithelial_Cell_Size  \
694                      1                  1                            3   
695                      1                  1                            2   
696                     10                  3                            7   
697                      6                  4                            3   
698                      8                  5                            4   

     Bare_Nuclei  Bland_Chromatin  Normal_Nucleoli  Mitoses  Class  
694            2                1                1        1      0  
695            1

In [3]:
# Split dataset
X = dataset.iloc[:, 1:10]
y = dataset.iloc[:, 10]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

In [4]:
# Feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [5]:
print(len(y_test))
print(math.sqrt(len(y_test)))

210
14.491376746189438


In [6]:
# Define the model (kNN)
classifier = KNeighborsClassifier(n_neighbors=13, p=2, metric='euclidean')

In [7]:
# Fit the model
classifier.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=13)

In [8]:
# Predict the test set results
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0], dtype=int64)

In [9]:
# Evaluate the model
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[130   5]
 [  5  70]]


In [10]:
print('Accuracy: ' + '{:.4f}'.format(accuracy_score(y_test, y_pred)))
print('F1 Score: ' + '{:.4f}'.format(f1_score(y_test, y_pred)))

Accuracy: 0.9524
F1 Score: 0.9333
