In [1]:
import numpy as np
from sklearn import preprocessing, neighbors
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('/breast-cancer-wisconsin.data')
df.replace('?',-99999, inplace=True)
df.drop(['id'], 1, inplace=True) #id column doesn't help us or our classifier in any way to find whether the person is benign or has a tumor so dropping it.

In [4]:
df

Unnamed: 0,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epi_cell_size,bare_nuclei,bland_chromation,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


In [5]:
#Creating our features and labels.
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

In [7]:
#Splitting our training and testing our data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
#initializing our KNN Classifier.
clf = neighbors.KNeighborsClassifier()

In [9]:
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
accuracy = clf.score(X_test, y_test)

In [11]:
accuracy

0.9714285714285714

In [12]:
#an accuracy of 97% is really great that too on the first try.
#if somebody has any doubts regarding the removal of the id column during our initial steps then they can copy and paste this and try themselves without dropping that column and see how badly it affects our accuracy.

In [23]:
#lets try with some data created by the user randomly.
example_measures = np.array([[5,4,1,1,1,2,3,4,1],[4,2,1,9,3,1,3,2,1],[8,7,1,9,5,1,4,1,1]])
example_measures = example_measures.reshape(len(example_measures), -1)
prediction = clf.predict(example_measures)
print(prediction)

[2 2 4]


In [None]:
#as we can see above it says that our first 2 sets are benign while the third is malignant and it seems quite evident too given that in the third set the clump thickness,cell size,marginal adhesion as well as epithelial cell size all seems out of order than our benign sets.

In [None]:
"""
Here is the attribute information with the help of which we created our own values for prediction.
Remember the 1st column was removed so we can ignore it.
Start from the 2nd i.e. clump thickness.
  Attribute                     Domain
   -- -----------------------------------------
   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)
  """