University of Wisconsin Breast Cancer Diagnosis Dataset on UCI Machine Learning Repository
https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei from images.

In [1]:
import pandas as pd
import numpy as np

In [2]:
from bokeh.plotting import figure, show, gridplot, ColumnDataSource
from bokeh.io import output_notebook
from bokeh.models import Span, Label
from bokeh.models import BoxAnnotation


from bokeh.transform import linear_cmap
from bokeh.util.hex import hexbin

In [3]:
output_notebook()

In [4]:
df_bC = pd.read_csv('breast-cancer.csv')

In [5]:
df_bC.head()

Unnamed: 0,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,0
1,1002945,5,4,4,5,7,10,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,1,3,1,1,0


In [6]:
df_bC = df_bC.drop(['ID'], axis = 1)

In [7]:
#Malingant 
class1 = df_bC.query('Class == 1')

In [8]:
#Benign
class0 = df_bC.query('Class == 0')

In [9]:
class1.shape

(239, 10)

In [10]:
class0.shape

(444, 10)

In [11]:

p1 = figure(title = '', plot_width=500, plot_height=500)
p1.circle(class1['Bland Chromatin'], class1['Single Epithelial Cell Size'],  size=17,
 color="red", alpha=0.2, legend = 'Malignant', muted_color='grey', muted_alpha=0.2)
p1.circle(class0['Bland Chromatin'], class0['Single Epithelial Cell Size'],  size=17,
 color="green", alpha=0.2, legend = 'Benign', muted_color='grey', muted_alpha=0.2)

p1.xaxis.axis_label = 'Bland Chromatin'
p1.yaxis.axis_label = 'Single Epithelial Cell Size'
p1.legend.location='top_left'
p1.legend.click_policy="hide"

show(p1)

In [12]:
#check for any missing values
df_bC.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 10 columns):
Clump Thickness                683 non-null int64
Uniformity of Cell Size        683 non-null int64
Uniformity of Cell Shape       683 non-null int64
Marginal Adhesion              683 non-null int64
Single Epithelial Cell Size    683 non-null int64
Bare Nuclei                    683 non-null int64
Bland Chromatin                683 non-null int64
Normal Nucleoli                683 non-null int64
Mitoses                        683 non-null int64
Class                          683 non-null int64
dtypes: int64(10)
memory usage: 53.5 KB


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
#Split data into training and test sets - 75% training and 25% testing

X_train, X_test, y_train, y_test = train_test_split(
    df_bC.drop(['Class'], axis = 1), df_bC['Class'], train_size = 0.75)

In [15]:
from sklearn.neighbors import KNeighborsClassifier

In [16]:
kNN = KNeighborsClassifier(n_neighbors=9)

In [17]:
kNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')

In [18]:
predict = kNN.predict(X_test)

In [19]:
predict.shape

(171,)

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
acc_9 = kNN.score(X_test, y_test)
acc_9

0.9649122807017544

In [22]:
results = X_test.copy()

In [23]:
results['Class'] = predict.copy()

In [24]:
results.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
31,10,7,7,3,8,5,7,4,3,1
172,4,1,1,1,2,1,3,1,1,0
240,8,4,4,1,2,9,3,3,1,1
60,9,10,10,1,10,8,3,3,1,1
23,1,1,1,1,2,1,3,1,1,0


In [30]:
class1 = results.query('Class == 1')

In [31]:
class0 = results.query('Class == 0')

In [39]:
#Plot of predictions using Bokeh

p3 = figure(title = 'Breast Cancer Predictions', plot_width=700, plot_height=700)


p3.circle(class1['Bland Chromatin'], class1['Single Epithelial Cell Size'],  size=30,
 color="red", alpha=0.2, legend = 'Malignant', muted_color='grey', muted_alpha=0.2)
p3.circle(class0['Bland Chromatin'], class0['Single Epithelial Cell Size'],  size=20,
 color="green", alpha=0.2, legend = 'Benign', muted_color='grey', muted_alpha=0.2)    


p3.xaxis.axis_label = 'Bland Chromatin'
p3.yaxis.axis_label = 'Single Epithelial Cell Size'
p3.legend.location='top_left'
p3.legend.click_policy="mute"


show(p3)

