# Wisconsin Breast Cancer Data
## Tumor Prediction Using KNN

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# load data
wdbc = pd.read_csv("Data/wdbc.data", header=None)
wdbc.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
61,858981,B,8.598,20.98,54.66,221.8,0.1243,0.08963,0.03,0.009259,...,9.565,27.04,62.06,273.9,0.1639,0.1698,0.09001,0.02778,0.2972,0.07712
276,8911230,B,11.33,14.16,71.79,396.6,0.09379,0.03872,0.001487,0.003333,...,12.2,18.99,77.37,458.0,0.1259,0.07348,0.004955,0.01111,0.2758,0.06386
148,86973702,B,14.44,15.18,93.97,640.1,0.0997,0.1021,0.08487,0.05532,...,15.85,19.85,108.6,766.9,0.1316,0.2735,0.3103,0.1599,0.2691,0.07683
46,85713702,B,8.196,16.84,51.71,201.9,0.086,0.05943,0.01588,0.005917,...,8.964,21.96,57.26,242.2,0.1297,0.1357,0.0688,0.02564,0.3105,0.07409
100,862717,M,13.61,24.98,88.05,582.7,0.09488,0.08511,0.08625,0.04489,...,16.99,35.27,108.6,906.5,0.1265,0.1943,0.3169,0.1184,0.2651,0.07397


In [3]:
# split data into dependant and independant variables
X=wdbc.iloc[:,2:32].values
y=wdbc.iloc[:,1:2].values
[X.shape, y.shape]

[(569, 30), (569, 1)]

In [4]:
# encode tumor type data
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
y[0:5]

  y = column_or_1d(y, warn=True)


array([1, 1, 1, 1, 1], dtype=int64)

In [5]:
# test and train set split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)
[X_test.shape, y_test.shape]

[(114, 30), (114,)]

In [6]:
# standard scaling based on mean and variance
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
X_train[0:2,:]
# option2: use normalize to scale to [0,1]

array([[-1.15036482, -0.39064196, -1.12855021, -0.95876358,  0.3109837 ,
        -0.5959945 , -0.80259612, -0.80249002,  0.29453906,  0.0942515 ,
        -0.4950523 ,  1.48720153, -0.51448782, -0.49154005,  0.28149837,
        -0.60451206, -0.46900701, -0.61170002,  0.05798237, -0.35763702,
        -1.0431756 ,  0.21353282, -1.0360446 , -0.84880771,  0.34249851,
        -0.73009743, -0.81232053, -0.75798367, -0.01614761, -0.38503402],
       [-0.93798972,  0.68051405, -0.94820146, -0.82152548, -0.60963604,
        -0.90986721, -0.66066905, -0.89871612,  0.75493453, -0.42547082,
        -0.33381757,  0.75941203, -0.28751805, -0.42127695, -0.1620797 ,
        -0.20486693, -0.05029632, -0.20309076, -0.25469005, -0.39139463,
        -0.71565415,  1.06684183, -0.68992205, -0.66869703, -0.09553745,
        -0.53786647, -0.37504806, -0.60687023,  0.09669004, -0.38615797]])

In [7]:
# train the KNN classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [8]:
# predicting the tumor type on the test set
y_pred = knn.predict(X_test)

In [9]:
# making a confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[67,  0],
       [ 4, 43]], dtype=int64)

In [11]:
# accuracy of model
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9649122807017544
             precision    recall  f1-score   support

          0       0.94      1.00      0.97        67
          1       1.00      0.91      0.96        47

avg / total       0.97      0.96      0.96       114



In [None]:
# 