In [2]:
import numpy as np
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
names = ['id', 'clump_thickness', 'uniform_cell_size', 'uniform_cell_shape', 'marginal_adhesion', 'single_epithelial_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']
df = pd.read_csv(url, names=names)

In [4]:
df.replace('?', -99999, inplace=True)
df.head()

Unnamed: 0,id,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epithelial_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
df.drop(columns='id', axis=1, inplace=True)
df.head()

Unnamed: 0,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epithelial_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [6]:
print(df.shape)

(699, 10)


In [7]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
clump_thickness,699.0,4.41774,2.815741,1.0,2.0,4.0,6.0,10.0
uniform_cell_size,699.0,3.134478,3.051459,1.0,1.0,1.0,5.0,10.0
uniform_cell_shape,699.0,3.207439,2.971913,1.0,1.0,1.0,5.0,10.0
marginal_adhesion,699.0,2.806867,2.855379,1.0,1.0,1.0,4.0,10.0
single_epithelial_size,699.0,3.216023,2.2143,1.0,2.0,2.0,4.0,10.0
bland_chromatin,699.0,3.437768,2.438364,1.0,2.0,3.0,5.0,10.0
normal_nucleoli,699.0,2.866953,3.053634,1.0,1.0,1.0,4.0,10.0
mitoses,699.0,1.589413,1.715078,1.0,1.0,1.0,1.0,10.0
class,699.0,2.689557,0.951273,2.0,2.0,2.0,4.0,4.0


In [8]:
X = np.array(df.drop(columns='class', axis=1))
y = np.array(df['class'])

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

scoring='accuracy'

models = []
models.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
models.append(('SVM', SVC(gamma='auto')))

In [9]:
results = []
names = []

for name, model in models:
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = '%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())
    print(msg)

KNN: 0.967760 (0.029738)
SVM: 0.960617 (0.028622)


In [10]:
# prediction
for name, model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(f'{name}: {accuracy_score(y_test, predictions)}')
    print(classification_report(y_test, predictions))

KNN: 0.9857142857142858
              precision    recall  f1-score   support

           2       0.99      0.99      0.99        99
           4       0.98      0.98      0.98        41

    accuracy                           0.99       140
   macro avg       0.98      0.98      0.98       140
weighted avg       0.99      0.99      0.99       140

SVM: 0.9571428571428572
              precision    recall  f1-score   support

           2       1.00      0.94      0.97        99
           4       0.87      1.00      0.93        41

    accuracy                           0.96       140
   macro avg       0.94      0.97      0.95       140
weighted avg       0.96      0.96      0.96       140



In [22]:
clf = SVC(gamma='auto')

clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print(accuracy)

example = np.array([[4, 2, 1, 1, 2, 1, 2, 1, 1]])
example = example.reshape(len(example), -1)
prediction = clf.predict(example)
print(prediction)

0.9571428571428572
[2]


In [23]:
print(df.loc[9])

clump_thickness           4
uniform_cell_size         2
uniform_cell_shape        1
marginal_adhesion         1
single_epithelial_size    2
bare_nuclei               1
bland_chromatin           2
normal_nucleoli           1
mitoses                   1
class                     2
Name: 9, dtype: object


In [24]:
example1 = np.array([[5, 3, 3, 3, 2, 3, 4, 4, 1]])
example1 = example1.reshape(len(example1), -1)
prediction = clf.predict(example1)
print(prediction)
print(df.loc[9])

[4]
clump_thickness           4
uniform_cell_size         2
uniform_cell_shape        1
marginal_adhesion         1
single_epithelial_size    2
bare_nuclei               1
bland_chromatin           2
normal_nucleoli           1
mitoses                   1
class                     2
Name: 9, dtype: object


In [25]:
example2 = np.array([[7, 3, 2, 10, 5, 10, 5, 4, 4]])
example2 = example2.reshape(len(example2), -1)
prediction = clf.predict(example2)
print(prediction)
print(df.loc[9])

[4]
clump_thickness           4
uniform_cell_size         2
uniform_cell_shape        1
marginal_adhesion         1
single_epithelial_size    2
bare_nuclei               1
bland_chromatin           2
normal_nucleoli           1
mitoses                   1
class                     2
Name: 9, dtype: object


In [29]:
example3 = np.array([[4, 2, 1, 1, 2, 1, 2, 1, 1]])
example3 = example3.reshape(len(example3), -1)
prediction = clf.predict(example3)
print(prediction)
print(df.loc[9])

[2]
clump_thickness           4
uniform_cell_size         2
uniform_cell_shape        1
marginal_adhesion         1
single_epithelial_size    2
bare_nuclei               1
bland_chromatin           2
normal_nucleoli           1
mitoses                   1
class                     2
Name: 9, dtype: object
