In [2]:
import numpy as np 
from sklearn import preprocessing, cross_validation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.metrics import classification_report, accuracy_score
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import pandas as pd



In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
#these are the 11 factors that breast cancer pathologists will use when looking at a potential cancerous cell
names = ['id','clump_thickness','uniform_cell_size','uniform_cell_shape',
        'marginal_adhesion','single_epithelial_size','bare_nuclei','bland_chromatin','normal_nucleoli','mitoses','class']
df = pd.read_csv(url, names=names)

In [4]:
df.replace('?',-99999,inplace=True)
print(df.axes)
df.drop(['id'],1,inplace=True)

print(df.shape)

[RangeIndex(start=0, stop=699, step=1), Index(['id', 'clump_thickness', 'uniform_cell_size', 'uniform_cell_shape',
       'marginal_adhesion', 'single_epithelial_size', 'bare_nuclei',
       'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class'],
      dtype='object')]
(699, 10)


In [5]:
print(df.loc[1])

clump_thickness            5
uniform_cell_size          4
uniform_cell_shape         4
marginal_adhesion          5
single_epithelial_size     7
bare_nuclei               10
bland_chromatin            3
normal_nucleoli            2
mitoses                    1
class                      2
Name: 1, dtype: object


In [9]:
X = np.array(df.drop(['class'],1))
y = np.array(df['class'])
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,y,test_size=0.2)

In [10]:
seed = 8
scoring = 'accuracy'

In [11]:
models = []
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5)))
models.append(('SVM', SVC())

In [23]:
results = []
names = []

for name,model in models:
    kfold = model_selection.KFold(n_splits=10, random_state = seed)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring = scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    
#between the two models we are testing (K Nearest Neighbors and Support Vector Machines), KNN narrowly beats SVM 


KNN: 0.971396 (0.021413)
SVM: 0.960649 (0.029659)


In [24]:
for name, model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions))
    
#precision refers to the number of false-positives the model has outputted
#recall refers to the number of false-negatives
#f1-score refers to a combination of both precision and recall 

KNN
0.957142857143
             precision    recall  f1-score   support

          2       0.96      0.98      0.97        87
          4       0.96      0.92      0.94        53

avg / total       0.96      0.96      0.96       140

SVM
0.957142857143
             precision    recall  f1-score   support

          2       0.99      0.94      0.96        87
          4       0.91      0.98      0.95        53

avg / total       0.96      0.96      0.96       140



KNN
0.957142857143
             precision    recall  f1-score   support

          2       0.96      0.98      0.97        87
          4       0.96      0.92      0.94        53

avg / total       0.96      0.96      0.96       140

SVM
0.957142857143
             precision    recall  f1-score   support

          2       0.99      0.94      0.96        87
          4       0.91      0.98      0.95        53

avg / total       0.96      0.96      0.96       140

