In [1]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import plot as plt
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [2]:
# Load Dataset
names = ['id', 'clump_thickness', 'uniform_cell_size', 'uniform_cell_shape',
       'marginal_adhesion', 'single_epithelial_size', 'bare_nuclei',
       'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']
df = pd.read_csv('breast-cancer-wisconsin.csv',names=names)

In [3]:
# Preprocess the data
df.replace('?',-99999, inplace=True)
print(df.axes)
df.drop(['id'], axis = 1, inplace=True)

[RangeIndex(start=0, stop=699, step=1), Index(['id', 'clump_thickness', 'uniform_cell_size', 'uniform_cell_shape',
       'marginal_adhesion', 'single_epithelial_size', 'bare_nuclei',
       'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class'],
      dtype='object')]


In [4]:
# Let explore the dataset and do a few visualizations
print(df.loc[10])
print(df.iloc[1,:])
display(df)
# Print the shape of the dataset
print(df.shape)

clump_thickness           1
uniform_cell_size         1
uniform_cell_shape        1
marginal_adhesion         1
single_epithelial_size    1
bare_nuclei               1
bland_chromatin           3
normal_nucleoli           1
mitoses                   1
class                     2
Name: 10, dtype: object
clump_thickness            5
uniform_cell_size          4
uniform_cell_shape         4
marginal_adhesion          5
single_epithelial_size     7
bare_nuclei               10
bland_chromatin            3
normal_nucleoli            2
mitoses                    1
class                      2
Name: 1, dtype: object


Unnamed: 0,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epithelial_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


(699, 10)


In [5]:
# Describe the dataset
print(df.describe())

       clump_thickness  uniform_cell_size  uniform_cell_shape  \
count       699.000000         699.000000          699.000000   
mean          4.417740           3.134478            3.207439   
std           2.815741           3.051459            2.971913   
min           1.000000           1.000000            1.000000   
25%           2.000000           1.000000            1.000000   
50%           4.000000           1.000000            1.000000   
75%           6.000000           5.000000            5.000000   
max          10.000000          10.000000           10.000000   

       marginal_adhesion  single_epithelial_size  bland_chromatin  \
count         699.000000              699.000000       699.000000   
mean            2.806867                3.216023         3.437768   
std             2.855379                2.214300         2.438364   
min             1.000000                1.000000         1.000000   
25%             1.000000                2.000000         2.000000   


In [6]:
# Create X and Y datasets for training
X = np.array(df.drop(['class'], axis = 1))
y = np.array(df['class'])

X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.2)

In [7]:
# Testing Options
seed = 8
scoring = 'accuracy'

In [8]:
results = []
model = SVC()
skfolds = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train,cv=skfolds, scoring=scoring)
results.append(cv_results)
msg="%s:%f, %f" % ("SVM",cv_results.mean(),cv_results.std())
print(msg)

SVM:0.644026, 0.003506


In [12]:
# Make predictions on validation dataset
model.fit(X_train,Y_train)
predictions = model.predict(X_validation)
sklearn.metrics.f1_score(Y_validation, predictions, average='weighted', labels=np.unique(y_pred))
print(accuracy_score(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
# Accuracy - ratio of correctly predicted observation to the total observations. 
# Precision - (false positives) ratio of correctly predicted positive observations to the total predicted positive observations
# Recall (Sensitivity) - (false negatives) ratio of correctly predicted positive observations to the all observations in actual class - yes.
# F1 score - F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false 

NameError: name 'sklearn' is not defined

In [10]:
clf = SVC()

clf.fit(X_train, Y_train)
accuracy = clf.score(X_validation, Y_validation)
print(accuracy)

example_measures = np.array([[4,2,1,1,1,2,3,2,1]])
example_measures = example_measures.reshape(len(example_measures), -1)
prediction = clf.predict(example_measures)
print(prediction)

0.7
[2]
