## Performing breast cancer analysis using Support Vector Machines (SVMs) ##
![Breast cancer analysis](Breast-Cancer.jpg)

In [2]:
import numpy as np
from sklearn import cross_validation, svm
import pandas as pd

In [3]:
#reading in the dataset from a csv file
df = pd.read_csv('breast-cancer-wisconsin.data.csv')
#getting some statistics before playing with the data
df.describe()

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marinal Adhension,Single Epithelial Cell Size,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [4]:
#some values in 'Bland Chromatin' column are missing, we'll replace them with the column's mean
df.replace('?', 3, inplace=True)

In [5]:
#displaying first five examples of our dataset
df.head()

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marinal Adhension,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id                             699 non-null int64
Clump Thickness                699 non-null int64
Uniformity of Cell Size        699 non-null int64
Uniformity of Cell Shape       699 non-null int64
Marinal Adhension              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    699 non-null object
Bland Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class                          699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [7]:
#getting rid of the id column since it provides no information
df.drop(['id'], 1, inplace=True)

In [8]:
#let's see what it looks like now
df.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marinal Adhension,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [9]:
#setting X to the feature matrix
X = np.array(df.drop(['Class'],1))
#setting Y to the labels vector
y = np.array(df['Class'])

In [11]:
#making sure we obtained correct arrays
print(X.shape, y.shape)
print("----\nX:")
print(X)
print("----\ny:")
print(y[:10])

(699, 9) (699,)
----
X:
[[5 1 1 ..., 3 1 1]
 [5 4 4 ..., 3 2 1]
 [3 1 1 ..., 3 1 1]
 ..., 
 [5 10 10 ..., 8 10 2]
 [4 8 6 ..., 10 6 1]
 [4 8 8 ..., 10 4 1]]
----
y:
[2 2 2 2 2 4 2 2 2 2]


In [12]:
#dividing our dataset into training and cross-validation set
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

In [13]:
#creating Support Vector Clasifier
clf = svm.SVC()
#fitting it on our dataset
clf.fit(X_train, y_train)

#now, after we've trained our SVM classifier, we can display the accuracy
accuracy = clf.score(X_test, y_test)
print("Accuracy: " + str(round(accuracy*100,2)) + "%")

Accuracy: 97.14%
