# Support Vector Machine

In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [59]:
df = pd.read_csv('breast-cancer-wisconsin.data')

## Data Exploration

In [60]:
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marge_adhesion,single_epith_cell,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


## Data Cleaning

#### There are many values in the dataset that are marked as '?'. We are going to replace them with a *Outlier*. 
We also could have dropped those rows altogether.

In [61]:
df.replace('?',-99999,inplace=True)
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marge_adhesion,single_epith_cell,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


#### Now, we need to drop the id column from the DataFrame, as it's existence would severly lower the accuracy of the K Nearest Neighbors algorithm.

In [62]:
df.drop(['id'], 1, inplace=True)
df.head()

Unnamed: 0,clump_thickness,unif_cell_size,unif_cell_shape,marge_adhesion,single_epith_cell,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


## Building the Model


### Creating features and Labels

In [63]:
X = np.array(df.drop(['class'],1))
y = np.array(df['class'])

### Preprocessing/Scaling
`sklearn.preprocessing` package provides several common utility functions and transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators.

*Standardization of datasets* is a common requirement for many machine learning estimators implemented in scikit-learn; they might behave badly if the individual features do not more or less look like standard normally distributed data: *Gaussian with zero mean and unit variance.*

In practice we often ignore the shape of the distribution and just transform the data to center it by removing the mean value of each feature, then scale it by dividing non-constant features by their standard deviation.

### Splitting the dataset

In [64]:
from sklearn.model_selection import train_test_split

In [65]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=99)

### Preprocessing

### Importing the Model

In [66]:
from sklearn.svm import SVC

### Create and fit a    Classifier

In [79]:
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

## Prediction

In [80]:
pred = svm.predict(X_test)
print(pred)

[4 4 4 2 2 2 4 2 2 2 4 2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 4 2 2 2 2 4 2 2 2 4
 4 2 2 2 2 2 4 2 2 2 4 4 2 4 2 2 2 2 2 2 4 2 2 2 2 2 2 2 4 2 2 4 2 2 2 2 2
 4 4 4 2 4 4 4 2 2 2 2 4 2 2 2 2 4 2 2 2 4 4 2 2 2 4 2 2 4 2 2 2 4 4 4 2 2
 2 4 2 2 4 2 4 2 4 4 2 4 2 2 2 2 2 2 4 4 4 4 2 4 4 2 4 4 2]


## Evaluation

In [81]:
accuracy = svm.score(X_test, y_test)
accuracy

0.9642857142857143

In [82]:
from sklearn.metrics import classification_report, confusion_matrix

In [83]:
print(confusion_matrix(y_test,pred))

[[92  3]
 [ 2 43]]


In [78]:
ex = np.array([[4,2,1,1,1,2,3,2,1],[10,4,5,2,1,2,5,2,1],[4,2,1,1,1,2,3,2,1]])
ex = ex.reshape(len(ex),-1)

ex_pred = svm.predict(ex)
print(ex_pred)

[2 4 2]
