# Support Vector Machine

In [77]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [78]:
df = pd.read_csv('breast-cancer-wisconsin.data')

## Data Exploration

In [79]:
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marge_adhesion,single_epith_cell,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


## Data Cleaning

#### There are many values in the dataset that are marked as '?'. We are going to replace them with a *Outlier*. 
We also could have dropped those rows altogether.

In [80]:
df.replace('?',-99999,inplace=True)
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marge_adhesion,single_epith_cell,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


#### Now, we need to drop the id column from the DataFrame, as it's existence would severly lower the accuracy of the K Nearest Neighbors algorithm.

In [81]:
df.drop(['id'], 1, inplace=True)
df.head()

Unnamed: 0,clump_thickness,unif_cell_size,unif_cell_shape,marge_adhesion,single_epith_cell,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


## Building the Model


### Creating features and Labels

In [82]:
X = np.array(df.drop(['class'],1))
y = np.array(df['class'])

### Preprocessing/Scaling
`sklearn.preprocessing` package provides several common utility functions and transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators.

*Standardization of datasets* is a common requirement for many machine learning estimators implemented in scikit-learn; they might behave badly if the individual features do not more or less look like standard normally distributed data: *Gaussian with zero mean and unit variance.*

In practice we often ignore the shape of the distribution and just transform the data to center it by removing the mean value of each feature, then scale it by dividing non-constant features by their standard deviation.

### Splitting the dataset

In [83]:
from sklearn.model_selection import train_test_split

In [84]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)

### Preprocessing

In [85]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [86]:
sc = StandardScaler()
sc.fit(X_train, y_train)



StandardScaler(copy=True, with_mean=True, with_std=True)

### Importing the Model

In [87]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVR
from sklearn.svm import SVR

### Create and fit a Classifier
The LinearSVC class regularizes the bias term, so you should center the training set first by subtracting its mean. This is
automatic if you scale the data using the StandardScaler. Moreover, make sure you set the loss hyperparameter to "hinge", as
it is not the default value. Finally, for better performance you should set the dual hyperparameter to False, unless there are more features than training instances (we will discuss duality later in the chapter)

In [88]:
# SVC

svc_clf = SVC()
svc_clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [89]:
# LinearSVC[LinearSVC(C=1, loss="hinge")]

linearsvc_clf = LinearSVC(C=1, loss="hinge")
linearsvc_clf.fit(X_train, y_train)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)

In [90]:
# Another option [SVC(kernel="linear", C=1)]

svc_lin_clf = SVC(kernel="linear", C=1)
svc_lin_clf.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [91]:
# Another option [SGDClassifier]

svm_sgd_clf = SGDClassifier(loss='hinge',max_iter=5, tol=None)
svm_sgd_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [92]:
# Many datasets are not linearly separable. Nonlinear SVM Classification using 
# PolynomialFeatures transformer [LinearSVC(C=10, loss="hinge")]
X_t,y_t= X_train,y_train
pf = PolynomialFeatures(degree=3)
pf.fit(X_t, y_t)
polynomial_svm_clf = LinearSVC(C=10, loss="hinge")
polynomial_svm_clf.fit(X_t, y_t)

LinearSVC(C=10, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)

In [93]:
# Polynomial Kernel: The 'kernel trick' makes it possible to get the same result as if you added many 
# polynomial features, even with very high-degree polynomials[SVC(kernel="poly", degree=3, coef0=1, C=5))]

poly_kernel_svm_clf = SVC(kernel="poly", degree=2, coef0=1, C=5)
poly_kernel_svm_clf.fit(X_train, y_train)

SVC(C=5, cache_size=200, class_weight=None, coef0=1,
  decision_function_shape='ovr', degree=2, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [94]:
# Gaussian RBF Kernel
# Gamma acts like a regularization hyperparameter: if  overfitting, reduce & if underfitting,  increase (similar to the C).

rbf_kernel_svm_clf = SVC(kernel="rbf", gamma=1, C=0.001)
rbf_kernel_svm_clf.fit(X_train, y_train)

SVC(C=0.001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

## Prediction

In [95]:
pred_svc = svc_clf.predict(X_test)
pred_linearsvc = linearsvc_clf.predict(X_test)
pred_svc_lin = svc_lin_clf.predict(X_test)
pred_svm_sgd = svm_sgd_clf.predict(X_test)
pred_polynomial_svm = polynomial_svm_clf.predict(X_test)
pred_poly_kernel_svm = poly_kernel_svm_clf.predict(X_test)
pred_rbf_kernel_svm = rbf_kernel_svm_clf.predict(X_test)

## Evaluation

In [96]:
accuracy = svc_clf.score(X_test, y_test)
accuracy

0.93571428571428572

In [97]:
accuracy = linearsvc_clf.score(X_test, y_test)
accuracy

0.94999999999999996

In [98]:
accuracy = svc_lin_clf.score(X_test, y_test)
accuracy

0.94999999999999996

In [99]:
accuracy = svm_sgd_clf.score(X_test, y_test)
accuracy

0.65000000000000002

In [100]:
accuracy = polynomial_svm_clf.score(X_test, y_test)
accuracy

0.97142857142857142

In [101]:
accuracy = poly_kernel_svm_clf.score(X_test, y_test)
accuracy

0.32142857142857145

In [102]:
accuracy = rbf_kernel_svm_clf.score(X_test, y_test)
accuracy

0.66428571428571426

In [103]:
from sklearn.metrics import classification_report, confusion_matrix

In [104]:
print(confusion_matrix(y_test,pred_linearsvc))

[[88  5]
 [ 2 45]]


In [105]:
print(confusion_matrix(y_test,pred_svc_lin))

[[86  7]
 [ 0 47]]


In [106]:
print(confusion_matrix(y_test,pred_svc))

[[84  9]
 [ 0 47]]


In [107]:
print(confusion_matrix(y_test,pred_svm_sgd))

[[91  2]
 [47  0]]


In [108]:
print(confusion_matrix(y_test,pred_polynomial_svm))

[[89  4]
 [ 0 47]]


In [109]:
print(confusion_matrix(y_test,pred_poly_kernel_svm))

[[45 48]
 [47  0]]


In [110]:
print(confusion_matrix(y_test,pred_rbf_kernel_svm))

[[93  0]
 [47  0]]


In [111]:
ex = np.array([[4,2,1,1,1,2,3,2,1],[10,4,5,2,1,2,5,2,1],[4,2,1,1,1,2,3,2,1]])
ex = ex.reshape(len(ex),-1)

ex_pred = svm.predict(ex)
print(ex_pred)

NameError: name 'svm' is not defined