In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
# use seaborn plotting defaults
import seaborn as sns; sns.set()
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC # "Support vector classifier"

# 6.2 SVM for Classification - Solution

Sources for Notebook:
- Andreas Mueller, Scipy 2016 Sklearn
- Jake VanderPlas, Python for Data Science Handbook

## 6.2.1 Example 1 - Linear Decision Boundaries

### 6.2.1.1 Make data

In [None]:
from sklearn.datasets.samples_generator import make_blobs
X, y = make_blobs(n_samples=50, centers=2,
                  random_state=0, cluster_std=0.60)

### 6.2.1.2 Plot data

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn');

### 6.2.1.3 Preprocess data for SVM: scaling features (0-1)

In [None]:
# Compute the minimum value per feature 
min_on_training = X.min(axis=0)
# Compute the range of each feature (max - min)
range_on_training = (X - min_on_training).max(axis=0)

# subtract the min, divide by range
# afterward, min=0 and max=1 for each feature
X_scaled = (X - min_on_training) / range_on_training
print("Minimum for each feature\n{}".format(X_scaled.min(axis=0)))
print("Maximum for each feature\n {}".format(X_scaled.max(axis=0)))

In [None]:
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, s=50, cmap='autumn');

### 6.2.1.4 Split data in train and  test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,
                                                    test_size=0.25,
                                                    random_state=1234,
                                                    stratify=y)

### 6.2.1.5 Fit model: linear SVC 

In [None]:
#fit the model
model = SVC(kernel='linear', C=10000000000)
model.fit(X_train, y_train)

### 6.2.1.5 Accuracy on test set

In [None]:
model.score(X_test, y_test)

what is happening? Let's plot the decision boundaries of the svm model

In [None]:
def plot_svc_decision_function(model, ax=None, plot_support=True):
    """Plot the decision function for a 2D SVC"""
    if ax is None:
        ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    
    # create grid to evaluate model
    x = np.linspace(xlim[0], xlim[1], 30)
    y = np.linspace(ylim[0], ylim[1], 30)
    Y, X = np.meshgrid(y, x)
    xy = np.vstack([X.ravel(), Y.ravel()]).T
    P = model.decision_function(xy).reshape(X.shape)
    
    # plot decision boundary and margins
    ax.contour(X, Y, P, colors='k',
               levels=[-1, 0, 1], alpha=0.5,
               linestyles=['--', '-', '--'])
    
    # plot support vectors
    if plot_support:
        ax.scatter(model.support_vectors_[:, 0],
                   model.support_vectors_[:, 1],
                   s=300, linewidth=1, facecolors='none');
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

In [None]:
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(model);

This is the dividing line that maximizes the margin between the two sets of points.
Notice that a few of the training points just touch the margin: they are indicated by the black circles in this figure.
These points are the pivotal elements of this fit, and are known as the *support vectors*, and give the algorithm its name.
In Scikit-Learn, the identity of these points are stored in the ``support_vectors_`` attribute of the classifier.

In [None]:
model.support_vectors_

## 6.2.2 Example 2 - Non-Linear Decision Boundaries

### 6.2.2.1 Get the data and construct the model

In [None]:
from sklearn.datasets.samples_generator import make_circles
X, y = make_circles(1000, factor=.1, noise=.1)

# Compute the minimum value per feature 
min_on_training = X.min(axis=0)
# Compute the range of each feature (max - min)
range_on_training = (X - min_on_training).max(axis=0)

# subtract the min, divide by range
# afterward, min=0 and max=1 for each feature
X = (X - min_on_training) / range_on_training
print("Minimum for each feature\n{}".format(X.min(axis=0)))
print("Maximum for each feature\n {}".format(X.max(axis=0)))

#make training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=1234,
                                                    stratify=y)
#clf = SVC(kernel='rbf')

The linear kernel of SVC Can't handle non-linear boundaries as the example below illustrates. 

In [None]:
model2 = SVC(kernel='linear').fit(X, y)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(model2, plot_support=False);

We will need to find another kernel that can fit non linear decision boundaries

### 6.2.2.2 Fit model: SVC with rbf kernel

In [None]:
clf = SVC(kernel='rbf', C=1000000, gamma = 'auto')
clf.fit(X, y)

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(clf)
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
            s=300, lw=1, facecolors='none');

Using this kernelized support vector machine, we learn a suitable nonlinear decision boundary.
This kernel transformation strategy is used often in machine learning to turn fast linear methods into fast nonlinear methods, especially for models in which the kernel trick can be used.

## 6.2.3 Tuning the SVM Parameters: C and Gamma

Visualisation of how different C and Gamma values change the decision boundary.

- Large C / gamma: Lower bias, high variance.
- Small C / gamma: Higher bias, lower variance

from: http://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html:

Intuitively, the gamma parameter defines how far the influence of a single training example reaches, with low values meaning ‘far’ and high values meaning ‘close’. The gamma parameters can be seen as the inverse of the radius of influence of samples selected by the model as support vectors.
The C parameter trades off misclassification of training examples against simplicity of the decision surface. A low C makes the decision surface smooth, while a high C aims at classifying all training examples correctly by giving the model freedom to select more samples as support vectors.

In [None]:
from plot_svm_interactive import *

In [None]:
plot_svm_interactive()

## 6.2.4 Task 2

Load the data 'SVM.csv'.
- Look at your data. 
- Make a linear SVM model to predict y using x1 and x2. Before running it:
    - Do you think it will perform well?
    - Where will the margin be?
- Evaluate your model
- Make a non-linear SVM model. Use kernel='rbf' and try C=10 and C=10000.
- Evaluate the model

## 6.2.5 Solution

### 6.2.5.1 Explore the data

In [None]:
svm_df = pd.read_csv('data/SVM.csv', index_col=0)

In [None]:
svm_df.sample(3)

In [None]:
sns.lmplot('x1', 'x2', data=svm_df, hue='y', fit_reg=False)

### 6.2.5.2 Create training and testing data

In [None]:
svm_mx = svm_df.as_matrix()

In [None]:
X_svm = svm_mx[:,:2]

In [None]:
X_svm.shape

In [None]:
# Conver the dependent into a binary classification 
Y_svm = svm_mx[:,2].astype(int) - 1

In [None]:
Y_svm[:10]

In [None]:
Y_svm.shape

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_svm, 
                                                    Y_svm,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=Y_svm)

In [None]:
ss = StandardScaler()
ss.fit(X_train)

### 6.2.5.3 Test SVM with linear kernel

In [None]:
svm_linear = SVC(kernel='linear')

In [None]:
svm_linear.fit(ss.transform(X_train), Y_train)

In [None]:
pred_linear = svm_linear.predict(ss.transform(X_test))

In [None]:
test_df = pd.DataFrame(X_test, columns=['x1', 'x2'])
test_df['actual'] = pd.Series(Y_test)
test_df['pred_lin'] = pd.Series(pred_linear)
test_df.sample(3)

In [None]:
sns.lmplot('x1', 
           'x2', 
           data=test_df, 
           hue='actual', 
           fit_reg=False)

sns.lmplot('x1', 
           'x2', 
           data=test_df, 
           hue='pred_lin', 
           fit_reg=False)

**When looking at these two plots it is apparent that the svm classifier draws a line which does not effectively detect the outter ends of the 'semi-circles'**

In [None]:
# Let's look at the accuracy score for this model
accuracy_score(Y_test, pred_linear)

### 6.2.5.4 Look at how a model with a non-linear kernel performs on this problem

In [None]:
# Let's use a radial basis function as the kernel
svm_nonlinear = SVC(kernel='rbf', C=10000)

In [None]:
svm_nonlinear.fit(ss.transform(X_train), Y_train)

In [None]:
pred_nonlinear = svm_nonlinear.predict(ss.transform(X_test))

In [None]:
test_df['pred_nonlin'] = pd.Series(pred_nonlinear)

In [None]:
sns.lmplot('x1', 
           'x2', 
           data=test_df, 
           hue='actual', 
           fit_reg=False)

sns.lmplot('x1', 
           'x2', 
           data=test_df, 
           hue='pred_lin', 
           fit_reg=False)

sns.lmplot('x1', 
           'x2', 
           data=test_df, 
           hue='pred_nonlin', 
           fit_reg=False)

Let's visualise the decision boundaries - note that this is in the standardised scale.

In [None]:
plt.scatter(ss.transform(X_train)[:, 0], ss.transform(X_train)[:, 1], c=Y_train, s=50, cmap='autumn')
plot_svc_decision_function(svm_nonlinear)
plt.scatter(svm_nonlinear.support_vectors_[:, 0], svm_nonlinear.support_vectors_[:, 1],
            s=300, lw=1, facecolors='none');

**This looks better, the points in the region (40,60) seem to be classified correctly now! **

In [None]:
# Let's look at the accuracy score for this model
accuracy_score(Y_test, pred_nonlinear)