# One-class SVM

>One-class SVM is an unsupervised algorithm that learns a decision function for novelty detection: classifying new data as similar or different to the training set.

[Scikit-learn documentation](https://scikit-learn.org/stable/auto_examples/svm/plot_oneclass.html)

In [1]:
# Do the math ! Do the math !
import pandas as pd
import numpy as np

# Matplotlib imports
import matplotlib.pyplot as plt
import matplotlib.font_manager
from matplotlib.colors import ListedColormap

# IPython widgets and display
from IPython.display import display, clear_output
import ipywidgets as widgets

# Every scikit-learn import
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.pipeline import Pipeline

from Data import Data

In [2]:
data = Data()

columns = ["rrTime" + str(index) for index in range(15)]
columns.extend(["ppTime" + str(index) for index in range(15)])
columns.extend(["rpTime" + str(index) for index in range(15)])
columns.extend(["prTime" + str(index) for index in range(15)])

xx, yy = np.meshgrid(np.linspace(-10, 10, 500), np.linspace(-10, 10, 500))
Xpred = np.array([xx.ravel(), yy.ravel()] + [np.repeat(0, xx.ravel().size) for _ in range(58)]).T

## Extracting the data

Choose the user tested in the One-Class SVM. The user's index will be used to build a training and testing set solely containing samples from the chosen user.

The rest of the data will be used as outliers.

In [3]:
def getUserData(userId):
    # We select the data of the user
    studiedUserData = data.table.loc[(data.table["user_id"] == userId), columns].astype('float')
    # We select the rest of the data. It will be used as outliers.
    otherUsersData = data.table.loc[(data.table["user_id"] != userId), columns].astype('float')
    
    return [studiedUserData, otherUsersData]


def buildData(userId, trainSize):
    studiedUserData, otherUsersData = getUserData(userId)

    # Use a ShuffleSplit to select a random subset of the user's data
    splitter = ShuffleSplit(n_splits=1, train_size=trainSize, test_size=None)
    X_train_indices, X_test_indices = list(splitter.split(studiedUserData))[0]

    X_train = studiedUserData.iloc[X_train_indices]
    X_test = studiedUserData.iloc[X_test_indices]

    # Abnormal data (used only for prediction)
    X_outliers = otherUsersData
    
    return [X_train, X_test, X_outliers]

## Creating and training the SVM

This block will define a `learn(...)` function that can be used on built data from the `buildData(...)` function.

### The Pipeline

The pipeline used includes a MinMaxScaler and an OneClassSVM classifier.

### MinMaxScaler parameters

According to https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf data for SVM should be linearly scaled to the range \[0, 1\] or \[-1, 1\] Also, the scaler should be fitted on the training data only before transforming the rest of the data.

### OneClassSVM parameters

In the case where there is much more samples than features, rbf kernel is usually the way to go. Setting the kernel to linear raised A LOT of false positive on abnormal data (~97% !!)

In [4]:
def learn(train, nu=0.01, gamma=0.01):
    
    # Create the scaler and the classifier
    scaler = MinMaxScaler((0, 1))
    classifier = OneClassSVM(kernel="rbf")
    
    # Create the pipeline
    pipeline = Pipeline([('scaler', scaler), ('classifier', classifier)])
    pipeline.set_params(classifier__nu=nu, classifier__gamma=gamma)
    
    # Fit the data
    pipeline.fit(X_train)
    
    return pipeline
    
    
def scoreFull(pipeline, train, test, outliers):
    print(len(test))
    
    # Predict on all data
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    y_pred_outliers = pipeline.predict(X_outliers)
    
    TP = y_pred_test[y_pred_test == 1].size
    TN = y_pred_outliers[y_pred_outliers == -1].size
    FP = y_pred_outliers[y_pred_outliers == 1].size
    FN = y_pred_test[y_pred_test == -1].size
    
    return {
        "TP": TP,
        "TN": TN,
        "FP": FP,
        "FN": FN,
        "FNT": y_pred_train[y_pred_train == -1].size,
        "precision": TP / (TP + FP + 1),
        "recall": TP / (TP + FN + 1),
        "f1": (2 * TP) / (2 * TP + FP + FN),
        "clf": pipeline
    }


def score(pipeline, train, test):
    
    # Predict on all data
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    
    TP = y_pred_test[y_pred_test == 1].size
    FN = y_pred_test[y_pred_test == -1].size
    
    return {
        "TP": TP,
        "FN": FN,
        "FNT": y_pred_train[y_pred_train == -1].size,
        "recall": TP / (TP + FN + 1),
        "clf": pipeline
    }

## Learn and score one user

In [5]:
user = widgets.Dropdown(options=data.users, value=1, description='User:', disabled=False)
display(user)

Dropdown(description='User:', options=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, …

In [6]:
X_train, X_test, X_outliers = buildData(user.value, 10)
model = learn(X_train, nu=0.9500000000000001, gamma=1e-08)
results = scoreFull(model, X_train, X_test, X_outliers)

print("False negative on training data : {0}/{1}, {2:.3f}%".format(results["FNT"],
                                                                  X_train.shape[0],
                                                                  results["FNT"] / X_train.shape[0] * 100))
print("False negative on testing data : {0}/{1}, {2:.3f}%".format(results["FN"],
                                                                 X_test.shape[0],
                                                                 results["FN"] / X_test.shape[0] * 100))
print("False positive on abnormal data : {0}/{1}, {2:.3f}%".format(results["FP"],
                                                                  X_outliers.shape[0],
                                                                  results["FP"] / X_outliers.shape[0] * 100))

print("precision: {},\nrecall: {},\nf1: {}".format(results["precision"], results["recall"], results["f1"]))

45
False negative on training data : 10/10, 100.000%
False negative on testing data : 43/45, 95.556%
False positive on abnormal data : 0/3715, 0.000%
precision: 0.6666666666666666,
recall: 0.043478260869565216,
f1: 0.0851063829787234


## Plotting the decision function

Apparently, this is not very significative. When the 60-dimensional points are projected on a 2-dimensional plane, abnormal data overlap the area included inside the decision function.

Plus, I do have issues for plotting the frontiers.

To have a more significative representation, we should search for the wo most important features by using features selection techniques as PCA, ANOVA or factor analysis. Then get the decision function for these features and plot it. The overlap should be minimized and the plot more representative.

This cell is disabled by setting it to raw text. Turn it back to Python to use it.

## Searching the best parameters with cross-validation

To find the best hyperparameters of our SVM with cross-validation, we need to know what should be our scoring function. Accuracy is by default the most simple score for parameters optimization. But in some cases, we may resort to precision, recall or f1-score.

[This article](https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9) sums up the how and why of those differents scores.

In our case, a false negative does not bring any harm to our user, only some annoyances (of course, there is a limit in this annoyance). However, a false positive is to be avoided as it would means that the user's computer is compromised. At first glance, using the precision may be the way to go.

F-1 score is particularly useful when we need a balance between the precision and the recall, and when there is an uneven class distribution, which is our case when we make our predictions on all the impostor data.

### Simple cross-validation

**This only applies when we have impostor data and test data to feed to the model**

In [None]:
grid = {'gamma' : np.logspace(-9, 3, 13),
        'nu' : np.linspace(0.01, 0.99, 99)}

usersCounts = np.array(data.usersCounts)
resultsTable = pd.DataFrame(columns=["User", "Best f1", "Precision", "Recall", "Gamma", "Nu"])

clear_output(wait=False)
display(resultsTable)

for user in usersCounts[(usersCounts[:, 1] > 30)]:
    X_train, X_test, X_outliers = buildData(user[0], 10)
    
    smallestDiff = 1.0
    smallestDiffParameters = None

    biggestF1 = 0
    biggestF1Parameters = None
    bestRecall = 0
    bestPrecision = 0

    for hyperparams in ParameterGrid(grid):
        results = learn(X_train, X_test, X_outliers, nu=hyperparams["nu"], gamma=hyperparams["gamma"])

        if results["f1"] > biggestF1:
            biggestF1 = results["f1"]
            biggestF1Parameters = hyperparams
            bestRecall = results["recall"]
            bestPrecision = results["precision"]
    
    resultsTable.loc[resultsTable.size] = [
        user[0],
        biggestF1,
        bestRecall,
        bestPrecision,
        biggestF1Parameters["gamma"],
        biggestF1Parameters["nu"]
    ]
    
    resultsTable["User"] = resultsTable["User"].astype(np.int32)
    
    clear_output(wait=True)
    display(resultsTable)

### Better cross-validation

Reference for this part : *Eude, T & Chang, Chuan. (2017). One-class SVM for biometric authentication by keystroke dynamics for remote evaluation. Computational Intelligence. 34. 10.1111/coin.12122.*

![Optimization protocol](ocsvm-optimization.png)

The positive data is split (as we want, in our case we may limit it to ten samples to simulates our use case). The first part constitute our training data whereas the second is mixed with impostor data and is used later in the evaluation of the model.

In this figure, the Data 1 used for the cross-validation (made by a subset of the positive data) is folded 5 times and used in a classic cross-validation algorithm where the recall is used as the score.

In [8]:
studiedUserData, otherUsersData = getUserData(1)

# Use a ShuffleSplit to select 80% of the user's data
splitter = ShuffleSplit(n_splits=1, train_size=0.8, test_size=None)
dataOneIndices, dataTwoIndices = list(splitter.split(studiedUserData))[0]

dataOne = studiedUserData.iloc[dataOneIndices]
dataTwoPositive = studiedUserData.iloc[dataTwoIndices]

splitter = ShuffleSplit(n_splits=1, train_size=10, test_size=None)
trainIndices, testIndices = list(splitter.split(dataOne))[0]

train = dataOne.iloc[trainIndices]
test = dataOne.iloc[testIndices]

print("Train len: {}, test len: {}, data2 positive: {}, data2 negative: {}".format(
    len(train), len(test), len(dataTwoPositive), len(otherUsersData)
))

grid = {'gamma' : np.logspace(-9, 3, 13),
        'nu' : np.linspace(0.01, 0.99, 99)}

bestRecall = 0
bestRecallParameters = None

for hyperparams in ParameterGrid(grid):
    model = learn(train, nu=hyperparams["nu"], gamma=hyperparams["gamma"])
    results = score(model, train, test)

    if results["recall"] > bestRecall:
        bestRecall = results["recall"]
        bestRecallParameters = hyperparams
        
print(bestRecallParameters)

model = learn(train, nu=bestRecallParameters["nu"], gamma=bestRecallParameters["gamma"])
results = scoreFull(model, None, dataTwoPositive, otherUsersData)

print(results)

Train len: 10, test len: 34, data2 positive: 11, data2 negative: 3715
{'gamma': 1e-08, 'nu': 0.31}
11
{'TP': 35, 'TN': 3694, 'FP': 21, 'FN': 10, 'FNT': 0, 'precision': 0.6140350877192983, 'recall': 0.7608695652173914, 'f1': 0.693069306930693, 'clf': Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('classifier', OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=1e-08, kernel='rbf',
      max_iter=-1, nu=0.31, random_state=None, shrinking=True, tol=0.001,
      verbose=False))])}


## Metrics

According to this page : https://stats.stackexchange.com/questions/192530/metrics-for-one-class-classification

According to Lee, Wee Sun, and Bing Liu. ["Learning with positive and unlabelled examples using weighted logistic regression."](https://www.aaai.org/Papers/ICML/2003/ICML03-060.pdf) ICML. Vol. 3. 2003. (creators of One-Class SVM)

The best formula may be :

\begin{equation*}
\frac{r^2}{\Pr(Y=1)}
\end{equation*}

What the hell this represents ? Who knows ?