# Scoring an Anomaly Detection Dataset

This is a proposal to provide a score for an anomaly detection dataset to measure its difficulty when attempting to perform anomaly detection methods against it. This is largely based on the paper https://arxiv.org/pdf/1503.01158v2.pdf. 

We focus on two major properties namely:
1. Relative Frequency / Ratio of anomalies against data points in a dataset
2. Semantic variation  / clusterdness

In [1]:
# Import necessary libraries and path relative to project
from IPython.display import HTML, display
import pandas as pd
import numpy as np
import tabulate

In [2]:
# Setup the dataset

# Instantiate pandas DataFrame
data = pd.DataFrame()

# Chunk size for reading data
chunksize = 10000

# The reference to the dataset. Change this to 
dataset_file = './data/iris_setosa_anomaly.csv'

print("Loading dataset '{}'...".format(dataset_file))

# Read each chunk and append to data frame
for i, chunk in enumerate(pd.read_csv(dataset_file, header=None, chunksize=chunksize)):
    print("Reading chunk %d" % (i + 1))
    data = data.append(chunk)

print("Done loading dataset...")
    
# Check for proper value of input dimensionality to be used by model
input_dim = len(data.columns) - 1
print("Input Dimensionality: %d" % (input_dim))

# Partition the data into positive_data and negative_data
positive_data = data[data[input_dim] == 1].iloc[:,:input_dim]
negative_data = data[data[input_dim] == -1].iloc[:,:input_dim].head(5)

Loading dataset './data/iris_setosa_anomaly.csv'...
Reading chunk 1
Done loading dataset...
Input Dimensionality: 3


  data = data.append(chunk)


## Relative Frequency

This simply the ratio of number of anomalies in the entire dataset.

In [3]:
# Divide the number of anomalies vs the entire length of the dataset
# X: A pandas data frame
def score_relative_frequency(X):
    # Class column is always the last value
    idx_class = len(X.columns) - 1
    anomalies = X[X[idx_class] == -1]
    
    # Return the score in percentage format
    return (len(anomalies) / len(X)) * 100

print("Relative Frequency: %0.5f" % (score_relative_frequency(data)))

Relative Frequency: 33.33333


## Semantic Variation

A normalized clusterdness measure of given the following equation:

$$\log(\frac{\sigma^2_{n}}{\sigma^2_{a}})$$

where:

* $\sigma^2_{n}$ is the variance of normal data
* $\sigma^2_{a}$ is the variance of anomaly data

To deal with multi-dimensional data, we compute for the $\sigma^2$ by taking the covariance matrix of the data $X$ using the equation:

$$
\mathbf{Var}(X) = \mathbf{E}[(X - \mathbf{E}(X))(X - \mathbf{E}(X))^{T}]
\\
=
\begin{bmatrix}
\mathbf{Var}(X_1) & \cdots &
\mathbf{Cov}(X_1, X_p) \\
\vdots & \ddots & \vdots \\
\mathbf{Cov}{X_p, X_1} & \cdots &
\mathbf{Var}(X_p)
\end{bmatrix} 
\\
=
\frac{1}{n - 1}
\begin{bmatrix}
\sum_{i=1}^n(X_{i1} - \hat{X}_{1})^2 & \cdots &
\sum_{i=1}^n(X_{i1} - \hat{X}_{1})(X_{ip} - \hat{X}_{p}) \\
\vdots & \ddots & \vdots \\
\sum_{i=1}^n(X_{ip} - \hat{X}_{p})(X_{i1} - \hat{X}_{1})  & \cdots &
\sum_{i=1}^n(X_{ip} - \hat{X}_{p})^2
\end{bmatrix}
$$

We then take trace of the covariance matrix to give us the overall variance:

$$
\sigma^2 = \operatorname{tr}({\mathbf{Var}(X)})
$$

In [4]:
def score_semantic_variation(X):
    idx_class = len(X.columns) - 1
    
    # Partition the data into positive_data and negative_data
    positive_data = X[X[idx_class] == 1].iloc[:,:idx_class]
    negative_data = X[X[idx_class] == -1].iloc[:,:idx_class]
    
    var_n = np.trace(positive_data.cov().values)
    var_a = np.trace(negative_data.cov().values)
    
    return np.log(var_n / var_a)
    
print("Semantic variation: %0.5f" % (score_semantic_variation(data)))

Semantic variation: 0.71245


## Test against public datasets

In [5]:
datasets = [
    { "name": "annthyroid", "location": "/home/ralampay/workspace/pyneural/notebooks/partitions/annthyroid-training-3251546f-bb22-48d8-ad40-13d51e75bf5f.csv" },
    { "name": "pageblocks", "location": "/home/ralampay/workspace/pyneural/notebooks/partitions/pageblocks-training-19a7bd3b-9640-4a5b-9ca2-32eb4dc08194.csv" },
    { "name": "creditcardfraud", "location": "/home/ralampay/workspace/pyneural/notebooks/partitions/creditcardfraud-training-1b5b3362-8ae5-419f-8a93-bec611aae5ac.csv" },
    { "name": "kddcup99", "location": "/home/ralampay/workspace/pyneural/notebooks/partitions/kddcup99-training-1ee21411-1cae-43a4-9575-f94c4bdf7ae4.csv" },
    { "name": "shuttle", "location": "/home/ralampay/workspace/pyneural/notebooks/partitions/shuttle-training-eed45643-1cf0-4cd3-9ac9-b7da0538565e.csv" },
    { "name": "backdoor", "location": "/home/ralampay/workspace/pyneural/notebooks/partitions/backdoor-training-155cbadd-b98b-480e-b3ae-fc172b620dba.csv" },
    { "name": "donors", "location": "/home/ralampay/workspace/pyneural/notebooks/partitions/donors-training-b2881566-edd9-4d15-a710-b0927d951d52.csv" },
    { "name": "magic04", "location": "/home/ralampay/workspace/pyneural/notebooks/partitions/magic04-training-ba90e6a7-3fd6-405f-9aed-54e39864582f.csv" },
    { "name": "mammography", "location": "/home/ralampay/workspace/pyneural/notebooks/partitions/mammography-training-ea58423d-2f5d-4c25-b5dd-4e524b1ad202.csv" },
    { "name": "waveform", "location": "/home/ralampay/workspace/pyneural/notebooks/partitions/waveform-training-7aa452b3-6387-4fd2-842a-696065e975d7.csv" }
]

scores = [["Dataset", "Relative Frequency", "Semantic Variation"]]

for o in datasets:
    # Instantiate pandas DataFrame
    data = pd.DataFrame()

    # Chunk size for reading data
    chunksize = 10000

    # The reference to the dataset. Change this to 
    dataset_file = o["location"]

    print("Loading dataset '{}'...".format(dataset_file))

    # Read each chunk and append to data frame
    for i, chunk in enumerate(pd.read_csv(dataset_file, header=None, chunksize=chunksize)):
        #print("Reading chunk %d" % (i + 1))
        data = pd.concat([data, chunk])

    #print("Done loading dataset %s..." % (o["name"]))
    input_dim = len(data.columns) - 1
    
    score_rf = score_relative_frequency(data)
    score_sv = score_semantic_variation(data)
    scores.append([
        o["name"],
        score_rf,
        score_sv
    ])

Loading dataset '/home/ralampay/workspace/pyneural/notebooks/partitions/annthyroid-training-3251546f-bb22-48d8-ad40-13d51e75bf5f.csv'...
Loading dataset '/home/ralampay/workspace/pyneural/notebooks/partitions/pageblocks-training-19a7bd3b-9640-4a5b-9ca2-32eb4dc08194.csv'...
Loading dataset '/home/ralampay/workspace/pyneural/notebooks/partitions/creditcardfraud-training-1b5b3362-8ae5-419f-8a93-bec611aae5ac.csv'...
Loading dataset '/home/ralampay/workspace/pyneural/notebooks/partitions/kddcup99-training-1ee21411-1cae-43a4-9575-f94c4bdf7ae4.csv'...
Loading dataset '/home/ralampay/workspace/pyneural/notebooks/partitions/shuttle-training-eed45643-1cf0-4cd3-9ac9-b7da0538565e.csv'...
Loading dataset '/home/ralampay/workspace/pyneural/notebooks/partitions/backdoor-training-155cbadd-b98b-480e-b3ae-fc172b620dba.csv'...
Loading dataset '/home/ralampay/workspace/pyneural/notebooks/partitions/donors-training-b2881566-edd9-4d15-a710-b0927d951d52.csv'...
Loading dataset '/home/ralampay/workspace/pyneu

In [6]:
# Display result in tabular format
tabulate.tabulate(scores, tablefmt='html')

0,1,2
Dataset,Relative Frequency,Semantic Variation
annthyroid,6.2,0.08383434074608195
pageblocks,7.806476484194294,4.53706272063149
creditcardfraud,0.13773378729265268,-0.05594329526420958
kddcup99,0.20037988686885552,11.002328556599126
shuttle,6.918779544477868,5.143372055024244
backdoor,2.3431340600658053,-0.26974687256027857
donors,5.913174378074899,-0.001947262023485022
magic04,35.00531349628056,1.2236408577475006
mammography,1.456796867886734,-0.03327060424053471
