# Scoring an Anomaly Detection Dataset

This is a proposal to provide a score for an anomaly detection dataset to measure its difficulty when attempting to perform anomaly detection methods against it. This is largely based on the paper https://arxiv.org/pdf/1503.01158v2.pdf. 

We focus on two major properties namely:
1. Relative Frequency / Ratio of anomalies against data points in a dataset
2. Semantic variation  / clusterdness

In [None]:
# Import necessary libraries and path relative to project
from IPython.display import HTML, display
import pandas as pd
import numpy as np
import tabulate

In [None]:
# Setup the dataset

# Instantiate pandas DataFrame
data = pd.DataFrame()

# Chunk size for reading data
chunksize = 10000

# The reference to the dataset. Change this to 
dataset_file = '../data/creditcardfraud.csv'

print("Loading dataset '{}'...".format(dataset_file))

# Read each chunk and append to data frame
for i, chunk in enumerate(pd.read_csv(dataset_file, header=None, chunksize=chunksize)):
    print("Reading chunk %d" % (i + 1))
    data = data.append(chunk)

print("Done loading dataset...")
    
# Check for proper value of input dimensionality to be used by model
input_dim = len(data.columns) - 1
print("Input Dimensionality: %d" % (input_dim))

# Partition the data into positive_data and negative_data
positive_data = data[data[input_dim] == 1].iloc[:,:input_dim]
negative_data = data[data[input_dim] == -1].iloc[:,:input_dim]

## Relative Frequency

This simply the ratio of number of anomalies in the entire dataset.

In [None]:
# Divide the number of anomalies vs the entire length of the dataset
# X: A pandas data frame
def score_relative_frequency(X):
    # Class column is always the last value
    idx_class = len(X.columns) - 1
    anomalies = X[X[idx_class] == -1]
    
    # Return the score in percentage format
    return (len(anomalies) / len(X)) * 100

print("Relative Frequency: %0.5f" % (score_relative_frequency(data)))

## Semantic Variation

A normalized clusterdness measure of given the following equation:

$$\log(\frac{\sigma^2_{n}}{\sigma^2_{a}})$$

where:

* $\sigma^2_{n}$ is the variance of normal data
* $\sigma^2_{a}$ is the variance of anomaly data

To deal with multi-dimensional data, we compute for the $\sigma^2$ by taking the covariance matrix of the data $X$ using the equation:

$$
\mathbf{Var}(X) = \mathbf{E}[(X - \mathbf{E}(X))(X - \mathbf{E}(X))^{T}]
\\
=
\begin{bmatrix}
\mathbf{Var}(X_1) & \cdots &
\mathbf{Cov}(X_1, X_p) \\
\vdots & \ddots & \vdots \\
\mathbf{Cov}{X_p, X_1} & \cdots &
\mathbf{Var}(X_p)
\end{bmatrix} 
\\
=
\frac{1}{n - 1}
\begin{bmatrix}
\sum_{i=1}^n(X_{i1} - \hat{X}_{1})^2 & \cdots &
\sum_{i=1}^n(X_{i1} - \hat{X}_{1})(X_{ip} - \hat{X}_{p}) \\
\vdots & \ddots & \vdots \\
\sum_{i=1}^n(X_{ip} - \hat{X}_{p})(X_{i1} - \hat{X}_{1})  & \cdots &
\sum_{i=1}^n(X_{ip} - \hat{X}_{p})^2
\end{bmatrix}
$$

We then take trace of the covariance matrix to give us the overall variance:

$$
\sigma^2 = \operatorname{tr}({\mathbf{Var}(X)})
$$

In [None]:
def score_semantic_variation(X):
    idx_class = len(X.columns) - 1
    
    # Partition the data into positive_data and negative_data
    positive_data = X[X[idx_class] == 1].iloc[:,:idx_class]
    negative_data = X[X[idx_class] == -1].iloc[:,:idx_class]
    
    var_n = np.trace(positive_data.cov().values)
    var_a = np.trace(negative_data.cov().values)
    
    return np.log(var_n / var_a)
    
print("Semantic variation: %0.5f" % (score_semantic_variation(data)))

## Test against public datasets

In [None]:
datasets = [
    { "name": "annthyroid", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/annthyroid.csv" },
    { "name": "backdoor", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/backdoor.csv" },
    { "name": "bald", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/bald.csv" },
    { "name": "bank", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/bank.csv" },
    { "name": "cover", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/cover.csv" },
    { "name": "creditcardfraud", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/creditcardfraud.csv" },
    { "name": "donors", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/donors.csv" },
    { "name": "kddcup99", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/kddcup99.csv" },
    { "name": "magic04", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/magic04.csv" },
    { "name": "mammography", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/mammography.csv" },
    { "name": "musk", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/musk.csv" },
    { "name": "pageblocks", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/pageblocks.csv" },
    { "name": "seismic", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/seismic.csv" },
    { "name": "shuttle", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/shuttle.csv" },
    { "name": "speech", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/speech.csv" },
    { "name": "synthetic", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/synthetic.csv" },
    { "name": "waveform", "location": "https://happy-research.s3.ap-southeast-1.amazonaws.com/waveform.csv" }
]

scores = [["Dataset", "Relative Frequency", "Semantic Variation"]]

for o in datasets:
    # Instantiate pandas DataFrame
    data = pd.DataFrame()

    # Chunk size for reading data
    chunksize = 10000

    # The reference to the dataset. Change this to 
    dataset_file = o["location"]

    print("Loading dataset '{}'...".format(dataset_file))

    # Read each chunk and append to data frame
    for i, chunk in enumerate(pd.read_csv(dataset_file, header=None, chunksize=chunksize)):
        print("Reading chunk %d" % (i + 1))
        data = data.append(chunk)

    print("Done loading dataset %s..." % (o["name"]))
    
    # Partition for percentages
    percent_drop = 0.95
    input_dim = len(data.columns) - 1
    df_subset = data[data[input_dim] == -1].sample(frac=percent_drop)
    data = data.drop(df_subset.index)
    
    score_rf = score_relative_frequency(data)
    score_sv = score_semantic_variation(data)
    scores.append([
        o["name"],
        score_rf,
        score_sv
    ])

In [None]:
# Display result in tabular format
tabulate.tabulate(scores, tablefmt='html')