In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
from sklearn.ensemble import IsolationForest
import seaborn as sns

from anomaly_detection_robustness.graph import Node, Graph

In [None]:
def plot_rocauc(data, x_col, y_col, title=None, xlabel=None, ylabel=None):
    if not title:
        title = f'Performance while varying {x_col}'
    if not xlabel:
        xlabel = x_col
    if not ylabel:
        ylabel = y_col
        
    fig, ax = plt.subplots(1,1)
    sns.boxplot(x=x_col, y=y_col, data=data, ax=ax)
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    return ax

# Check the robustness of an Isolation Forest when varying the number of features

## Define an example network

In [None]:
# Only for visualizing
fig, ax = plt.subplots(1,1)
G = nx.DiGraph()
G.add_edge('r1', 'c1')
G.add_edge('r2', 'c1')
G.add_edge('r1', 'c2')
G.add_edge('r2', 'c2')
G.add_edge('c1', 'gc1')
G.add_edge('gc1', 'ggc1')
nx.draw_networkx(G, node_size=1000, ax=ax)
ax.set_title('Graphical model used to generate synthetic data');

In [None]:
# Actual definition 
r1 = Node(256, 'root 1')
r2 = Node(10, 'root 2')

c1 = Node(50, 'child 1')
c1.add_parent(r1)
c1.add_parent(r2)

c2 = Node(8, 'child 2')
c2.add_parent(r1)
c2.add_parent(r2)

gc1 = Node(5, 'grandchild 1')
gc1.add_parent(c1)

ggc1 = Node(20, 'grandgrandchild 1')
ggc1.add_parent(gc1)

nodes = [r1, r2, c1, c2, gc1, ggc1]
CONTAMINATION = 0.005  # used for IF and when generating data
graph = Graph(nodes=nodes, contamination=CONTAMINATION)

REPEAT_TIMES = 10

## Vary on how many features the anomalies are different

In [None]:
title = 'The more different the anomalies\nthe better the Isolation Forest\'s performance'
xlabel = 'Nr. of features on which anomalies differ from normal data points\n(out of 6 features in total)'
ylabel = 'ROC AUC score'

values = range(len(nodes)+1)
scores = pd.DataFrame()
i = 0
for nr_features_to_change in values:
    for _ in range(REPEAT_TIMES):
        graph.label(nr_features_to_change=nr_features_to_change)
        scores.loc[i, ylabel] = graph.score()
        scores.loc[i, xlabel] = nr_features_to_change 
        i += 1

plot_rocauc(data=scores, x_col=xlabel, y_col=ylabel, title=title)

Repeat with uncorrelated nodes

In [None]:
graph_uncorrelated = Graph([Node(10, f'{i}') for i in range(6)], contamination=CONTAMINATION)

title = 'Isolation Forest performance\nwhile varying nr. features changed for anomalies'
xlabel = 'Nr. of features changed out of 6'
ylabel = 'ROC AUC score'

values = range(len(nodes)+1)
scores = pd.DataFrame()
i = 0
for nr_features_to_change in values:
    for _ in range(REPEAT_TIMES):
        graph_uncorrelated.label(nr_features_to_change=nr_features_to_change)
        scores.loc[i, ylabel] = graph_uncorrelated.score()
        scores.loc[i, xlabel] = nr_features_to_change 
        i += 1

plot_rocauc(data=scores, x_col=xlabel, y_col=ylabel, title=title)

As expected the performance increases when the anomalies are defined to differ on more features.

# Check IF performance when adding irrelevant features

### Keep anomalies constant (i.e. `nr_features_to_change`)  and add irrelevant features 
(similar values for all data points; both anomalies and normal data points)

In [None]:
title = 'Isolation Forests\'s performance drops significantly\nwhen adding noise (irrelevant features)'
xlabel = 'Nr. of irrelevant features added (to the original 6 features)'
ylabel = 'ROC AUC score'

In [None]:
values = [0, 100, 250, 500, 750, 1000]
scores = pd.DataFrame()
i = 0
for nr_externals in values:
    for _ in range(REPEAT_TIMES):
        graph = Graph(nodes=nodes, contamination=CONTAMINATION, nr_externals=nr_externals)
        graph.label(nr_features_to_change=len(nodes))
        scores.loc[i, ylabel] = graph.score()
        scores.loc[i, xlabel] = nr_externals 
        i += 1

In [None]:
plot_rocauc(data=scores, x_col=xlabel, y_col=ylabel, title=title)

Performance decreases clearly.

## Performing PCA shows that it should be easy to separate the anomalies

In [None]:
xlabel = 'First PCA component'
ylabel = 'Second PCA component'

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
X_r = pca.fit_transform(graph.X)

# plt.bar(range(graph.X.shape[1]), pca.explained_variance_)
plt.scatter(X_r[graph.y==1, 0], X_r[graph.y==1, 1], c='r', label='True anomaly')
plt.scatter(X_r[graph.y==0, 0], X_r[graph.y==0, 1], c='b', label='True normal')
plt.title('First two PCA components show that\nanomalies are `few and different`')
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.legend();

In [None]:
model = IsolationForest(
    contamination=CONTAMINATION,
    random_state=42,
    n_jobs=-1)
model.fit(graph.X)
predictions = model.predict(graph.X)

plt.scatter(X_r[predictions==1, 0], X_r[predictions==1, 1], c='b', label='Predicted normal')
plt.scatter(X_r[predictions==-1, 0], X_r[predictions==-1, 1], c='orange', label='Predicted anomaly')
plt.title('Visual representation (on first two PCA components)\nof low quality predictions with Isolation Forest')
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.legend();

## Can we tune the model to increase performance?

In [None]:
graph = Graph(nodes=nodes, contamination=CONTAMINATION, nr_externals=500)
graph.label(nr_features_to_change=len(nodes))

## Changing the `max samples` parameter

In [None]:
title = 'Performance steadily increases\nwhen increasing `max_samples` parameter'
xlabel = 'Value for `max_sample` parameter'
ylabel = 'ROC AUC score'

In [None]:
values = ['auto', .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.]
scores = pd.DataFrame()
i = 0
for max_samples in values:
    for _ in range(REPEAT_TIMES):
        model = IsolationForest(
            max_samples=max_samples,
            contamination=CONTAMINATION,
            random_state=None,
            n_jobs=-1
        )
        scores.loc[i, ylabel], scores.loc[i, 'Runtime'] = graph.score(model=model, get_fit_predict_time=True)
        scores.loc[i, xlabel] = max_samples 
        i += 1

In [None]:
plot_rocauc(data=scores, x_col=xlabel, y_col=ylabel, title=title)

By increasing the number of samples the performance increases significantly.
Continuing `max_samples` is set to `0.25`.

In [None]:
fg = sns.lmplot(
    x='max_samples', 
    y='Runtime (sec)', 
    data=(scores
          .loc[lambda x: x[xlabel] != 'auto']
          .assign(max_samples=lambda x: x[xlabel].astype(float)))
)
fg.ax.set_title('Runtime also steadily increases\nwhen increasing `max_samples` parameter')
fg.ax.set_xlabel(xlabel);

## Changing the `max features` parameter

In [None]:
title = 'Isolation Forest performance\nwhile varying max_features parameter'
xlabel = 'Value for max_features parameter'
ylabel = 'ROC AUC score'

values = [.1, .25, .5, .75, .9, 1.]
scores = pd.DataFrame()
i = 0
for max_features in values:
    for _ in range(REPEAT_TIMES):
        model = IsolationForest(
            max_samples=0.25,
            max_features=max_features,
            contamination=CONTAMINATION,
            random_state=None,
            n_jobs=-1
        )
        scores.loc[i, ylabel], scores.loc[i, 'time'] = graph.score(model=model, get_fit_predict_time=True)
        scores.loc[i, xlabel] = max_features 
        i += 1

plot_rocauc(data=scores, x_col=xlabel, y_col=ylabel, title=title)
plot_rocauc(data=scores, x_col=xlabel, y_col='time', title='Runtime for different settings of max_features')

The maximum number of features doesn't have a big impact.
Continuing it is kept at the default.

## Changing the `n_estimators` parameter

In [None]:
title = 'Isolation Forest performance\nwhile varying n_estimators parameter'
xlabel = 'Value for n_estimators parameter'
ylabel = 'ROC AUC score'

values = np.arange(100, 500, 100)
scores = pd.DataFrame()
i = 0
for n_estimators in values:
    for _ in range(REPEAT_TIMES):
        model = IsolationForest(
            n_estimators=n_estimators,
            max_samples=0.25,
            contamination=CONTAMINATION,
            random_state=None,
            n_jobs=-1
        )
        scores.loc[i, ylabel], scores.loc[i, 'time'] = graph.score(model=model, get_fit_predict_time=True)
        scores.loc[i, xlabel] = n_estimators 
        i += 1

plot_rocauc(data=scores, x_col=xlabel, y_col=ylabel, title=title)
plot_rocauc(data=scores, x_col=xlabel, y_col='time', title='Runtime for different settings of n_estimators')

Increasing the number of estimators seems to improves performance a bit.