In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
from sklearn.ensemble import IsolationForest
import seaborn as sns

from anomaly_detection_robustness.graph import Node, Graph

In [None]:
def plot_rocauc(name, values, scores):
    fig, ax = plt.subplots(1,1)
    sns.boxplot(x=name, y='score', data=scores, ax=ax)
    ax.set_title(f'Varying {name}')
    return ax

# Check the robustness of an Isolation Forest when varying the number of features

## Define a network

In [None]:
# Only for visualizing
G = nx.DiGraph()
G.add_edge('r1', 'c1')
G.add_edge('r2', 'c1')
G.add_edge('r1', 'c2')
G.add_edge('r2', 'c2')
G.add_edge('c1', 'gc1')
G.add_edge('gc1', 'ggc1')
nx.draw_networkx(G, node_size=1000)

In [None]:
r1 = Node(256, 'root 1')
r2 = Node(10, 'root 2')

c1 = Node(50, 'child 1')
c1.add_parent(r1)
c1.add_parent(r2)

c2 = Node(8, 'child 2')
c2.add_parent(r1)
c2.add_parent(r2)

gc1 = Node(5, 'grandchild 1')
gc1.add_parent(c1)

ggc1 = Node(20, 'grandgrandchild 1')
ggc1.add_parent(gc1)

nodes = [r1, r2, c1, c2, gc1, ggc1]

## Vary on how many features the anomalies are different

In [None]:
contamination = 0.005
graph = Graph(nodes=nodes, contamination=contamination)

In [None]:
name = 'nr_features_to_change'
values = range(len(nodes)+1)
scores = pd.DataFrame()
repeat_times = 10
i = 0
for nr_features_to_change in values:
    for _ in range(repeat_times):
        graph.label(nr_features_to_change=nr_features_to_change)
        scores.loc[i, 'score'] = graph.score()
        scores.loc[i, name] = nr_features_to_change 
        i += 1
    
plot_rocauc(name, round(pd.Series(values)/len(nodes)*100, 1), scores)

As expected the performance increases when the anomalies are defined to differ on more features.

# Add more features

### Keep anomalies constant (i.e. `nr_features_to_change`)  and add more features with similar values for all data points (both anomalies and normal data points)

In [None]:
chain_length = 20
name = f'nr_chains_of_length_{chain_length}'
values = range(5)
scores = pd.DataFrame()
repeat_times = 10
i = 0
for nr_chains in values:
    for _ in range(repeat_times):
        graph = Graph(nodes=nodes, contamination=contamination, nr_chains=nr_chains, chain_length=chain_length)        
        graph.label(nr_features_to_change=len(nodes))
        scores.loc[i, 'score'] = graph.score()
        scores.loc[i, name] = nr_chains 
        i += 1
    
plot_rocauc(name, pd.Series(values)*chain_length, scores);

Performance decreases when more features are added.

## Adding even more features

In [None]:
name = f'nr_additional_features'
values = np.arange(0, 750, 250)
scores = pd.DataFrame()
repeat_times = 10
i = 0
for nr_externals in values:
    for _ in range(repeat_times):
        graph = Graph(nodes=nodes, contamination=contamination, nr_externals=nr_externals)
        graph.label(nr_features_to_change=len(nodes))
        scores.loc[i, 'score'] = graph.score()
        scores.loc[i, name] = nr_externals 
        i += 1
    
plot_rocauc(name, values, scores)

Performance decreases even further.

## Comparing the predictions agains a simple PCA shows that it should be easy to separate the anomalies

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
X_r = pca.fit_transform(graph.X)

# plt.bar(range(graph.X.shape[1]), pca.explained_variance_)
plt.scatter(X_r[graph.y==1, 0], X_r[graph.y==1, 1], c='r', label='Anomaly')
plt.scatter(X_r[graph.y==0, 0], X_r[graph.y==0, 1], c='b', label='Normal')
plt.title('PCA first two components')
plt.legend();

In [None]:
model = IsolationForest(
    n_estimators=100,
    max_samples='auto',
    contamination=contamination,
    random_state=42,
    n_jobs=-1)

model.fit(graph.X)
predictions = model.predict(graph.X)

plt.scatter(X_r[predictions==1, 0], X_r[predictions==1, 1], c='b', label='Normal')
plt.scatter(X_r[predictions==-1, 0], X_r[predictions==-1, 1], c='r', label='Anomaly')
plt.title('IF predictions')
plt.legend();

## Can we tune the model to increase performance?

In [None]:
graph = Graph(nodes=nodes, contamination=contamination, nr_externals=500)
graph.label(nr_features_to_change=len(nodes))

## Max samples

In [None]:
name = 'max_samples'
values = ['auto', .1, .25, .5, .99]
scores = pd.DataFrame()
repeat_times = 5
i = 0
for max_samples in values:
    for _ in range(repeat_times):
        model = IsolationForest(
            n_estimators=100,
            max_samples=max_samples,
            contamination=contamination,
            random_state=None,
            n_jobs=-1
        )
        scores.loc[i, 'score'] = graph.score(model=model)
        scores.loc[i, name] = max_samples 
        i += 1
    
plot_rocauc(name, values, scores)

By increasing the number of samples the performance increases significantly.
Continuing `max_samples` is set to `0.25`.

## Max features

In [None]:
name = 'max_features'
values = [.1, .25, .5, .75, .9, 1.]
scores = pd.DataFrame()
repeat_times = 10
i = 0
for max_features in values:
    for _ in range(repeat_times):
        model = IsolationForest(
            n_estimators=100,
            max_samples=.25,
            max_features=max_features,
            contamination=contamination,
            random_state=None,
            n_jobs=-1
        )
        scores.loc[i, 'score'] = graph.score(model=model)
        scores.loc[i, name] = max_features 
        i += 1
    
plot_rocauc(name, values, scores)

The maximum number of features doesn't have a big impact.
Continuing it is kept at the default.

## Nr. estimators

In [None]:
name = 'n_estimators'
values = np.arange(100, 500, 100)
scores = pd.DataFrame()
repeat_times = 5
i = 0
for n_estimators in values:
    for _ in range(repeat_times):
        model = IsolationForest(
            n_estimators=n_estimators,
            max_samples=.25,
            contamination=contamination,
            random_state=None,
            n_jobs=-1
        )
        scores.loc[i, 'score'] = graph.score(model=model)
        scores.loc[i, name] = n_estimators 
        i += 1
    
plot_rocauc(name, values, scores)

Increasing the number of estimators does not seem to have a significant effect.