In [1]:
from coniferest.isoforest import IsolationForest
from coniferest.aadforest import AADForest
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time


def load_single(oid_filename, feature_filename):
    oid     = np.memmap(oid_filename, mode='c', dtype=np.uint64)
    feature = np.memmap(feature_filename, mode='c', dtype=np.float32).reshape(oid.shape[0], -1)
    return oid, feature

# Plot config

In [2]:
plt.rcParams["font.family"] = "DejaVu Serif"
plt.rcParams["mathtext.fontset"] = 'dejavuserif'
plt.rcParams["font.size"] = 22
plt.rcParams['axes.linewidth'] = 1.2
plt.rcParams['lines.linewidth'] = 2.2

xtick_param = {'direction': 'in',
         'major.size': 8,
         'major.width': 2,
         'minor.size': 5,
         'minor.width': 1.5}
ytick_param = {'direction': 'in',
         'major.size': 8,
         'major.width': 2,
         'minor.size': 5,
         'minor.width': 1.5}
plt.rc('xtick', **xtick_param)
plt.rc('ytick', **ytick_param)

grid_param = {'linestyle': '--', 'alpha': 0.5}
plt.rc('grid', **grid_param)

In [3]:
oids, features_art = load_single('snad4_features/sid_snad4_r_100.dat', 'expanded_features/exp_feature_snad4_r_100.dat')

In [4]:

def build_if(n_estimators, n_subsamples, data, return_n_out=10, log=False):
    t = time.monotonic()
    iforest = IsolationForest(n_jobs=40, n_trees=n_estimators, n_subsamples=n_subsamples).fit(data)
    t = (time.monotonic() - t) / 60
    if log:
        print(f'Forest is fitted in {t:.0f} m')
    
    t = time.monotonic()
    scores = iforest.score_samples(data)
    t = (time.monotonic() - t) / 60
    if log:
        print(f'Scores are computed in {t:.0f} m')

    t = time.monotonic()
    ind_sorted = np.argsort(scores)
    t = (time.monotonic() - t) / 60
    if log:
        print(f'Scores are sorted in {t:.0f} m')
    if_out = oids[ind_sorted[:return_n_out]]
    
    return if_out, ind_sorted

In [None]:
n_estimators = 10000
n_subsamples = 256*2**5

outs = []
for i in range(10):
    if i == 0:
        if_10out, if_all_ind = build_if(n_estimators, n_subsamples, features_art[:, :-1], log=True)
    else:
        if_10out, if_all_ind = build_if(n_estimators, n_subsamples, features_art[:, :-1])
    outs.append(if_10out)

Forest is fitted in 13 m


In [6]:
total = set(outs[0])
for i in range(1,10):
    total = total.intersection(set(outs[i]))
len(total)

8

In [21]:
outs[0] == outs[9]

array([False, False, False, False, False, False, False, False, False,
        True])

In [11]:
2**13*10000

81920000

In [7]:
np.log2(len(oids) // 10000)

12.71789058398728

In [8]:
2**13

8192

In [15]:
256*2**5*10000

81920000