# Random Histogram Forest

In [2]:
from timeit import timeit
from sklearn.metrics import average_precision_score
import math
import scipy.io as sio
import scipy.stats as sstats
import random 
import numpy as np
import rht
import rhf
import anomaly_score as a_s
import Node

## Execution time with satellite, cardio, and musk datasets

### Execution time for fixed parameters

In [3]:
setup_code='''
import scipy.io as sio
import scipy.stats as sstats
import random 
import numpy as np
import rht
import rhf
import anomaly_score as a_s
import Node
# set the number of trees and max height
H = 5
T = 100
mat_contents = sio.loadmat("../datasets/musk.mat")
dataset = mat_contents['X'] 
labels = mat_contents['y']
dataset = dataset.astype('float32') 
'''
code = '''
test_rhf = rhf.rhf(X=dataset, t=T, nd=0, h=H)
scores = np.empty(labels.size)
for i, x in enumerate(dataset):
    score = a_s.anomaly_score(test_rhf, dataset.size, x)
    np.append(scores, score)    
'''
for i in range(0,10):
    print("Total time for rhf-cython (train) = ", timeit(setup=setup_code, stmt=code, number=1))

Total time for rhf-cython (train) =  2515.1521688880166
Total time for rhf-cython (train) =  2484.7391208059853
Total time for rhf-cython (train) =  2558.155892419978
Total time for rhf-cython (train) =  2419.7553646920132
Total time for rhf-cython (train) =  2435.1043213850353
Total time for rhf-cython (train) =  2352.413767109043
Total time for rhf-cython (train) =  2462.460835060978
Total time for rhf-cython (train) =  2340.7785142989596
Total time for rhf-cython (train) =  2350.3405260470463
Total time for rhf-cython (train) =  2263.1618289710023


In [None]:
musk.mat

Total time for rhf-python (train) =  2244.9498698560055
Total time for rhf-python (train) =  2348.7104356749915
Total time for rhf-python (train) =  2348.4431248309556
Total time for rhf-python (train) =  2358.9754647139926
Total time for rhf-python (train) =  2355.7764436089783
Total time for rhf-python (train) =  2412.258353453013
Total time for rhf-python (train) =  2338.0248104759958
Total time for rhf-python (train) =  2620.6636921860045
Total time for rhf-python (train) =  2438.444901470968
Total time for rhf-python (train) =  2247.217430861958

avg (python) = 2371.346


Total time for rhf-python (train) =  3907.5885492800007
Total time for rhf-python (train) =  3879.408482304003
Total time for rhf-python (train) =  3560.124946171003
Total time for rhf-python (train) =  3573.552410616998
Total time for rhf-python (train) =  3638.5123434520065
Total time for rhf-python (train) =  3770.0237400339975
Total time for rhf-python (train) =  3826.4306275380077

In [4]:
res = np.array([
2515.1521688880166,
2484.7391208059853,
2558.155892419978,
2419.7553646920132,
2435.1043213850353,
2352.413767109043,
2462.460835060978,
2340.7785142989596,
2350.3405260470463,
2263.1618289710023    
])

(round(np.mean(res),3))

2418.206

### Number of elements vs. execution time

In [None]:
mat_contents = sio.loadmat("../datasets/satellite.mat")
ds = mat_contents['X'] 
ls = mat_contents['y']


i = 2250
while i < ds.shape[0]:
    H = 5
    T = 100
    print("n=", i)
    
    dataset = ds[:i, :]
    labels = ls[:i]
    dataset = dataset.astype('float32') 

    code = '''
test_rhf = rhf.rhf(X=dataset, t=T, nd=0, h=H)

scores = np.empty(labels.size)
for i, x in enumerate(dataset):
    score = a_s.anomaly_score(test_rhf, dataset.size, x)
    np.append(scores, score)    
'''
    
    print("Total time for rhf-cython (train) = ", timeit(stmt=code, number=10, globals=globals()))
    i = i + 250

### Height vs. execution time

In [8]:
mat_contents = sio.loadmat("../datasets/satellite.mat")
dataset = mat_contents['X'] 
labels = mat_contents['y']


for j in range(1, 11):
    T_h = 100
    
    H_h = j
    print("BC, H_h=", H_h)
    
    dataset = dataset.astype('float32') 

    code = '''
print("H_h", H_h)
test_rhf = rhf.rhf(X=dataset, t=T_h, nd=0, h=H_h)

scores = np.empty(labels.size)
for i, x in enumerate(dataset):
    score = a_s.anomaly_score(test_rhf, dataset.size, x)
    np.append(scores, score)    
'''
    
    print("Total time for rhf-cython (train) = ", timeit(stmt=code, number=10, globals=globals()))

BC, H_h= 1
H_h 1


KeyboardInterrupt: 

### Number of trees vs. execution time

In [None]:
mat_contents = sio.loadmat("../datasets/cardio.mat")
dataset = mat_contents['X'] 
labels = mat_contents['y']

k = 1
while k < 105:
    H_t = 5
    T_t  = k
    print("BC, T_t=", T_t)
    dataset = dataset.astype('float32') 

    code = '''
test_rhf = rhf.rhf(X=dataset, t=T_t, nd=0, h=H_t)

scores = np.empty(labels.size)
for i, x in enumerate(dataset):
    score = a_s.anomaly_score(test_rhf, dataset.size, x)
    np.append(scores, score)    
'''
    print("Total time for rhf-cython (train) = ", timeit(stmt=code, number=10, globals=globals()))
    k = k + 2

BC, T_t= 1
Total time for rhf-cython (train) =  3.105881569965277
BC, T_t= 3
Total time for rhf-cython (train) =  11.537520741054323
BC, T_t= 5
Total time for rhf-cython (train) =  20.355849726998713
BC, T_t= 7
Total time for rhf-cython (train) =  26.592776047997177
BC, T_t= 9
Total time for rhf-cython (train) =  35.743615761981346
BC, T_t= 11
Total time for rhf-cython (train) =  42.31217603298137
BC, T_t= 13
Total time for rhf-cython (train) =  51.12538918899372
BC, T_t= 15
Total time for rhf-cython (train) =  59.70853758294834
BC, T_t= 17
Total time for rhf-cython (train) =  67.73922391899396
BC, T_t= 19
Total time for rhf-cython (train) =  77.96840173500823
BC, T_t= 21
Total time for rhf-cython (train) =  82.68279686296592
BC, T_t= 23
Total time for rhf-cython (train) =  91.23252107598819
BC, T_t= 25
Total time for rhf-cython (train) =  99.95301215199288
BC, T_t= 27
Total time for rhf-cython (train) =  112.10268352599815
BC, T_t= 29


## Results

In [None]:
 #dataset = np.array([[1,2],[70,3],[80,18],[90,2],[100,23],[110,12],[111,22],[112,19],[113,20],[114,21],[115,22],[116,34]])
#Node.Node.printNode(test_rhf[0], 0)


## Average Precision Calculation

In [30]:
import scipy.io as sio
import scipy.stats as sstats
import random 
import numpy as np
import rht
import rhf
import anomaly_score as a_s
import Node
# set the number of trees and max height
H = 5
T = 100
mat_contents = sio.loadmat("../datasets/musk.mat")
dataset = mat_contents['X'] 
labels = mat_contents['y']
dataset = dataset.astype('float32')
for i in range(0, 10):
    test_rhf = rhf.rhf(X=dataset, t=T, nd=0, h=H)

    scores = np.empty(labels.size)
    for i, x in enumerate(dataset):
            score = a_s.anomaly_score(test_rhf, dataset.size, x)
            scores[i] = score
        
    print("AP=", average_precision_score(labels, scores))
    

AP= 0.9979303616198283
AP= 0.9999999999999999
AP= 0.9999999999999998
AP= 0.9999999999999999
AP= 0.9997896065642753
AP= 0.9997896065642751
AP= 0.9999999999999999
AP= 0.9985562690323885
AP= 0.9777108769488685
AP= 0.9999999999999999


In [33]:
results_satellite = np.array(
[0.6170157550859576,
0.663753486615899,
0.6636068896731623,
0.6095129668453847,
0.6425214156445306,
0.6399171649455052,
0.6726283457778135,
0.6404746861089395,
0.6511386966950713,
0.6434209656334753])

results_cardio = np.array(
[0.5181553327242993,
0.5926435236520571,
0.5322619213333626,
0.6142247044086294,
0.59205904109385,
0.6346077183945367,
0.5941951278338284,
0.5467206851013799,
0.6132809570352109,
0.5515181125400918])

results_musk = np.array(
[
0.9999999999999999,
1.0,
0.9920679390186478,
0.9999999999999999,
1.0,
0.9969025164739402,
0.9999999999999999,
0.9997896065642751,
0.9999999999999999,
0.9996822408419207,
0.9979303616198283,
0.9999999999999999,
0.9999999999999998,
0.9999999999999999,
0.9997896065642753,
0.9997896065642751,
0.9999999999999999,
0.9985562690323885,
0.9777108769488685,
0.9999999999999999
])

In [34]:
data = results_musk
conf = 0.95

n = len(data)
mean = np.mean(data)
std = np.std(data)
tstar = sstats.t.ppf((1 + conf) / 2., n-1)
h = tstar*std/math.sqrt(n)
print("{0} +/- {1}".format(round(mean,3), round(h,3)))


0.998 +/- 0.002
