# Random Histogram Forest

In [2]:
from timeit import timeit
from sklearn.metrics import average_precision_score
import math
import scipy.io as sio
import scipy.stats as sstats
import random 
import numpy as np
import rht
import rhf
import anomaly_score as a_s
import Node

## Execution time with satellite, cardio, and musk datasets

### Execution time for fixed parameters

In [7]:
setup_code='''
import scipy.io as sio
import scipy.stats as sstats
import random 
import numpy as np
import rht
import rhf
import anomaly_score as a_s
import Node
# set the number of trees and max height
H = 5
T = 100
mat_contents = sio.loadmat("../datasets/musk.mat")
dataset = mat_contents['X'] 
labels = mat_contents['y']
dataset = dataset.astype('float32') 
'''
code = '''
test_rhf = rhf.rhf(X=dataset, t=T, nd=0, h=H)
scores = np.empty(labels.size)
for i, x in enumerate(dataset):
    score = a_s.anomaly_score(test_rhf, dataset.size, x)
    np.append(scores, score)    
'''
for i in range(0,3):
    print("Total time for rhf-cython (train) = ", timeit(setup=setup_code, stmt=code, number=1))

KeyboardInterrupt: 

### Number of elements vs. execution time

In [None]:
mat_contents = sio.loadmat("../datasets/satellite.mat")
ds = mat_contents['X'] 
ls = mat_contents['y']


i = 2250
while i < ds.shape[0]:
    H = 5
    T = 100
    print("n=", i)
    
    dataset = ds[:i, :]
    labels = ls[:i]
    dataset = dataset.astype('float32') 

    code = '''
test_rhf = rhf.rhf(X=dataset, t=T, nd=0, h=H)

scores = np.empty(labels.size)
for i, x in enumerate(dataset):
    score = a_s.anomaly_score(test_rhf, dataset.size, x)
    np.append(scores, score)    
'''
    
    print("Total time for rhf-cython (train) = ", timeit(stmt=code, number=10, globals=globals()))
    i = i + 250

### Height vs. execution time

In [None]:
mat_contents = sio.loadmat("../datasets/satellite.mat")
dataset = mat_contents['X'] 
labels = mat_contents['y']


for i in range(1, 11):
    H = i
    T = 100
    
    dataset = dataset.astype('float32') 

    code = '''
print("H=", H)

test_rhf = rhf.rhf(X=dataset, t=T, nd=0, h=H)

scores = np.empty(labels.size)
for i, x in enumerate(dataset):
    score = a_s.anomaly_score(test_rhf, dataset.size, x)
    np.append(scores, score)    
'''
    
    print("Total time for rhf-cython (train) = ", timeit(stmt=code, number=10, globals=globals()))

### Number of trees vs. execution time

In [None]:
mat_contents = sio.loadmat("../datasets/cardio.mat")
dataset = mat_contents['X'] 
labels = mat_contents['y']


for i in range(1, 101):
    H = 5
    T = i
    print("T=", i)
    
    dataset = dataset.astype('float32') 

    code = 
'''
test_rhf = rhf.rhf(X=dataset, t=T, nd=0, h=H)

scores = np.empty(labels.size)
for i, x in enumerate(dataset):
    score = a_s.anomaly_score(test_rhf, dataset.size, x)
    np.append(scores, score)    
'''
    
    print("Total time for rhf-cython (train) = ", timeit(stmt=code, number=10, globals=globals()))

## Results

In [None]:
 #dataset = np.array([[1,2],[70,3],[80,18],[90,2],[100,23],[110,12],[111,22],[112,19],[113,20],[114,21],[115,22],[116,34]])
#Node.Node.printNode(test_rhf[0], 0)


## Average Precision Calculation

In [30]:
import scipy.io as sio
import scipy.stats as sstats
import random 
import numpy as np
import rht
import rhf
import anomaly_score as a_s
import Node
# set the number of trees and max height
H = 5
T = 100
mat_contents = sio.loadmat("../datasets/musk.mat")
dataset = mat_contents['X'] 
labels = mat_contents['y']
dataset = dataset.astype('float32')
for i in range(0, 10):
    test_rhf = rhf.rhf(X=dataset, t=T, nd=0, h=H)

    scores = np.empty(labels.size)
    for i, x in enumerate(dataset):
            score = a_s.anomaly_score(test_rhf, dataset.size, x)
            scores[i] = score
        
    print("AP=", average_precision_score(labels, scores))
    

AP= 0.9979303616198283
AP= 0.9999999999999999
AP= 0.9999999999999998
AP= 0.9999999999999999
AP= 0.9997896065642753
AP= 0.9997896065642751
AP= 0.9999999999999999
AP= 0.9985562690323885
AP= 0.9777108769488685
AP= 0.9999999999999999


In [33]:
results_satellite = np.array(
[0.6170157550859576,
0.663753486615899,
0.6636068896731623,
0.6095129668453847,
0.6425214156445306,
0.6399171649455052,
0.6726283457778135,
0.6404746861089395,
0.6511386966950713,
0.6434209656334753])

results_cardio = np.array(
[0.5181553327242993,
0.5926435236520571,
0.5322619213333626,
0.6142247044086294,
0.59205904109385,
0.6346077183945367,
0.5941951278338284,
0.5467206851013799,
0.6132809570352109,
0.5515181125400918])

results_musk = np.array(
[
0.9999999999999999,
1.0,
0.9920679390186478,
0.9999999999999999,
1.0,
0.9969025164739402,
0.9999999999999999,
0.9997896065642751,
0.9999999999999999,
0.9996822408419207,
0.9979303616198283,
0.9999999999999999,
0.9999999999999998,
0.9999999999999999,
0.9997896065642753,
0.9997896065642751,
0.9999999999999999,
0.9985562690323885,
0.9777108769488685,
0.9999999999999999
])

In [34]:
data = results_musk
conf = 0.95

n = len(data)
mean = np.mean(data)
std = np.std(data)
tstar = sstats.t.ppf((1 + conf) / 2., n-1)
h = tstar*std/math.sqrt(n)
print("{0} +/- {1}".format(round(mean,3), round(h,3)))


0.998 +/- 0.002
