# Random Histogram Forest

In [1]:
from timeit import timeit

### Testing

In [2]:
setup_code='''
import scipy.io as sio
import scipy.stats as sstats
import random 
import numpy as np
import rht
import rhf
import anomaly_score as a_s
import Node
# set the number of trees and max height
H = 5
T = 100
mat_contents = sio.loadmat("../datasets/satellite.mat")
dataset = mat_contents['X'] 
labels = mat_contents['y']
dataset = dataset.astype('float32') 
'''
code = '''
test_rhf = rhf.rhf(X=dataset, t=T, nd=0, h=H)
for i, x in enumerate(dataset):
    score = a_s.anomaly_score(test_rhf, dataset.size, x)
'''
for i in range(0,10):
    print("Total time for rhf-cython (train) = ", timeit(setup=setup_code, stmt=code, number=1))

'''
for i, x in enumerate(dataset):
    score = anomaly_score(test_rhf, dataset.size, x)
    np.append(scores, [score, labels[i]])
    print("score = " + str(score) + " | anomaly? " + str(labels[i]))
'''


Total time for rhf-cython (train) =  239.46417132100032
Total time for rhf-cython (train) =  233.30278345999977
Total time for rhf-cython (train) =  237.83752649799862
Total time for rhf-cython (train) =  240.80996020999737
Total time for rhf-cython (train) =  237.6489033339967
Total time for rhf-cython (train) =  234.7397418949986
Total time for rhf-cython (train) =  233.13837992299523
Total time for rhf-cython (train) =  237.30533448100323
Total time for rhf-cython (train) =  231.1335203540002
Total time for rhf-cython (train) =  238.0386776029991


'\nfor i, x in enumerate(dataset):\n    score = anomaly_score(test_rhf, dataset.size, x)\n    np.append(scores, [score, labels[i]])\n    print("score = " + str(score) + " | anomaly? " + str(labels[i]))\n'

### Testing Python vs Cython speed

In [5]:
import kurtosis_sum
kurtosis_sum.kurtosis_sum(dataset, dataset.shape[1]-1)

47.71512985229492

In [None]:
 #dataset = np.array([[1,2],[70,3],[80,18],[90,2],[100,23],[110,12],[111,22],[112,19],[113,20],[114,21],[115,22],[116,34]])
#Node.Node.printNode(test_rhf[0], 0)


In [38]:

setup_code = '''
import scipy.io as sio
import scipy.stats as sstats
import random 
import numpy as np
import kurtosis_sum as ks_cy
from timeit import timeit

# sum of log(Kurtosis(X[a] + 1)) of attributes 0 to d inclusive
def kurtosis_sum(X, d):
    sum = 0
    
    # loop over the transpose matrix in order to analyze by column
    for a in range(0, d+1):
        # + 4 since the scipy function for kurtosis subtracts 3 so +1 +3 = 4
        sum += np.log(sstats.stats.kurtosis(X[:,a])+4)
        
    return sum
mat_contents = sio.loadmat("satellite.mat")


test_data = mat_contents['X']

'''



# Python 

code = '''
kurtosis_sum(test_data, test_data.shape[1]-1)
'''
for i in range(0,10):
    print("Total time for kurtosis sum (python) = ", timeit(setup=setup_code, stmt=code, number=100))

Total time for kurtosis sum (python) =  1.11729780800124
Total time for kurtosis sum (python) =  1.0739590100001806
Total time for kurtosis sum (python) =  1.0495012139999744
Total time for kurtosis sum (python) =  1.0497420669998974
Total time for kurtosis sum (python) =  1.0502796979999403
Total time for kurtosis sum (python) =  1.0653134689982835
Total time for kurtosis sum (python) =  1.0392168799990031
Total time for kurtosis sum (python) =  1.0634308279986726
Total time for kurtosis sum (python) =  1.0469607429986354
Total time for kurtosis sum (python) =  1.1195489530000486


In [39]:
# Cython 

setup_code = '''
import scipy.io as sio
import scipy.stats as sstats
import random 
import numpy as np
import kurtosis_sum as ks_cy
from timeit import timeit

mat_contents = sio.loadmat("satellite.mat")


test_data = mat_contents['X']

'''

code = '''ks_cy.kurtosis_sum2(test_data, test_data.shape[1]-1)'''

for i in range(0,10):
    print("Total time for kurtosis sum (cython) = ", timeit(setup=setup_code, stmt=code, number=100))

Total time for kurtosis sum (cython) =  1.0919095469998865
Total time for kurtosis sum (cython) =  1.1623525440008962
Total time for kurtosis sum (cython) =  1.0357378009994136
Total time for kurtosis sum (cython) =  1.1084858289996191
Total time for kurtosis sum (cython) =  1.1147826469987194
Total time for kurtosis sum (cython) =  1.084498797001288
Total time for kurtosis sum (cython) =  1.0787436030004756
Total time for kurtosis sum (cython) =  1.1252668859997357
Total time for kurtosis sum (cython) =  1.1269218009983888
Total time for kurtosis sum (cython) =  1.08067842499986


In [1]:
import scipy.io as sio
import scipy.stats as sstats
import random 
import numpy as np
import rht
import rhf
import anomaly_score as a_s
import Node
# set the number of trees and max height
H = 5
T = 100
mat_contents = sio.loadmat("../datasets/satellite.mat")
#dataset = mat_contents['X'] 
labels = mat_contents['y']
dataset = np.array([[1,2],[70,3],[80,18],[90,2],[100,23],[110,12],[111,22],[112,19],[113,20],[114,21],[115,22],[116,34]])

dataset = dataset.astype('float32') 


test_rhf = rhf.rhf(X=dataset, t=T, nd=0, h=H)


In [3]:
import importlib
importlib.reload(Node)

for i, x in enumerate(dataset):
    print("i=",i)
    print("x=",x)
    score = a_s.anomaly_score(test_rhf, dataset.size, x)

i= 0
x= [1. 2.]
i= 1
x= [70.  3.]
i= 2
x= [80. 18.]
i= 3
x= [90.  2.]
i= 4
x= [100.  23.]
i= 5
x= [110.  12.]
i= 6
x= [111.  22.]
i= 7
x= [112.  19.]
i= 8
x= [113.  20.]
i= 9
x= [114.  21.]
i= 10
x= [115.  22.]
i= 11
x= [116.  34.]


b'\x00\x00\x00\x00'