In [1]:
%load_ext autoreload
%autoreload 2

In [88]:
import sys
from timeit import timeit
from tf.app import use
from pack import deepSize
from sparse import PlainString, PlainInt, SparseString, SparseInt

In [98]:
intItems = [(5, 500), (7, -2), (9, -1), (10, 0), (12, 1000)]
stringItems = [(5, "foo ür bår"), (10, "𒉡𒊌=𒊍𒋫𒆷𒀠𒇷𒈬")]

In [102]:
df = PlainInt(intItems)
for i in range(15):
    print(f"df[{i}] = {df.get(i)}")

[-1, -1, -1, -1, -1, 500, -1, -3, -1, -2, 0, -1, 1000]
df[0] = None
df[1] = None
df[2] = None
df[3] = None
df[4] = None
df[5] = 500
df[6] = None
df[7] = -2
df[8] = None
df[9] = -1
df[10] = 0
df[11] = None
df[12] = 1000
df[13] = None
df[14] = None


In [104]:
df = SparseInt(intItems)
for i in range(15):
    print(f"df[{i}] = {df.get(i)}")

df[0] = None
df[1] = None
df[2] = None
df[3] = None
df[4] = None
df[5] = 500
df[6] = None
df[7] = -2
df[8] = None
df[9] = -1
df[10] = 0
df[11] = None
df[12] = 1000
df[13] = None
df[14] = None


In [106]:
df = PlainString(stringItems)
for i in range(15):
    print(f"df[{i}] = {df.get(i)}")

df[0] = None
df[1] = None
df[2] = None
df[3] = None
df[4] = None
df[5] = foo ür bår
df[6] = None
df[7] = None
df[8] = None
df[9] = None
df[10] = 𒉡𒊌=𒊍𒋫𒆷𒀠𒇷𒈬
df[11] = None
df[12] = None
df[13] = None
df[14] = None


In [105]:
df = SparseString(stringItems)
for i in range(15):
    print(f"df[{i}] = {df.get(i)}")

df[0] = None
df[1] = None
df[2] = None
df[3] = None
df[4] = None
df[5] = foo ür bår
df[6] = None
df[7] = None
df[8] = None
df[9] = None
df[10] = 𒉡𒊌=𒊍𒋫𒆷𒀠𒇷𒈬
df[11] = None
df[12] = None
df[13] = None
df[14] = None


In [107]:
A = use('bhsa:clone', silent='deep')

In [111]:
def testPerformance(feat, check=True):
    api = A.api
    maxNode = api.F.otype.maxNode
    Fs = api.Fs
    featObj = Fs(feat)
    items = sorted(featObj.items())
    firstI = items[0][0]
    halfI = items[len(items) // 2][0]
    lastI = items[-1][0]
    nonI = lastI + 100
    
    dataP = (PlainString if featObj.meta["valueType"] == 'str' else PlainInt)(items)
    dataS = (SparseString if featObj.meta["valueType"] == 'str' else SparseInt)(items)
    
    tfLookup = featObj.v
    ppLookup = dataP.get
    spLookup = dataS.get
    
    print(f"{len(items)} items")
    print(f"Some items:")
    print(f"\tfirst: {firstI:>7} = {tfLookup(firstI)}")
    print(f"\thalf : {halfI:>7} = {tfLookup(halfI)}")
    print(f"\tlast : {lastI:>7} = {tfLookup(lastI)}")
    print(f"\tnon  : {nonI:>7} = {tfLookup(nonI)}")
    print(f"Size (TF compiled) = {deepSize(featObj.data):>8}")
    print(f"Size (Plain)       = {dataP.size()}")
    print(f"Size (Sparse)      = {dataS.size()}")
    
    if check:
        print("checking correctness ...")
        errors = []
        for i in range(maxNode + 5):
            tv = tfLookup(i)
            sv = spLookup(i)
            pv = spLookup(i)
            if tv != sv or tv != pv:
                errors.append(i)
        if errors:
            print(f"{len(errors)} errors")
            for i in errors[0:10]:
                print(f'''{i:>7} TF: "{tfLookup(i)}" PP: "{ppLookup(i)}" SP: "{spLookup(i)}''')
        else:
            print("all values correct")
    else:
        print("correctness not checked")
    
    def execute(method, v):
        upperIndex = 430000
        key0 = 739 # not in the data of nametype
        key1 = 740 # in the data of nametype
        fI = firstI
        hI = halfI
        lI = lastI
        nI = nonI
        
        def doall():
            n = 0
            for i in range(fI - 10, lI + 10):
                if v(i) is not None:
                    n += 1
            
        times1 = 1000000
        times2 = max((1, 1000000 // (lI - fI)))
        
        print(method)
        for task in (
            ("first", "v(fI)", times1),
            ("half", "v(hI)", times1),
            ("last", "v(lI)", times1),
            ("non", "v(nI)", times1),
            ("all", "doall()", times2),
        ):
            (label, code, times) = task
            sys.stdout.write(f"\t{label:<5} {times:>7}x")
            sys.stdout.flush()
            t = timeit(code, globals=locals(), number=times)
            sys.stdout.write(f"  {t:>.4f}\n")
    
    execute("TF", tfLookup)
    execute("PP", ppLookup)
    execute("SP", spLookup)

In [127]:
testPerformance('nametype')

38184 items
Some items:
	first:     740 = pers
	half :  210677 = topo
	last : 1446794 = pers
	non  : 1446894 = None
Size (TF compiled) =  2380461
Size (Plain)       =  7395867
	offsets : I:  5787184
	values  : x:  1608683

Size (Sparse)      =   418826
	indices : I:   279176
	offsets : B:    69795
	bounds  : B:    69794
	values  : x:       61

checking correctness ...
all values correct
TF
	first 1000000x  0.1926
	half  1000000x  0.1879
	last  1000000x  0.1990
	non   1000000x  0.1495
	all         1x  0.2454
PP
	first 1000000x  0.5935
	half  1000000x  0.5922
	last  1000000x  0.5876
	non   1000000x  0.1618
	all         1x  0.6786
SP
	first 1000000x  1.1868
	half  1000000x  1.2123
	last  1000000x  1.1702
	non   1000000x  0.8637
	all         1x  1.3198


In [128]:
testPerformance('sp')

435817 items
Some items:
	first:       1 = prep
	half :  217909 = adjv
	last : 1446799 = adjv
	non  : 1446899 = None
Size (TF compiled) = 33175225
Size (Plain)       =  8511068
	offsets : I:  5787204
	values  : x:  2723864

Size (Sparse)      =  2355537
	indices : I:  1570320
	offsets : B:   392581
	bounds  : B:   392580
	values  : x:       56

checking correctness ...
all values correct
TF
	first 1000000x  0.1873
	half  1000000x  0.1857
	last  1000000x  0.1938
	non   1000000x  0.1543
	all         1x  0.2862
PP
	first 1000000x  0.5532
	half  1000000x  0.5921
	last  1000000x  0.6009
	non   1000000x  0.1708
	all         1x  0.7388
SP
	first 1000000x  1.1303
	half  1000000x  1.3025
	last  1000000x  1.2033
	non   1000000x  0.9306
	all         1x  1.6315


In [129]:
testPerformance('g_word_utf8')

426584 items
Some items:
	first:       1 = בְּ
	half :  213293 = 
	last :  426584 = יָֽעַל
	non  :  426684 = None
Size (TF compiled) = 41387185
Size (Plain)       =  6671649
	offsets : I:  1706344
	values  : x:  4965305

Size (Sparse)      =  6747070
	indices : I:  1706328
	offsets : I:  1706332
	bounds  : I:  1706328
	values  : x:  1628082

checking correctness ...
all values correct
TF
	first 1000000x  0.1929
	half  1000000x  0.1875
	last  1000000x  0.1895
	non   1000000x  0.1512
	all         2x  0.2199
PP
	first 1000000x  0.6235
	half  1000000x  0.5071
	last  1000000x  0.6589
	non   1000000x  0.1646
	all         2x  0.6270
SP
	first 1000000x  1.1826
	half  1000000x  1.1837
	last  1000000x  1.3549
	non   1000000x  0.9052
	all         2x  1.2940


In [130]:
testPerformance('voc_lex_utf8')

435817 items
Some items:
	first:       1 = בְּ
	half :  217909 = רַב
	last : 1446799 = יָשֵׁשׁ
	non  : 1446899 = None
Size (TF compiled) = 33881856
Size (Plain)       = 10411046
	offsets : I:  5787204
	values  : x:  4623842

Size (Sparse)      =  5319347
	indices : I:  1737052
	offsets : I:  1737056
	bounds  : I:  1737052
	values  : x:   108187

checking correctness ...
all values correct
TF
	first 1000000x  0.1887
	half  1000000x  0.1853
	last  1000000x  0.1988
	non   1000000x  0.1483
	all         1x  0.2746
PP
	first 1000000x  0.6282
	half  1000000x  0.6556
	last  1000000x  0.6691
	non   1000000x  0.1636
	all         1x  0.7889
SP
	first 1000000x  1.2105
	half  1000000x  1.3767
	last  1000000x  1.3605
	non   1000000x  0.9455
	all         1x  1.7392


In [131]:
testPerformance('otype')

1446799 items
Some items:
	first:       1 = word
	half :  723400 = phrase
	last : 1446799 = lex
	non  : 1446899 = None
Size (TF compiled) =  8162441
Size (Plain)       = 16457200
	offsets : I:  5787204
	values  : x: 10669996

Size (Sparse)      =      183
	indices : I:       56
	offsets : B:       15
	bounds  : B:       14
	values  : x:       98

checking correctness ...
all values correct
TF
	first 1000000x  0.2143
	half  1000000x  0.4172
	last  1000000x  0.4202
	non   1000000x  0.3470
	all         1x  0.5754
PP
	first 1000000x  0.5491
	half  1000000x  0.5909
	last  1000000x  0.5922
	non   1000000x  0.1618
	all         1x  0.8993
SP
	first 1000000x  0.7655
	half  1000000x  0.7843
	last  1000000x  0.7562
	non   1000000x  0.4478
	all         1x  1.1638


In [132]:
testPerformance('code')

90688 items
Some items:
	first:  515674 = 0
	half :  561018 = 201
	last :  606361 = 410
	non  :  606461 = None
Size (TF compiled) =  8877392
Size (Plain)       =  1212724
	values  : h:  1212724

Size (Sparse)      =   494082
	indices : I:   329388
	values  : h:   164694

checking correctness ...
all values correct
TF
	first 1000000x  0.1917
	half  1000000x  0.1837
	last  1000000x  0.1859
	non   1000000x  0.1490
	all        11x  0.2558
PP
	first 1000000x  0.2764
	half  1000000x  0.2548
	last  1000000x  0.2742
	non   1000000x  0.1652
	all        11x  0.3187
SP
	first 1000000x  0.8770
	half  1000000x  0.9618
	last  1000000x  0.8997
	non   1000000x  0.8550
	all        11x  1.0174
