# Select the selection algorithms

The idea here is that we can simplify the decision logic, reduce the binary size
and speed up the compilation time by only including a subset of selection algorithms.
We're aiming to get algorithms that perform well in different situations, and complement
each other - so to do this, we're iteratively removing the worst performing algorithm,
after which algorithms are re-evaluated on their speedups relative to the remaining
algorithms. This gets us a minimum spanning set of selection algorithms that performs
well over diverse inputs.

In [11]:
from select_k_dataset import load_dataframe, get_dataset

df = load_dataframe("select_k_times.json")
df

Unnamed: 0,key_type,index_type,algo,row,col,k,use_index_input,use_memory_pool,time
0,float,int64_t,kRadix8bits,1,1024,1,0,1,0.000017
1,float,int64_t,kRadix11bits,1,1024,1,0,1,0.000012
2,float,int64_t,kRadix11bitsExtraPass,1,1024,1,0,1,0.000012
3,float,int64_t,kWarpImmediate,1,1024,1,0,1,0.000009
4,float,int64_t,kWarpFiltered,1,1024,1,0,1,0.000010
...,...,...,...,...,...,...,...,...,...
21405,float,int64_t,kRadix11bits,7,1254,7696,0,1,0.000015
21406,float,int64_t,kRadix11bitsExtraPass,7,1254,7696,0,1,0.000015
21407,float,int64_t,kRadix8bits,7,2189,7960,0,1,0.000030
21408,float,int64_t,kRadix11bits,7,2189,7960,0,1,0.000019


In [12]:
from collections import Counter

def rank_algos(df, use_relative_speedup=False):
    _, y, weights = get_dataset(df)
    times = Counter()
    for algo, speedup in zip(y, weights):
        times[algo] += speedup if use_relative_speedup else 1
    return sorted(times.items(), key=lambda x:-x[-1])

In [13]:
# show the number of times each algorithm is fastest for a given k/# of rows/# of cols / dtype / memory pool etc
rank_algos(df)

[('kWarpDistributedShm', 1157),
 ('kRadix11bits', 1064),
 ('kWarpImmediate', 447),
 ('kRadix11bitsExtraPass', 369),
 ('kFaissBlockSelect', 302),
 ('kWarpDistributed', 42),
 ('kWarpFiltered', 23),
 ('kRadix8bits', 4)]

In [15]:
# kRadix8bits seems to have a performance issue with 64 bit index types, it is one
# of the worst performing algorithms for 64bit indices, but one of the top 3 for 32 bit
rank_algos(df[df.index_type == "int64_t"])

[('kWarpDistributedShm', 1157),
 ('kRadix11bits', 1064),
 ('kWarpImmediate', 447),
 ('kRadix11bitsExtraPass', 369),
 ('kFaissBlockSelect', 302),
 ('kWarpDistributed', 42),
 ('kWarpFiltered', 23),
 ('kRadix8bits', 4)]

In [16]:
rank_algos(df[df.index_type == "uint32_t"])

[]

In [20]:
# do an algorithm selection pass, repeatedly remove the lowest performing algorithm
#
# The idea here is that we can simplify the decision logic, reduce the binary size
# and speed up the compilation time by only including a subset of selection algorithms.
# we're aiming to get algorithms that perform well in different situations, and complement
# each other - so to do this, we're iteratively removing the worst performing algorithm,
# after which algorithms are re-evaluated on their speedups relative to the remaining
# algorithms. This gets us a minimum spanning set of selection algorithms that performs
# well over diverse inputs.
#
# note: the lowest performing algorithm here might actually be pretty good, but
# just not provide much benefit over another similar algorithm. 
# As an example, kWarpDistributed  is an excellent selection algorithm, but in testing 
# kWarpDistributedShm is slightly faster than it in situations where it does well, 
# meaning that it gets removed early on in this loop
current = df[df.use_memory_pool == True]
algos = set(df.algo)

# we're arbitrarily getting this down to 3 selection algorithms
while len(algos) > 4:
    times = rank_algos(current, use_relative_speedup=False)
    algo, speedup = times[-1]
    algos.remove(algo)
    current = df[df.algo.isin(algos)]

print("selected", algos)
rank_algos(current)

selected {'kWarpImmediate', 'kRadix11bitsExtraPass', 'kRadix11bits', 'kWarpDistributedShm'}


[('kWarpDistributedShm', 1266),
 ('kRadix11bits', 1156),
 ('kWarpImmediate', 577),
 ('kRadix11bitsExtraPass', 409)]

In [10]:
# experimenting with different subsets of index type / dtype / use memory seems
# to pretty consistently show that kRadix11bits / kWarpDistributedShm / kFaissBlockSelect
# all get selected here