# Likelihood computation benchmarks
In this notebook we compare the run time of the various likelihood functions implemented in the recombulator-x module.

In [None]:
#debug this
import numpy
import numba
#print(numba.

def inheritance_vectors(n: int):
    """Generate all possible inheritance vectors of length n
    """
    v = numpy.zeros(n, dtype=numpy.int8)
    for i in range(1 << n):
        for j in range(n):
            v[j] = 1 if i & (1 << j) else 0
        yield v # note that this is always the same vector, not a copy
    return v # this does nothing but is needed by numba

jif = numba.jit(inheritance_vectors)
for x in jif(3):
    print(x)

[0 0 0]
[1 0 0]
[0 1 0]
[1 1 0]
[0 0 1]
[1 0 1]
[0 1 1]
[1 1 1]


In [1]:
import numba

In [7]:
numba._version.get_versions()

{'version': '0.56.0', 'full': 'f75c45a8dd7c7acfdfa8b41bea18695a88447c3e'}

In [1]:
import time
import numpy
import pandas
import seaborn
import matplotlib.pyplot as plt
from tqdm import tqdm
import itertools

In [2]:
import recombulatorx
from recombulatorx import compute_family_likelihood
from recombulatorx.testing import generate_processed_family, generate_random_rates

Retrieve the available implementations, which vary depending on the availability of the numba module.

In [3]:
available_implementations = recombulatorx.likelihood.implementations.keys()
available_implementations

dict_keys(['dynamic', 'direct-loop', 'direct-numpy', 'dynamic-numba', 'direct-loop-numba'])

In [4]:
# FIXME
available_implementations = [i for i in available_implementations if i != 'direct-loop-numba']

Define benchmarking parameters, the number of families and the maximum mean time (in seconds) allowed for one family likelihood computation:

In [5]:
max_time = 1
n_fam_I = 100
n_fam_II = 100

In [6]:
time_df_acc = {}
slow_funcs = set()
first_run = True
for n_markers in tqdm(itertools.chain(range(2, 101), range(10**4 - 50, 10**4 + 1))):

    rates = generate_random_rates(n_markers)
    direct_numpy_cache = None        
                
    fams_I = [generate_processed_family(f'FAM_I_{i}', 2, True, *rates) for i in range(n_fam_I)]
    fams_II = [generate_processed_family(f'FAM_II_{i}', 2, False, *rates) for i in range(n_fam_II)]
    time_acc = {}
    if first_run: # run every function once, this is mainly for numba since it needs to compile!
        for impl in available_implementations:
            compute_family_likelihood(fams_I[0], *rates, implementation=impl)
            compute_family_likelihood(fams_II[0], *rates, implementation=impl)
        first_run = False
    
    for fams in fams_I, fams_II:
        for impl in available_implementations:
            for fam in fams:
                ftype = 'type I' if fam.is_mother_phased else 'type II'
                if (impl, ftype) in slow_funcs: continue
                t0 = time.time()
                lh = compute_family_likelihood(fam, *rates, implementation=impl)
                t1 = time.time()
                time_acc[(impl, ftype, fam.fid)] = [lh, t1 - t0]
    if len(time_acc) == 0:
        break
    
    df = pandas.DataFrame(time_acc, index=['Family Likelihood', 'Elapsed Time']).T
    df.index.names = ['Implementation', 'Family Type', 'Family ID']

    # check if any function is taking too long
    mean_dt = df.reset_index().groupby(['Implementation', 'Family Type'])['Elapsed Time'].mean()
    new_slow = set(mean_dt.index[mean_dt > max_time])
    slow_funcs |= new_slow
    if new_slow:
        print('dropped', new_slow, 'at', n_markers, 'markers')
    time_df_acc[n_markers] = df.reset_index()

7it [03:35, 62.63s/it]

dropped {('direct-loop', 'type II')} at 8 markers


10it [06:23, 68.35s/it]

dropped {('direct-numpy', 'type II')} at 11 markers


14it [11:52, 96.59s/it]

dropped {('direct-loop', 'type I')} at 15 markers


15it [13:52, 103.66s/it]

dropped {('dynamic', 'type II')} at 16 markers


17it [15:24, 75.65s/it] 

leaf monotonicity failure 2.2587785400506548e-33 2.2587785400506572e-33 -2.3947971802426073e-48 -1.0602177848692062e-15
leaf monotonicity failure 2.2587785400506548e-33 2.2587785400506572e-33 -2.3947971802426073e-48 -1.0602177848692062e-15


18it [17:37, 92.71s/it]

dropped {('direct-numpy', 'type I')} at 19 markers


21it [20:10, 60.69s/it]

dropped {('dynamic-numba', 'type II')} at 21 markers


150it [41:38, 16.66s/it]


In [7]:
full_df = pandas.concat(time_df_acc, names=['# of markers', 'row']).reset_index('row', drop=True).reset_index()
full_df

Unnamed: 0,# of markers,Implementation,Family Type,Family ID,Family Likelihood,Elapsed Time
0,2,dynamic,type I,FAM_I_0,0.006449,0.000352
1,2,dynamic,type I,FAM_I_1,0.006474,0.000175
2,2,dynamic,type I,FAM_I_2,0.153094,0.000173
3,2,dynamic,type I,FAM_I_3,0.152492,0.000158
4,2,dynamic,type I,FAM_I_4,0.155309,0.000157
...,...,...,...,...,...,...
38395,10000,dynamic-numba,type I,FAM_I_95,0.000000,0.000485
38396,10000,dynamic-numba,type I,FAM_I_96,0.000000,0.000432
38397,10000,dynamic-numba,type I,FAM_I_97,0.000000,0.000470
38398,10000,dynamic-numba,type I,FAM_I_98,0.000000,0.000555


In [8]:
full_df.to_csv('benchmarks.tsv', index=False, sep='\t')

In [9]:
full_df = pandas.read_csv('benchmarks.tsv', sep='\t')

## Plotting times

In [None]:
fig = plt.figure(figsize=(15, 8))
seaborn.lineplot(data=full_df, x='# of markers', y='Elapsed Time', hue='Implementation', style='Family Type')
plt.yscale('log')

In [None]:
fig = plt.figure(figsize=(10, 6))
seaborn.lineplot(data=full_df.loc[full_df['# of markers'] <= 36], x='# of markers', y='Elapsed Time', hue='Implementation', style='Family Type')
plt.yscale('log')

In [None]:
fig.savefig('likelihood_benchmarks_1-35.pdf')

In [None]:
plot_df = full_df.copy()
plot_df['BLOCK'] = float('nan')
plot_df.loc[plot_df['# of markers'] < 26, 'BLOCK'] = 'LEFT'
plot_df.loc[plot_df['# of markers'] > 100, 'BLOCK'] = 'RIGHT'

In [None]:
fig = plt.figure(figsize=(10, 6))
seaborn.lineplot(
    data=plot_df.query('BLOCK == "LEFT"'), x='# of markers', y='Elapsed Time', 
    hue='Implementation', style='Family Type', 
)
plt.yscale('log')

In [None]:
fig.savefig('likelihood_benchmarks_1-25.pdf')

In [None]:
fig = seaborn.relplot(data=plot_df, x='# of markers', y='Elapsed Time', 
                hue='Implementation', style='Family Type', 
                col='BLOCK', kind='line', facet_kws={'sharey': True, 'sharex': False},
)
plt.yscale('log')

In [None]:
fig.savefig('likelihood_benchmarks_split.pdf')

In [None]:
assert False

In [None]:
Qui facciamo dei test per vedere se la versione approssimata funziona?

In [None]:
full_df.groupby(['n_markers', 'fam', 'rep'])['lh'].std()

In [None]:
acc = {}
for l, sdf in full_df.groupby(['n_markers', 'fam', 'rep']):
    if len(sdf):
        sdf['rel_lh'] = sdf['lh']/sdf.query('func != "emp"')['lh'].max()
        acc[l] = sdf
full_df_r = pandas.concat(acc)

In [None]:
full_df_r['rel_lh'].min()

In [None]:

acc = {}
for l, sdf in full_df.groupby(['n_markers', 'fam', 'rep']):
    if len(sdf):
        sdf['rel_lh'] = sdf['lh']/sdf.query('func != "emp"')['lh'].max()
        acc[l] = sdf
full_df_r = pandas.concat(acc)

fig = plt.figure(figsize=(10, 5))
seaborn.lineplot(data=full_df_r, x='n_markers', y='rel_lh', hue='func', style='fam')
plt.ylim((0, 1.1))
plt.title('likelyhood accuracy')
pdf.savefig()
plt.close()