# Benchmark recombination and mutation rates estimation error and computation time

In [1]:
import numpy
import pandas
import seaborn
import xstr_recomb
from xstr_recomb import estimate_rates
from xstr_recomb.testing import generate_processed_family, generate_random_rates

In [2]:
try:
    from tqdm import tqdm
except ModuleNotFoundError:
    tqdm = lambda x: x
    print('tqdm module not found, no progress bars!')

In [3]:
from time import time

def random_estimation_test(n_markers, n_fam_I, n_fam_II, seed, **kwargs):
    if seed is not None:
        numpy.random.seed(seed)
    simulated_rates = generate_random_rates(n_markers)
    fams = [generate_processed_family(f'FAM_I_{i}', 2, True, *simulated_rates) for i in range(n_fam_I)]
    fams += [generate_processed_family(f'FAM_II_{i}', 2, False, *simulated_rates) for i in range(n_fam_II)]
    t1 = time()
    estimated_rates = estimate_rates(fams, 0.1, 0.1, **kwargs)
    elapsed_time = time() - t1
    return simulated_rates, estimated_rates, elapsed_time

In [4]:
try:
    import joblib
    mem = joblib.Memory('estimation_testing_cache', verbose=0)
    random_estimation_test = mem.cache(random_estimation_test)
except ModuleNotFoundError:
    print('joblib module not found, cannot cache results')

In [5]:
def random_estimation_tests(n_markers, n_fam_I, n_fam_II, n_tests):
    data = [
        random_estimation_test(n_markers, n_fam_I, n_fam_II, seed, estimate_mutation_rates='all')
        for seed in tqdm(range(n_tests))
    ]
    # put results in a pandas DataFrame
    recomb_cols = [('RECOMBINATION', f'M{i}-{i+1}') for i in range(1, n_markers)]
    mut_cols = [('MUTATION', f'M{i}') for i in range(1, n_markers + 1)]
    all_cols = [(l1, *l2) for l1 in ['SIMULATED', 'ESTIMATED'] for l2_cols in [recomb_cols, mut_cols] for l2 in l2_cols] + [('ELAPSED TIME', None, None)]

    df = pandas.DataFrame(
        [
            numpy.concatenate([*simulated_rates, *estimated_rates, [elapsed_time]])
            for simulated_rates, estimated_rates, elapsed_time in data
        ], 
        columns=pandas.MultiIndex.from_tuples(all_cols, names=['SOURCE', 'RATE', 'MARKER']))
    df.index.name = 'TEST'
    return df

In [6]:
#random_estimation_test(200, 1000, 0, 0, optimization_method='Nelder-Mead', maxiter=10000)

In [7]:
#random_estimation_test(200, 1000, 0, 0, optimization_method='L-BFGS-B', maxiter=10000)

In [8]:
#random_estimation_test(200, 1000, 0, 0, optimization_method='Powell', maxiter=10000)

In [9]:
#random_estimation_test(200, 1000, 0, 0, optimization_method='TNC', maxiter=10000)

## Tests comparing to literature

In [10]:
lit1 = random_estimation_tests(n_markers=12, n_fam_I=216, n_fam_II=185, n_tests=10)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 817.67it/s]


In [11]:
lit1_minutes = lit1.loc[:, ('ELAPSED TIME', None, None)]/60
print(f'average time (minutes): {lit1_minutes.mean()}, std: {lit1_minutes.std()}')

average time (minutes): 4.602970682779948, std: 0.88745879190279


In [12]:
lit2 = random_estimation_tests(n_markers=15, n_fam_I=54, n_fam_II=104, n_tests=10)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 830.01it/s]


In [13]:
lit2_minutes = lit2.loc[:, ('ELAPSED TIME', None, None)]/60
print(f'average time (minutes): {lit2_minutes.mean()}, std: {lit2_minutes.std()}')

average time (minutes): 21.65960879007975, std: 2.4175920800151385


# Tests varying the number of markers

In [14]:
n_fam_I = 1000
n_fam_II = 0
marker_tests = {}
for n_markers in [12, 15, 20, 30, 100]:
    print(f'{n_markers=}')
    marker_tests[n_markers] = random_estimation_tests(n_markers, n_fam_I, n_fam_II, 100)

n_markers=12


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 899.84it/s]


n_markers=15


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 890.84it/s]


n_markers=20


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 909.53it/s]


n_markers=30


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 874.15it/s]


n_markers=100


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 903.01it/s]


In [15]:
acc = {}
mae_acc = {}
for n_markers, df in marker_tests.items():
    delta = df['ESTIMATED'] - df['SIMULATED']
    mae_acc[n_markers] = delta.abs().mean()
    stats = delta.stack().apply(['mean', 'std'])
    stats['TIME'] = df[('ELAPSED TIME', None, None)].apply(['mean', 'std'])
    #stats = stats.unstack(level=0)
    #stats['MAE'] = delta.abs().mean()
    acc[n_markers] = stats
    #delta.apply(['mean', 'std'])
df = pandas.concat(acc, names=['n_markers'])
df

Unnamed: 0_level_0,RATE,MUTATION,RECOMBINATION,TIME
n_markers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12,mean,0.000696,-0.00084,3.318523
12,std,0.002994,0.006164,0.553825
15,mean,0.000665,-0.000769,4.51091
15,std,0.002949,0.006517,0.793646
20,mean,0.000536,-0.000463,6.834724
20,std,0.003046,0.005966,0.973003
30,mean,0.000273,-0.000215,10.402235
30,std,0.002975,0.004429,1.611994
100,mean,3.4e-05,-4.4e-05,45.736237
100,std,0.002842,0.002362,1.877282


In [16]:
pandas.concat(mae_acc, names=['n_markers']).reset_index().groupby(['n_markers', 'RATE']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
n_markers,RATE,Unnamed: 2_level_1
12,MUTATION,0.002066
12,RECOMBINATION,0.004408
15,MUTATION,0.002087
15,RECOMBINATION,0.004602
20,MUTATION,0.002105
20,RECOMBINATION,0.004187
30,MUTATION,0.002075
30,RECOMBINATION,0.003073
100,MUTATION,0.001953
100,RECOMBINATION,0.001634


In [17]:
df.loc[(slice(None), slice('mean')), :]

Unnamed: 0_level_0,RATE,MUTATION,RECOMBINATION,TIME
n_markers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12,mean,0.000696,-0.00084,3.318523
15,mean,0.000665,-0.000769,4.51091
20,mean,0.000536,-0.000463,6.834724
30,mean,0.000273,-0.000215,10.402235
100,mean,3.4e-05,-4.4e-05,45.736237


In [18]:
if False:
    n_markers = 12
    delta = marker_tests[n_markers]['ESTIMATED'] - marker_tests[n_markers]['SIMULATED']
    delta.to_csv('delta12.csv')
    delta.columns = delta.columns.get_level_values('MARKER')
    delta.boxplot(figsize=(8, 4))
    import matplotlib.pyplot as plt
    plt.xticks(rotation=90);
    plt.savefig('deltas12.pdf')

In [19]:
print(pandas.concat(acc, names=['n_markers']).to_latex())

\begin{tabular}{llrrr}
\toprule
    & RATE &  MUTATION &  RECOMBINATION &       TIME \\
n\_markers & {} &           &                &            \\
\midrule
12  & mean &  0.000696 &      -0.000840 &   3.318523 \\
    & std &  0.002994 &       0.006164 &   0.553825 \\
15  & mean &  0.000665 &      -0.000769 &   4.510910 \\
    & std &  0.002949 &       0.006517 &   0.793646 \\
20  & mean &  0.000536 &      -0.000463 &   6.834724 \\
    & std &  0.003046 &       0.005966 &   0.973003 \\
30  & mean &  0.000273 &      -0.000215 &  10.402235 \\
    & std &  0.002975 &       0.004429 &   1.611994 \\
100 & mean &  0.000034 &      -0.000044 &  45.736237 \\
    & std &  0.002842 &       0.002362 &   1.877282 \\
\bottomrule
\end{tabular}



  print(pandas.concat(acc, names=['n_markers']).to_latex())


# Tests varying the number of families

In [None]:
n_markers = 15
n_fam_II = 0
n_tests = 100
fam_I_acc = {}
for n_fam_I_log2 in range(1, 11):
    n_fam_I = 2**n_fam_I_log2
    fam_I_acc[n_fam_I] = random_estimation_tests(n_markers, n_fam_I, n_fam_II, n_tests)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 814.34it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:20<00:00,  4.89it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:16<00:00,  1.31it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:25<00:00,  1.45s/it]
100%|███████████████████████████████████████████████████████████████████████

In [None]:
assert False

In [None]:
delta = df['ESTIMATED'] - df['SIMULATED']
delta.head()

In [None]:
delta.apply(['mean', 'std'])

In [None]:
delta.plot.box()

In [None]:
ratio = numpy.log2(df['ESTIMATED']/df['SIMULATED'])
ratio.head()

In [None]:
r = ratio.stack(['RATE', 'MARKER'])
s = df.loc[:, 'SIMULATED'].stack(['RATE', 'MARKER'])
r.loc[s >= 1e-3].unstack(['RATE', 'MARKER']).plot.box()

In [None]:
r.loc[s >= 1e-3].std()

In [None]:
r.loc[s >= 1e-2].unstack(['RATE', 'MARKER']).quantile([0.025, 0.5, 0.975])

In [None]:
seaborn.displot(r.loc[s >= 1e-2])

In [None]:
pandas.concat([
    df.drop('ELAPSED TIME', axis=1).stack(['RATE', 'MARKER']), 
    pandas.DataFrame({'r': r, 's': s})
], axis=1).sort_values('r')

In [None]:
s >= 1e04

In [None]:
delta.loc[sorted(delta.abs().idxmax())]

In [None]:
df.loc[sorted(delta.abs().idxmax()), (['SIMULATED', 'ESTIMATED'], 'RECOMBINATION', )]

In [None]:
df.loc[sorted(delta.abs().idxmax()), [('SIMULATED', 'RECOMBINATION')]]

In [None]:
seaborn.boxplot(data=delta)

In [None]:
seaborn.displot(df[('ELAPSED TIME', None, None)])