In [None]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import pickle
import os

#### Load data

In [None]:
# load data generated in Notebooks 1 and 2
# Can be skipped if running demo
CHM13_counts = pd.read_pickle('repeat_distributions/CHM13_counts.pickle')
random_counts = pd.read_pickle('repeat_distributions/random_counts.pickle')
subonly_counts = pd.read_pickle('repeat_distributions/subonly_counts.pickle')

denovo_exp_rate = pd.read_pickle('denovo/denovo_exp_rate.pickle')
denovo_con_rate = pd.read_pickle('denovo/denovo_con_rate.pickle')
denovo_nonexp_rate = pd.read_pickle('denovo/denovo_nonexp_rate.pickle')

denovo_exp_rate_poisson = pd.read_pickle('denovo/denovo_exp_rate_poisson.pickle')
denovo_con_rate_poisson = pd.read_pickle('denovo/denovo_con_rate_poisson.pickle')
denovo_nonexp_rate_poisson = pd.read_pickle('denovo/denovo_nonexp_rate_poisson.pickle')

denovo_substitution_context_rate = pd.read_pickle('denovo/denovo_mut_freq_triplets.pickle')
denovo_substitution_context_rate_poisson = pd.read_pickle('denovo/denovo_mut_freq_triplets_poisson.pickle')

#### Define functions

In [None]:
# evolve function
def mut_evolve_dist_AB(A_count_input, B_count_input, starting_conditions, boot = None, input_nuc = 'A', mut = True, mutonly = False, speedup_multiplier = 1, output_components = False, stochastics = None, reflective = True):
    exp_rate_A_AA, con_rate_A_AA, nonexp_rate_A_AB, B_indel_rates = starting_conditions
    A_count_output = A_count_input.copy(); B_count_output = B_count_input.copy()
    A_bins = len(A_count_input)
    B_bins = len(B_count_input)
    A_length_array = np.array(range(1,A_bins+3))
    A_length_array_bases = np.array(range(1,A_bins+3)) * len(input_nuc) ### including motif length
    B_length_array = np.array(range(1,B_bins+3))
    B_length_array_bases = np.array(range(1,B_bins+3)) * len(input_nuc) ### including motif length
    A_count_input = np.insert(A_count_input, A_bins, [0,0])
    B_count_input = np.insert(B_count_input, B_bins, [0,0])
    #A_count_input = A_count_input.astype('int64')
    #B_count_input = B_count_input.astype('int64')

    if boot is None:
        denovo_sub = denovo_substitution_context_rate.loc[input_nuc]
    else:
        denovo_sub = denovo_substitution_context_rate_poisson.loc[boot].loc[input_nuc]

    # distribution info
    total_B_bases = (B_count_input[:B_bins] * B_length_array_bases[:B_bins]).sum()
    B_L1_base_portion = ((B_count_input[0] * len(input_nuc)) / (B_count_input[:B_bins]* B_length_array_bases[:B_bins]).sum()) ### including motif length
    B_nonflank_base_portion = (B_count_input[2:B_bins+2] * B_length_array_bases[:B_bins]).sum() / total_B_bases  ### include portion of triplets 1nt away???
    B_flank_base_portion = (B_count_input[1:B_bins] * 2 * len(input_nuc)).sum() / total_B_bases ### including motif length
    
    total_A_bases = (A_count_input[:A_bins] * A_length_array_bases[:A_bins]).sum()
    A_nonflank_base_portion = (A_count_input[2:A_bins+2] * A_length_array_bases[:A_bins]).sum() / total_A_bases
    A_flank_base_portion = (A_count_input[1:A_bins] * 2 * len(input_nuc)).sum() / total_A_bases ### including motif length
    
    total_A_change_in = np.array([0.0]*A_bins); total_B_change_in = np.array([0.0]*B_bins)
    total_A_change_out = np.array([0.0]*A_bins); total_B_change_out = np.array([0.0]*B_bins)

    if mut == True:
        # A>B which adds to the A count locally. add these to A
        A_mut_in_local_A_B = 2 * len(input_nuc) * denovo_sub['Acontraction'] * A_count_input[1:]
        A_mut_out_local_A_B = -2 * len(input_nuc) * denovo_sub['Acontraction'] * A_count_input
        A_mut_out_local_A_B[0] = -1 * len(input_nuc) * A_count_input[0] * denovo_sub['A10']
        #A_mut_in_local_A_B = A_mut_out_local_A_B[1:]

        # total number of A>B fission events
        A_mut_out_fission = np.insert((-denovo_sub['Afission'] * A_count_input[2:] * A_length_array_bases[:A_bins]), 0, [0, 0]) # used to subtract from A_count, starting from L=3 (with 0 for L=1,2)
        # each fission creates 2 As. add these to A
        A_mut_in_fission =  np.array([np.sum(((2/A_length_array[:A_bins]) * -A_mut_out_fission[2:A_bins+2])[L-1:]) for L in A_length_array[:A_bins]]) ### use length_array_bases???
 
        # B>A which adds to the A count locally (which must come from B_L>1)
        # A from B>A leaving the -1 bin
        A_len_freq = (A_count_input / A_count_input.sum())[:A_bins]
        A_mut_out_local_B_A = -denovo_sub['Aexpansion'] * B_flank_base_portion * total_B_bases * A_len_freq
        # B>A creating A_L=1 from B_L>2
        B_A_into_L1 = total_B_bases * B_nonflank_base_portion * denovo_sub['A01']
        A_mut_in_local_B_A = np.insert(-A_mut_out_local_B_A, 0, B_A_into_L1)
        
        # fusion process for A
        A_len_freq = (A_count_input / A_count_input.sum())[:A_bins]
        A_fusion_freq_in = np.bincount((np.add.outer(A_length_array[:A_bins], A_length_array[:A_bins])+1).ravel(), weights = np.outer(A_len_freq, A_len_freq).ravel())[1:]
        A_mut_in_fusion_A_B = A_fusion_freq_in * denovo_sub['Afusion'] * B_L1_base_portion * total_B_bases
        A_mut_out_fusion_A_B = (-2) *A_len_freq * denovo_sub['Afusion'] * B_L1_base_portion * total_B_bases
        
        
        # total B>A
        # B>A which adds to the B count locally. add these to B
        B_mut_in_local_B_A = 2 * len(input_nuc) * denovo_sub['Aexpansion'] * B_count_input[1:]
        B_mut_out_local_B_A = -2 * len(input_nuc) * denovo_sub['Aexpansion'] * B_count_input
        B_mut_out_local_B_A[0] = -1 * B_L1_base_portion * total_B_bases * denovo_sub['Afusion']

        # total number of B>A fission events
        B_mut_out_fission = np.insert((-denovo_sub['A01'] * B_count_input[2:] * B_length_array_bases[:B_bins]), 0, [0, 0]) # used to subtract from B_count, starting from L=3 (with 0 for L=1,2)
        # each fission creates 2 Bs. add these to B
        B_mut_in_fission =  np.array([np.sum(((2/B_length_array[:B_bins]) * -B_mut_out_fission[2:B_bins+2])[L-1:]) for L in B_length_array[:B_bins]]) ### use length_array_bases???

        # A>B which adds to the B count locally (which must come from A_L>1)
        # B from A>B leaving the -1 bin
        B_len_freq = (B_count_input / B_count_input.sum())[:B_bins]
        B_mut_out_local_A_B = -denovo_sub['Acontraction'] * A_flank_base_portion * total_A_bases * B_len_freq
        # A>B creating B_L=1 from A_L>2
        A_B_into_L1 = total_A_bases * A_nonflank_base_portion * denovo_sub['Afission']
#        A_B_into_L1 = -A_mut_out_fission.sum()
        B_mut_in_local_A_B = np.insert(-B_mut_out_local_A_B, 0, A_B_into_L1)
        
        # fusion process for B
        B_len_freq = (B_count_input / B_count_input.sum())[:B_bins]
        B_fusion_freq_in = np.bincount((np.add.outer(B_length_array[:B_bins], B_length_array[:B_bins])+1).ravel(), weights = np.outer(B_len_freq, B_len_freq).ravel())[1:]
        B_mut_in_fusion_B_A = B_fusion_freq_in * denovo_sub['A10'] * A_count_input[0] * len(input_nuc)
        B_mut_out_fusion_B_A = (-2) * B_len_freq * denovo_sub['A10'] * A_count_input[0] * len(input_nuc)

        # update counts for next round (with absorbing boundary)
        total_A_change_in += A_mut_in_local_A_B[:A_bins] + A_mut_in_local_B_A[:A_bins] + A_mut_in_fission[:A_bins] + A_mut_in_fusion_A_B[:A_bins]
        total_B_change_in += B_mut_in_local_B_A[:B_bins] + B_mut_in_local_A_B[:B_bins] + B_mut_in_fission[:B_bins] + B_mut_in_fusion_B_A[:B_bins]
        total_A_change_out += A_mut_out_local_A_B[:A_bins] + A_mut_out_local_B_A[:A_bins] + A_mut_out_fission[:A_bins] + A_mut_out_fusion_A_B[:A_bins]
        total_B_change_out += B_mut_out_local_B_A[:B_bins] + B_mut_out_local_A_B[:B_bins] + B_mut_out_fission[:B_bins] + B_mut_out_fusion_B_A[:B_bins]

        # apply reflecting boundary
        if reflective == True:
            total_A_change_in[A_bins-1] += A_mut_in_local_A_B[A_bins:].sum() + A_mut_in_local_B_A[A_bins:].sum() + A_mut_in_fission[A_bins:].sum() + A_mut_in_fusion_A_B[A_bins:].sum()
            total_B_change_in[B_bins-1] += B_mut_in_local_B_A[B_bins:].sum() + B_mut_in_local_A_B[B_bins:].sum() + B_mut_in_fission[B_bins:].sum() + B_mut_in_fusion_B_A[B_bins:].sum()
            total_A_change_out[A_bins-1] += A_mut_out_local_A_B[A_bins:].sum() + A_mut_out_local_B_A[A_bins:].sum() + A_mut_out_fission[A_bins:].sum() + A_mut_out_fusion_A_B[A_bins:].sum()
            total_B_change_out[B_bins-1] += B_mut_out_local_B_A[B_bins:].sum() + B_mut_out_local_A_B[B_bins:].sum() + B_mut_out_fission[B_bins:].sum() + B_mut_out_fusion_B_A[B_bins:].sum()
           
            
    if mutonly == False:
        # A expansions in and out
        A_exp_out = A_count_input[:A_bins] * -exp_rate_A_AA[:A_bins]
        A_exp_in = np.insert(-A_exp_out, 0, B_indel_rates[0]*total_B_bases)

        # A contractions in and out
        A_con_out = A_count_input[:A_bins+1] * -con_rate_A_AA[:A_bins+2]
        A_con_in = -A_con_out[1:]

        # A fusions from B1>B0 deletions
        if (mut != True):
            A_len_freq = (A_count_input / A_count_input.sum())[:A_bins]
            A_fusion_freq_in = np.bincount((np.add.outer(A_length_array[:A_bins], A_length_array[:A_bins])+1).ravel(), weights = np.outer(A_len_freq, A_len_freq).ravel())[1:]
        A_mut_in_fusion_Bdel = A_fusion_freq_in[1:A_bins+1] * B_indel_rates[1] * B_L1_base_portion * total_B_bases
        A_mut_out_fusion_Bdel = (-2) *A_len_freq * B_indel_rates[1] * B_L1_base_portion * total_B_bases

        # A fission events from insertions
        A_nonexp_fissions_out = -A_count_input * nonexp_rate_A_AB # used to calculate fission_in, starting with L=2 going to 2x L=1
        # each fission creates 2 As. add these to A
        A_nonexp_in_fission =  np.array([np.sum(((2/A_length_array[:A_bins]) * -A_nonexp_fissions_out[1:A_bins+1])[L-1:]) for L in A_length_array[:A_bins]])


        # B expansions in and out
        B_exp_out = B_count_input[:B_bins] * -B_indel_rates[2] * B_length_array[:B_bins] # B>BB rates are flat, per base
        B_exp_in = np.insert(-B_exp_out, 0, A_nonexp_fissions_out.sum())

        # B contractions in and out
        B_con_out = B_count_input[:B_bins+1] * -B_indel_rates[1] * B_length_array[:B_bins+1] # B>_ rates are flat, per base
        B_con_in = -B_con_out[1:]

        # B fusions from A1>A0 deletions
        if (mut != True):
            B_len_freq = (B_count_input / B_count_input.sum())[:B_bins]
            B_fusion_freq_in = np.bincount((np.add.outer(B_length_array[:B_bins], B_length_array[:B_bins])+1).ravel(), weights = np.outer(B_len_freq, B_len_freq).ravel())[1:]
        B_mut_in_fusion_Adel = B_fusion_freq_in[1:B_bins+1] * A_count_input[0] * con_rate_A_AA[0]
        B_mut_out_fusion_Adel = 2 *B_len_freq * A_count_input[0] * -con_rate_A_AA[0]
                    
        # B fission events from insertions
        B_nonexp_fissions_out = -B_count_input * B_indel_rates[0] * B_length_array # used to calculate fission_in, starting with L=2 going to 2x L=1
        # each fission creates 2 Bs. add these to B
        B_nonexp_in_fission =  np.array([np.sum(((2/B_length_array[:B_bins]) * -B_nonexp_fissions_out[1:B_bins+1])[L-1:]) for L in B_length_array[:B_bins]])
            
       # update counts for next round (with absorbing boundary)
        total_A_change_in += A_exp_in[:A_bins] + A_con_in[:A_bins] + A_mut_in_fusion_Bdel[:A_bins] + A_nonexp_in_fission[:A_bins]
        total_B_change_in += B_exp_in[:B_bins] + B_con_in[:B_bins] + B_mut_in_fusion_Adel[:B_bins] + B_nonexp_in_fission[:B_bins]
        total_A_change_out += A_exp_out[:A_bins] + A_con_out[:A_bins] + A_mut_out_fusion_Bdel[:A_bins] + A_nonexp_fissions_out[:A_bins]
        total_B_change_out += B_exp_out[:B_bins] + B_con_out[:B_bins] + B_mut_out_fusion_Adel[:B_bins] + B_nonexp_fissions_out[:B_bins]

        # apply reflecting boundary
        if reflective == True:
            total_A_change_in[A_bins-1] += A_exp_in[A_bins:].sum() + A_con_in[A_bins:].sum() + A_mut_in_fusion_Bdel[A_bins:].sum() + A_nonexp_in_fission[A_bins:].sum()
            total_B_change_in[B_bins-1] += B_exp_in[B_bins:].sum() + B_con_in[B_bins:].sum() + B_mut_in_fusion_Adel[B_bins:].sum() + B_nonexp_in_fission[B_bins:].sum()
            total_A_change_out[A_bins-1] += A_exp_out[A_bins:].sum() + A_con_out[A_bins:].sum() + A_mut_out_fusion_Bdel[A_bins:].sum() + A_nonexp_fissions_out[A_bins:].sum()
            total_B_change_out[B_bins-1] += B_exp_out[B_bins:].sum() + B_con_out[B_bins:].sum() + B_mut_out_fusion_Adel[B_bins:].sum() + B_nonexp_fissions_out[B_bins:].sum()

    
    # flag to stop the simulation if more repeats are removed from a bin than exist in that bin (excluding the last 10 noisy bins)
    flag = ((np.abs(total_A_change_out[:A_bins-10]) * speedup_multiplier > A_count_output[:A_bins-10]).sum()) > 0

    # apply speedup
    total_A_change_in *= speedup_multiplier; total_A_change_out *= speedup_multiplier
    total_B_change_in *= speedup_multiplier; total_B_change_out *= speedup_multiplier
    
    if stochastics is not None:
        # the sum of poisson random variables is poisson-distributed. not necessary to run n poisson samples
        total_A_change_in = np.random.poisson(total_A_change_in.clip(0))
        total_A_change_out = -1 * np.random.poisson(np.abs(total_A_change_out.clip(max=0)))
        total_B_change_in = np.random.poisson(total_B_change_in.clip(0))
        total_B_change_out = -1 * np.random.poisson(np.abs(total_B_change_out.clip(max=0)))   
    
    total_A_change = total_A_change_in + total_A_change_out
    total_B_change = total_B_change_in + total_B_change_out
    
    # update counts for next round
    A_count_output = A_count_output[:A_bins] + total_A_change[:A_bins]
    B_count_output = B_count_output[:B_bins] + total_B_change[:B_bins]

    # remove negative values
    A_count_output[A_count_output <0] = 0            
    B_count_output[B_count_output <0] = 0

    boundary_flag = ((A_count_output[A_bins-1] > 1000) | (B_count_output[B_bins-1] > 1000))
    
    if output_components == True:
        if mutonly == False:
            return  A_mut_in_local_A_B[:A_bins], A_mut_out_local_A_B[:A_bins], A_mut_in_local_B_A[:A_bins], A_mut_out_local_B_A[:A_bins], A_mut_in_fission[:A_bins], A_mut_out_fission[:A_bins], A_mut_in_fusion_A_B[:A_bins], A_mut_out_fusion_A_B[:A_bins], A_exp_in[:A_bins], A_exp_out[:A_bins], A_con_in[:A_bins], A_con_out[:A_bins], A_mut_in_fusion_Bdel[:A_bins], A_mut_out_fusion_Bdel[:A_bins], A_nonexp_in_fission[:A_bins], A_nonexp_fissions_out[:A_bins]
        else:
            return  A_mut_in_local_A_B[:A_bins], A_mut_out_local_A_B[:A_bins], A_mut_in_local_B_A[:A_bins], A_mut_out_local_B_A[:A_bins], A_mut_in_fission[:A_bins], A_mut_out_fission[:A_bins], A_mut_in_fusion_A_B[:A_bins], A_mut_out_fusion_A_B[:A_bins]
    else:
        return A_count_output, B_count_output, flag, boundary_flag

In [None]:
def extend_power_law(power, start_rate, start_len, end_len=100):
    denom = (start_len**power) / start_rate
    return pd.Series([i**power for i in range(start_len+1, end_len+1)], index = list(range(start_len+1,end_len+1))) / denom

def multiply_then_powerlaw(exp_power, con_power, mult, A_bins = 100, boot = None, L_mult = 9, L_mult_nonexp = 9, motif = 'A', fill = False, nonexp_factor = False):
    if boot is None:
        bootname = ''
        denovo_exp_rate_current = denovo_exp_rate[motif]
        denovo_con_rate_current = denovo_con_rate[motif]
        denovo_nonexp_rate_current = denovo_nonexp_rate[motif]
    else:
        bootname = '_boot'+str(boot)
        denovo_exp_rate_current = denovo_exp_rate_poisson[motif][boot]
        denovo_con_rate_current = denovo_con_rate_poisson[motif][boot]
        denovo_nonexp_rate_current = denovo_nonexp_rate_poisson[motif][boot]
    if fill == True:
        fillname = '_fill'
        denovo_exp_rate_current = denovo_exp_rate_current.replace(0, np.nan).interpolate(method = 'from_derivatives')
        denovo_con_rate_current = denovo_con_rate_current.replace(0, np.nan).interpolate(method = 'from_derivatives')
        denovo_nonexp_rate_current = denovo_nonexp_rate_current.replace(0, np.nan).interpolate(method = 'from_derivatives')
    else:
        fillname = ''
    exp = pd.concat([denovo_exp_rate_current.reindex(range(L_mult)), pd.Series(denovo_exp_rate_current[L_mult-1] * mult, index = [L_mult]), extend_power_law(exp_power, denovo_exp_rate_current[L_mult-1]*mult, L_mult, A_bins+3)])
    con = pd.concat([denovo_con_rate_current.reindex(range(L_mult)), pd.Series(denovo_con_rate_current[L_mult-1] * mult, index = [L_mult]), extend_power_law(con_power, denovo_con_rate_current[L_mult-1]*mult, L_mult, A_bins+3)])
    if nonexp_factor == False:
        nonexpname = ''
        nonexp = pd.concat([denovo_nonexp_rate_current.reindex(range(L_mult_nonexp)), pd.Series(denovo_nonexp_rate_current[L_mult_nonexp-1] * mult, index = [L_mult_nonexp]), extend_power_law(exp_power, denovo_nonexp_rate_current[L_mult_nonexp-1]*mult, L_mult_nonexp, A_bins+3)])
    else:
        nonexpname = '_nonexp_x' + str(nonexp_factor)
        nonexp = exp * nonexp_factor
    nonexp.loc[1] = 0
    name = 'mult_L'+str(L_mult)+'_x'+str(mult)+'_extend_pl_' + str(exp_power) + '_' + str(con_power) + nonexpname + fillname + bootname
    return name, exp, con, nonexp

In [None]:
# setup function
def setup_evolve(exp_power=1, con_power=1, mult=1, boot = None, stochastics = None, L_mult = 9, L_mult_nonexp = 9, fill = False, nonexp_factor = False, A_bins = 100, B_bins = 100, input_nuc = 'A', mutonly = False, exp_zero = False, con_zero = False, nonexp_zero = False, different_input = False, random_start = False, subonly_start = False, ceiling = None, custom_rates = False):
# set up counts
    A_length_array = np.array(range(1,A_bins+1))
    B_length_array = np.array(range(1,A_bins+1))
    if different_input == False:
        if random_start == False:
            if subonly_start == False:
                A_count_input = np.nan_to_num(CHM13_counts['A'][input_nuc].reindex(range(1,A_bins+1)).values)
                B_count_input = np.nan_to_num(CHM13_counts['B'][input_nuc].reindex(range(1,B_bins+1)).values)
            if subonly_start == True:
                A_count_input = np.nan_to_num(subonly_counts['A'][input_nuc].reindex(range(1,A_bins+1)).values)
                B_count_input = np.nan_to_num(subonly_counts['B'][input_nuc].reindex(range(1,B_bins+1)).values)
        if random_start == True:
            A_count_input = np.nan_to_num(random_counts['A'][input_nuc].reindex(range(1,A_bins+1)).values)
            B_count_input = np.nan_to_num(random_counts['B'][input_nuc].reindex(range(1,B_bins+1)).values)
    else:
        A_count_input = np.nan_to_num(different_input[0].reindex(range(1,A_bins+1)).values)
        B_count_input = np.nan_to_num(different_input[1].reindex(range(1,B_bins+1)).values)
# set up rates    
    if mutonly == False:
        if custom_rates == False:
            name, exp_rate, con_rate, nonexp_rate = multiply_then_powerlaw(exp_power = exp_power, con_power = con_power, A_bins = A_bins, mult=mult, boot = boot, L_mult = 9, L_mult_nonexp = 9, motif = input_nuc, fill = fill, nonexp_factor = nonexp_factor)
            B_indel_rate = np.array([exp_rate[0], con_rate[0], nonexp_rate[0]])        
        else:
            exp_rate, con_rate, name = custom_rates
            nonexp_rate = exp_rate * nonexp_factor
            B_indel_rate = np.array([denovo_exp_rate[input_nuc][0], denovo_con_rate[input_nuc][0], denovo_nonexp_rate[input_nuc][0]])        
        # change rates from per unit to per STR
        exp_rate = exp_rate.values[1:A_bins+1] * A_length_array
        con_rate = con_rate.values[1:A_bins+2] * np.array(range(1,A_bins+2))
        nonexp_rate = nonexp_rate.values[1:A_bins+3] * np.array(range(1,A_bins+3))
        if ceiling != None:
            ceiling_loc = []
            if (exp_rate > ceiling).sum() > 0:
                ceiling_loc.append(pd.Series(exp_rate > ceiling).idxmax())
            if (con_rate > ceiling).sum() > 0:
                ceiling_loc.append(pd.Series(con_rate > ceiling).idxmax())
            if len(ceiling_loc) > 0:
                ceiling_loc = min(np.array(ceiling_loc))
                print( '\r' + 'rate ceiling reached at L=' + str(ceiling_loc), end = ' ')
                exp_rate[ceiling_loc:] = ceiling
                con_rate[ceiling_loc:] = ceiling
                nonexp_rate[ceiling_loc:] = ceiling
            name = name + '_ceiling_' + str(ceiling)
        if exp_zero == True:
            exp_rate *= 0
        if con_zero == True:
            con_rate *= 0
        if nonexp_zero == True:
            nonexp_rate *= 0
        if random_start == False:
            if subonly_start == True:
                name = name + '_subonlystart'
            else:
                name = name + '_CHM13start'
        else:
            if random_start == True:
                name = name + '_randomstart'        
        if stochastics is None:
            name = name
        else:
            name = name + '_stochastics_' + str(stochastics)
        return A_count_input, B_count_input, exp_rate, con_rate, nonexp_rate, B_indel_rate, name
    else:
        if boot is None:
            name = 'mutonly'
        else:
            name = 'mutonly_boot' + str(boot)
        if random_start == False:
            if subonly_start == True:
                name = name + '_subonlystart'
            else:
                name = name + '_CHM13start'
        else:
            if random_start == True:
                name = name + '_randomstart'  
        if stochastics is None:
            name = name
        else:
            name = name + '_stochastics_' + str(stochastics)
        return A_count_input, B_count_input, None, None, None, None, name

## Constant speedup

In [None]:
def run_simulation(exp_power=1, con_power=1, mult=1, boot = None, L_mult = 9, L_mult_nonexp = 9, fill = False, nonexp_factor = False, A_bins = 100, B_bins = 100, input_nuc = 'A', mutonly = False, speedup = 3, rounds = 5, overwrite = False, stochastics = None, random_start = False, subonly_start = False, ceiling = False, reflective = True, sim_dir = 'grid_output/'):
    starting_conditions = setup_evolve(exp_power=exp_power, con_power=con_power, mult=mult, boot = boot, stochastics = stochastics, L_mult = L_mult, L_mult_nonexp = L_mult_nonexp, fill = fill, nonexp_factor = nonexp_factor, A_bins = A_bins, B_bins = B_bins, input_nuc = input_nuc, mutonly = mutonly, random_start = random_start, subonly_start = subonly_start, ceiling = ceiling)

    if overwrite == False:
        if 'Adist_bins'+str(A_bins)+'_sp1e'+str(speedup)+'_rounds1e'+str(rounds)+'_'+starting_conditions[6]+'.pickle' in finished:
            print('already done: ' + starting_conditions[6])
            return None
        else:
            pass
    else:
        pass        
    print('\r' + '         ' + starting_conditions[6], end = '     ')
    A_counts_timeseries = dict(); B_counts_timeseries = dict()
    A_counts_timeseries[0] = starting_conditions[0]; B_counts_timeseries[0] = starting_conditions[1]
    A_counts_current = A_counts_timeseries[0]; B_counts_current = B_counts_timeseries[0]; flag = False; boundary_flag = False
    for rep in range(1, 1 + 10**rounds):
        if (flag == False):# & (max(A_counts_current[~np.isnan(A_counts_current)]) < 1e12):
            A_counts_current, B_counts_current, flag, boundary_flag = mut_evolve_dist_AB(A_counts_current, B_counts_current, starting_conditions[2:6], boot=boot, input_nuc = input_nuc, mutonly=mutonly, speedup_multiplier=10**speedup, stochastics = stochastics, reflective = reflective)
            if rep%int(max(1, 1e6/(10**speedup))) == 0:
                print('\r' + str(rep), end = '   ')
                A_counts_timeseries[rep], B_counts_timeseries[rep] = A_counts_current, B_counts_current
        else:
            print('\r' + 'ending due to numerical error at round '+str(rep))
            break
    A_counts_timeseries = pd.DataFrame(A_counts_timeseries)
    B_counts_timeseries = pd.DataFrame(B_counts_timeseries)
    A_counts_timeseries.to_pickle(sim_dir + 'Adist_'+input_nuc+'_bins'+str(A_bins)+'_sp1e'+str(speedup)+'_rounds1e'+str(rounds)+'_'+starting_conditions[6]+'.pickle')
    B_counts_timeseries.to_pickle(sim_dir + 'Bdist_'+input_nuc+'_bins'+str(A_bins)+'_sp1e'+str(speedup)+'_rounds1e'+str(rounds)+'_'+starting_conditions[6]+'.pickle')
    return A_counts_timeseries, B_counts_timeseries

In [None]:
import argparse
parser = argparse.ArgumentParser(description='repeat distribution simulation')

parser.add_argument('--dir', action="store", dest='dir', default = 'simulations/grid_output/', type=str)
parser.add_argument('--mult', action="store", dest='mult', type=float)
parser.add_argument('--exp', action="store", dest='exp', type=float)
parser.add_argument('--con', action="store", dest='con', type=float)
parser.add_argument('--L_mult', action="store", dest='L_mult', default = 9, type=int)
parser.add_argument('--L_mult_nonexp', action="store", dest='L_mult_nonexp', default = 9, type=int)
parser.add_argument('--motif', action="store", dest='motif', default = 'A', type=str)
parser.add_argument('--speedup', action="store", dest='speedup', default = 3, type=int)
parser.add_argument('--rounds', action="store", dest='rounds', default = 5, type=int)
parser.add_argument('--A_bins', action="store", dest='A_bins', default = 200, type=int)
parser.add_argument('--B_bins', action="store", dest='B_bins', default = 200, type=int)
parser.add_argument('--boot', action="store", dest='boot', type = int)
parser.add_argument('--mutonly', default=False, action="store_true")
parser.add_argument('--random_start', default=False, action="store_true")
parser.add_argument('--overwrite', default=False, action="store_true")
parser.add_argument('--stochastics', action="store", dest='stochastics', type = int)
parser.add_argument('--ceiling', action="store", dest='ceiling', default=None, type=float)
parser.add_argument('--reflective', default=True, action="store_false")

args = parser.parse_args()
finished = os.listdir(args.dir)
run_simulation(mult=args.mult, exp_power=args.exp, con_power=args.con, boot = args.boot, L_mult = args.L_mult, L_mult_nonexp = args.L_mult_nonexp, input_nuc = args.motif, A_bins = args.A_bins, B_bins = args.B_bins, mutonly = args.mutonly, random_start = args.random_start, speedup = args.speedup, rounds = args.rounds, sim_dir = args.dir, overwrite = args.overwrite, stochastics = args.stochastics, ceiling = args.ceiling, reflective = args.reflective)

##### - remove all lines below to generate simulation_script.py

## Progressive speedup

In [None]:
def run_simulation_custom(mult, exp_power, con_power, max_speedup = 5, rounds_mult = 0, fill = False, nonexp_factor = False, A_bins = 100, B_bins = 100, input_nuc = 'A', mutonly = False, stochastics = None, boot = None, overwrite = False, random_start = False, subonly_start = True, reflective = True, sim_dir = 'simulations/test/'):
    A_counts_timeseries = dict(); B_counts_timeseries = dict()
    starting_conditions = setup_evolve(exp_power, con_power, mult, nonexp_factor = nonexp_factor, A_bins = A_bins, B_bins = B_bins, input_nuc = input_nuc, mutonly = mutonly, random_start = random_start, subonly_start = subonly_start, ceiling = None, boot = boot)
    A_counts_timeseries[0] = starting_conditions[0]; B_counts_timeseries[0] = starting_conditions[1]
    A_counts_current = A_counts_timeseries[0]; B_counts_current = B_counts_timeseries[0]; flag = False; boundary_flag = False

    if overwrite == False:
        if sim_dir + 'Adist_'+input_nuc+'_bins'+str(A_bins)+'_sp1e0_prospeedup_'+starting_conditions[6]+'.pickle' in finished:
            print('already done: ' + starting_conditions[6])
            return None
        else:
            pass
    else:
        pass
    
    current_rep = 1
    while max_speedup >= 0:
        rounds = max(3, 5 - max_speedup) + rounds_mult; speedup = max_speedup; ceiling = 10**-(speedup+1)
        starting_conditions = setup_evolve(exp_power, con_power, mult, nonexp_factor = nonexp_factor, A_bins = A_bins, B_bins = B_bins, input_nuc = input_nuc, mutonly = mutonly, random_start = random_start, subonly_start = subonly_start, ceiling = None, boot=boot, stochastics = stochastics)
        if mutonly != True:
            exp_rate, con_rate, nonexp_rate, B_indel_rate = starting_conditions[2:6]
            exp_rate[exp_rate > ceiling] = ceiling
            con_rate[con_rate > ceiling] = ceiling
            nonexp_rate[nonexp_rate > ceiling] = ceiling
            conditions_ceiling = (exp_rate, con_rate, nonexp_rate, B_indel_rate)
        else:
            conditions_ceiling = (None, None, None, None)

        for rep in range(current_rep, current_rep + 10**rounds):
            if flag == True:
                print('\r' + 'ending due to numerical error at round '+str(rep))
                A_counts_timeseries = pd.DataFrame(A_counts_timeseries)
                B_counts_timeseries = pd.DataFrame(B_counts_timeseries)
                A_counts_timeseries.to_pickle(sim_dir + 'Adist_'+input_nuc+'_bins'+str(A_bins)+'_sp1e0_prospeedup_'+starting_conditions[6]+'.pickle')
                B_counts_timeseries.to_pickle(sim_dir + 'Bdist_'+input_nuc+'_bins'+str(A_bins)+'_sp1e0_prospeedup_'+starting_conditions[6]+'.pickle')
                return A_counts_timeseries, B_counts_timeseries
            if (boundary_flag == True) & (max_speedup > 0): # move to next lower speedup if boundary is hit
                break
            else:
                A_counts_current, B_counts_current, flag, boundary_flag = mut_evolve_dist_AB(A_counts_current, B_counts_current, conditions_ceiling, input_nuc = input_nuc, mutonly=mutonly, speedup_multiplier=10**speedup, stochastics = stochastics, reflective = reflective, boot=boot)
                if rep%int(max(1, 10**(rounds-2))) == 0:
                    print('\r' + str(rep), end = '   ')
                    A_counts_timeseries[rep], B_counts_timeseries[rep] = A_counts_current, B_counts_current        
        current_rep += 10**rounds
        max_speedup -= 1
        # forgo next speedup round if rate ceiling has not been hit at current speedup
        if exp_rate.max() + con_rate.max() < ceiling:
            break
        
    A_counts_timeseries = pd.DataFrame(A_counts_timeseries)
    B_counts_timeseries = pd.DataFrame(B_counts_timeseries)
    A_counts_timeseries.to_pickle(sim_dir + 'Adist_'+input_nuc+'_bins'+str(A_bins)+'_sp1e0_prospeedup_'+starting_conditions[6]+'.pickle')
    B_counts_timeseries.to_pickle(sim_dir + 'Bdist_'+input_nuc+'_bins'+str(A_bins)+'_sp1e0_prospeedup_'+starting_conditions[6]+'.pickle')
    return A_counts_timeseries, B_counts_timeseries

In [None]:
import argparse
parser = argparse.ArgumentParser(description='repeat distribution simulation')

parser.add_argument('--dir', action="store", dest='dir', default = 'simulations/grid_output/', type=str)
parser.add_argument('--mult', action="store", dest='mult', type=float)
parser.add_argument('--exp', action="store", dest='exp', type=float)
parser.add_argument('--con', action="store", dest='con', type=float)
parser.add_argument('--motif', action="store", dest='motif', default = 'A', type=str)
parser.add_argument('--speedup', action="store", dest='speedup', default = 5, type=int)
parser.add_argument('--rounds', action="store", dest='rounds', default = 0, type=int)
parser.add_argument('--A_bins', action="store", dest='A_bins', default = 200, type=int)
parser.add_argument('--B_bins', action="store", dest='B_bins', default = 200, type=int)
parser.add_argument('--boot', action="store", dest='boot', type = int)
parser.add_argument('--mutonly', default=False, action="store_true")
parser.add_argument('--random_start', default=False, action="store_true")
parser.add_argument('--subonly_start', default=False, action="store_true")
parser.add_argument('--overwrite', default=False, action="store_true")
parser.add_argument('--stochastics', action="store", dest='stochastics', type = int)
parser.add_argument('--reflective', default=True, action="store_false")

args = parser.parse_args()
finished = os.listdir(args.dir)
run_simulation_custom(mult=args.mult, exp_power=args.exp, con_power=args.con, boot = args.boot, input_nuc = args.motif, A_bins = args.A_bins, B_bins = args.B_bins, mutonly = args.mutonly, random_start = args.random_start, subonly_start = args.subonly_start, max_speedup = args.speedup, rounds_mult = args.rounds, sim_dir = args.dir, overwrite = args.overwrite, stochastics = args.stochastics, reflective = args.reflective)

##### - remove all lines below to generate simulation_script_prospeedup.py
- Generation of complete dataset requires using simulation_script_speedup.py along with jobs_prospeedup.sh (and simulation_script.py along with jobs_sp1_long.sh and jobs_sp2_long.sh) on a Slurm-based cluster to more quickly populate all points in the grid.

##### Arguments:
- --dir -> directory where output of simulation will be stored
- --mult -> multiplier controlling instability rates
- --L_mult -> length at which the multiplier is applied to the de novo instability rates
- --exp -> power law exponent controlling expansion and non-motif insertion rates
- --con -> power law exponent controlling contraction rate
- --motif -> STR sequence motif
- --speedup -> controls constant or maximum speedup factor applied to all mutation rates
- --rounds -> controlls number of iterations (10^n)
- --A_bins -> number of length bins 1-n for the A (repeat) motifs
- --B_bins -> number of length bins 1-n for the B (non-repeat) strings
- --boot -> selects from a pre-generated list of mutation rates subject to Poisson noise
- --mutonly -> turns off all indel and instability processes in the simulation, running substituion processes only
- --random_start -> uses counts from a randomized genome as the starting distribution
- --subonly_start -> uses counts from a substitution-only simulation as the starting distribution
- --overwrite -> ignore check for whether the conditions will produce a file that already exists in the given directory
- --stochastics -> turns on stochastic (Poisson) sampling of mutation counts during each iteration
- -- ceiling -> set upper rate limit for any mutation process
- --reflective -> turn off handling of the reflecting boundary at L=A_bins and B_bins, turning it into an absorbing boundary

## Run demo
- For a demo of the computational model, the below code can be run, requiring values for "mult", "exp_power" and "con_power" parameters. See Methods for further description, including default values for optional paramerters.
- Demo on a "normal" desktop computer with given parameters is expected to take ~5 minutes.

In [None]:
# Load minimal data for demo:
CHM13_counts = dict()
CHM13_counts['A'] = pd.DataFrame([861201679, 234254484, 88812020, 32801187, 12627047, 3780464, 1581788, 567751, 326510, 207206, 127161, 97987, 86634, 77081, 68340, 58903, 47652, 38435, 32238, 27356, 24095, 21886, 19548, 16572, 13746, 10757, 8409, 6853, 5435, 3908, 2740, 1824, 1408, 952, 912, 786, 809, 803, 693, 636, 497, 341, 241, 200, 163, 157, 167, 119, 116, 85, 94, 60, 42, 63, 26, 29, 20, 14, 7, 3, 3, 7, 4, 4], index = range(1,65), columns = ['A'])
CHM13_counts['B'] = pd.DataFrame([391005710, 234634738, 173991271, 128644266, 82379768, 57163482, 44845809, 31123715, 23724537, 16138191, 12638350, 9637450, 6485630, 4960219, 3700614, 2978004, 2541592, 2199129, 1529461, 1307551, 948882, 703894, 583040, 484866, 436960, 322284, 281835, 207162, 183167, 146508, 125876, 104629, 89550, 80680, 65894, 60237, 50645, 43932, 38951, 33281, 29366, 26809, 23398, 20929, 18711, 16401, 15401, 13201, 11954, 11180, 9209, 8217, 7411, 7940, 6446, 6543, 5351, 5363, 4614, 4920, 3642, 3733, 3536, 2936, 2545, 2346, 1576, 1148, 793, 1602, 833, 2980, 2732, 2347, 2175, 1894, 1721, 1663, 1550, 1846, 1274, 1270, 1200, 1184, 863, 883, 726, 971, 667, 771, 1036, 924, 545, 982, 568, 536, 461, 578, 506, 660, 637, 453, 394, 553, 599, 371, 515, 433, 343, 501, 390, 526, 359, 333, 285, 317, 372, 327, 302, 539, 560, 518, 311, 277, 231, 297, 267, 588, 367, 361, 215, 230, 216, 402, 205, 511, 331, 215, 183, 184, 164, 340, 254, 344, 158, 401, 136, 170, 162, 187, 142, 175, 270, 235, 162, 366, 142, 351, 128, 127, 122, 111, 128, 117, 116, 136, 124, 342, 94, 112, 110, 98, 99, 134, 103, 128, 100, 149, 135, 158, 92, 83, 82, 82, 131, 191, 85, 309, 172, 396, 68, 70, 139, 278, 83, 100, 76, 149, 92, 282], index = range(1,201), columns = ['A'])
CHM13_counts = pd.concat(CHM13_counts, axis=1)

subonly_counts = dict()
subonly_counts['A'] = pd.DataFrame([916707580.2033024, 340290716.258333, 128678308.62519115, 48838020.7874859, 18548753.855557196, 7045762.038399432, 2676403.5444367407, 1016663.2352091095, 386191.7452785688, 146699.59581817046, 55725.61361521049, 21168.04760300643, 8040.938639205142, 3054.447695047391, 1160.2688617171016, 440.7421458419924, 167.42122927793446, 63.59697677514248, 24.15808002594318, 9.176738583082912, 3.4858950269147133, 1.3241593436114223, 0.5029979255644212, 0.19106983939869215, 0.07258018705917339, 0.027570460990195812, 0.010472972721773027, 0.003978285225988993, 0.0015111997099370791, 0.0005740474685914903, 0.00021805886675958894, 8.283229519180028e-05, 3.146484813344478e-05, 1.1952302731299332e-05, 4.540226603820093e-06, 1.7246599318518845e-06, 6.551329129767819e-07, 2.4886015250820107e-07, 9.453253573385976e-08, 3.5909325869190544e-08, 1.3640591298746398e-08, 5.18154341457788e-09, 1.968271871001872e-09, 7.476718514560248e-10, 2.840121863729806e-10, 1.0788546051489958e-10, 4.098159568134373e-11, 1.55673542715905e-11, 5.9134476095456306e-12, 2.246294522548187e-12, 8.532821147996443e-13, 3.2412952091919334e-13, 1.2312451475204588e-13, 4.677033455618555e-14, 1.776627667449281e-14, 6.74873485233351e-15, 2.563588473914226e-15, 9.738100558675044e-16, 3.69913515588853e-16, 1.405161183033852e-16, 5.3376745295775186e-17, 2.0275801614578305e-17, 7.702008221664194e-18, 2.925700880991615e-18, 1.1113628288474581e-18, 4.2216459835948847e-19, 1.6036432340719404e-19, 6.091632581647316e-20, 2.3139802370857326e-20, 8.789933512660019e-21, 3.3389624474187368e-21, 1.2683452280060184e-21, 4.817962593886852e-22, 1.830161303368957e-22, 6.952088836470106e-23, 2.6408349417727886e-23, 1.0031530599987096e-23, 3.810598102390866e-24, 1.4475017299912813e-24, 5.49851021290328e-25, 2.0886755390297206e-25, 7.93408639505333e-26, 3.013858579147787e-26, 1.1448505955972263e-26, 4.348853311273552e-27, 1.6519644740485116e-27, 6.275186646154749e-28, 2.383705462510538e-28, 9.054793169731035e-29, 3.439572553323957e-29, 1.306563158508199e-29, 4.963137692955658e-30, 1.8853075015279892e-30, 7.161565258914736e-31, 2.7204041120994567e-31, 1.0333754823401774e-31, 3.9253740392858455e-32, 1.4911021301878577e-32, 5.682823327959757e-33, 3.428759707012527e-33], index = range(1,101), columns = ['A'])
subonly_counts['B'] = pd.DataFrame([550498794.3648516, 346320080.1269246, 215422559.61719304, 133909901.19805124, 83238781.58012784, 51741505.78915495, 32162697.296232127, 19992442.949264973, 12427371.10493236, 7724896.5005180165, 4801822.158505596, 2984829.122353852, 1855380.0194100519, 1153310.58339157, 716901.8141016357, 445628.6263764576, 277004.282511156, 172186.81203998503, 107031.91290661399, 66531.40414603188, 41356.148997399345, 25707.124054394466, 15979.636478957083, 9932.99684007114, 6174.384902609636, 3838.018831515179, 2385.725668777034, 1482.9752579443784, 921.8225064420567, 573.0080315439691, 356.1837576315802, 221.4050453336747, 137.6261355238746, 85.54887785276219, 53.177475877009144, 33.05530138591516, 20.547288709994625, 12.772265132387137, 7.939283810843722, 4.935086045880151, 3.0676664117959866, 1.9068719626311865, 1.185318151897783, 0.7367978284602383, 0.45799605713836933, 0.28469191988886655, 0.17696547379996758, 0.11000234544581367, 0.06837783519997773, 0.0425038968731665, 0.02642056807035858, 0.01642311572615223, 0.01020866506111151, 0.006345741214257145, 0.003944534502529505, 0.0024519361751922765, 0.0015241319358117495, 0.0009474056385579074, 0.0005889105942086762, 0.0003660688451243766, 0.0002275496496217057, 0.00014144564262323002, 8.79230965653398e-05, 5.4653298371514975e-05, 3.3972677710072244e-05, 2.111753297938136e-05, 1.3126730925983014e-05, 8.159620963840785e-06, 5.072048375865068e-06, 3.152802665849103e-06, 1.9597929501391797e-06, 1.218214019233847e-06, 7.572460124180416e-07, 4.707067184168981e-07, 2.9259291053287115e-07, 1.8187675668200792e-07, 1.1305521572933016e-07, 7.027550983852459e-08, 4.3683497936869776e-08, 2.7153812137189442e-08, 1.6878902752873576e-08, 1.0491983840116927e-08, 6.521853138974554e-09, 4.054006278938213e-09, 2.5199842068591176e-09, 1.5664308256776053e-09, 9.736987735694173e-10, 6.052544971083951e-10, 3.762282712209035e-10, 2.338647837267046e-10, 1.453711516416722e-10, 9.036320643437254e-11, 5.617007903485778e-11, 3.4915513772451004e-11, 2.1703603109364986e-11, 1.3491034128803281e-11, 8.386073084150692e-12, 5.212811790503936e-12, 3.2403016871595538e-12, 2.014182641877817e-12, 1.2520228381568401e-12, 7.782616902133947e-13, 4.837701358111617e-13, 3.007131755882505e-13, 1.8692434129019585e-13, 1.161928116332889e-13, 7.222585021331858e-14, 4.489583620284987e-14, 2.79074057612331e-14, 1.734733913415051e-14, 1.0783165501297115e-14, 6.702852658218828e-15, 4.166516200867617e-15, 2.58992076020143e-15, 1.6099036270939265e-15, 1.000721616026816e-15, 6.220519886591289e-16, 3.8666964957855806e-16, 2.403551803242204e-16, 1.4940560442655428e-16, 9.287103612226439e-17, 5.772895456985953e-17, 3.5884516151424185e-17, 2.230593831148531e-17, 1.3865447756247434e-17, 8.61880987908215e-18, 5.357481780441848e-18, 3.3302290490741228e-18, 2.070081798464368e-18, 1.2867699456062506e-18, 7.998606113747827e-19, 4.971960992820773e-19, 3.0905880052828883e-19, 1.9211201037559625e-19, 1.1941748452872465e-19, 7.423031794465943e-20, 4.6141820219298135e-20, 2.8681913699161395e-20, 1.782877592466713e-20, 1.1082428261447667e-20, 6.88887541629852e-21, 4.282148585285124e-21, 2.661798246935351e-21, 1.6545829193632123e-21, 1.0284944173362715e-21, 6.393156572044061e-22, 3.9740080515485094e-22, 2.4702570343467985e-22, 1.5355202446965533e-22, 9.54484650418752e-23, 5.933109322599955e-23, 3.6880411034874887e-23, 2.292498998662099e-23, 1.425025240064973e-23, 8.858005765792488e-24, 5.506166763982025e-24, 3.4226521447819294e-24, 2.1275323117363145e-24, 1.322481381691934e-24, 8.2205896252379515e-25, 5.1099467048914545e-25, 3.1763603971503663e-25, 1.9744365167110993e-25, 1.2273165104374322e-25, 7.629041521686596e-26, 4.742238374913899e-26, 2.9477916381210484e-26, 1.832357392184888e-26, 1.1389996393485771e-26, 7.080060822028102e-27, 4.400990089187962e-27, 2.7356705333475412e-27, 1.700502185953998e-27, 1.057037990936665e-27, 6.570584404492206e-28, 4.0842978007157435e-28, 2.5388135206857206e-28, 1.5781351917303963e-28, 9.809742476498505e-29, 6.097769567492104e-29, 3.7903944764414215e-29, 2.3561222063277495e-29, 1.4645736441560355e-29, 9.10384000199909e-30, 5.658978168336365e-30, 3.517640237820048e-30, 2.186577236145866e-30, 1.3591839092090395e-30, 8.448733795056932e-31, 5.251761903308292e-31, 3.2645132108644712e-31, 2.0292326080501292e-31, 1.2613779487458502e-31, 7.840768590403876e-32, 4.8738486469808327e-32, 3.0296010346166e-32, 1.8832103936251097e-32, 1.1706100394524177e-32, 7.276552153203449e-33, 4.52312977454541e-33], index = range(1,201), columns = ['A'])
subonly_counts = pd.concat(subonly_counts, axis=1)

denovo_exp_rate = pd.DataFrame([4.562984199878992e-12,  1.0467617654842797e-10,  5.807374246629577e-11,  8.005268166171157e-11,  1.3679705360645337e-10,  3.0556157928111613e-10,  2.289030948565612e-09,  4.854045051592342e-09,  1.1364615491677555e-08,  1.419922964171028e-08,  1.1828716797973438e-08,  4.1291073073549976e-10,  1.447422661312361e-10,  5.0615435868574976e-11, 1.6412002445322716e-10],  index = [0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0,  11.0,  12.0,  13.0,  25.0], columns = ['A'])
denovo_con_rate = pd.DataFrame([2.885531044935673e-10,  1.871202448033845e-10,  1.7746255255979686e-10,  2.498754225278858e-10,  3.7299996616692954e-10,  4.540853190555286e-10,  1.0654492863651945e-09,  3.20580338022747e-09,  7.138318574137602e-09,  1.3303065078067975e-08,  2.0285696564748838e-08,  3.1794126266633476e-09,  7.237113306561804e-10,  3.036926152114499e-10,  5.25055043000813e-11,  1.1059236545578058e-10,  1.3900672767207106e-10], [0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0,  11.0,  12.0,  13.0,  14.0,  20.0,  24.0], columns = ['A'])
denovo_nonexp_rate = pd.DataFrame([1.4427655224678364e-10,  5.807374246629577e-12,  8.791913014870057e-12,  6.383862501634492e-12,  1.0406121895022531e-11,  3.3700342391113206e-11,  2.1336461765241066e-11,  1.1596546420079138e-10,  1.3940337656658055e-10,  2.763718878031177e-10,  2.890375115148498e-10,  3.377319543062175e-10,  3.036926152114499e-10,  2.625275215004065e-10,  1.6588854818367085e-10,  6.036395505019818e-11,  6.996035254753307e-11,  8.281119434384756e-11,  9.221806429413932e-11, 1.1676038583818848e-10, 1.6412002445322716e-10, 1.0262244516086404e-09], index = [0.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0,  11.0,  12.0,  13.0,  14.0,  15.0,  16.0,  17.0,  18.0,  19.0, 22.0,  25.0, 32.0], columns = ['A'])
denovo_substitution_context_rate = pd.DataFrame([4.541573240083719e-09,  8.077760235763368e-09,  6.447333857760922e-09,  2.865765641003364e-09,  4.7677112600042175e-09,  3.967453912936841e-09], index = ['Afission', 'Acontraction', 'A10', 'Afusion', 'Aexpansion', 'A01'], columns = ['A']).transpose()

In [None]:
# directory to store simulation results (create if it does not exist, can be changed as needed)
sim_dir = 'simulations/'
finished = os.listdir(sim_dir)

#### with constant speedup

In [None]:
test = run_simulation(mult=1, exp_power=1, con_power=1, input_nuc = 'A', mutonly = False, speedup = 4, rounds = 4, fill = True, A_bins = 100, B_bins = 200, subonly_start = True)

#### with progressive speedup

In [None]:
test = dict()
test[0] = run_simulation_custom(1, 0, 0.5, max_speedup=5, rounds_mult = 1, subonly_start=True)

## Substitution-only simulations
- note: run this set of computational models first, and process data to generate the "subonly_counts.pickle" file which is used to initiate runs of the computational model for all other parameter combinaztions

In [None]:
# directory to store simulation results (create if it does not exist, can be changed as needed)
sim_dir = 'simulations/subonly_output/'
finished = os.listdir(sim_dir)

In [None]:
test = run_simulation(input_nuc='A', B_bins = 200, mutonly = True, speedup = 5, random_start=True, rounds = 5, ceiling = 1e-5, sim_dir='simulations/subonly_output/')

In [None]:
test = run_simulation(input_nuc='AC', B_bins = 500, mutonly = True, speedup = 5, random_start=True, rounds = 5, ceiling = 1e-5, sim_dir='simulations/subonly_output/')

In [None]:
test = run_simulation(input_nuc='AAC', B_bins = 500, mutonly = True, speedup = 5, random_start=True, rounds = 5, ceiling = 1e-5, sim_dir='simulations/subonly_output/')

In [None]:
test = run_simulation(input_nuc='AAAC', B_bins = 2000, mutonly = True, speedup = 5, random_start=True, rounds = 5, ceiling = 1e-5, sim_dir='simulations/subonly_output/')

#### Process data

In [None]:
grid_files_mutonly = os.listdir('simulations/subonly_output/')
grid_files_A_mutonly = [file for file in grid_files_mutonly if file.startswith('Adist')]

In [None]:
parameter_info_mutonly = pd.Series(grid_files_A_mutonly).str.split('_', expand = True)[[1,2,3,4,6]]
parameter_info_mutonly.index = grid_files_A_mutonly
parameter_info_mutonly[2] = parameter_info_mutonly[2].str.split('bins', expand = True)[1].astype(int)
parameter_info_mutonly[3] = parameter_info_mutonly[3].str.split('sp1e', expand = True)[1].astype(int)
parameter_info_mutonly[4] = parameter_info_mutonly[4].str.split('rounds1e', expand = True)[1].astype(int)
parameter_info_mutonly[6] = parameter_info_mutonly[6].str.split('.pickle', expand = True)[0]
parameter_info_mutonly.columns = ['motif', 'bins', 'speedup (1e)', 'rounds (1e)', 'start']

In [None]:
parameter_info_mutonly_cols = parameter_info_mutonly.groupby(['motif', 'bins', 'speedup (1e)', 'rounds (1e)', 'start'])['bins'].count().index
substitutions_only = dict()
for motif, bins, speedup, rounds, start in parameter_info_mutonly_cols:
    file = 'Adist_'+motif+'_bins'+str(bins)+'_sp1e'+str(speedup)+'_rounds1e'+str(rounds)+'_mutonly_'+start+'.pickle'
    substitutions_only[motif] = pd.read_pickle('simulations/subonly_output/' + file)
    substitutions_only[motif].columns = substitutions_only[motif].columns * (10**speedup)
substitutions_only = pd.concat(substitutions_only, axis=1).sort_index(axis=1)

In [None]:
substitutions_only_B = dict()
for motif, bins, speedup, rounds, start in parameter_info_mutonly_cols:
    file = 'Bdist_'+motif+'_bins'+str(bins)+'_sp1e'+str(speedup)+'_rounds1e'+str(rounds)+'_mutonly_'+start+'.pickle'
    substitutions_only_B[motif] = pd.read_pickle('simulations/subonly_output/' + file)
    substitutions_only_B[motif].columns = substitutions_only_B[motif].columns * (10**speedup)
substitutions_only_B = pd.concat(substitutions_only_B, axis=1).sort_index(axis=1)

In [None]:
starting_subonlydist = dict()
starting_subonlydist['A'] = dict(); starting_subonlydist['B'] = dict()
for motif in ['A', 'AC', 'AAC', 'AAAC']:
    starting_subonlydist['A'][motif] = substitutions_only[motif][1e10][:100]
    starting_subonlydist['B'][motif] = substitutions_only_B[motif][1e10][:200]
starting_subonlydist['A'] = pd.concat(starting_subonlydist['A'], axis=1)
starting_subonlydist['B'] = pd.concat(starting_subonlydist['B'], axis=1)
starting_subonlydist = pd.concat(starting_subonlydist, axis=1)
starting_subonlydist.index +=1

In [None]:
starting_subonlydist.to_pickle('repeat_distributions/subonly_counts.pickle')