In [1]:
import pandas as pd
import numpy as np
import re
import os
import pickle as pkl
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Model labels
nonAdapTrack_list = ['Pseudo','Neutral','Adaptive','Neutral_dom','Adaptive_dom','AdapTrack_stablespace20']
AdapTrack_list = [
    'AdapTrack_dom',
    'AdapTrack_env2','AdapTrack_env5','AdapTrack_env10','AdapTrack_env20',
    'AdapTrack_int2','AdapTrack_int25','AdapTrack_int100','AdapTrack_int200',
    'AdapTrack_pop500','AdapTrack_pop1000','AdapTrack_pop5000','AdapTrack_pop10000','AdapTrack_pop50000',
    'AdapTrack_neufrac0.05','AdapTrack_neufrac0.15','AdapTrack_neufrac0.2','AdapTrack_neufrac0.25',
    'AdapTrack_lambda2','AdapTrack_lambda4','AdapTrack_lambda6','AdapTrack_lambda8','AdapTrack_lambda10',
    'AdapTrack_AP0.15','AdapTrack_AP0.35','AdapTrack_AP0.45','AdapTrack_AP0.5',
    'AdapTrack', 'AdapTrack_quasineu', 'AdapTrack_adapt_fluc', 'AdapTrack_pop_fluc'
]

## Get allele frequency spectrum

In [3]:
def get_spectrum(mut_df, Ne):
    """
    Get the cumulative allele-frequency spectrum.
    mut_df: dataframe contain column 'AF' with mutaiton allele counts
    Ne: effective population size
    Return: a list of cumulative allele-frequency for each bin.
    """
    mut_df['AF'] = mut_df['AF'] / (2 * Ne)
    Total_alleles = len(mut_df)
    spectrum = []
    
    # Fine-grained at very low frequencies
    for start in np.linspace(0, 0.019, 20):
        F_alleles = (mut_df['AF'] < start + 0.001).sum() / Total_alleles # cumulative
        spectrum.append(F_alleles)

    # Coarser across the remaining range
    for start in np.linspace(0.02, 0.99, 98):
        F_alleles = (mut_df['AF'] < start + 0.01).sum() / Total_alleles # cumulative
        spectrum.append(F_alleles)
    return spectrum

In [4]:
def get_mean_spectrum(rep, version):
    """
    Compute the time-averaged AF spectrum from 20 replicates.
    rep: replicate index.
    version : model version label.
    Returns: mean spectrum across sampled generations.
    """
    target_dir = f'./data/Simulation_log/rep{rep}/{version}_20samples/'

    # Determine number of generations sampled and Ne
    if version == 'AdapTrack_pop50000':
        N_gen_total = 160000
    else:
        N_gen_total = 200000
    
    if ('pop' in version) and (version != 'AdapTrack_pop_fluc'):
        Ne = int(re.search(r'pop(\d+)', version)[1])
    else:
        Ne = 10000

    # Read mutation tables
    spectrum_list = []
    for N_gen in range(N_gen_total - 19000, N_gen_total + 1, 1000):
        
        if version == 'AdapTrack_pop_fluc':
            file_list = os.listdir(target_dir)
            for z, file in enumerate(file_list):
                if f'{version}_{N_gen}' in file:
                    mut_df = pd.read_table(target_dir + file, header=None, sep='\s+')
                    Ne = int(file.split('_')[-1].split('.')[0])
                    break
                else:
                    assert z < len(file_list) - 1, f"{version}_{N_gen}.txt not found"
                    
        else:
            mut_df = pd.read_table(target_dir + f'{version}_{N_gen}.txt', header=None, sep='\s+')

        if version in ['AdapTrack', 'AdapTrack_quasineu', 'AdapTrack_adapt_fluc', 'AdapTrack_pop_fluc']:
            mut_df = mut_df.rename(columns={5:'ID', 8:'s', 11:'SG', 12:'AF'})
        else:
            mut_df = mut_df.rename(columns={4:'ID', 7:'s', 10:'SG', 11:'AF'})
        
        if version == 'AdapTrack_stablespace20':
            # Collapse duplicate IDs by summing AF
            mut_dict = {}
            for i,row in mut_df.iterrows():
                if row['ID'] in mut_dict:
                    mut_dict[row['ID']] += row['AF']
                else:
                    mut_dict[row['ID']] = row['AF']
            mut_df = pd.DataFrame(list(mut_dict.items()),columns=['ID', 'AF'])
        
        spectrum = get_spectrum(mut_df, Ne)
        spectrum_list.append(spectrum)
    
    # get average
    spectrum_mean = np.mean(spectrum_list,axis=0)
    
    return spectrum_mean

In [None]:
# Compute average spectra across models
spectrum_final_dict = {}
for version in nonAdapTrack_list + AdapTrack_list:
    print(version)
    spectrum_mean_list = []
    for rep in range(1,31):
        print(rep,end='\r',flush=True)
        spectrum_mean = get_mean_spectrum(rep,version)
        spectrum_mean_list.append(spectrum_mean)
    spectrum_final_dict[version] = np.mean(spectrum_mean_list,axis=0)

In [13]:
# Uncomment and run to overwrite the result file
# with open('./data/Allele_spectrum.pkl','wb') as file:
#     pkl.dump(spectrum_final_dict,file)

## Get summary statistics for each model

In [14]:
summary_dict = {}
selcoeff_cutoff = 0 # 1/(2*Ne)

for version in nonAdapTrack_list + AdapTrack_list:
    print(version)
    if version == 'Pseudo':
        summary_dict['Type'] = [version]
    else:
        summary_dict['Type'].append(version)

    # Get effective population size
    if ('pop' in version) and (version != 'AdapTrack_pop_fluc'):
        Ne = int(version.split('pop')[1])
    else:
        Ne = 10000

    # Get summary statistics for each replicate
    for rep in range(1, 31):
        print(rep, end='\r', flush=True)
        mut_df = pd.read_csv(
            f'./data/Simulation_log/rep{rep}/Mutation_summary_{version}.txt')

        # Filter mutations that still exist after popualtion reach equilibrium state,
        # typically 80000 generations, but 40000 generations when Ne == 50000.
        if Ne == 50000:
            mut_df_sub = mut_df[(mut_df['Gen_E'] >= 40000) | (mut_df['Gen_E'] == -1)]
        else:
            mut_df_sub = mut_df[(mut_df['Gen_E'] >= 80000) | (mut_df['Gen_E'] == -1)]

        # P for polymorphism, S for substitution
        for mut_type in ['P', 'S']:
            if mut_type == 'P':
                idx = mut_df_sub['isFixed'] == 'F'
            elif mut_type == 'S':
                idx = mut_df_sub['isFixed'] == 'T'

            # mean: use mean selection coefficient (geometric mean fitness -1) during the mutation's lifetime
            # inst: instant, use selection coefficient in the current timepoint
            for fitness_type in ['mean', 'inst']:
                if fitness_type == 'mean':
                    # if (version in nonAdapTrack_list) and (version != 'AdapTrack_stablespace20'):
                    #     key = 'cur_selcoef'
                    # else:
                    key = 'mean_selcoef'
                elif fitness_type == 'inst':
                    key = 'cur_selcoef'

                # Count number of deleterious/beneficial/neutral mutations/substitutions
                if version == 'Pseudo':
                    summary_dict[f'deleterious_{fitness_type}_{mut_type}_{rep}'] = [
                        (mut_df_sub.loc[idx,key] < -selcoeff_cutoff).sum()
                    ]
                    summary_dict[f'beneficial_{fitness_type}_{mut_type}_{rep}'] = [
                        (mut_df_sub.loc[idx,key] > selcoeff_cutoff).sum()
                    ]
                    summary_dict[f'neutral_{fitness_type}_{mut_type}_{rep}'] = [
                        ((mut_df_sub.loc[idx,key]).abs() <= selcoeff_cutoff).sum()
                    ]
                else:
                    summary_dict[f'deleterious_{fitness_type}_{mut_type}_{rep}'].append(
                        (mut_df_sub.loc[idx,key] < -selcoeff_cutoff).sum()
                    )
                    summary_dict[f'beneficial_{fitness_type}_{mut_type}_{rep}'].append(
                        (mut_df_sub.loc[idx,key] > selcoeff_cutoff).sum()
                    )
                    summary_dict[f'neutral_{fitness_type}_{mut_type}_{rep}'].append(
                        ((mut_df_sub.loc[idx,key]).abs() <= selcoeff_cutoff).sum()
                    )

        # Get dN/dneutral and pN/pneutral, theoretical expectations of dneutral and pneutral are calcualted.
        if version == 'Pseudo':
            summary_dict[f'dNdneutral_{rep}'] = [(mut_df_sub['isFixed'] == 'T').sum()/1920]
            idx_PM = mut_df_sub['isFixed'] == 'F'
            summary_dict[f'pNpneutral_{rep}'] = [mut_df_sub.loc[idx_PM, 'AF'].sum() / (4*Ne*1e-7*1.6e5)]
            
        elif 'asexual' in version:
            summary_dict[f'dNdneutral_{rep}'].append((mut_df_sub['isFixed'] == 'T').sum()/(1920 * 2))
            #idx_PM = mut_df_sub['isFixed'] == 'F'
            #summary_dict[f'pNpneutral_{rep}'].append(mut_df_sub.loc[idx_PM, 'AF'].sum() * 2 / (4*Ne*1e-7*1.6e5))
        
        else:
            summary_dict[f'dNdneutral_{rep}'].append((mut_df_sub['isFixed'] == 'T').sum()/1920)
            idx_PM = mut_df_sub['isFixed'] == 'F'
            summary_dict[f'pNpneutral_{rep}'].append(mut_df_sub.loc[idx_PM, 'AF'].sum() / (4*Ne*1e-7*1.6e5))

Pseudo
Neutral
Adaptive
Neutral_dom
Adaptive_dom
AdapTrack_stablespace20
AdapTrack_dom
AdapTrack_env2
AdapTrack_env5
AdapTrack_env10
AdapTrack_env20
AdapTrack_int2
AdapTrack_int25
AdapTrack_int100
AdapTrack_int200
AdapTrack_pop500
AdapTrack_pop1000
AdapTrack_pop5000
AdapTrack_pop10000
AdapTrack_pop50000
AdapTrack_neufrac0.05
AdapTrack_neufrac0.15
AdapTrack_neufrac0.2
AdapTrack_neufrac0.25
AdapTrack_lambda2
AdapTrack_lambda4
AdapTrack_lambda6
AdapTrack_lambda8
AdapTrack_lambda10
AdapTrack_AP0.15
AdapTrack_AP0.35
AdapTrack_AP0.45
AdapTrack_AP0.5
AdapTrack
AdapTrack_quasineu
AdapTrack_adapt_fluc
AdapTrack_pop_fluc
30

In [15]:
# Convert to dataframe
summary_df = pd.DataFrame(summary_dict)

In [16]:
# Add per-type means across replicates (beneficial/neutral/deleterious × inst/mean × P/S)
for meanORinst in ['inst', 'mean']:
    for rep in range(1, 31):
        for Type in ['P', 'S']:
            for mut_type in ['beneficial', 'neutral', 'deleterious']:
                summary_df[f'{mut_type}_{meanORinst}_{Type}'] = \
                    summary_df.loc[:,[f'{mut_type}_{meanORinst}_{Type}_{rep}' for rep in range(1,31)]].mean(axis=1)


In [17]:
# Uncomment and run to overwrite the result file
# summary_df.to_csv('./data/SLiM_summary_raw.csv', index=False, na_rep='NA')

## Prepare a table for plotting

In [18]:
# Prepare normalized proportions for each replicate (beneficial/neutral/deleterious × inst/mean × P/S)
tmp_df = pd.DataFrame()
tmp_df['Type'] = summary_df['Type']
for meanORinst in ['inst', 'mean']:
    for rep in range(1, 31):
        for Type in ['P', 'S']:
            for mut_type in ['beneficial', 'neutral', 'deleterious']:
                tmp_df[f'{mut_type}_{meanORinst}_{Type}_{rep}'] = \
                summary_df[f'{mut_type}_{meanORinst}_{Type}_{rep}']/ \
                (summary_df[f'deleterious_{meanORinst}_{Type}_{rep}'] + summary_df[f'beneficial_{meanORinst}_{Type}_{rep}'] + \
                    summary_df[f'neutral_{meanORinst}_{Type}_{rep}'])


In [19]:
# Get mean proportions for each replicate (beneficial/neutral/deleterious × inst/mean × P/S)
plt_df = pd.DataFrame()
plt_df['Type'] = summary_df['Type']
for meanORinst in ['inst', 'mean']:
    for Type in ['P', 'S']:
        for mut_type in ['beneficial', 'neutral', 'deleterious']:
            plt_df[f'{mut_type}_{meanORinst}_{Type}'] = \
                tmp_df.loc[:,[f'{mut_type}_{meanORinst}_{Type}_{rep}' for rep in range(1,31)]].mean(axis=1)


In [20]:
# Append dN/dneutral and pN/pneutral information to plt_df
plt_df = pd.concat([plt_df,summary_df.loc[:,[f'dNdneutral_{rep}' for rep in range(1,31)]]],axis=1)
plt_df = pd.concat([plt_df,summary_df.loc[:,[f'pNpneutral_{rep}' for rep in range(1,31)]]],axis=1)

In [21]:
# Uncomment and run to overwrite the result file
# plt_df.to_csv('./data/Simulation_summary.csv',index=False)