In [1]:
import pandas as pd
import numpy as np
import re
import pickle as pkl
import warnings
warnings.filterwarnings('ignore')

In [2]:
nonquasi_list = ['Pseudo','Neutral','Adaptive','Neutral_dom','Adaptive_dom','quasi_stablespace20']
quasi_list = [
    'quasi_dom',
    'quasi_env2','quasi_env5','quasi_env10','quasi_env20',
    'quasi_int2','quasi_int25','quasi_int100','quasi_int200',
    'quasi_pop500','quasi_pop1000','quasi_pop5000','quasi_pop10000','quasi_pop50000',
    'quasi_neufrac0.05','quasi_neufrac0.15','quasi_neufrac0.2','quasi_neufrac0.25',
    'quasi_lambda2','quasi_lambda4','quasi_lambda6','quasi_lambda8','quasi_lambda10',
    'quasi_AP0.15','quasi_AP0.35','quasi_AP0.45','quasi_AP0.5'
]
# 

## Get allele frequency spectrum

In [3]:
def get_spectrum(mut_df,Ne):
    mut_df['AF'] = mut_df['AF']/(2*Ne)
    Total_alleles = len(mut_df)
    spectrum = []
    for start in np.linspace(0,0.019,20):
        F_alleles = (mut_df['AF'] < start+0.001).sum()/Total_alleles #accumulative
        spectrum.append(F_alleles)
    for start in np.linspace(0.02,0.99,98):
        F_alleles = (mut_df['AF'] < start+0.01).sum()/Total_alleles #accumulative
        spectrum.append(F_alleles)
    return spectrum

In [4]:
def get_mean_spectrum(rep,version):
    target_dir = f'./data/Simulation_log/rep{rep}/{version}_20samples/'
    if version == 'quasi_pop50000':
        N_gen_total = 160000
    else:
        N_gen_total = 200000
    if 'pop' in version:
        Ne = int(re.search(r'pop(\d+)',version)[1])
    else:
        Ne = 10000
    spectrum_list = []
    for N_gen in range(N_gen_total-19000,N_gen_total+1,1000):
        mut_df = pd.read_table(target_dir+f'{version}_{N_gen}.txt',header=None,delim_whitespace=True)
        mut_df = mut_df.rename(columns={4:'ID',7:'s',10:'SG',11:'AF'})
        
        if version == 'quasi_stablespace20':
            mut_dict = {}
            for i,row in mut_df.iterrows():
                if row['ID'] in mut_dict:
                    mut_dict[row['ID']] += row['AF']
                else:
                    mut_dict[row['ID']] = row['AF']
            mut_df = pd.DataFrame(list(mut_dict.items()),columns=['ID', 'AF'])
        
        spectrum = get_spectrum(mut_df,Ne)
        spectrum_list.append(spectrum)
    spectrum_mean = np.mean(spectrum_list,axis=0)    
    return spectrum_mean

In [None]:
spectrum_final_dict = {}
for version in nonquasi_list+quasi_list:
    print(version)
    spectrum_mean_list = []
    for rep in range(1,31):
        print(rep,end='\r',flush=True)
        spectrum_mean = get_mean_spectrum(rep,version)
        spectrum_mean_list.append(spectrum_mean)
    spectrum_final_dict[version] = np.mean(spectrum_mean_list,axis=0)

In [10]:
# with open('./data/Allele_spectrum_v3.pkl','wb') as file:
#     pkl.dump(spectrum_final_dict,file)

## Get summary statistics for each model

In [10]:
summary_dict = {}
selcoeff_cutoff = 0 # 1/(2*Ne)

for version in nonquasi_list + quasi_list:
    print(version)
    if version == 'Pseudo':
        summary_dict['Type'] = [version]
    else:
        summary_dict['Type'].append(version)

    if 'pop' in  version:
        Ne = int(version.split('pop')[1])
    else:
        Ne = 10000
    
    for rep in range(1,31):
        print(rep,end='\r',flush=True)
        mut_df = pd.read_csv(
            f'./data/Simulation_log/rep{rep}/Mutation_summary_{version}.txt')

        if Ne == 50000:
            mut_df_sub = mut_df[(mut_df['Gen_E']>=40000) | (mut_df['Gen_E']==-1)]
        else:
            mut_df_sub = mut_df[(mut_df['Gen_E']>=80000) | (mut_df['Gen_E']==-1)]

        for mut_type in ['P','S']: # P for polymorphism, S for substitution
            if mut_type == 'P':
                idx = mut_df_sub['isFixed'] == 'F'
            elif mut_type == 'S':
                idx = mut_df_sub['isFixed'] == 'T'
            for fitness_type in ['effective','inst']:
                if fitness_type == 'effective':
                    if version in nonquasi_list:
                        key = 'cur_selcoef'
                    else:
                        key = 'effective_selcoef_1'
                elif fitness_type == 'inst':
                    key = 'cur_selcoef'
                if version == 'Pseudo':
                    summary_dict[f'deleterious_{fitness_type}_{mut_type}_{rep}'] = \
                        [(mut_df_sub.loc[idx,key] < -selcoeff_cutoff).sum()]
                    summary_dict[f'beneficial_{fitness_type}_{mut_type}_{rep}'] = \
                        [(mut_df_sub.loc[idx,key] > selcoeff_cutoff).sum()]
                    summary_dict[f'neutral_{fitness_type}_{mut_type}_{rep}'] = \
                        [((mut_df_sub.loc[idx,key]).abs() <= selcoeff_cutoff).sum()]
                else:
                    summary_dict[f'deleterious_{fitness_type}_{mut_type}_{rep}'].append(
                        (mut_df_sub.loc[idx,key] < -selcoeff_cutoff).sum())
                    summary_dict[f'beneficial_{fitness_type}_{mut_type}_{rep}'].append(
                        (mut_df_sub.loc[idx,key] > selcoeff_cutoff).sum())
                    summary_dict[f'neutral_{fitness_type}_{mut_type}_{rep}'].append(
                        ((mut_df_sub.loc[idx,key]).abs() <= selcoeff_cutoff).sum())
        if version == 'Pseudo':
            summary_dict[f'dNdneutral_{rep}'] = [(mut_df_sub['isFixed'] == 'T').sum()/1920]
        else:
            summary_dict[f'dNdneutral_{rep}'].append((mut_df_sub['isFixed'] == 'T').sum()/1920)

Pseudo
Neutral
Adaptive
Neutral_dom
Adaptive_dom
quasi_stablespace20
quasi_dom
quasi_env2
quasi_env5
quasi_env10
quasi_env20
quasi_int2
quasi_int25
quasi_int100
quasi_int200
quasi_pop500
quasi_pop1000
quasi_pop5000
quasi_pop10000
quasi_pop50000
quasi_neufrac0.05
quasi_neufrac0.15
quasi_neufrac0.2
quasi_neufrac0.25
quasi_lambda2
quasi_lambda4
quasi_lambda6
quasi_lambda8
quasi_lambda10
quasi_AP0.15
quasi_AP0.35
quasi_AP0.45
quasi_AP0.5
30

In [11]:
summary_df = pd.DataFrame(summary_dict)

In [49]:
for meanORinst in ['inst','effective']:
    for rep in range(1,31):
        for Type in ['P','S']:
            for mut_type in ['beneficial','neutral','deleterious']:
                summary_df[f'{mut_type}_{meanORinst}_{Type}'] = \
                    summary_df.loc[:,[f'{mut_type}_{meanORinst}_{Type}_{rep}' for rep in range(1,31)]].mean(axis=1)


In [None]:
#summary_df.to_csv('./data/SLiM_summary_raw.csv', index=False, na_rep='NA')

## Prepare a table for plotting

In [50]:
tmp_df = pd.DataFrame()
tmp_df['Type'] = summary_df['Type']
for meanORinst in ['inst','effective']:
    for rep in range(1,31):
        for Type in ['P','S']:
            for mut_type in ['beneficial','neutral','deleterious']:
                tmp_df[f'{mut_type}_{meanORinst}_{Type}_{rep}'] = \
                summary_df[f'{mut_type}_{meanORinst}_{Type}_{rep}']/ \
                (summary_df[f'deleterious_{meanORinst}_{Type}_{rep}'] + summary_df[f'beneficial_{meanORinst}_{Type}_{rep}'] + \
                    summary_df[f'neutral_{meanORinst}_{Type}_{rep}'])


In [51]:
plt_df = pd.DataFrame()
plt_df['Type'] = summary_df['Type']
for meanORinst in ['inst','effective']:
    for Type in ['P','S']:
        for mut_type in ['beneficial','neutral','deleterious']:
            plt_df[f'{mut_type}_{meanORinst}_{Type}'] = \
                tmp_df.loc[:,[f'{mut_type}_{meanORinst}_{Type}_{rep}' for rep in range(1,31)]].mean(axis=1)


In [52]:
plt_df = pd.concat([plt_df,summary_df.loc[:,[f'dNdneutral_{rep}' for rep in range(1,31)]]],axis=1)

In [18]:
#plt_df.to_csv('Simulation_summary.csv',index=False)