In [1]:
import numpy as np
import numpy.random as nrand
from scipy.stats import norm, rv_continuous, pearsonr, spearmanr
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import time
from statsmodels.distributions.empirical_distribution import ECDF, monotone_fn_inverter

In [2]:
def average_PCC(env_mut_fit_list):
    N = env_mut_fit_list.shape[0]
    if N == 1:
        return False
    else:
        PCC_matrix = np.corrcoef(env_mut_fit_list)
        return PCC_matrix[np.triu_indices(N,k=1)].mean()

def average_CV(env_mut_fit_list):
    N = env_mut_fit_list.shape[0]
    if N == 1:
        return False
    else:
        std = env_mut_fit_list.std(axis=0,ddof=1)
        mean = env_mut_fit_list.mean(axis=0)
        return np.mean(std/mean)
    
def average_std(env_mut_fit_list):
    N = env_mut_fit_list.shape[0]
    if N == 1:
        return False
    else:
        std = env_mut_fit_list[:,(env_mut_fit_list>1).sum(axis=0)>0].std(axis=0,ddof=1).mean()
        return std

In [3]:
## mut_fit_list: list of fitness value
## N_env: number of enviornment
## env_variance: fitness variance due to change of enviornments
# The function first ordered the original fitness list, so the index of each element in list
# correspond to their ranking in the list. Then add normal distributed noise
# to each of fitness value in the list, then get the fitness ranking of each fitness, 
# according to this ranking, we shuffled the original ordered fitness list, and get the new
# fitness list in a new environment while maintaining the same shape of distribution.
# This process is repeated N_env times, then the average fitness will be calculated.

def average_env_fit(mut_fit_list_rev_cdf,N_mut,N_env,env_variance,se_list):
    env_mut_fit_list = []
    mut_fit_list = mut_fit_list_rev_cdf(nrand.rand(N_mut)) # using continious distribution
    mut_fit_list.sort()
    for i in range(N_env):
        tmp_array = mut_fit_list + nrand.normal(scale=env_variance,size=N_mut)
        order = tmp_array.argsort()
        ranks = order.argsort()
        mut_fit_list_new = mut_fit_list_rev_cdf(nrand.rand(N_mut))
        mut_fit_list_new.sort()
        mut_fit_list_new = mut_fit_list_new[ranks]
        #### introduce measurement error ####
        mut_fit_list_new = nrand.normal(
            loc = mut_fit_list_new, 
            scale= se_list
        )
        env_mut_fit_list.append(mut_fit_list_new)
        
    env_mut_fit_list = np.array(env_mut_fit_list)
    CV = average_CV(env_mut_fit_list)    
    lowest = env_mut_fit_list.min(axis=0) # use the minimum
    
    return lowest, CV, env_mut_fit_list

In [4]:
# get_env_given_CV will do a automatic search to fit the calculated CV to the target CV,
# and return a list of across-environments-averaged fitness distribution with preset replication.

def get_env_given_CV(mut_fit_list_rev_cdf,N_mut,N_env,target_CV,mut_type,cutoff = 0.0001, rep=1):
    if mut_type == 'Nonsynonymous':
        mut_idx = non_idx
    elif mut_type == 'Synonymous':
        mut_idx = syn_idx
    env_variance = 0.001 # initial value of env_variance. the value will be updated during the searching
    step_size = 0.0001 # setp_size of searching
    direction_flag_list = [] # 1 for CV > target_CV, -1 for CV < target_CV
    rep_list = []
    CV_list = []
    env_mut_fit_list_list = []
    while True:
        mean, CV, env_mut_fit_list = average_env_fit(
            mut_fit_list_rev_cdf,N_mut,N_env,env_variance,
            df_empirical[mut_idx]['YPD_fitness_se']
        )
        #print(env_variance,CV)
        if CV - target_CV > cutoff:
            # calculated CV is greater than target CV
            direction_flag_list.append(-1)
        
        elif CV - target_CV < -cutoff:
            # calculated CV is lower than target CV
            direction_flag_list.append(1)
        
        else:
            # calculated CV is within the cutoff
            direction_flag_list.append(0)
            rep_list.append(mean)
            env_mut_fit_list_list.append(env_mut_fit_list)
            CV_list.append(CV)
            if len(rep_list) == rep:
                break
            continue
        
        if len(direction_flag_list) >= 2 and direction_flag_list[-1]*direction_flag_list[-2] == -1:
            # update step_size according to the state of last two direction flag
            step_size = step_size/2
        
        # update env_variance according to step_size and direction flag
        env_variance += step_size*direction_flag_list[-1]
    
    return np.array(rep_list), CV_list, env_mut_fit_list_list
        


In [5]:
def subsample_env(env_mut_fit_list_all,N_env,rep):
    env_mut_fit_list_list = []
    mean_list = []
    CV_list = []
    N_all = len(env_mut_fit_list_all)
    for i in range(rep):
        idx = nrand.choice(range(N_all),N_env,replace=False)
        env_mut_fit_list = env_mut_fit_list_all[idx]
        mean = np.exp(np.log(env_mut_fit_list).mean(axis=0))
        env_mut_fit_list_list.append(env_mut_fit_list)
        mean_list.append(mean)
        CV_list.append(average_CV(env_mut_fit_list))
    env_mut_fit_list_list = np.array(env_mut_fit_list_list)
    mean_list = np.array(mean_list)
    CV_list = np.array(CV_list)
    return mean_list, CV_list, env_mut_fit_list_list
    

In [5]:
df_empirical  = pd.read_csv('All_mutations_four_env_SNF6_two_replicates_se.csv')

In [6]:
# Convert empirical descrete DFE to smooth DFE.
non_idx = df_empirical['Mutation_type'] == 'Nonsynonymous_mutation'
syn_idx = df_empirical['Mutation_type'] == 'Synonymous_mutation'
non_E_distribution = df_empirical[non_idx]['YPD_fitness'].to_numpy().flatten()
syn_E_distribution = df_empirical[syn_idx]['YPD_fitness'].to_numpy().flatten()
non_cdf = ECDF(non_E_distribution)
syn_cdf = ECDF(syn_E_distribution)
non_fit_list_rev_cdf = \
    monotone_fn_inverter(non_cdf,np.concatenate([[0],non_cdf.x[1:]]))
syn_fit_list_rev_cdf = \
    monotone_fn_inverter(syn_cdf,np.concatenate([[0],syn_cdf.x[1:]]))

In [None]:
#### parameters need to tune ####

replication = 1000
N_env_list = list(range(1,10))+list(range(10,40,2))+list(range(40,100,10))+list(range(100,201,50)) # number of environment
N_non = len(non_E_distribution)
N_syn = len(syn_E_distribution)
target_CV_list = [[0.008,0.003],[0.008,0.004],[0.008,0.005]] # target CV
cutoff_list = [0.98,0.99] # fitness cutoff for purging mutations
res_dict = {
    'N_env':[],'target_non_CV':[],'target_syn_CV':[],'rep':[],'cutoff':[],'dN':[],'dS':[],
    'non_CV':[], 'syn_CV':[]
}

##################################

for target_CV in target_CV_list:
    print(target_CV)
    if target_CV[1] in res_dict['target_syn_CV']:
        continue
        
    for N_env in N_env_list:
        print(N_env)
        if N_env == 1:
            non_mean_list = np.array([non_E_distribution]*replication)
            syn_mean_list = np.array([syn_E_distribution]*replication)
            non_CV_list = [np.nan]*replication
            syn_CV_list = [np.nan]*replication
        else:  
            # get_env_given_CV will do a automatic search to fit the calculated CV to the target CV,
            # and return a list of across-environments-averaged fitness distribution with preset replication.
            non_mean_list, non_CV_list, non_env_mut_fit_list_list = get_env_given_CV(
                non_fit_list_rev_cdf,
                N_non, N_env, target_CV[0],'Nonsynonymous',
                cutoff = 0.0001,
                rep=replication
            )
            
            syn_mean_list, syn_CV_list, syn_env_mut_fit_list_list = get_env_given_CV(
                syn_fit_list_rev_cdf,
                N_syn, N_env,target_CV[1],'Synonymous',
                cutoff = 0.0001,
                rep=replication
            )

        for cutoff in cutoff_list:
            dN_list = (non_mean_list>cutoff).sum(axis=1)/N_non
            dS_list = (syn_mean_list>cutoff).sum(axis=1)/N_syn
            res_dict['N_env'] += [N_env]*replication
            res_dict['target_non_CV'] += [target_CV[0]]*replication
            res_dict['target_syn_CV'] += [target_CV[1]]*replication
            res_dict['cutoff'] += [cutoff]*replication
            res_dict['rep'] += list(range(replication))
            #res_dict['Ne'] += [Ne]*replication
            res_dict['dN'] += list(dN_list)
            res_dict['dS'] += list(dS_list)
            res_dict['non_CV'] += list(non_CV_list)
            res_dict['syn_CV'] += list(syn_CV_list)
            
res_df = pd.DataFrame(res_dict)
res_df['dNdS'] = res_df.dN/res_df.dS

In [12]:
plot_dict = {
    'N_env':[],'target_non_CV':[],'target_syn_CV':[],'cutoff':[],'dNdS_mean':[],'dNdS_std':[],
    'dNdS_se':[],'non_CV_mean':[],'syn_CV_mean':[],'non_CV_std':[],'syn_CV_std':[],
    'non_CV_se':[],'syn_CV_se':[],'dN':[],'dS':[]
}
for target_CV in target_CV_list:
    for N_env in N_env_list:
        for cutoff in cutoff_list:
            idx = (res_df.target_syn_CV == target_CV[1]) & (res_df.cutoff == cutoff)  & (res_df.N_env == N_env)
            dN_list = res_df[idx].dN
            dS_list = res_df[idx].dS
            dNdS_mean = (dN_list/dS_list).mean()
            dNdS_std = (dN_list/dS_list).std()
            dNdS_se = (dN_list/dS_list).sem()
            res_df[idx].non_CV
            res_df[idx].syn_CV
            plot_dict['N_env'].append(N_env)
            plot_dict['target_non_CV'].append(target_CV[0])
            plot_dict['target_syn_CV'].append(target_CV[1])
            plot_dict['cutoff'].append(cutoff)
            plot_dict['non_CV_mean'].append(res_df[idx].non_CV.mean())
            plot_dict['syn_CV_mean'].append(res_df[idx].syn_CV.mean())
            plot_dict['non_CV_std'].append(res_df[idx].non_CV.std())
            plot_dict['syn_CV_std'].append(res_df[idx].syn_CV.std())
            plot_dict['non_CV_se'].append(res_df[idx].non_CV.sem())
            plot_dict['syn_CV_se'].append(res_df[idx].syn_CV.sem())
            plot_dict['dN'].append(dN_list.mean())
            plot_dict['dS'].append(dS_list.mean())
            plot_dict['dNdS_mean'].append(dNdS_mean)
            plot_dict['dNdS_std'].append(dNdS_std)
            plot_dict['dNdS_se'].append(dNdS_se)
plot_df = pd.DataFrame(plot_dict)

In [None]:
cmap = ['r','b','g']

for j,target_CV in enumerate(target_CV_list):
    fig,ax = plt.subplots(figsize=[5.2,5])
    params = {'mathtext.default': 'regular' }          
    plt.rcParams.update(params)
    ax.set_title(f'Nonsynonymous $CV$={target_CV[0]}\n Synonymous $CV$={target_CV[1]}',size=15)
    color=['red', 'green', 'blue']
    for i,cutoff in enumerate(cutoff_list):
        idx = (plot_df.target_syn_CV == target_CV[1]) & (plot_df.cutoff == cutoff)
        x = plot_df[idx].N_env
        y = plot_df[idx].dNdS_mean
        ax.plot(x, y, lw=2, label = f'Fitness cutoff = {cutoff}')
        ax.fill_between(plot_df[idx].N_env,
                        plot_df[idx].dNdS_mean-plot_df[idx].dNdS_std*1.96,
                        plot_df[idx].dNdS_mean+plot_df[idx].dNdS_std*1.96, alpha=.3)
        ax.set_xlabel("Number of different environments",size=15)
        ax.set_ylabel("Expected $\mathit{d}_{N}/\mathit{d}_{S}$",size=15)
        ax.legend(fontsize='large')
    ax.set_xlim(-1,200)
    ax.set_ylim(-0.1,1.1)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    #plt.savefig(f'lowest_with_se_{int(target_CV[0]*1000)}{int(target_CV[1]*1000)}.pdf')
    