In [1]:
import pandas as pd
import numpy as np
import warnings
import scipy.stats as stats
from scipy.integrate import quad
import pickle as pkl
warnings.filterwarnings('ignore')

In [2]:
DFE_dict = {}

In [3]:
# Load DFEs of yeast genes.
DFE_dict_tmp = {}

Gene_list = ['ADA2','PRS3','ASC1','RAD6','BFR1','RPL29', \
             'BUD23','RPL39','CCW12','RPS7A','EOS1','SNF6','GET1',\
             'TSR2','GIM5','VMA21','IES6','VMA7','LSM1','EST1','PAF1']
df_YPD = pd.DataFrame()

for Gene in Gene_list:
    df_gene = pd.read_csv(f'./data/Fitness_landscapes/Xukang/{Gene}.txt', 
                     delim_whitespace=True)
    df_gene['Gene'] = Gene
    df_YPD = pd.concat([df_YPD,df_gene],ignore_index=True)
df_YPD = df_YPD[df_YPD['Mutation_type'] != 'Nonsense_mutation']

#df_YPD.iloc[:,2:6] = np.log(df_YPD.iloc[:,2:6])
df_YPD['YPD_fitness'] = df_YPD[[f'Fitness_from_YPD_replicate_{i}' for i in range(1,5)]].mean(axis=1)
_,P = stats.ttest_1samp(df_YPD[['Fitness_from_YPD_replicate_1','Fitness_from_YPD_replicate_2',
           'Fitness_from_YPD_replicate_3','Fitness_from_YPD_replicate_4']],1,axis=1)
df_YPD['YPD_P'] = P

df_all  = pd.read_csv('/home/siliang/Xukang/Fitness_Distribution/All_mutations_four_env_SNF6_two_replicates_se.csv')
df_all = df_all[df_all['Mutation_type'] != 'Nonsense_mutation']

DFE_dict_tmp['YPD'] = \
    df_YPD.loc[df_YPD['Mutation_type'] == 'Nonsynonymous_mutation',['YPD_fitness','YPD_P']]

for env in ['SC_37','YPD_H2O2','YPE']:
    df_all[f'{env}_fitness'] = df_all[[f'Fitness_from_{env}_replicate_{i}' for i in range(1,4)]].mean(axis=1)
    _,P = stats.ttest_1samp(df_all[[f'Fitness_from_{env}_replicate_{i}' for i in range(1,4)]],1,axis=1)
    df_all[f'{env}_P'] = P
    DFE_dict_tmp[env] = \
        df_all.loc[df_all['Mutation_type'] == 'Nonsynonymous_mutation',[f'{env}_fitness',f'{env}_P']]

DFE_dict['Yeast21genes'] = DFE_dict_tmp

In [4]:
DFE_dict_tmp = {}
hsp_df = pd.read_csv('./data/Fitness_landscapes/HSP90/combined_fitness_2.csv')
idx = (hsp_df['mut_type'] == 'Nonsynonymous') & (hsp_df['N_mut'] == 1)
subkeys = []
for env in ['standard_rep1','standard_rep2','diamide','ethanol','nitrogen_depletion','salt','37C']:
    tmp_df = hsp_df.loc[idx,[f'fitness_{env}',f'P_{env}']]
    tmp_df = tmp_df.rename(columns={f'fitness_{env}':f'{env}_fitness',f'P_{env}':f'{env}_P'})
    DFE_dict_tmp[env] = tmp_df
DFE_dict['HSP90'] = DFE_dict_tmp

In [5]:
DFE_dict_tmp = {}
ubi_df = pd.read_csv('./data/Fitness_landscapes/Ubiquitin/Ubiquitin_fitness.csv')
env = 'standard'
tmp_df = \
    ubi_df.loc[(ubi_df['mut_type'] == 'Nonsynonymous') & (ubi_df['N_mut'] == 1),['fitness','P']]
tmp_df = tmp_df.rename(columns={f'fitness':f'{env}_fitness',f'P':f'{env}_P'})
DFE_dict_tmp['standard'] = tmp_df
DFE_dict['Ubiquitin'] = DFE_dict_tmp

In [6]:
DFE_dict_tmp = {}
for env in ['no_Lon','Lon']:
    result_df = pd.read_csv(f'./data/Fitness_landscapes/DHFR/DHFR_{env}_result.csv')
    result_df['fitness'] = np.exp(result_df['growth_rate']*(1/0.23))
    idx_non = result_df['mut_type'] == 'Nonsynonymous'
    idx_notna = result_df['fitness'].notna()
    tmp_df = result_df.loc[idx_non & idx_notna,['fitness','P']]
    tmp_df = tmp_df.rename(columns={f'fitness':f'{env}_fitness',f'P':f'{env}_P'})
    DFE_dict_tmp[env] = tmp_df
DFE_dict['DHFR'] = DFE_dict_tmp


In [7]:
def downsample_ben(df_rep,env,Ne):
    N_sample = 10000
    
    idx_ben = df_rep[f'{env}_fitness'] > 1
    idx_sig = df_rep[f'{env}_P'] <= 0.05
    F_ben = idx_ben.sum()/(df_rep[f'{env}_fitness']>-1).sum()
    F_ben_sig = (idx_ben & idx_sig).sum()/(df_rep[f'{env}_fitness']>-1).sum()
    
    N_ben = int(N_sample*F_ben*0.1)
    N_nonben = N_sample - N_ben
    df_ben = df_rep[idx_ben].sample(N_ben,replace=True)
    df_nonben = df_rep[df_rep[f'{env}_fitness']<=1].sample(N_nonben,replace=True)
    df_combined = pd.concat([df_nonben,df_ben])
    
    df_tmp = df_rep.copy()
    idx = df_tmp[f'{env}_P'] > 0.05
    df_tmp.loc[idx,f'{env}_fitness'] = 1
    
    N_ben_sig = int(N_sample*F_ben_sig*0.1)
    N_nonben_sig = N_sample - N_ben_sig
    df_ben_sig = df_tmp[idx_ben & idx_sig].sample(N_ben_sig,replace=True)
    df_nonben_sig = df_tmp[df_tmp[f'{env}_fitness']<=1].sample(N_nonben_sig,replace=True)
    df_combined_sig = pd.concat([df_nonben_sig,df_ben_sig])
    
    return df_combined, df_combined_sig

In [8]:
def get_fix_prob(mean_list,Ne):
    #mean_list = np.exp(mean_list)
    s = mean_list-1
    fix_prob = (1-np.exp(-2*s))/(1-np.exp(-4*Ne*s))
    idx = (np.abs(s) < 1/(2*Ne)) | (s == 0)
    fix_prob[idx] = 1/(2*Ne)
    #fix_prob[fix_prob>1/(2*Ne)] = 1/2
    return fix_prob

def cal_lambda(mean_list,Ne):
    fix_prob = get_fix_prob(mean_list,Ne)
    Lambda = fix_prob.mean()*2*Ne
    return Lambda

def cal_alpha(mean_list,Ne):
    fix_prob = get_fix_prob(mean_list,Ne)
    benefit_idx = mean_list-1 > 1/(2*Ne)
    Alpha = fix_prob[~benefit_idx].sum()/fix_prob.sum()
    return Alpha

In [9]:
def L(s,a,u,Ne):
    return u/s*Ne*np.log(Ne)*np.exp(-a*s)*2*(s+1/a)

def fixP(s,a,u,Ne):
    return 2*s*np.exp(-L(s,a,u,Ne))

def mean_fixP(a,u,Ne):
    INT = quad(lambda x: a*2*x*np.exp(-L(x,a,u,Ne)-a*x), 0, np.inf)
    return INT

In [10]:
def cal_asexual(df_rep,env,Ne,u=1.2e-4):
    s = df_rep[f'{env}_fitness']-1
    a = 1/s[s>1/(2*Ne)].mean()
    P_CI = mean_fixP(a,u,Ne)[0]
    #P_CI = fixP(x,alpha,u,Ne)
    #P_CI[P_CI<1/(2*Ne)] = 1/(2*Ne)
    P_Kimura = (1-np.exp(-2*s))/(1-np.exp(-4*Ne*s))
    P_fix = P_Kimura
    P_fix[s>1/(2*Ne)] = P_CI
    Lambda = P_fix.mean()*(2*Ne)
    Alpha = P_fix[s<1/(2*Ne)].sum()/P_fix.sum()
    return Lambda,Alpha

In [11]:
alpha_Ne_dict = {}
lambda_Ne_dict = {}
for Ne in [1e4,1e6,1e7,1e8]:
    print(Ne)
    alpha_Ne_dict[f'{Ne:.0e}'] = {}
    lambda_Ne_dict[f'{Ne:.0e}'] = {}
    for gene in ['Yeast21genes','HSP90','Ubiquitin','DHFR']:#
        alpha_Ne_dict[f'{Ne:.0e}'][gene] = {}
        lambda_Ne_dict[f'{Ne:.0e}'][gene] = {}
        print(gene)
        df_env_dict = DFE_dict[gene]
        for env,df_rep in df_env_dict.items():
            Lambda = cal_lambda(df_rep[f'{env}_fitness'], Ne)
            Alpha = cal_alpha(df_rep[f'{env}_fitness'], Ne)
            Lambda_asexual,Alpha_asexual = cal_asexual(df_rep,env,Ne,u=1.2e-4)

            df_tmp = df_rep.copy()
            idx = df_tmp[f'{env}_P'] > 0.05
            df_tmp.loc[idx,f'{env}_fitness'] = 1
            Lambda_sig = cal_lambda(df_tmp[f'{env}_fitness'], Ne)
            Alpha_sig = cal_alpha(df_tmp[f'{env}_fitness'], Ne)
            
            Lambda_ds_list = []
            Alpha_ds_list = []
            Lambda_ds_sig_list = []
            Alpha_ds_sig_list = []
            Lambda_ds_asexual_list = []
            Alpha_ds_asexual_list = []
            for i in range(100):
                df_ds_ben, df_ds_ben_sig = downsample_ben(df_rep,env,Ne)

                Lambda_ds = cal_lambda(df_ds_ben[f'{env}_fitness'], Ne)
                Alpha_ds = cal_alpha(df_ds_ben[f'{env}_fitness'], Ne)
                Lambda_ds_asexual,Alpha_ds_asexual = cal_asexual(df_ds_ben,env,Ne,u=1.2e-5)

                Lambda_ds_sig = cal_lambda(df_ds_ben_sig[f'{env}_fitness'], Ne)
                Alpha_ds_sig = cal_alpha(df_ds_ben_sig[f'{env}_fitness'], Ne)
                
                Lambda_ds_list.append(Lambda_ds)
                Alpha_ds_list.append(Alpha_ds)
                Lambda_ds_sig_list.append(Lambda_ds_sig)
                Alpha_ds_sig_list.append(Alpha_ds_sig)
                Lambda_ds_asexual_list.append(Lambda_ds_asexual)
                Alpha_ds_asexual_list.append(Alpha_ds_asexual)
                        
            lambda_Ne_dict[f'{Ne:.0e}'][gene][env] = \
                [Lambda, Lambda_sig, Lambda_asexual,
                 np.mean(Lambda_ds_list), np.mean(Lambda_ds_sig_list), np.mean(Lambda_ds_asexual_list)]
            alpha_Ne_dict[f'{Ne:.0e}'][gene][env] = \
                [Alpha, Alpha_sig, Alpha_asexual,
                 np.mean(Alpha_ds_list), np.mean(Alpha_ds_sig_list), np.mean(Alpha_ds_asexual_list)]


10000.0
Yeast21genes
HSP90
Ubiquitin
DHFR
1000000.0
Yeast21genes
HSP90
Ubiquitin
DHFR
10000000.0
Yeast21genes
HSP90
Ubiquitin
DHFR
100000000.0
Yeast21genes
HSP90
Ubiquitin
DHFR


In [15]:
for Ne in ['1e+06','1e+07','1e+08']:
    print(Ne)
    lambda_gene_dict = lambda_Ne_dict[Ne]
    alpha_gene_dict = alpha_Ne_dict[Ne]
    for gene in ['Yeast21genes', 'HSP90', 'Ubiquitin', 'DHFR']:
        print(gene)
        lambda_env_dict = lambda_gene_dict[gene]
        alpha_env_dict = alpha_gene_dict[gene]
        for env,lambda_list in lambda_env_dict.items():
            alpha_list = alpha_env_dict[env]
            #print(env)
            #print(
            #    lambda_list[0],lambda_list[1],lambda_list[2],lambda_list[3],lambda_list[4],lambda_list[5],
            #    alpha_list[0],alpha_list[1],alpha_list[2],alpha_list[3],alpha_list[4],alpha_list[5],
            #    sep='\t'
            #)

1e+06
Yeast21genes
HSP90
Ubiquitin
DHFR
1e+07
Yeast21genes
HSP90
Ubiquitin
DHFR
1e+08
Yeast21genes
HSP90
Ubiquitin
DHFR


In [67]:
# with open('./data/lambda_Ne_dict.pkl','wb') as f:
#     pkl.dump(lambda_Ne_dict,f)
# with open('./data/alpha_Ne_dict.pkl','wb') as f:
#     pkl.dump(alpha_Ne_dict,f)