In [2]:
import pandas as pd
import numpy as np
import numpy.random as nrand
from statsmodels.distributions.empirical_distribution import ECDF, monotone_fn_inverter
import matplotlib as mpl
import matplotlib.pyplot as plt
#from brokenaxes import brokenaxes
import warnings
import scipy.stats as stats
from scipy.misc import derivative
from scipy.integrate import quad
import pickle as pkl
warnings.filterwarnings('ignore')

In [3]:
DFE_dict = {}

In [4]:
# Load DFEs of yeast genes.
DFE_dict_tmp = {}

Gene_list = ['ADA2','PRS3','ASC1','RAD6','BFR1','RPL29', \
             'BUD23','RPL39','CCW12','RPS7A','EOS1','SNF6','GET1',\
             'TSR2','GIM5','VMA21','IES6','VMA7','LSM1','EST1','PAF1']
df_YPD = pd.DataFrame()

for Gene in Gene_list:
    df_gene = pd.read_csv(f'../../Xukang/Mutants/{Gene}.txt', 
                     delim_whitespace=True)
    df_gene['Gene'] = Gene
    df_YPD = pd.concat([df_YPD,df_gene],ignore_index=True)
df_YPD = df_YPD[df_YPD['Mutation_type'] != 'Nonsense_mutation']

#df_YPD.iloc[:,2:6] = np.log(df_YPD.iloc[:,2:6])
df_YPD['YPD_fitness'] = df_YPD[[f'Fitness_from_YPD_replicate_{i}' for i in range(1,5)]].mean(axis=1)
_,P = stats.ttest_1samp(df_YPD[['Fitness_from_YPD_replicate_1','Fitness_from_YPD_replicate_2',
           'Fitness_from_YPD_replicate_3','Fitness_from_YPD_replicate_4']],1,axis=1)
df_YPD['YPD_P'] = P

df_all  = pd.read_csv('/home/siliang/Xukang/Fitness_Distribution/All_mutations_four_env_SNF6_two_replicates_se.csv')
df_all = df_all[df_all['Mutation_type'] != 'Nonsense_mutation']

DFE_dict_tmp['YPD'] = \
    df_YPD.loc[df_YPD['Mutation_type'] == 'Nonsynonymous_mutation',['YPD_fitness','YPD_P']]

for env in ['SC_37','YPD_H2O2','YPE']:
    df_all[f'{env}_fitness'] = df_all[[f'Fitness_from_{env}_replicate_{i}' for i in range(1,4)]].mean(axis=1)
    _,P = stats.ttest_1samp(df_all[[f'Fitness_from_{env}_replicate_{i}' for i in range(1,4)]],1,axis=1)
    df_all[f'{env}_P'] = P
    DFE_dict_tmp[env] = \
        df_all.loc[df_all['Mutation_type'] == 'Nonsynonymous_mutation',[f'{env}_fitness',f'{env}_P']]

DFE_dict['Yeast21genes'] = DFE_dict_tmp

In [8]:
DFE_dict_tmp = {}
hsp_df = pd.read_csv('./HSP90/combined_fitness_2.csv')
idx = (hsp_df['mut_type'] == 'Nonsynonymous') & (hsp_df['N_mut'] == 1)
subkeys = []
for env in ['standard_rep1','standard_rep2','diamide','ethanol','nitrogen_depletion','salt','37C']:
    tmp_df = hsp_df.loc[idx,[f'fitness_{env}',f'P_{env}']]
    tmp_df = tmp_df.rename(columns={f'fitness_{env}':f'{env}_fitness',f'P_{env}':f'{env}_P'})
    DFE_dict_tmp[env] = tmp_df
DFE_dict['HSP90'] = DFE_dict_tmp

In [11]:
DFE_dict_tmp = {}
ubi_df = pd.read_csv('./Ubiquitin/Ubiquitin_fitness.csv')
env = 'standard'
tmp_df = \
    ubi_df.loc[(ubi_df['mut_type'] == 'Nonsynonymous') & (ubi_df['N_mut'] == 1),['fitness','P']]
tmp_df = tmp_df.rename(columns={f'fitness':f'{env}_fitness',f'P':f'{env}_P'})
DFE_dict_tmp['standard'] = tmp_df
DFE_dict['Ubiquitin'] = DFE_dict_tmp

In [28]:
DFE_dict_tmp = {}
for env in ['no_Lon','Lon']:
    result_df = pd.read_csv(f'./DHFR/DHFR_{env}_result.csv')
    result_df['fitness'] = np.exp(result_df['growth_rate']*(1/0.23))
    idx_non = result_df['mut_type'] == 'Nonsynonymous'
    idx_notna = result_df['fitness'].notna()
    tmp_df = result_df.loc[idx_non & idx_notna,['fitness','P']]
    tmp_df = tmp_df.rename(columns={f'fitness':f'{env}_fitness',f'P':f'{env}_P'})
    DFE_dict_tmp[env] = tmp_df
DFE_dict['DHFR'] = DFE_dict_tmp


In [49]:
DFE_dict_tmp = {}
Gene_list = ['ADA2', 'ASC1', 'RPL39', 'TSR2', 'EOS1', 'IES6']
df_YPD = pd.DataFrame()

for Gene in Gene_list:
    df_gene = pd.read_csv(f'../../Xukang/Mutants/{Gene}.txt', 
                     delim_whitespace=True)
    df_gene['Gene'] = Gene
    df_YPD = pd.concat([df_YPD,df_gene],ignore_index=True)
df_YPD = df_YPD[df_YPD['Mutation_type'] != 'Nonsense_mutation']
df_YPD['YPD_fitness'] = df_YPD[[f'Fitness_from_YPD_replicate_{i}' for i in range(1,5)]].mean(axis=1)
_,P = stats.ttest_1samp(df_YPD[['Fitness_from_YPD_replicate_1','Fitness_from_YPD_replicate_2',
           'Fitness_from_YPD_replicate_3','Fitness_from_YPD_replicate_4']],1,axis=1)
df_YPD['YPD_P'] = P

DFE_dict_tmp['YPD'] = \
    df_YPD.loc[df_YPD['Mutation_type'] == 'Nonsynonymous_mutation',['YPD_fitness','YPD_P']]


DFE_dict['Yeast21genes'] = DFE_dict_tmp

In [64]:
# Load DFEs of yeast genes.
DFE_dict_tmp = {}
env = 'YPD'

Gene_list = ['ADA2', 'ASC1', 'RPL39', 'TSR2', 'EOS1', 'IES6']
df_YPD = pd.DataFrame()

for Gene in Gene_list:
    df_gene = pd.read_csv(f'../../Xukang/Mutants/{Gene}.txt', 
                     delim_whitespace=True)
    df_gene['Gene'] = Gene
    df_YPD = pd.concat([df_YPD,df_gene],ignore_index=True)
df_YPD = df_YPD[df_YPD['Mutation_type'] != 'Nonsense_mutation']
df_YPD['YPD_fitness'] = df_YPD[[f'Fitness_from_YPD_replicate_{i}' for i in range(1,5)]].mean(axis=1)
_,P = stats.ttest_1samp(df_YPD[['Fitness_from_YPD_replicate_1','Fitness_from_YPD_replicate_2',
           'Fitness_from_YPD_replicate_3','Fitness_from_YPD_replicate_4']],1,axis=1)
df_YPD['YPD_P'] = P

DFE_dict_6gene = \
    df_YPD.loc[df_YPD['Mutation_type'] == 'Nonsynonymous_mutation',['YPD_fitness','YPD_P']]

Yeast11gene_df = pd.read_csv('./Xukang/Xukang_new_data.csv')
Yeast11gene_df['fitness'] = Yeast11gene_df.loc[:, ['Fitness1', 'Fitness2', 'Fitness3', 'Fitness4']].mean(axis=1)
_,P = stats.ttest_1samp(Yeast11gene_df[['Fitness1','Fitness2','Fitness3','Fitness4']],1,axis=1)
Yeast11gene_df['P'] = P

tmp_df = \
    Yeast11gene_df.loc[(Yeast11gene_df['Mutation_type'] == 'Nonsynonymous_mutation'),['fitness','P']]
DFE_dict_11gene = tmp_df.rename(columns={f'fitness':f'{env}_fitness',f'P':f'{env}_P'})

DFE_dict_17gene = pd.concat([DFE_dict_6gene,DFE_dict_11gene],ignore_index=True)

DFE_dict_tmp[env] = DFE_dict_17gene
DFE_dict['Yeast17genes'] = DFE_dict_tmp

In [76]:
((DFE_dict_17gene['YPD_P'] < 0.05) & (DFE_dict_17gene['YPD_fitness'] > 1)).sum() / len(DFE_dict_17gene)

idx = ((DFE_dict_17gene['YPD_P'] < 0.05) & (DFE_dict_17gene['YPD_fitness'] > 1))
DFE_dict_17gene.loc[idx, 'YPD_fitness'].mean()

1.0055086226625

In [77]:
len(DFE_dict_17gene)

5386

In [65]:
def downsample_ben(df_rep,env,Ne):
    N_sample = 10000
    
    idx_ben = df_rep[f'{env}_fitness'] > 1
    idx_sig = df_rep[f'{env}_P'] <= 0.05
    F_ben = idx_ben.sum()/(df_rep[f'{env}_fitness']>-1).sum()
    F_ben_sig = (idx_ben & idx_sig).sum()/(df_rep[f'{env}_fitness']>-1).sum()
    
    N_ben = int(N_sample*F_ben*0.1)
    N_nonben = N_sample - N_ben
    df_ben = df_rep[idx_ben].sample(N_ben,replace=True)
    df_nonben = df_rep[df_rep[f'{env}_fitness']<=1].sample(N_nonben,replace=True)
    df_combined = pd.concat([df_nonben,df_ben])
    
    df_tmp = df_rep.copy()
    idx = df_tmp[f'{env}_P'] > 0.05
    df_tmp.loc[idx,f'{env}_fitness'] = 1
    
    N_ben_sig = int(N_sample*F_ben_sig*0.1)
    N_nonben_sig = N_sample - N_ben_sig
    df_ben_sig = df_tmp[idx_ben & idx_sig].sample(N_ben_sig,replace=True)
    df_nonben_sig = df_tmp[df_tmp[f'{env}_fitness']<=1].sample(N_nonben_sig,replace=True)
    df_combined_sig = pd.concat([df_nonben_sig,df_ben_sig])
    
    return df_combined, df_combined_sig

In [58]:
def get_fix_prob(mean_list,Ne):
    #mean_list = np.exp(mean_list)
    s = mean_list-1
    fix_prob = (1-np.exp(-2*s))/(1-np.exp(-4*Ne*s))
    idx = (np.abs(s) < 1/(2*Ne)) | (s == 0)
    fix_prob[idx] = 1/(2*Ne)
    #fix_prob[fix_prob>1/(2*Ne)] = 1/2
    return fix_prob

def cal_lambda(mean_list,Ne):
    fix_prob = get_fix_prob(mean_list,Ne)
    Lambda = fix_prob.mean()*2*Ne
    return Lambda

def cal_alpha(mean_list,Ne):
    fix_prob = get_fix_prob(mean_list,Ne)
    benefit_idx = mean_list-1 > 1/(2*Ne)
    Alpha = fix_prob[~benefit_idx].sum()/fix_prob.sum()
    return Alpha

In [59]:
def L(s,a,u,Ne):
    return u/s*Ne*np.log(Ne)*np.exp(-a*s)*2*(s+1/a)

def fixP(s,a,u,Ne):
    return 2*s*np.exp(-L(s,a,u,Ne))

def mean_fixP(a,u,Ne):
    INT = quad(lambda x: a*2*x*np.exp(-L(x,a,u,Ne)-a*x), 0, np.inf)
    return INT

In [60]:
def cal_asexual(df_rep,env,Ne,u=1.2e-4):
    s = df_rep[f'{env}_fitness']-1
    a = 1/s[s>1/(2*Ne)].mean()
    P_CI = mean_fixP(a,u,Ne)[0]
    #P_CI = fixP(x,alpha,u,Ne)
    #P_CI[P_CI<1/(2*Ne)] = 1/(2*Ne)
    P_Kimura = (1-np.exp(-2*s))/(1-np.exp(-4*Ne*s))
    P_fix = P_Kimura
    P_fix[s>1/(2*Ne)] = P_CI
    Lambda = P_fix.mean()*(2*Ne)
    Alpha = P_fix[s<1/(2*Ne)].sum()/P_fix.sum()
    return Lambda,Alpha

In [43]:
((df_tmp['YPD_P'] < 0.05) & (df_tmp['YPD_fitness'] > 1)).sum()/len(df_tmp)
df_tmp[((df_tmp['YPD_P'] < 0.05) & (df_tmp['YPD_fitness'] > 1))]

Unnamed: 0,YPD_fitness,YPD_P
2730,1.01615,0.002244


In [66]:
alpha_Ne_dict = {}
lambda_Ne_dict = {}
for Ne in [1e4,1e6,1e7,1e8]:
    print(Ne)
    alpha_Ne_dict[f'{Ne:.0e}'] = {}
    lambda_Ne_dict[f'{Ne:.0e}'] = {}
    for gene in ['Yeast21genes','HSP90','Ubiquitin','DHFR', 'Yeast17genes']:#
        alpha_Ne_dict[f'{Ne:.0e}'][gene] = {}
        lambda_Ne_dict[f'{Ne:.0e}'][gene] = {}
        print(gene)
        df_env_dict = DFE_dict[gene]
        for env,df_rep in df_env_dict.items():
            Lambda = cal_lambda(df_rep[f'{env}_fitness'], Ne)
            Alpha = cal_alpha(df_rep[f'{env}_fitness'], Ne)
            Lambda_asexual,Alpha_asexual = cal_asexual(df_rep,env,Ne,u=1.2e-4)

            df_tmp = df_rep.copy()
            idx = df_tmp[f'{env}_P'] > 0.05
            df_tmp.loc[idx,f'{env}_fitness'] = 1
            Lambda_sig = cal_lambda(df_tmp[f'{env}_fitness'], Ne)
            Alpha_sig = cal_alpha(df_tmp[f'{env}_fitness'], Ne)
            
            Lambda_ds_list = []
            Alpha_ds_list = []
            Lambda_ds_sig_list = []
            Alpha_ds_sig_list = []
            Lambda_ds_asexual_list = []
            Alpha_ds_asexual_list = []
            for i in range(100):
                df_ds_ben, df_ds_ben_sig = downsample_ben(df_rep,env,Ne)

                Lambda_ds = cal_lambda(df_ds_ben[f'{env}_fitness'], Ne)
                Alpha_ds = cal_alpha(df_ds_ben[f'{env}_fitness'], Ne)
                Lambda_ds_asexual,Alpha_ds_asexual = cal_asexual(df_ds_ben,env,Ne,u=1.2e-5)

                Lambda_ds_sig = cal_lambda(df_ds_ben_sig[f'{env}_fitness'], Ne)
                Alpha_ds_sig = cal_alpha(df_ds_ben_sig[f'{env}_fitness'], Ne)
                
                Lambda_ds_list.append(Lambda_ds)
                Alpha_ds_list.append(Alpha_ds)
                Lambda_ds_sig_list.append(Lambda_ds_sig)
                Alpha_ds_sig_list.append(Alpha_ds_sig)
                Lambda_ds_asexual_list.append(Lambda_ds_asexual)
                Alpha_ds_asexual_list.append(Alpha_ds_asexual)
                        
            lambda_Ne_dict[f'{Ne:.0e}'][gene][env] = \
                [Lambda, Lambda_sig, Lambda_asexual,
                 np.mean(Lambda_ds_list), np.mean(Lambda_ds_sig_list), np.mean(Lambda_ds_asexual_list)]
            alpha_Ne_dict[f'{Ne:.0e}'][gene][env] = \
                [Alpha, Alpha_sig, Alpha_asexual,
                 np.mean(Alpha_ds_list), np.mean(Alpha_ds_sig_list), np.mean(Alpha_ds_asexual_list)]


10000.0
Yeast21genes
HSP90
Ubiquitin
DHFR
Yeast17genes
1000000.0
Yeast21genes
HSP90
Ubiquitin
DHFR
Yeast17genes
10000000.0
Yeast21genes
HSP90
Ubiquitin
DHFR
Yeast17genes
100000000.0
Yeast21genes
HSP90
Ubiquitin
DHFR
Yeast17genes


In [30]:
for Ne in ['1e+06','1e+07','1e+08']:
    print(Ne)
    lambda_gene_dict = lambda_Ne_dict[Ne]
    alpha_gene_dict = alpha_Ne_dict[Ne]
    for gene in ['Yeast21genes','HSP90','Ubiquitin','DHFR']:
        print(gene)
        lambda_env_dict = lambda_gene_dict[gene]
        alpha_env_dict = alpha_gene_dict[gene]
        for env,lambda_list in lambda_env_dict.items():
            alpha_list = alpha_env_dict[env]
            #print(env)
            print(
#                 lambda_list[0],lambda_list[1],lambda_list[2],lambda_list[3],
#                 alpha_list[0],alpha_list[1],alpha_list[2],alpha_list[3],
                lambda_list[4],lambda_list[5],alpha_list[4],alpha_list[5],
                sep='\t'
            )

1e+04
Yeast21genes
1.2216321945722954	0.7240896126897672	0.19334864138031707	0.00132791792686488
1.19190346785737	1.3846654654086352	0.38248959247459646	0.0009976373805259215
1.9477718899110588	1.3414384878894925	0.22226603602472145	0.0010850900601160616
2.788000382800978	2.0486099962773294	0.13096976137174848	0.0002643011001551194
HSP90
1.5771178212746149	3.7260029647059967	0.5271711602972211	0.0003557525961202186
1.3076639163162493	6.982604294469933	0.5894915282970835	0.0001848214368044196
15.629546125766915	41.00316125749884	0.05854639603084429	1.52920978431084e-05
6.3257697678438936	21.91175874837404	0.14451653153144536	4.973808298391447e-05
4.37551489862243	12.70005901127572	0.20753322079778452	8.30078326782747e-05
4.282891951420176	11.616567137569886	0.21196934406722445	5.728661480756493e-05
15.708641295017303	16.561964108129022	0.024848222447865945	1.6823555410636917e-05
Ubiquitin
5.166397471191485	11.042089041217118	0.10933563445585168	2.3749979575623503e-05
DHFR
176.1066904678

In [67]:
# with open('lambda_Ne_dict_2024Oct.pkl','wb') as f:
#     pkl.dump(lambda_Ne_dict,f)
# with open('alpha_Ne_dict_2024Oct.pkl','wb') as f:
#     pkl.dump(alpha_Ne_dict,f)