In [20]:
import numpy as np
import numpy.random as nrand
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import itertools
from itertools import combinations
import copy
from matplotlib.ticker import FormatStrFormatter
from sklearn.linear_model import Ridge

In [21]:
Y_MIN = -0.6931471805599453
Y_MAX = 0.942657031435126

In [22]:
def load_sequence(sequences):
    BASES = np.asarray(['A','T','C','G'])
    shape = sequences.shape
    data = sequences[..., None] == BASES
    return data.reshape(shape[0], shape[1] * BASES.size)

def tolog(y,Y_MIN,Y_MAX):
    y = np.log(y)
    y = y - Y_MIN
    y = y/Y_MAX
    return y

In [23]:
with open("All_data_df.pkl",'rb') as f:
    data = pickle.load(f)

In [24]:
# Filter genotypes that have at least 2 fitness measure higher than 0.5
data_filtered = data[np.sum(data[['FitS1','FitS2','FitS3','FitS4','FitS5','FitS6']] == 0.5,axis=1) < 5]

In [25]:
sequences = np.array(list(map(list,data_filtered['Seq'])))
x = load_sequence(sequences)  # one-hot encoding for A,C,G.
y = tolog(np.asarray(data_filtered['Fit']),Y_MIN,Y_MAX)

In [38]:
# Change the parameter to select different ruggedness measure
metric = 'r_s' # 'N_max','epi','r_s','open_ratio'


In [39]:
if metric in ['N_max','gamma','adptwalk_steps','adptwalk_probs']:
    with open('../../index_file/trna_neighbor_list.pkl','rb') as f:
        neighbor_list = pickle.load(f)

if metric in ['epi','gamma']:
    with open('../../index_file/trna_epi_square_list.pkl','rb') as f:
        res_set = pickle.load(f)
    
if metric == 'open_ratio':
    with open('../../index_file/trna_pathway_list_4steps_300000.pkl','rb') as f:
        pathway_list = pickle.load(f)

In [40]:
def get_N_max(y):
    N_max = 0
    for i in range(len(y)):
        fit = y[i]
        if np.sum(fit <= y[neighbor_list[i]])==0:
            N_max += 1
    return N_max

def cal_epi(y):
    epi_fit_list = y[res_set]
    n_epi = np.sum(np.sum(epi_fit_list[:,[0,0,3,3]] > epi_fit_list[:,[1,2,1,2]],axis=1)==4)
    n_epi += np.sum(np.sum(epi_fit_list[:,[0,0,3,3]] < epi_fit_list[:,[1,2,1,2]],axis=1)==4)
    return n_epi/len(epi_fit_list)

def cal_r_s(y):
    # x is a global variable with each row being a one-hot encode sequence for the corresponding genotype.
    reg = Ridge(fit_intercept=True).fit(x, y)
    y_predict = reg.predict(x)
    roughness = np.sqrt(np.mean(np.square(y - y_predict)))
    slope = np.mean(np.abs(reg.coef_))
    return roughness/slope

def cal_open_ratio(y):
    diff = y[pathway_list[:,:-1]] - y[pathway_list[:,1:]] 
    open_descend = np.sum(np.sum(diff >= 0,axis=1) == pathway_list.shape[1]-1)
    open_ascend = np.sum(np.sum(diff <= 0,axis=1) == pathway_list.shape[1]-1)
    total_open = open_descend + open_ascend
    return total_open/len(pathway_list)

def cal_gamma(y):
    cov = np.sum((y[res_set][:,1]-y[res_set][:,0])*(y[res_set][:,3]-y[res_set][:,2])) +\
    np.sum((y[res_set][:,2]-y[res_set][:,0])*(y[res_set][:,3]-y[res_set][:,1]))
    cov = cov/(2*res_set.shape[0])
    sg_list = []
    for i in range(len(sequences)):
        for neighbor_idx in neighbor_list[i]:
            sg_list.append(y[neighbor_idx] - y[i])
    var = np.var(sg_list)
    return cov/var

def cal_adptwalk_steps(y):
    N_step_list = []
    for idx_0 in range(len(y)):
        idx_current = idx_0
        N_step = 0
        while True:
            fit_current = y[idx_current]
            neighbor = neighbor_list[idx_current]
            if len(neighbor) == 0 :
                if N_step > 0: N_step_list.append(N_step)
                break
            fit_next = y[neighbor].max()
            if fit_next <= fit_current:
                if N_step > 0: N_step_list.append(N_step)
                break
            idx_next = neighbor[np.argmax(y[neighbor])]
            N_step += 1
            idx_current = idx_next
    return np.mean(N_step_list)

def cal_adptwalk_probs(y):
    idx_GO = np.argmax(y)
    N_reach = 0
    N_total = 0
    for idx_0 in range(len(y)):
        idx_current = idx_0
        N_step = 0
        while True:
            fit_current = y[idx_current]
            neighbor = neighbor_list[idx_current]
            if len(neighbor) == 0:
                if N_step > 0: N_total += 1
                break
            fit_next = y[neighbor].max()
            if fit_next <= fit_current:
                if N_step > 0:
                    N_total += 1
                    if idx_current == idx_GO:
                        N_reach += 1
                break
            idx_next = neighbor[np.argmax(y[neighbor])]
            N_step += 1
            idx_current = idx_next
    return N_reach/N_total

if metric == 'N_max':
    get_ruggedness = get_N_max

elif metric == 'epi':
    get_ruggedness = cal_epi

elif metric == 'r_s':
    get_ruggedness = cal_r_s

elif metric == 'open_ratio':
    get_ruggedness = cal_open_ratio

elif metric == 'gamma':
    get_ruggedness = cal_gamma
    
elif metric == 'adptwalk_steps':
    get_ruggedness = cal_adptwalk_steps
    
elif metric == 'adptwalk_probs':
    get_ruggedness = cal_adptwalk_probs

In [41]:
duplicates_list = ['FitS1','FitS2','FitS3','FitS4','FitS5','FitS6']
res_dict = {i:[] for i in range(1,7)}
for replication in range(1,7):
    print(replication,end='\r')
    if replication == 1:
        iter_list = duplicates_list
        for duplicate in iter_list:
            y = tolog(np.asarray(data_filtered[duplicate]),Y_MIN,Y_MAX)
            res_dict[replication].append(get_ruggedness(y))
    else:
        iter_list = combinations(duplicates_list,replication)   
        for duplicate in iter_list:
            y = tolog(np.sum(np.asarray(data_filtered[list(duplicate)]),axis=1),Y_MIN,Y_MAX)
            res_dict[replication].append(get_ruggedness(y))
        

6

In [42]:
res_dict

{1: [3.6076349719826695,
  4.191800096702262,
  5.410619674104377,
  3.319183308711098,
  3.220879997662329,
  3.3480987156529283],
 2: [3.6715146647406716,
  4.0874707120474785,
  3.290055105935143,
  3.2304506777269952,
  3.2981215172196268,
  4.380631864694454,
  3.5086288065296856,
  3.4312855005074026,
  3.5169721573473796,
  3.8930959246911043,
  3.783737666384117,
  3.890518535476146,
  3.1260880122046473,
  3.186011119399684,
  3.132316872051421],
 3: [3.919322704098412,
  3.4107399948395964,
  3.3622455739419377,
  3.413438284993295,
  3.635683711950389,
  3.572520760218308,
  3.6353941588450924,
  3.153936813369648,
  3.19562236125844,
  3.1540607312961533,
  3.789240109932419,
  3.7176500981176033,
  3.7899093278200513,
  3.2713199760974687,
  3.319483655015171,
  3.272552688743619,
  3.471259928552284,
  3.5307627857266333,
  3.470337398631295,
  3.092132416012372],
 4: [3.629403057219945,
  3.581807419742048,
  3.627491745720768,
  3.2609074112843475,
  3.2969807171487555,

In [43]:
# #Uncomment only if you want to overwrite trna_raw_data folder.
# with open(f'./trna_Li_{metric}_plot.pkl','wb') as f:
#     pickle.dump(res_dict,f)