In [None]:
import numpy as np
import numpy.random as nrand
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import itertools
from itertools import combinations
import copy
from matplotlib.ticker import FormatStrFormatter
from sklearn.linear_model import Ridge

In [None]:
Y_MIN = -0.6931471805599453
Y_MAX = 0.942657031435126

In [None]:
def load_sequence(sequences):
    BASES = np.asarray(['A','C','G'])
    shape = sequences.shape
    data = sequences[..., None] == BASES
    return data.reshape(shape[0], shape[1] * BASES.size)

def tolog(y,Y_MIN,Y_MAX):
    y = np.log(y)
    y = y - Y_MIN
    y = y/Y_MAX
    return y

In [None]:
with open("All_data_df.pkl",'rb') as f:
    data = pickle.load(f)

In [None]:
# Filter genotypes that have at least 2 fitness measure higher than 0.5
data_filtered = data[np.sum(data[['FitS1','FitS2','FitS3','FitS4','FitS5','FitS6']] == 0.5,axis=1) < 5]

In [None]:
sequences = np.array(list(map(list,data_filtered['Seq'])))
x = load_sequence(sequences)  # one-hot encoding for A,C,G.
y = tolog(np.asarray(data_filtered['Fit']),Y_MIN,Y_MAX)

In [None]:
# Change the parameter to select different ruggedness measure
metric = 'N_max' # 'N_max','epi','r_s','open_ratio'

In [None]:
if metric == 'N_max':
    with open('../index_file/trna_neighbor_list.pkl','rb') as f:
        neighbor_list = pickle.load(f)

elif metric == 'epi':
    with open('../index_file/trna_epi_square_list.pkl','rb') as f:
        res_set = pickle.load(f)
    
elif metric == 'open_ratio':
    with open('../index_file/trna_pathway_list_4steps_300000.pkl','rb') as f:
        pathway_list = pickle.load(f)

In [None]:
def get_N_max(y):
    N_max = 0
    for i in range(len(y)):
        fit = y[i]
        if np.sum(fit <= y[neighbor_list[i]])==0:
            N_max += 1
    return N_max

def cal_epi(y):
    epi_fit_list = y[res_set]
    n_epi = np.sum(np.sum(epi_fit_list[:,[0,0,3,3]] > epi_fit_list[:,[1,2,1,2]],axis=1)==4)
    n_epi += np.sum(np.sum(epi_fit_list[:,[0,0,3,3]] < epi_fit_list[:,[1,2,1,2]],axis=1)==4)
    return n_epi/len(epi_fit_list)

def cal_r_s(y):
    # x is a global variable with each row being a one-hot encode sequence for the corresponding genotype.
    reg = Ridge(fit_intercept=True).fit(x, y)
    y_predict = reg.predict(x)
    roughness = np.sqrt(np.mean(np.square(y - y_predict)))
    slope = np.mean(np.abs(reg.coef_))
    return roughness/slope

def cal_open_ratio(y):
    diff = y[pathway_list[:,:-1]] - y[pathway_list[:,1:]] 
    open_descend = np.sum(np.sum(diff >= 0,axis=1) == pathway_list.shape[1]-1)
    open_ascend = np.sum(np.sum(diff <= 0,axis=1) == pathway_list.shape[1]-1)
    total_open = open_descend + open_ascend
    return total_open/len(pathway_list)

if metric == 'N_max':
    get_ruggedness = get_N_max

elif metric == 'epi':
    get_ruggedness = cal_epi

elif metric == 'r_s':
    get_ruggedness = cal_r_s

elif metric == 'open_ratio':
    get_ruggedness = cal_open_ratio

In [None]:
duplicates_list = ['FitS1','FitS2','FitS3','FitS4','FitS5','FitS6']
res_dict = {i:[] for i in range(1,7)}
for replication in range(1,7):
    print(replication,end='\r')
    if replication == 1:
        iter_list = duplicates_list
        for duplicate in iter_list:
            y = tolog(np.asarray(data_filtered[duplicate]),Y_MIN,Y_MAX)
            res_dict[replication].append(get_ruggedness(y))
    else:
        iter_list = combinations(duplicates_list,replication)   
        for duplicate in iter_list:
            y = tolog(np.sum(np.asarray(data_filtered[list(duplicate)]),axis=1),Y_MIN,Y_MAX)
            res_dict[replication].append(get_ruggedness(y))
        

In [None]:
# Uncomment only if you want to overwrite trna_raw_data folder.
# with open(f'./trna_raw_data/trna_{metric}_plot.pkl','wb') as f:
#     pickle.dump(res_dict,f)