In [2]:
import numpy as np
import numpy.random as nrand
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import itertools
from itertools import combinations
import copy
from matplotlib.ticker import FormatStrFormatter
from sklearn.linear_model import Ridge
from scipy.linalg import hadamard

In [3]:
def load_sequence(sequences):
    """
    Convert ATCG sequences to one-hot encoding sequence
    """
    BASES = np.asarray(['A','T','C','G'])
    shape = sequences.shape
    data = sequences[..., None] == BASES
    return data.reshape(shape[0], shape[1] * BASES.size)

In [4]:
# Load raw data
df_all = pd.read_csv('SD_seq_arti_data.csv')

# Extract genotype sequences and convert it to 2D numpy array
seqs_raw = df_all.seq.to_numpy()
seqs = np.array(list(map(list,seqs_raw)))

In [5]:
# Convert ATCG genotypes to one-hot encoding sequences
x = load_sequence(seqs)

# Only keep variable sites for analysis
x = x[:,np.where((x != x[0]).sum(axis=0) > 0)[0]]

In [6]:
fitness_df = df_all

In [7]:
# Find all genotypes between wild type and the antipodal genotypes
N = 9
gt_0 = "UAAGGAGGU" # wild type
gt_1 = "AUUCUUCUC"
idx_sub = True

for i in range(N):
    idx_tmp = (seqs[:,i] == gt_0[i]) | (seqs[:,i] == gt_1[i])
    idx_sub = idx_sub & idx_tmp
idx_sub.sum()

452

In [None]:
# Convert genotypes to 0-1 coding sequences
seqs_sub = (seqs[idx_sub] != list(gt_0)).astype(int)

# Get corresponding fitness
fitness_df_sub = fitness_df[idx_sub] 

# Create a genotye (str) to fitness dictionary for fast reference.
seq_fit_dict = {
    ''.join(seqs_sub[i].astype('str')): fitness_df_sub.iloc[i,1:4].to_list() for i in range(len(seqs_sub))
}

In [2]:
bin(2)

'0b10'

In [9]:
# Prepare FL for E (Bayesian-Walsh epistasis) calculation
landscape_list_b = []
for i in range(2**N):
    seq_b = [int(x) for x in bin(i)[2:]] # loop through all possible N-length genotypes
    seq_b = [0]*(N-len(seq_b))+seq_b # Add 0s to fill up N digits
    landscape_list_b.append(seq_b)
landscape_list_b = np.array(landscape_list_b)

fitness_df_sub_ordered = {'seq':[],'P1':[],'P2':[],'P3':[]}

# Combine genotypes with corresponding fitness measurement replicates
for seq in landscape_list_b:
    seq_str = ''.join(seq.astype('str'))
    if seq_str in seq_fit_dict:
        r1,r2,r3 = seq_fit_dict[seq_str]
        fitness_df_sub_ordered['seq'].append(seq_str)
        fitness_df_sub_ordered['P1'].append(r1)
        fitness_df_sub_ordered['P2'].append(r2)
        fitness_df_sub_ordered['P3'].append(r3)
    else:
        # there are 60 missing genotype. Assign them 0 fitness.
        fitness_df_sub_ordered['seq'].append(seq_str)
        fitness_df_sub_ordered['P1'].append(0)
        fitness_df_sub_ordered['P2'].append(0)
        fitness_df_sub_ordered['P3'].append(0)

fitness_df_sub_ordered = pd.DataFrame(fitness_df_sub_ordered)

In [24]:
# Change the parameter to select different ruggedness measure
metric = 'r_s' # 'N_max','epi','r_s','open_ratio'

In [25]:
# Load index data

if metric in ['N_max','gamma','adptwalk_steps','adptwalk_probs']:
    with open('../../index_file/SD_seq_neighbor_list.pkl','rb') as f:
        neighbor_list = pickle.load(f)

if metric in ['epi','gamma']:
    with open('../../index_file/SD_seq_epi_square_list.pkl','rb') as f:
        epi_square_list = pickle.load(f)
    
elif metric == 'open_ratio':
    with open('../../index_file/SD_seq_pathway_list_4steps_25000000.pkl','rb') as f:
        pathway_list = pickle.load(f)
    _y = fitness_df['mean'].to_numpy()
    # Sample evolution pathways that start at fitness lower than 20th percentile 
    # and end at fitness higher than 80th percentile of the fitness distribution
    y20,y80 = np.percentile(_y,[20,80])
    filtered_ascend = (_y[pathway_list[:,0]]<=y20) & (_y[pathway_list[:,-1]]>=y80)
    filtered_descend = (_y[pathway_list[:,0]]>=y80) & (_y[pathway_list[:,-1]]<=y20)
    pathway_list = pathway_list[filtered_ascend|filtered_descend,:]

In [26]:
# Functions to calculate ruggendss measures

def get_N_max(y):
    N_max = 0
    for i in range(len(y)):
        fit = y[i]
        if np.sum(fit <= y[neighbor_list[i]])==0:
            N_max += 1
    return N_max

def cal_epi(y):
    epi_fit_list = y[epi_square_list]
    n_epi = np.sum(np.sum(epi_fit_list[:,[0,0,3,3]] > epi_fit_list[:,[1,2,1,2]],axis=1)==4)
    n_epi += np.sum(np.sum(epi_fit_list[:,[0,0,3,3]] < epi_fit_list[:,[1,2,1,2]],axis=1)==4)
    return n_epi/len(epi_fit_list)

def cal_r_s(y):
    # x is a global variable with each row being a one-hot encode sequence for the corresponding genotype.
    reg = Ridge(fit_intercept=True).fit(x, y)
    y_predict = reg.predict(x)
    roughness = np.sqrt(np.mean(np.square(y - y_predict)))
    slope = np.mean(np.abs(reg.coef_))
    return roughness/slope

def cal_open_ratio(y):
    diff = y[pathway_list[:,:-1]] - y[pathway_list[:,1:]] 
    open_descend = np.sum(np.sum(diff >= 0,axis=1) == pathway_list.shape[1]-1)
    open_ascend = np.sum(np.sum(diff <= 0,axis=1) == pathway_list.shape[1]-1)
    total_open = open_descend + open_ascend
    return total_open/len(pathway_list)

def cal_E(y):
    global idx_1, phi
    W = y.astype('float32')
    E = phi.dot(W)/(2**N)
    E_square = np.square(E)
    E_sum = E_square.sum()-E_square[0]
    E_1 = E_square[idx_1].sum()
    F_sum = (E_sum-E_1)/E_sum
    return F_sum

def cal_gamma(y):
    cov = np.sum((y[epi_square_list][:,1]-y[epi_square_list][:,0])*(y[epi_square_list][:,3]-y[epi_square_list][:,2])) +\
    np.sum((y[epi_square_list][:,2]-y[epi_square_list][:,0])*(y[epi_square_list][:,3]-y[epi_square_list][:,1]))
    cov = cov/(2*epi_square_list.shape[0])
    sg_list = []
    for i in range(len(y)):
        for neighbor_idx in neighbor_list[i]:
            sg_list.append(y[neighbor_idx] - y[i])
    var = np.var(sg_list)
    return cov/var

def cal_adptwalk_steps(y):
    N_step_list = []
    for idx_0 in range(len(y)):
        idx_current = idx_0
        N_step = 0
        while True:
            fit_current = y[idx_current]
            neighbor = neighbor_list[idx_current]
            if len(neighbor) == 0 :
                if N_step > 0: N_step_list.append(N_step)
                break
            fit_next = y[neighbor].max()
            if fit_next <= fit_current:
                if N_step > 0: N_step_list.append(N_step)
                break
            idx_next = neighbor[np.argmax(y[neighbor])]
            N_step += 1
            idx_current = idx_next
    return np.mean(N_step_list)

def cal_adptwalk_probs(y):
    idx_GO = np.argmax(y)
    N_reach = 0
    N_total = 0
    for idx_0 in range(len(y)):
        idx_current = idx_0
        N_step = 0
        while True:
            fit_current = y[idx_current]
            neighbor = neighbor_list[idx_current]
            if len(neighbor) == 0:
                if N_step > 0: N_total += 1
                break
            fit_next = y[neighbor].max()
            if fit_next <= fit_current:
                if N_step > 0:
                    N_total += 1
                    if idx_current == idx_GO:
                        N_reach += 1
                break
            idx_next = neighbor[np.argmax(y[neighbor])]
            N_step += 1
            idx_current = idx_next
    return N_reach/N_total


if metric == 'N_max':
    get_ruggedness = get_N_max

elif metric == 'epi':
    get_ruggedness = cal_epi

elif metric == 'r_s':
    get_ruggedness = cal_r_s

elif metric == 'open_ratio':
    get_ruggedness = cal_open_ratio

elif metric == 'E':
    N=9
    landscape_list_b = []
    for i in range(2**N):
        seq_b = [int(x) for x in bin(i)[2:]]
        seq_b = [0]*(N-len(seq_b))+seq_b
        landscape_list_b.append(seq_b)
    landscape_list_b = np.array(landscape_list_b)
    get_ruggedness = cal_E
    phi = hadamard(2**N,dtype='float32')
    idx_1 = landscape_list_b.sum(axis=1) == 1

elif metric == 'gamma':
    get_ruggedness = cal_gamma
    
elif metric == 'adptwalk_steps':
    get_ruggedness = cal_adptwalk_steps
    
elif metric == 'adptwalk_probs':
    get_ruggedness = cal_adptwalk_probs

In [27]:
# Calculate ruggedness of FL using 1,2, or all 3 fitness measument replicates.

duplicates_list = ['P1','P2','P3']
res_dict = {i:[] for i in range(1,len(duplicates_list)+1)}
for replication in range(1,len(duplicates_list)+1):
    print(replication,end='\r')
    if replication == 1:
        for duplicate in duplicates_list:
            if metric == 'E':
                y = fitness_df_sub_ordered[duplicate].to_numpy()
            else:
                y = fitness_df[duplicate].to_numpy()
            res_dict[replication].append(get_ruggedness(y))
    else:
        iter_list = combinations(duplicates_list,replication)   
        for duplicate in iter_list:
            if metric == 'E':
                y = fitness_df_sub_ordered[list(duplicate)].mean(axis=1).to_numpy()
            else:
                y = fitness_df[list(duplicate)].mean(axis=1).to_numpy()
            res_dict[replication].append(get_ruggedness(y))
        

3

In [28]:
# Uncomment only if you want to overwrite.
# with open(f'./SD_seq_{metric}_plot.pkl','wb') as f:
#     pickle.dump(res_dict,f)