In [2]:
import numpy as np
import numpy.random as nrand
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import itertools
from itertools import combinations
import copy
from matplotlib.ticker import FormatStrFormatter
from sklearn.linear_model import Ridge

In [3]:
Y_MIN = -0.6931471805599453 # fitness minimum
Y_MAX = 0.942657031435126 # fitness maximum

In [13]:
def load_sequence(sequences):
    """
    Convert ATCG sequences to one-hot encoding sequence
    """
    BASES = np.asarray(['A','T,','C','G'])
    shape = sequences.shape
    data = sequences[..., None] == BASES
    return data.reshape(shape[0], shape[1] * BASES.size)

def tolog(y,Y_MIN,Y_MAX):
    """
    Convert fitness to log scale, and normalize it.
    """
    y = np.log(y)
    y = y - Y_MIN
    y = y/Y_MAX
    return y

In [95]:
def get_idx_from_seq(seq:str, seq_fit_dict):
    """
    Get the index of the genotype in FL using genotype sequence.
    """
    if seq in seq_fit_dict:
        return seq_fit_dict[seq]
    else:
        return -1

def get_pathways(seq_start,seq_end,seq_fit_dict):
    """
    Given start genotype and end genotype, get possible evolutionary pathways in genotype space
    """
    mut_idx = np.where(seq_start != seq_end)[0]
    idx_start = get_idx_from_seq(''.join(seq_start),seq_fit_dict)
    
    pathways_list = []
    
    for mut_list in itertools.permutations(mut_idx):
        pathway_idx_list = [idx_start]
        seq_tmp = np.copy(seq_start) # Record current genotype
        for mut_idx in mut_list:
            seq_tmp[mut_idx] = seq_end[mut_idx] # mutation on genotype
            tmp_idx = get_idx_from_seq(''.join(seq_tmp),seq_fit_dict) # get index of the current genotype
            if tmp_idx == -1: # If there are not fitness measurement for this genotype, 
                              # give up the current evolutionary pathway
                break
            else:
                pathway_idx_list.append(tmp_idx) # Attach the current genotype to the evolutionary pathway
        
        if tmp_idx != -1: 
            pathways_list.append(pathway_idx_list)
    
    return pathways_list
            
                

In [5]:
# Load raw data
with open("All_data_df.pkl",'rb') as f:
    data = pickle.load(f)
    
# Filter genotypes that have at least 2 fitness measure higher than 0.5
data_filtered = data[np.sum(data[['FitS1','FitS2','FitS3','FitS4','FitS5','FitS6']] == 0.5,axis=1) < 5]

# Initialize genotype space
sequences = np.array(list(map(list,data_filtered['Seq'])))

# Create genotype to fitness dictionary
seq_fit_dict = dict(zip(list(map(''.join,sequences)),list(range(len(sequences)))))

x = load_sequence(sequences)  # one-hot encoding for A,C,G.
y = tolog(np.asarray(data_filtered['Fit']),Y_MIN,Y_MAX) # Normalize fitness

In [47]:
# Get fitness at 20th and 80th percentile and filter start and end genotypes.
percentile20, percentile80 = np.percentile(y,[20,80])
seq20_list = sequences[y<=percentile20]
seq80_list = sequences[y>=percentile80]

In [140]:
# Generate pathway_list for open_ratio calculation

pathway_idx_list = []
for seq20 in seq20_list:
    print(len(pathway_idx_list),end='\r')
    diff = (seq80_list != seq20).sum(axis=1)
    seq80_filtered = seq80_list[diff == 4]
    if len(seq80_filtered) < 15:
        continue
    seq80_rand_idx = nrand.choice(range(len(seq80_filtered)),size=15,replace=False)
    seq80_filtered = seq80_filtered[seq80_rand_idx]
    for seq80 in seq80_filtered:
        pathway_idx_list += get_pathways(seq20,seq80,seq_fit_dict)

325123

In [146]:
# round it to 300K
idx_300K = nrand.choice(range(len(pathway_idx_list)),size=300000,replace=False)
pathway_idx_list = np.array(pathway_idx_list)
pathway_idx_list_300K = pathway_idx_list[idx_300K,:]

In [149]:
#with open('../../index_file/trna_pathway_list_4steps_300000.pkl','wb') as f:
#    pickle.dump(pathway_idx_list_300K,f)