In [13]:
import os
import itertools
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sys
sys.path.append("../")
from models.Ground_truth_oracles.RNA_landscape_models import RNA_landscape_constructor
from models.Ground_truth_oracles.TF_binding_landscape_models import *

In [29]:
def is_sequence_a_peak(model, sequence, peak_dict, alphabet="AGTC"):
    if sequence in peak_dict:
        return peak_dict[sequence]
    neighbor=[s for s in sequence]
    sequence_fitness=model.get_fitness(sequence)
    for position in range(len(sequence)):
        for aa in alphabet:
            if aa!=sequence[position]:
               neighbor[position]=aa
               neighbor_string="".join(neighbor)
               if sequence_fitness<model.get_fitness(neighbor_string):
                  peak_dict[sequence] = 0
                  return 0
               elif sequence_fitness>model.get_fitness(neighbor_string):
                  peak_dict[neighbor_string] = 0
               neighbor[position]=sequence[position] # reset 
    peak_dict[sequence] = 1
    return 1

def get_all_peaks(landscape, alphabet='AGTC'):
    peaks=set()
    peak_dict = {}
    for ind, seq in enumerate(itertools.product(alphabet, repeat=14)):
        if ind % 10000 == 0:
            print('Processed {} sequences and found {} peaks'.format(ind, len(peaks)))
            print(len(peak_dict))
        seq = ''.join(seq)
        if is_sequence_a_peak(landscape, seq, peak_dict, alphabet):
           peaks.add(seq)
    return peaks

if not os.path.isdir('../peaks'):
    os.mkdir('../peaks')

In [30]:
rna_landscape_constructor_1=RNA_landscape_constructor()
rna_landscape_constructor_1.load_landscapes("../data/RNA_landscapes/RNA_landscape_config.yaml", 
                                      landscapes_to_test = [0])
landscape1 = next(rna_landscape_constructor_1.generate_from_loaded_landscapes())
rna_landscape_constructor_2=RNA_landscape_constructor()
rna_landscape_constructor_2.load_landscapes("../data/RNA_landscapes/RNA_landscape_config.yaml", 
                                      landscapes_to_test = [12])
landscape2 = next(rna_landscape_constructor_2.generate_from_loaded_landscapes())

B1L14RNA1 loaded
B2L14RNA1+2 loaded


In [31]:
peaks_1 = get_all_peaks(landscape1["landscape_oracle"], 'UGTC')
pickle.dump(peaks_1, open('../peaks/peaks_B1L14RNA1.pkl'))

Processed 0 sequences and found 0 peaks
0
Processed 10000 sequences and found 0 peaks
10000
Processed 20000 sequences and found 0 peaks
20000
Processed 30000 sequences and found 0 peaks
30000
Processed 40000 sequences and found 0 peaks
40000
Processed 50000 sequences and found 0 peaks
50000
Processed 60000 sequences and found 0 peaks
60000
Processed 70000 sequences and found 0 peaks
70328
Processed 80000 sequences and found 0 peaks
81077
Processed 90000 sequences and found 0 peaks
91512
Processed 100000 sequences and found 0 peaks
101663
Processed 110000 sequences and found 0 peaks
112195
Processed 120000 sequences and found 0 peaks
123078
Processed 130000 sequences and found 0 peaks
133190
Processed 140000 sequences and found 0 peaks
143190
Processed 150000 sequences and found 0 peaks
153190
Processed 160000 sequences and found 0 peaks
163190
Processed 170000 sequences and found 0 peaks
173190
Processed 180000 sequences and found 0 peaks
183190
Processed 190000 sequences and found 0 p

Processed 1560000 sequences and found 0 peaks
2100009
Processed 1570000 sequences and found 0 peaks
2121544
Processed 1580000 sequences and found 0 peaks
2136744
Processed 1590000 sequences and found 0 peaks
2152439
Processed 1600000 sequences and found 0 peaks
2163858
Processed 1610000 sequences and found 0 peaks
2177378
Processed 1620000 sequences and found 0 peaks
2192211
Processed 1630000 sequences and found 0 peaks
2207598
Processed 1640000 sequences and found 0 peaks
2224191
Processed 1650000 sequences and found 0 peaks
2240180
Processed 1660000 sequences and found 0 peaks
2252431
Processed 1670000 sequences and found 0 peaks
2262815
Processed 1680000 sequences and found 0 peaks
2278130
Processed 1690000 sequences and found 0 peaks
2293400
Processed 1700000 sequences and found 0 peaks
2310929
Processed 1710000 sequences and found 0 peaks
2325129
Processed 1720000 sequences and found 0 peaks
2340840
Processed 1730000 sequences and found 0 peaks
2352412
Processed 1740000 sequences 

Processed 3080000 sequences and found 0 peaks
4024109
Processed 3090000 sequences and found 0 peaks
4034109
Processed 3100000 sequences and found 0 peaks
4044109
Processed 3110000 sequences and found 0 peaks
4054109
Processed 3120000 sequences and found 0 peaks
4064109
Processed 3130000 sequences and found 0 peaks
4074109
Processed 3140000 sequences and found 0 peaks
4084109
Processed 3150000 sequences and found 0 peaks
4094109
Processed 3160000 sequences and found 0 peaks
4104109
Processed 3170000 sequences and found 0 peaks
4114109
Processed 3180000 sequences and found 0 peaks
4124109
Processed 3190000 sequences and found 0 peaks
4134109
Processed 3200000 sequences and found 0 peaks
4144109
Processed 3210000 sequences and found 0 peaks
4154109
Processed 3220000 sequences and found 0 peaks
4164626
Processed 3230000 sequences and found 0 peaks
4176159
Processed 3240000 sequences and found 0 peaks
4186159
Processed 3250000 sequences and found 0 peaks
4196567
Processed 3260000 sequences 

Processed 4600000 sequences and found 0 peaks
5805984
Processed 4610000 sequences and found 0 peaks
5817882
Processed 4620000 sequences and found 0 peaks
5828799
Processed 4630000 sequences and found 0 peaks
5840313
Processed 4640000 sequences and found 0 peaks
5853266
Processed 4650000 sequences and found 0 peaks
5865577
Processed 4660000 sequences and found 0 peaks
5878879
Processed 4670000 sequences and found 0 peaks
5892019
Processed 4680000 sequences and found 0 peaks
5904775
Processed 4690000 sequences and found 0 peaks
5916349
Processed 4700000 sequences and found 0 peaks
5929668
Processed 4710000 sequences and found 0 peaks
5942647
Processed 4720000 sequences and found 0 peaks
5957053
Processed 4730000 sequences and found 0 peaks
5970638
Processed 4740000 sequences and found 0 peaks
5984691
Processed 4750000 sequences and found 0 peaks
5996396
Processed 4760000 sequences and found 0 peaks
6009184
Processed 4770000 sequences and found 0 peaks
6026725
Processed 4780000 sequences 

Processed 6120000 sequences and found 0 peaks
8091148
Processed 6130000 sequences and found 0 peaks
8102458
Processed 6140000 sequences and found 0 peaks
8117818
Processed 6150000 sequences and found 0 peaks
8133372
Processed 6160000 sequences and found 0 peaks
8148614
Processed 6170000 sequences and found 0 peaks
8166655
Processed 6180000 sequences and found 0 peaks
8183327
Processed 6190000 sequences and found 0 peaks
8198989
Processed 6200000 sequences and found 0 peaks
8216463
Processed 6210000 sequences and found 0 peaks
8234093
Processed 6220000 sequences and found 0 peaks
8252273
Processed 6230000 sequences and found 0 peaks
8269754
Processed 6240000 sequences and found 0 peaks
8286260
Processed 6250000 sequences and found 0 peaks
8300630
Processed 6260000 sequences and found 0 peaks
8316974
Processed 6270000 sequences and found 0 peaks
8333706
Processed 6280000 sequences and found 0 peaks
8350791
Processed 6290000 sequences and found 0 peaks
8367650
Processed 6300000 sequences 

SystemError: <built-in function duplexfold> returned a result with an error set

In [None]:
peaks_2 = get_all_peaks(rna_landscape_constructor_2["landscape_oracle"], 'UGTC')
pickle.dump(peaks_2, open('../peaks/peaks_B2L14RNA1+2.pkl'))