In [24]:
import arnie
from arnie.utils import *
from arnie.mfe_bootstrap import mfe_bootstrap
from arnie.bpps import bpps
from arnie.mea.mea import MEA
from arnie.mea.mea_utils import *
from arnie.utils import _group_into_non_conflicting_bp
from Bio import SeqIO
import pandas as pd

def get_seq(seq_filename):
    record = SeqIO.read(seq_filename, "fasta")
    return str(record.seq)

def get_shape_data(filename):
    shape = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            shape.append(line)
            
    for i in range(len(shape)):
        shape[i] = (-1) if (shape[i] == 'nan') else (float(shape[i]))
        
    return shape

def get_sliding_windows(full_seq, shape, step, window):
    coords = list(range(0,len(full_seq)-window+1,step))
    seq_windows = []
    shape_windows = []
    for i in coords:
        new_seq_window = full_seq[i:i+window]
        seq_windows.append(new_seq_window)
        new_shape_window = shape[i:i+window]
        shape_windows.append(new_shape_window)
        
    return seq_windows, shape_windows, coords

def run_probknot(bpp, theta=0.3):
    adj_matrix = MEA(bpp, run_probknot_heuristic=True, theta=theta)
    bp_list = adj_matrix.MEA_bp_list
    threshknot_struct = adj_matrix.structure
    return bp_list, threshknot_struct

def get_bp_list(dotbracket):
    return convert_dotbracket_to_bp_list(dotbracket, allow_pseudoknots=True)

def get_groups(bp_list):
    return _group_into_non_conflicting_bp(bp_list)
    
def is_probable_PK(groups):
    likely_bp = 0
    for i in groups[1:]:
        if len(i) > 2:
            likely_bp += 1
    if likely_bp >= 2:
        return True
    else:
        return False
    
def get_shape_pseudoknots(seq_filename, shape_filename, step, window, num_bootstrap, theta=0.3):

    full_seq = get_seq(seq_filename)
    full_shape = get_shape_data(shape_filename)
    
    
    seq_windows, shape_windows, coords = get_sliding_windows(full_seq, full_shape, step, window)
    
    PK_hit_list = []
    fold_structures = []
    threshknot_structures = []
    PK_seqs = []
    for i,seq in enumerate(seq_windows):
        shape = shape_windows[i]
        mfe_struct, bpp = mfe_bootstrap(seq, num_bootstrap=num_bootstrap, shape_signal=shape, pk=False)
        bp_list, threshknot_struct = run_probknot(bpp, theta=theta)
        groups = _group_into_non_conflicting_bp(bp_list)
        if is_PK(threshknot_struct):
            if is_probable_PK(groups):
                PK_hit_list.append(i)
                fold_structures.append(mfe_struct)
                threshknot_structures.append(threshknot_struct)
                PK_seqs.append(seq)
        print('finished one seq')
        
    starts = []
    ends = []
    for i in PK_hit_list:
        start = coords[i]+1
        end = coords[i]+window
        
    PK_list = zip(starts, ends, PK_seqs, fold_structures, threshknot_structures)
    df = pd.DataFrame(PK_list, columns = ['start', 'end', 'sequence', 'fold_structure', 'threshknot_structure'])
    return df

In [8]:
seq = 'AAAUUGGGGCCCCUUUAAAAAAGGCUUU'

seq_windows, coords = get_sliding_windows(seq, 2, 5)
print(seq_windows)
print(coords)

print(coords[0]+1)
print(coords[0]+5)

['AAAUU', 'AUUGG', 'UGGGG', 'GGGCC', 'GCCCC', 'CCCUU', 'CUUUA', 'UUAAA', 'AAAAA', 'AAAAG', 'AAGGC', 'GGCUU']
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22]
1
5


In [9]:
full_SARS = get_seq('/home/gnye8/Desktop/PK_research/SSRP_work/fasta_files/SARS_CoV2_reference.fasta')
full_shape = get_shape_data('/home/gnye8/Desktop/PK_research/SSRP_work/shape_data/incarnato_invivo_reactivity-Copy1.csv')

In [28]:
test_SARS = full_SARS[13000:14000]
test_shape = full_shape[13000:14000]

def test_get_shape_pseudoknots(full_seq, full_shape, step, window, num_bootstrap, theta=0.3):
    
    seq_windows, shape_windows, coords = get_sliding_windows(full_seq, full_shape, step, window)
    
    PK_hit_list = []
    fold_structures = []
    threshknot_structures = []
    PK_seqs = []
    for i,seq in enumerate(seq_windows):
        shape = shape_windows[i]
        mfe_struct, bpp = mfe_bootstrap(seq, num_bootstrap=num_bootstrap, shape_signal=shape, pk=False)
        bp_list, threshknot_struct = run_probknot(bpp, theta=theta)
        groups = _group_into_non_conflicting_bp(bp_list)
        if is_PK(threshknot_struct):
            if is_probable_PK(groups):
                PK_hit_list.append(i)
                fold_structures.append(mfe_struct)
                threshknot_structures.append(threshknot_struct)
                PK_seqs.append(seq)
        print('finished one seq')
        
    starts = []
    ends = []
    for i in PK_hit_list:
        start = coords[i]+1
        end = coords[i]+window
        
    PK_list = zip(starts, ends, PK_seqs, fold_structures, threshknot_structures)
    df = pd.DataFrame(PK_list, columns = ['start', 'end', 'sequence', 'fold_structure', 'threshknot_structure'])
    return df

In [29]:
test = test_get_shape_pseudoknots(test_SARS, test_shape, 40, 120, 100)

[0.167, 3.354, 2.384, 0.383, 0.1, 0.02, 0.049, 0.043, 0.039, 0.101, 0.057, 0.133, 0.072, 0.057, 0.204, 0.168, 0.162, 0.056, 0.064, 0.136, 0.373, 0.148, 0.439, 0.446, 0.084, 0.208, 0.157, 0.171, 0.295, 0.317, 0.436, 1.334, 0.66, 0.19, 0.407, 0.632, 0.601, 0.047, 1.655, 0.391, 0.285, 0.129, 0.323, 0.105, 0.118, 0.002, 0.153, 0.171, 0.497, 0.158, 0.335, 0.205, 0.883, 0.656, 1.229, 0.443, 1.235, 1.155, 0.454, 0.322, 0.995, 1.556, 1.698, 1.023, 0.757, 0.913, 2.262, 0.484, 0.318, 1.372, 0.243, 0.785, 0.237, 1.493, 0.106, 0.535, 0.082, 0.076, 0.27, 0.291, 0.827, 0.097, 0.221, 0.088, 0.495, 0.825, 2.169, 1.18, 5.491, 0.939, 0.255, 0.248, 0.036, 0.0, 0.177, 0.553, 0.094, 0.031, 0.538, 0.035, 0.047, 0.155, 0.264, 0.564, 0.544, 0.386, 0.361, 0.351, 0.396, 0.264, 0.401, 0.544, 0.563, 1.543, 0.449, 0.13, 0.513, 0.68, 0.133, 0.129]
finished one seq
[0.285, 0.129, 0.323, 0.105, 0.118, 0.002, 0.153, 0.171, 0.497, 0.158, 0.335, 0.205, 0.883, 0.656, 1.229, 0.443, 1.235, 1.155, 0.454, 0.322, 0.995, 1.556

finished one seq
[0.803, 0.528, 0.144, 0.404, 2.867, 1.042, 0.077, 0.033, 0.895, 0.442, 0.253, 0.369, 0.867, 0.315, 0.143, 0.254, 0.061, 0.141, 0.301, 0.526, 0.461, 0.839, 0.894, 0.568, 0.787, 1.617, 1.459, 2.578, 0.836, 1.991, 0.758, 0.42, 0.686, 1.276, 1.614, 5.372, 0.13, 0.048, 0.191, 0.769, 0.262, 1.267, 0.071, 0.063, 0.148, 0.283, 0.113, 0.106, 0.773, 0.159, 0.206, 1.029, 0.698, 0.206, 0.239, 0.433, 0.19, 0.173, 0.068, 0.669, 0.189, nan, 0.022, 0.002, 0.099, 0.0, 0.249, 0.093, 0.18, 0.034, 0.015, 0.255, 1.985, 0.193, 0.118, 0.057, 0.196, 0.197, 0.045, 1.564, 1.727, 0.102, 0.35, 2.449, 1.766, 0.473, 0.016, 0.027, 0.259, 0.067, 0.451, 0.38, 0.339, 1.373, 1.89, 3.354, 1.082, 0.7, 2.483, 0.121, 0.088, 0.129, 0.592, 5.672, 2.384, 2.276, 3.539, 1.381, 0.087, 0.041, 0.016, 0.104, 0.023, 0.229, 0.1, 0.06, 0.0, 0.115, 0.125, 0.121]
finished one seq
[0.262, 1.267, 0.071, 0.063, 0.148, 0.283, 0.113, 0.106, 0.773, 0.159, 0.206, 1.029, 0.698, 0.206, 0.239, 0.433, 0.19, 0.173, 0.068, 0.669, 0.1

finished one seq
[0.532, 0.874, 0.005, 0.738, 0.221, 1.022, 2.064, 1.353, 0.679, 1.8, 1.786, 0.071, 0.01, 1.291, 0.949, 1.435, 0.154, 0.173, 0.385, 0.49, 0.273, 0.252, 0.0, 0.253, 0.335, 0.257, 0.0, 0.146, 1.039, 0.12, 0.245, 1.382, 1.119, 0.713, 0.726, 1.324, 0.407, 0.278, 0.0, 0.069, 0.273, 0.492, 0.591, 0.324, 0.0, 0.0, 0.0, 0.058, 0.008, 0.0, 0.0, 0.0, 0.459, 0.215, 0.895, 0.956, 1.408, 1.535, 1.837, 0.0, 0.161, 0.0, 0.998, 0.012, 0.0, 0.0, 0.009, 0.099, 0.0, 0.0, 1.037, 0.171, 0.386, 0.094, 0.199, 0.384, 0.668, 0.301, 0.0, 0.263, 0.228, 0.178, 0.615, 0.878, 0.512, 0.331, 0.0, 0.228, 0.252, 0.0, 0.0, 0.0, 0.131, 0.0, 0.0, 0.115, 0.099, 0.026, 0.025, 0.0, 0.071, 0.122, 0.0, 0.028, 0.0, 0.0, 0.0, 0.0, 0.191, 0.488, 0.705, 3.063, 0.761, 0.774, 0.041, 0.12, 0.0, 0.0, 0.259, 0.0]
finished one seq
[0.273, 0.492, 0.591, 0.324, 0.0, 0.0, 0.0, 0.058, 0.008, 0.0, 0.0, 0.0, 0.459, 0.215, 0.895, 0.956, 1.408, 1.535, 1.837, 0.0, 0.161, 0.0, 0.998, 0.012, 0.0, 0.0, 0.009, 0.099, 0.0, 0.0, 1.037,

In [27]:
test

Unnamed: 0,start,end,sequence,fold_structure,threshknot_structure
