In [8]:
import arnie
from arnie.utils import *
from arnie.mfe_bootstrap import mfe_bootstrap
from arnie.bpps import bpps
from arnie.mea.mea import MEA
from arnie.mea.mea_utils import *
from arnie.utils import _group_into_non_conflicting_bp
from arnie.pk_predictors import pk_predict_from_bpp
from Bio import SeqIO
import pandas as pd

def get_seq(seq_filename):
    record = SeqIO.read(seq_filename, "fasta")
    return str(record.seq)

def get_shape_data(filename):
    shape = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            shape.append(line)
            
    for i in range(len(shape)):
        shape[i] = (-1) if (shape[i] == 'nan') else (float(shape[i]))
        
    return shape

def get_sliding_windows(full_seq, shape, step, window):
    coords = list(range(0,len(full_seq)-window+1,step))
    seq_windows = []
    shape_windows = []
    for i in coords:
        new_seq_window = full_seq[i:i+window]
        seq_windows.append(new_seq_window)
        new_shape_window = shape[i:i+window]
        shape_windows.append(new_shape_window)
        
    return seq_windows, shape_windows, coords

def predict_pks(bpp, heuristic='threshknot', theta=0.3):
    structure = pk_predict_from_bpp(bpp, heuristic, theta)
    return structure

def get_bp_list(dotbracket):
    return convert_dotbracket_to_bp_list(dotbracket, allow_pseudoknots=True)

def get_groups(bp_list):
    return _group_into_non_conflicting_bp(bp_list)
    
def is_probable_PK(groups):
    likely_bp = 0
    for i in groups[1:]:
        if len(i) > 2:
            likely_bp += 1
    if likely_bp >= 2:
        return True
    else:
        return False
    
def get_shape_pseudoknots(seq_filename, shape_filename, step, window, num_bootstrap, theta=0.3):

    full_seq = get_seq(seq_filename)
    full_shape = get_shape_data(shape_filename)
    
    
    seq_windows, shape_windows, coords = get_sliding_windows(full_seq, full_shape, step, window)
    
    PK_hit_list = []
    fold_structures = []
    threshknot_structures = []
    PK_seqs = []
    for i,seq in enumerate(seq_windows):
        shape = shape_windows[i]
        mfe_struct, bpp = mfe_bootstrap(seq, num_bootstrap=num_bootstrap, shape_signal=shape, pk=False)
        threshknot_struct = predict_pks(bpp, heuristic='threshknot', theta=0.3)
        bp_list = convert_dotbracket_to_bp_list(threshknot_struct, allow_pseudoknots=True)
        groups = _group_into_non_conflicting_bp(bp_list)
        if is_PK(threshknot_struct):
            if is_probable_PK(groups):
                print('found one')
                PK_hit_list.append(i)
                fold_structures.append(mfe_struct)
                threshknot_structures.append(threshknot_struct)
                PK_seqs.append(seq)
        print('finished one seq')
        
    starts = []
    ends = []
    for i in PK_hit_list:
        start = coords[i]+1
        starts.append(start)
        end = coords[i]+window
        ends.append(end)
        
    PK_list = zip(starts, ends, PK_seqs, fold_structures, threshknot_structures)
    df = pd.DataFrame(PK_list, columns = ['start', 'end', 'sequence', 'fold_structure', 'threshknot_structure'])
    return df

In [None]:
zhang_invivo = get_shape_pseudoknots('/home/gnye8/Desktop/PK_research/SSRP_work/fasta_files/SARS_CoV2_reference.fasta', '/home/gnye8/Desktop/PK_research/SSRP_work/shape_data/zhang_invivo_reactivity-Copy1.csv', 40, 120, 100)

Reached max iteration, stopping before converged.
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
found one
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
Reached max iteration, stopping before converged.
found one
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
found one
finished one seq
finished one seq
Reached max iteration, stopping before converged.
found one
finished one seq
finished one seq
finished one seq


In [2]:
full_SARS = get_seq('/home/gnye8/Desktop/PK_research/SSRP_work/fasta_files/SARS_CoV2_reference.fasta')
full_shape = get_shape_data('/home/gnye8/Desktop/PK_research/SSRP_work/shape_data/zhang_invivo_reactivity-Copy1.csv')

In [3]:
test_SARS = full_SARS[13000:14000]
test_shape = full_shape[13000:14000]

def test_get_shape_pseudoknots(full_seq, full_shape, step, window, num_bootstrap, theta=0.3):
    
    seq_windows, shape_windows, coords = get_sliding_windows(full_seq, full_shape, step, window)
    
    PK_hit_list = []
    fold_structures = []
    threshknot_structures = []
    PK_seqs = []
    for i,seq in enumerate(seq_windows):
        shape = shape_windows[i]
        mfe_struct, bpp = mfe_bootstrap(seq, num_bootstrap=num_bootstrap, shape_signal=shape, pk=False)
        threshknot_struct = predict_pks(bpp, heuristic = 'threshknot', theta=0.3)
        bp_list = convert_dotbracket_to_bp_list(threshknot_struct, allow_pseudoknots=True)
        groups = _group_into_non_conflicting_bp(bp_list)
        if is_PK(threshknot_struct):
            if is_probable_PK(groups):
                print('found one')
                PK_hit_list.append(i)
                fold_structures.append(mfe_struct)
                threshknot_structures.append(threshknot_struct)
                PK_seqs.append(seq)
        print('finished one seq')
        
    starts = []
    ends = []
    for i in PK_hit_list:
        start = coords[i]+1
        starts.append(start)
        end = coords[i]+window
        ends.append(end)
        
    print(starts)
    print(threshknot_structures)
        
    PK_list = zip(starts, ends, PK_seqs, fold_structures, threshknot_structures)
    df = pd.DataFrame(PK_list, columns = ['start', 'end', 'sequence', 'fold_structure', 'threshknot_structure'])
    return df

In [4]:
test = test_get_shape_pseudoknots(test_SARS, test_shape, 40, 120, 100)

finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
found one lol
finished one seq
finished one seq
finished one seq
finished one seq
found one lol
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
finished one seq
found one lol
finished one seq
finished one seq
finished one seq
found one lol
finished one seq
[321, 481, 761, 881]
['....(((....(([...))[[[[[)))]]]]]...]..........((((.(((((((.((((((...))))))..))).))))))))(((((....)))))..(((......)))....', '........((((((........(((......((.........))[[[[[[[))))).))))...]]]]]]].(((((.......(((.................))).......))))).', '((.((((.(((((.......))))).)))).))((((........((((((.....((((((((.(((((....)))))...)))).)))).))))))[[[.[[[..))))]]]...]]]', '..(((((..((((((.(((((...))))).)))))).[[[[....))))).((((....))))..............(((((((((...((....))...)))))))))....]]]]...']


In [5]:
test

Unnamed: 0,start,end,sequence,fold_structure,threshknot_structure
0,321,440,ACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAAC...,...(((((((.(((.....(((((...))))).............)...,....(((....(([...))[[[[[)))]]]]]...].............
1,481,600,TAAGTGCAGCCCGTCTTACACCGTGCGGCACAGGCACTAGTACTGA...,.......(((((((........(((((((((((((....)).)))....,........((((((........(((......((.........))[[...
2,761,880,CGGTGACATGGTACCACATATATCACGTCAACGTCTTACTAAATAC...,((.((((.(((((.......))))).)))).))((((........(...,((.((((.(((((.......))))).)))).))((((........(...
3,881,1000,CACATACAATTGTTGTGATGATGATTATTTCAATAAAAAGGACTGG...,.........((((((.(((((...))))).)))))).((((.((((...,..(((((..((((((.(((((...))))).)))))).[[[[....)...
