In [17]:
# read fasta sequence and convert to string

from Bio import SeqIO

def get_seq(seq_filename):
    record = SeqIO.read(seq_filename, "fasta")
    full_seq = str(record.seq)
    return full_seq

# divide string into list of sliding windows of appropriate length

def get_sliding_windows(full_seq, step, window):
    windows = []
    for i in range(0, len(full_seq)+1, step):
        new_windows = full_seq[i:i+window]
        if len(new_windows) == window:
            windows.append(new_windows)
    return windows

# for loop to run bpps on each item in the list

import arnie
from arnie.bpps import bpps
from arnie.pfunc import pfunc 

def get_bpps_list(windows):
    bpps_list = []
    for i in windows:
        new_bpps = bpps(i, package='contrafold', linear=True, threshknot=True)
        bpps_list.append(new_bpps)
    return bpps_list

# for loop to run MEA on each item to convert bpps to bp_list

from arnie.mea.mea import MEA
from arnie.mea.mea_utils import *

def get_bp_list(bpps_list):
    bp_list = []
    for i in bpps_list:
        new_bp = MEA(i, run_probknot_heuristic=True, theta=0.3)
        new_bp_list = new_bp.MEA_bp_list
        bp_list.append(new_bp_list)
    return bp_list

# function to go over each bp_list and check whether or not it is a pseudoknot

def is_PK(bp_list):
    '''checks if a given bp_list represents a PK
    Args:
        bp_list: of list of tuples where the tuples are the indeces of the bp in increasing order (bp[0]<bp[1])
    
    returns:
        True if it is a psuedoknot
    '''
    for i in bp_list:
        if bp_list == []:
            return False
        else:
            current_bp = bp_list[0]
            for bp in bp_list[1:]:
                if ((current_bp[0] < bp[0] and bp[0] < current_bp[1] and current_bp[1] < bp[1])
                    or (current_bp[0] > bp[0] and current_bp[0] < bp[1] and bp[1] < current_bp[1])):
                    return True
            return is_PK(bp_list[1:])


In [20]:
def get_dotbracket_list(bpps_list):
    dotbracket_list = []
    for i in bpps_list:
        new_dotbracket = MEA(i, run_probknot_heuristic=True, theta=0.3)
        new_structure = new_dotbracket.structure
        dotbracket_list.append(new_structure)
    return dotbracket_list

In [30]:
from Bio import SeqIO
import arnie
from arnie.bpps import bpps
from arnie.pfunc import pfunc
from arnie.mea.mea import MEA
from arnie.mea.mea_utils import *

def get_pseudoknots(seq_filename, step, window):
        
        full_seq = get_seq(seq_filename)
        
        windows = get_sliding_windows(full_seq, step=step, window=window)
        
        bpps_list = get_bpps_list(windows)
        
        bp_list = get_bp_list(bpps_list)
        
        PK_list = is_PK(bp_list)
        
        return PK_list

In [31]:
get_pseudoknots("SARS_last_1000.fasta", step = 100, window = 300)

True