In [4]:
from Bio import SeqIO
import arnie
from arnie.pk_predictors import pk_predict
from arnie.utils import *
from arnie.utils import _group_into_non_conflicting_bp
import pandas as pd

def get_seq(seq_filename):
    record = SeqIO.read(seq_filename, "fasta")
    return str(record.seq)

def get_sliding_windows(full_seq, step, window):
    coords = list(range(0,len(full_seq)-window+1,step))
    windows = []
    for i in coords:
        new_window = full_seq[i:i+window]
        windows.append(new_window)
    return windows, coords

def stable_helices(dotbracket):
    bp_list = convert_dotbracket_to_bp_list(dotbracket, allow_pseudoknots=True)
    groups = _group_into_non_conflicting_bp(bp_list)
    for idx, pairs in enumerate(groups):
        if idx > 0:
            if len(pairs) == 1:
                return False 
            
def get_pseudoknots(seq_windows, coords, pk_predictor):
    PK_list = []
    for seq,coord in zip(seq_windows,coords):
        dotbracket = pk_predict(seq, pk_predictor)
        if is_PK(dotbracket):
            if stable_helices(dotbracket):
                PK_list.append([coord, coord+window, seq, dotbracket])
    df = pd.DataFrame(PK_list,columns=["start","end","sequence", "struct"])
    return df

#input is list of shape values as floats with nan values numpy objects 
def normalize_shape(shape_reacs):
    shape_reacs = np.array(shape_reacs)

    # Get rid of nan values for now
    nonan_shape_reacs = shape_reacs[~np.isnan(shape_reacs)]

    # Find Filter 1: 1.5 * Inter-Quartile Range
    sorted_shape = np.sort(nonan_shape_reacs)
    q1 = sorted_shape[int(0.25 * len(sorted_shape))]
    q3 = sorted_shape[int(0.75 * len(sorted_shape))]
    iq_range = abs(q3 - q1)
    filter1 = next(x for x, val in \
        enumerate(list(sorted_shape)) if val > 1.5 * iq_range)

    # Find Filter 2: 95% value
    filter2 = int(0.95 * len(sorted_shape))

    # Get maximum filter value and fiter data
    filter_cutoff = sorted_shape[max(filter1, filter2)]
    sorted_shape = sorted_shape[sorted_shape < filter_cutoff]

    # Scalefactor: Mean of top 10th percentile of values
    top90 = sorted_shape[int(0.9 * len(sorted_shape))]
    scalefactor = np.mean(sorted_shape[sorted_shape > top90])
        
    # Scale dataset
    return shape_reacs/scalefactor

# input is text file of any shape data set, output is normalized list of values in a list (some np.nan objects)
def retrieve_shape_data(shape_filename):

    # write shape text file to list
    shape_file = open("{}".format(shape_filename), "r")
    shape_data = shape_file.read()
    shape_data_list = shape_data.split("\n")
    shape_file.close()
    
    shape_nan_list = []
    for char in shape_data_list:
        if char == '':
            shape_data_list.remove('')
        elif (char == '-999') or (char == 'nan') or (char == "NaN"):
            shape_nan_list.append('nan')
        else: 
            shape_nan_list.append(float(char))
    
    #convert string 'nan' to np.nan
    shape_reacs = []
    for char in shape_nan_list:
        if char == 'nan':
            shape_reacs.append(np.nan)
        else:
            shape_reacs.append(char)
    
    # normalize shape data
    normalized_shape_data = normalize_shape(shape_reacs).tolist()
    return normalized_shape_data

#add in this capability later
#def get_pseudoknots_with_shapeknots()

def viral_knots(seq_filename, shapeknots=False, shape_rankings=False, shape_data_folder=None, 
                shape_data_sets=None, pk_predictors, step, window):
    ### args:
        ### seq_filename - fasta file containing RNA sequence for viral genome
        ### shapeknots - if shapeknots is one of the predictors desired to use (must include shape data)
        ### shape_rankings - if shape reactivity values are used to score the pseudoknots (must include shape data)
        ### shape_data_folder - folder containing csv's with shape reactivity
        ### shape_data_sets - names of the reactivity files (no .csv necessary)
        ### pk_predictors - list of desired predictors for use: currently implemented are threshknots, knotty, pknots, spotrna
        ### step - desired number of nucleotides to slide each window
        ### window - desired size of window to divide genome into
    
    
    ##first retrieve viral sequence and shape data (if directed)
    seq = get_seq(seq_filename)
    RNA_seq = seq.replace("T", "U")
    
    ##sort sequence and normalized shape data into windows
    
    seq_windows, coords = get_sliding_windows(RNA_seq, step=step, window=window)
    
    all_shape_windows = []
    if shapeknots or shape_rankings:
        shape_reacs = []
        for name in shape_data_sets:
            shape_reacs.append(retrieve_shape_data(shape_data_folder+'/'+name+'.csv'))
            
        for track in shape_reacs:
            shape_windows, shape_coords = get_sliding_windows(track, step=step, window=window)
            all_shape_windows.append(shape_windows)
    
    ##run necessary data through each predictor and sort output into individual csvs
    
    pk_dfs = []
    for name in pk_predictors:
        pk_dfs.append(get_pseudoknots(seq_windows, coords, pk_predictor=name))
    
    ##run shapeknots - once implemented
    
    #shapeknots_dfs = []
    #if shapeknots:
        #shapeknots_dfs.append(run_shapeknots(...))
    
    ##run csv's through ranking function - output is sorted in csv containing program, location, 
        #sequence, structure, consensus, shape score, and free energy 
    
    
    
    #output is single csv containing: location, sequence, average consensus for that window
        #average consensus for pseudoknotted base pairs
        #list of predictors and corresponding structures
        #z_scores for each structure (average across all predicted structures?)
            #to use arnie function, must use either nupack or threshknot+contrafold+linearpartition


In [10]:
spotrna = pd.read_csv('/home/gnye8/Desktop/PK_research/pipeline_results/direct_output/spotrna.csv')
spotrna

Unnamed: 0.1,Unnamed: 0,start,end,sequence,struct
0,0,40,160,UUUCGAUCUCUUGUAGAUCUGUUCUCUAAACGAACUUUAAAAUCUG...,....(((((.....))))).........................(....
1,1,120,240,CACGCAGUAUAAUUAAUAACUAAUUACUGUCGUUGACAGGACACGA...,.((....((..(((.......)))))[[[[[)).]]]]]...((((...
2,2,360,480,AGACUCCGUGGAGGAGGUCUUAUCAGAGGCACGUCAACAUCUUAAA...,((((((([[[..)))).))).........]]].....((((((.))...
3,3,400,520,CUUAAAGAUGGCACUUGUGGCUUAGUAGAAGUUGAAAAAGGCGUUU...,......((.....(....((((......((((((....(((((......
4,4,440,560,GCGUUUUGCCUCAACUUGAACAGCCCUAUGUGUUCAUCAAACGUUC...,.((((.[[[[.[[[[[((((((........))))))...))))......
...,...,...,...,...,...
334,334,29480,29600,UCUCCAAACAAUUGCAACAAUCCAUGAGCAGUGCUGACUCAACUCA...,............(.........[[[[[[[)...(((((....))))...
335,335,29520,29640,AACUCAGGCCUAAACUCAUGCAGACCACACAAGGCAGAUGGGCUAU...,...........................(((((((....((((((((...
336,336,29560,29680,GGCUAUAUAAACGUUUUCGCUUUUCCGUUUACGAUAUAUAGUCUAC...,((((((((((([[[.............)).]]].)))))))))(.....
337,337,29600,29720,GUCUACUCUUGUGCAGAAUGAAUUCUCGUAACUACAUAGCACAAGU...,..((((..((((((.(((....)))...[[[[[[[[[.))))))))...
