In [10]:
import pandas as pd
from arnie.utils import *

# retrieve data from csv for pseudoknot predictions to be compared 

def get_csv(csv_loc):
    df = pd.read_csv(csv_loc)
    return df 

# store lists of start locations and structures

def get_info(df):
    
    starts = df['start'].to_list()
    ends = df['end'].to_list()
    sequences = df['sequence'].to_list()
    dotbrackets = df['struct'].to_list()
    
    return starts, ends, sequences, dotbrackets

# convert dotbracket structures to bp lists

def get_bp_list(dotbracket):
    bp_list = convert_dotbracket_to_bp_list(dotbracket, allow_pseudoknots=True)
    return bp_list
    
# get a list of all structures from the list you are comparing with that have the same start location

def get_scores(start_locs1, bp_lists1, start_locs2, bp_lists2):
    scores = []
    for i, loc1 in enumerate(start_locs1): 
        for idx, loc2 in enumerate(start_locs2):
            if loc1 == loc2:
                bp_list1 = bp_lists1[i]
                bp_list2 = bp_lists2[idx]
                bp_list_score = compare_bp_lists(bp_list1, bp_list2)
                scores.append(bp_list_score)
        else:
            scores.append(0)
    return scores

# compare bps in the bp lists - simple point system, add a point for every shared bp

def compare_bp_lists(bp_list1, bp_list2):
    bp_list_score = 0
    for bp1 in bp_list1: 
        for bp2 in bp_list2: 
            if bp1 == bp2: 
                bp_list_score += 1
        # divide by total number of base pairs in bp_list1 to normalize results
    return bp_list_score/len(bp_list1)

# add scores into final dataframe
def get_df(starts, ends, sequences, dotbrackets, scores):
    PK_list = zip(starts, ends, sequences, dotbrackets, scores)
    df = pd.DataFrame(PK_list, columns = ['start', 'end', 'sequence', 'structure', 'score'])
    ranked_df = df.sort_values('score', ascending=False)
    return ranked_df


def get_consensus_scores(csv1, csv2):
    df1 = get_csv(csv1)
    df2 = get_csv(csv2)
    
    starts1, ends1, sequences1, dotbrackets1 = get_info(df1)
    starts2, ends2, sequences2, dotbrackets2 = get_info(df2)
    
    bp_lists1 = []
    for dotbracket in dotbrackets1: 
        bp_list = get_bp_list(dotbracket)
        bp_lists1.append(bp_list)
        
    bp_lists2 = []
    for dotbracket in dotbrackets2: 
        bp_list = get_bp_list(dotbracket)
        bp_lists2.append(bp_list)
        
    scores = get_scores(starts1, bp_lists1, starts2, bp_lists2)
    
    df = get_df(starts1, ends1, sequences1, dotbrackets1, scores)
    return df

In [11]:
knotty_scores = get_consensus_scores('/home/gnye8/Desktop/PK_research/pipeline_results/knotty/knotty_output.csv', '/home/gnye8/Desktop/PK_research/pipeline_results/pknots/pk_predictor_output.csv')

In [12]:
knotty_scores

Unnamed: 0,start,end,sequence,structure,score
169,7640,7760,GUGCUGGUAGUACAUUUAUUAGUGAUGAAGUUGCGAGAGACUUGUC...,..((((((((((.(((((((((((((((.(((......))))))))...,1.000000
324,14280,14400,AAACUCUUUGACCGUUAUUUUAAAUAUUGGGAUCAGACAUACCACC...,.............((((.....(((.(((((.[[[[[[[.....))...,0.973684
328,14440,14560,UUGUUGAUGGUGUUCCAUUUGUAGUUUCAACUGGAUACCACUUCAG...,...(((((((((((([[[..[[[[[[[[[[[[))))))))..[[.....,0.934783
553,23920,24040,CAAAUUUACAAAACACCACCAAUUAAAGAUUUUGGUGGUUUUAAUU...,..............((((((((..[[[[[[[))))))))...]]]]...,0.897436
175,7880,8000,AUGUUAUAGUUUUUGAUGGUAAAUCAAAAUGUGAAGAAUCAUCUGC...,....(((((.((((((({{{{{))))))).(((([[..))))..((...,0.885714
...,...,...,...,...,...
259,11520,11640,GUUUUUGGCCAGAGGUAUUGUUUUUAUGUGUGUUGAGUAUUGCCCU...,.....((((((((((((.((...........(([[[[[[[.))......,0.000000
261,11600,11720,UAAUGCUAGUUUAUUGUUUCUUAGGCUAUUUUUGUACUUGUUACUU...,......(((((((..((.....((((((..[[[[[....[..[[[....,0.000000
263,11680,11800,CUGACUCUUGGUGUUUAUGAUUACUUAGUUUCUACACAGGAGUUUA...,..((((((([[[[[[[[[[....[[[[[[[[[[....))))))).....,0.000000
264,11720,11840,AGUUUAGAUAUAUGAAUUCACAGGGACUACUCCCACCCAAGAAUAG...,.((((((...((((.(((.[[[[[[.[[[[[......[[[[))).....,0.000000


In [13]:
pknots_scores = get_consensus_scores('/home/gnye8/Desktop/PK_research/pipeline_results/pknots/pk_predictor_output.csv', '/home/gnye8/Desktop/PK_research/pipeline_results/knotty/knotty_output.csv')

In [14]:
pknots_scores

Unnamed: 0,start,end,sequence,structure,score
213,18800,18920,AAGCAACCAUGAUCUGUAUUGUCAAGUCCAUGGUAAUGCACAUGUA...,..(((((((((.................))))))..)))((((((....,1.000000
110,10000,10120,UCUGAUGUUCUUUACCAACCACCACAAACCUCUAUCACCUCAGCUG...,.........(((((((.[[[[..[[[[.[[[[.((((((((((......,1.000000
211,18640,18760,UAGGACCUGAGCGCACCUGUUGUCUAUGUGAUAGACGUGCCACAUG...,((((..[[[[[.[[[)))).((((((.....)))))).((.....)...,1.000000
114,10280,10400,GGGUUAUUGGACAUUCUAUGCAAAAUUGUGUACUUAAGCUUAAGGU...,..(((....)))....((((((....)))))).....(((((.(((...,0.911765
292,25880,26000,AACUUCUUCAAUUGUCAUUACUUCAGGUGAUGGCACAACAAGUCCU...,......((((..(((((((((.....)))))))))[[[[[[[[[[[...,0.909091
...,...,...,...,...,...
141,12880,13000,UAUACAGAACUGGAACCACCUUGUAGGUUUGUUACAGACACACCUA...,..(((...[[[...[[[[((((.(((((.(((.....))).)))))...,0.000000
142,12960,13080,AGGAUUAAACAACCUAAAUAGAGGUAUGGUACUUGGUAGUUUAGCU...,.........((((((......)))).))(((((((((((.....))...,0.000000
144,13040,13160,AAGUGCCUGCCAAUUCAACUGUAUUAUCUUUCUGUGCUUUUGCUGU...,.(((.............)))......((((..(((.[[[[[[[[[[...,0.000000
146,13160,13280,GUACACACACUGGUACUGGUCAGGCAAUAACAGUUACACCGGAAGC...,((((........))))(((((.((((((.((((...((((....((...,0.000000
