In [1]:
import pandas as pd
import arnie
from arnie.utils import *
from arnie.utils import _group_into_non_conflicting_bp

# import csv for pseudoknot predictions

def get_csv(csv_loc):
    df = pd.read_csv(csv_loc)
    return df 

# extract locations for each pseudoknot along with dotbracket structures

def get_info(df):
    
    starts = df['start'].to_list()
    ends = df['end'].to_list()
    sequences = df['sequence'].to_list()
    dotbrackets = df['struct'].to_list()
    
    return starts, ends, sequences, dotbrackets

# import shapeknots data and convert to list

def get_shape_data(filename):
    shape = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            shape.append(line)
            
    for i in range(len(shape)):
        shape[i] = (-1) if shape[i] == 'nan' else float(shape[i])
        
    return shape

# use Rachael's function to compare shape and dotbracket structure and return ranking

def evaluate_L1_shape_score(s,shape):
    score = 0
    for c,react in zip(s,shape):
        if (c=="." and react>0.25) or (c!="." and react<0.5):
            score += 1
    return score/len(s)

# rank PKs using my old functions

def get_groups(dotbracket):
    bp_list = convert_dotbracket_to_bp_list(dotbracket, allow_pseudoknots=True)
    groups = _group_into_non_conflicting_bp(bp_list)
    return groups

def get_bracket_count(struct):
    bracket_count = 0
    for char in struct:
        if char == '[':
            bracket_count+=1
    return bracket_count

def get_PK_rank(struct):
    PK_rank = 0
    if get_bracket_count(struct) > 1:
        for pos,char in enumerate(struct):
            if (char == '[' and struct[pos+1] == '['):
                PK_rank += 1
    for pos,char in enumerate(struct):
        if char == '[':
            if (pos == 0) and (struct[pos+1] != '[') or (struct[pos+1] != '.'):
                PK_rank -= 1
            elif (pos != 0) and (struct[pos-1] != '[') or (struct[pos-1] != '.'):
                PK_rank-= 1
            elif (pos != 0) and (struct[pos+1] != '[') or (struct[pos+1] != '.'):
                PK_rank -= 1
            elif (pos == len(pos)) and (struct[pos-1] != '[') or (struct[pos-1] != '.'):
                PK_rank -= 1
        elif char == ']':
            if (pos == 0) and ((struct[pos+1] != ']') or (struct[pos+1] != '.')):
                PK_rank -= 1
            elif (pos != 0) and (struct[pos-1] != ']') or (struct[pos-1] != '.'):
                PK_rank-= 1
            elif (pos != 0) and (struct[pos+1] != ']') or (struct[pos+1] != '.'):
                PK_rank -= 1
            elif (pos == len(pos)) and (struct[pos-1] != ']') or (struct[pos-1] != '.'):
                PK_rank -= 1
    return PK_rank

# rank PKs on consensus with other predictions

def get_bp_list(dotbracket):
    bp_list = convert_dotbracket_to_bp_list(dotbracket, allow_pseudoknots=True)
    return bp_list

def compare_bp_lists(bp_list1, bp_list2):
    bp_list_score = 0
    for bp1 in bp_list1: 
        for bp2 in bp_list2: 
            if bp1 == bp2: 
                bp_list_score += 1
        # divide by total number of base pairs in bp_list1 to normalize results
    return bp_list_score/len(bp_list1)

def get_consensus_scores(start_locs1, bp_lists1, start_locs2, bp_lists2):
    scores = []
    for i, loc1 in enumerate(start_locs1): 
        for idx, loc2 in enumerate(start_locs2):
            if loc1 == loc2:
                bp_list1 = bp_lists1[i]
                bp_list2 = bp_lists2[idx]
                bp_list_score = compare_bp_lists(bp_list1, bp_list2)
                scores.append(bp_list_score)
        else:
            scores.append(0)
    return scores

# create new dataframe with rankings

def get_df(starts, ends, sequences, dotbrackets, shape_scores, ranks, consensus_scores):
    PK_list = zip(starts, ends, sequences, dotbrackets, shape_scores, ranks, consensus_scores)
    df = pd.DataFrame(PK_list, columns = ['start', 'end', 'sequence', 'structure', 'shape_score', 'rank', 'consensus_score'])
    ranked_df = df.sort_values('rank', ascending=False)
    return ranked_df

# put it all together

def score_pk_consensus_with_shape(csv, shape_file, csv2):
    df = get_csv(csv)
    starts, ends, sequences, dotbrackets = get_info(df)
    
    # get rough score for consensus with shape data
    
    full_shape = get_shape_data(shape_file)
    shapes = []
    for i, start in enumerate(starts):
        end = ends[i]
        shape_window = full_shape[start:end]
        shapes.append(shape_window)
    
    shape_scores = []
    for i, struct in enumerate(dotbrackets):
        shape = shapes[i]
        score = evaluate_L1_shape_score(struct, shape)
        shape_scores.append(score)
        
    # get rough ranking for likelihood of PK
        
    ranks = []
    for struct in dotbrackets: 
        rank = get_PK_rank(struct)
        ranks.append(rank)
        
    # get consensus score with other predictions
    
    df2 = get_csv(csv2)
    starts2, ends2, sequences2, dotbrackets2 = get_info(df2)
    
    bp_lists1 = []
    for dotbracket in dotbrackets: 
        bp_list = get_bp_list(dotbracket)
        bp_lists1.append(bp_list)
        
    bp_lists2 = []
    for dotbracket in dotbrackets2: 
        bp_list = get_bp_list(dotbracket)
        bp_lists2.append(bp_list)
        
    consensus_scores = get_consensus_scores(starts, bp_lists1, starts2, bp_lists2)
        
    df = get_df(starts, ends, sequences, dotbrackets, shape_scores, ranks, consensus_scores)
    return df

In [2]:
knotty_df = score_pk_consensus_with_shape('/home/gnye8/Desktop/PK_research/pipeline_results/knotty/knotty_output.csv', '/home/gnye8/Desktop/PK_research/SSRP_work/shape_data/incarnato_invivo_reactivity-Copy1.csv', '/home/gnye8/Desktop/PK_research/pipeline_results/pknots/pk_predictor_output.csv')

In [3]:
knotty_df.to_csv('/home/gnye8/Desktop/PK_research/pipeline_results/knotty/knotty_analysis_scores.csv')

In [4]:
pknots_df = score_pk_consensus_with_shape('/home/gnye8/Desktop/PK_research/pipeline_results/pknots/pk_predictor_output.csv', '/home/gnye8/Desktop/PK_research/SSRP_work/shape_data/incarnato_invivo_reactivity-Copy1.csv', '/home/gnye8/Desktop/PK_research/pipeline_results/knotty/knotty_output.csv')

In [5]:
pknots_df.to_csv('/home/gnye8/Desktop/PK_research/pipeline_results/pknots/pknots_analysis_scores.csv')

In [6]:
knotty_df

# pseudoknot specific shape ranking
# pseudoknot specific consensus score
# visualize interesting pseudoknots

Unnamed: 0,start,end,sequence,structure,shape_score,rank,consensus_score
109,5000,5120,UCCACACGCAAGUUGUGGACAUGUCAAUGACAUAUGGACAACAGUU...,(((((((....).)))))).(((((.[[)))))]].((((.(((((...,0.808333,-3,0.000000
379,16560,16680,AUUGCAACAUGUGACUGGACAAAUGCUGGUGAUUACAUUUUAGCUA...,.(((((.[[))))).]]......(((.((((.(((.........))...,0.775000,-3,0.000000
62,2720,2840,CACCAACAAAGGUUACUUUUGGUGAUGACACUGUGAUAGAAGUGCA...,((((((..(((....)))))))))....((((........))))((...,0.850000,-3,0.000000
45,1960,2080,GCUAUAACAAUACUAGAUGGAAUUUCACAGUAUUCACUGAGACUCA...,.............((((((((((.(((.((((.(((.((([[.)))...,0.791667,-3,0.000000
132,6160,6280,GAUGUGGUGGCUAUUGAUUAUAAACACUACACACCCUCUUUUAAGA...,..(((((((...............)))))))....(((((((.[[)...,0.716667,-3,0.000000
...,...,...,...,...,...,...,...
141,6520,6640,AUUACAGAAGAGGUUGGCCACACAGAUCUAAUGGCUGCUUAUGUAG...,.....(([[[.[[[[(((((..[[[[[[[[[))))).......[[[...,0.833333,-38,0.000000
273,12080,12200,CAACCUUACAAGCUAUAGCCUCAGAGUUUAGUUCCCUUCCAUCAUA...,.........((((([[[[[.[[[[[[[............[[[[[[....,0.758333,-38,0.181818
472,20560,20680,UUUCUAAGGUUGUCAAAGUGACUAUUGACUAUACAGAAAUUUCAUU...,((((((.[[[[[[[[((((((...[[[[[[[....[.[[[.)))))...,0.516667,-39,0.166667
372,16280,16400,AUGUGGUGCUUGCAUACGUAGACCAUUCUUAUGUUGUAAAUGCUGU...,.((((([[[..[[[[[[[[[[[[....[[.[[..[[[..[[[.[[....,0.741667,-40,0.000000


In [7]:
pknots_df

Unnamed: 0,start,end,sequence,structure,shape_score,rank,consensus_score
237,21200,21320,UAAGCUCAUGGGACACUUCGCAUGGUGGACAGCCUUUGUUACUAAU...,...........(((...................................,0.541667,-2,0.0
255,22520,22640,UCCAACCAACAGAAUCUAUUGUUAGAUUUCCUAAUAUUACAAACUU...,...........(((((((....)))))))........((((....)...,0.691667,-3,0.0
222,20120,20240,AAUUGGAGAAGCCGUAAAAACACAGUUCAAUUAUUAUAAGAAAGUU...,...(((.....)))....(((((...((((((..........))))...,0.508333,-3,0.0
115,10320,10440,UAAGGUUGAUACAGCCAAUCCUAAGACACCUAAGUAUAAGUUUGUU...,...((((.....))))........(((((..(((...(((((((((...,0.750000,-3,0.0
1,120,240,CACGCAGUAUAAUUAAUAACUAAUUACUGUCGUUGACAGGACACGA...,.((((((((..((((.....))))))))).)))((.(((((.((((...,0.700000,-3,0.0
...,...,...,...,...,...,...,...
4,440,560,GCGUUUUGCCUCAACUUGAACAGCCCUAUGUGUUCAUCAAACGUUC...,((.....))((((((.((((((........)))))).....[[[.....,0.716667,-23,0.2
92,8800,8920,ACUAAUGACAAAGCUUGCCCAUUGAUUGCUGCAGUCAUAACAAGAG...,....((((([[[[.[[[[[.[[[[.[[[.[[[.))))).....(((...,0.683333,-24,0.0
30,3000,3120,UGAGUUUAAAUUGGCUUCACAUAUGUAUUGUUCUUUCUACCCUCCA...,...........((.[[[[[[[....[[[[.[[[[[[....[[[[))...,0.750000,-25,0.0
49,4480,4600,UAUAAGGGUAUUAAAAUACAAGAGGGUGUGGUUGAUUAUGGUGCUA...,..((((((((...((((..[[[[[[[[[[[.[[[[[..[[[[[[[[...,0.791667,-27,0.0
