In [105]:
import pandas as pd
import arnie
from arnie.utils import *
from arnie.utils import _group_into_non_conflicting_bp

# import csv for pseudoknot predictions

def get_csv(csv_loc):
    df = pd.read_csv(csv_loc)
    return df 

# extract locations for each pseudoknot along with dotbracket structures

def get_info(df):
    
    starts = df['start'].to_list()
    ends = df['end'].to_list()
    sequences = df['sequence'].to_list()
    dotbrackets = df['struct'].to_list()
    
    return starts, ends, sequences, dotbrackets

# import shapeknots data and convert to list

def get_shape_data(filename):
    shape = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            shape.append(line)
            
    for i in range(len(shape)):
        shape[i] = (-1) if shape[i] == 'nan' else float(shape[i])
        
    return shape

# use Rachael's function to compare shape and dotbracket structure and return ranking

def evaluate_L1_shape_score(s,shape):
    score = 0
    for c,react in zip(s,shape):
        if (c=="." and react>0.25) or (c!="." and react<0.5):
            score += 1
    return score/len(s)

# rank PKs using my old functions

def get_groups(dotbracket):
    bp_list = convert_dotbracket_to_bp_list(dotbracket, allow_pseudoknots=True)
    groups = _group_into_non_conflicting_bp(bp_list)
    return groups

def get_bracket_count(struct):
    bracket_count = 0
    for char in struct:
        if char == '[':
            bracket_count+=1
    return bracket_count

def get_PK_rank(struct):
    PK_rank = 0
    if get_bracket_count(struct) > 1:
        for pos,char in enumerate(struct):
            if (char == '[' and struct[pos+1] == '['):
                PK_rank += 1
    for pos,char in enumerate(struct):
        if char == '[':
            if (pos == 0) and (struct[pos+1] != '[') or (struct[pos+1] != '.'):
                PK_rank -= 1
            elif (pos != 0) and (struct[pos-1] != '[') or (struct[pos-1] != '.'):
                PK_rank-= 1
            elif (pos != 0) and (struct[pos+1] != '[') or (struct[pos+1] != '.'):
                PK_rank -= 1
            elif (pos == len(pos)) and (struct[pos-1] != '[') or (struct[pos-1] != '.'):
                PK_rank -= 1
        elif char == ']':
            if (pos == 0) and ((struct[pos+1] != ']') or (struct[pos+1] != '.')):
                PK_rank -= 1
            elif (pos != 0) and (struct[pos-1] != ']') or (struct[pos-1] != '.'):
                PK_rank-= 1
            elif (pos != 0) and (struct[pos+1] != ']') or (struct[pos+1] != '.'):
                PK_rank -= 1
            elif (pos == len(pos)) and (struct[pos-1] != ']') or (struct[pos-1] != '.'):
                PK_rank -= 1
    return PK_rank

# rank PKs on consensus with other predictions

def get_bp_list(dotbracket):
    bp_list = convert_dotbracket_to_bp_list(dotbracket, allow_pseudoknots=True)
    return bp_list

def compare_bp_lists(bp_list1, bp_list2):
    bp_list_score = 0
    for bp1 in bp_list1: 
        for bp2 in bp_list2: 
            if bp1 == bp2: 
                bp_list_score += 1
        # divide by total number of base pairs in bp_list1 to normalize results
    return bp_list_score/len(bp_list1)

def get_consensus_scores(start_locs1, bp_lists1, start_locs2, bp_lists2):
    scores = []
    for i, loc1 in enumerate(start_locs1): 
        for idx, loc2 in enumerate(start_locs2):
            if loc1 == loc2:
                bp_list1 = bp_lists1[i]
                bp_list2 = bp_lists2[idx]
                bp_list_score = compare_bp_lists(bp_list1, bp_list2)
                scores.append(bp_list_score)
        else:
            scores.append(0)
    return scores

# create new dataframe with rankings

def get_df(starts, ends, sequences, dotbrackets, shape_scores, pk_shape_scores, pk_bp_shape_scores, ranks, consensus_scores, pk_consensus_scores, bp_pk_consensus_scores):
    PK_list = zip(starts, ends, sequences, dotbrackets, shape_scores, pk_shape_scores, pk_bp_shape_scores, ranks, consensus_scores, pk_consensus_scores, bp_pk_consensus_scores)
    df = pd.DataFrame(PK_list, columns = ['start', 'end', 'sequence', 'structure', 'shape_score', 'pk_shape_score','pk_bp_shape_score', 'rank', 'consensus_score', 'pk_consensus_score', 'bp_pk_consensus_score'])
    ranked_df = df.sort_values('rank', ascending=False)
    return ranked_df

# get location of regions containing pseudoknots

def get_pk_locs(dotbracket, start):
    list = []
    all_pk_starts = []
    all_pk_ends = []
    for i, char in enumerate(dotbracket):
        if char == '[':
            list.append(i)
            pk_start = start + i
            all_pk_starts.append(pk_start)
        elif char == ']':
            list.append(i)
            pk_end = start + i 
            all_pk_ends.append(pk_end)   
    
    pk_start = min(all_pk_starts)
    pk_end = max(all_pk_ends)
    
    window_locs = []
    window_locs.append(min(list))
    window_locs.append(max(list))
    
    pk_dotbracket = dotbracket[window_locs[0]:window_locs[1]+1]
    
    return pk_start, pk_end, window_locs, pk_dotbracket

#get shape data for pk locations

def get_pk_shape(full_shape, pk_start, pk_end):
    pk_shape = full_shape[pk_start-1:pk_end]
    return pk_shape
        
# get structure and location for only pk bps

def get_bp_pk_locs(dotbracket, start):
    locs = []
    struct = []
    for i, char in enumerate(dotbracket):
        if char == '[':
            locs.append(i)
            struct.append(char)
        elif char == ']':
            locs.append(i)
            struct.append(char)
    return locs, struct

# get shape data for only pk bps

def get_bp_pk_shape(full_shape, locs):
    shapes = []
    for loc in locs: 
        shape = full_shape[loc]
        shape = float(shape)
        shapes.append(shape)
    return shapes

# get consensus score for only pk bps

def compare_bp_pks(struct1, struct2):
    bp_count = 0
    score = 0
    for i, char in enumerate(struct1):
        if char == '[' or char == ']':
            bp_count += 1
            if char == struct2[i]:
                score += 1
    return score/bp_count

def get_bp_pk_consensus_scores(starts, dotbrackets, starts2, dotbrackets2):
    bp_pk_consensus_scores = []
    for i, start in enumerate(starts):
        for i2, start2 in enumerate(starts2):
            if start == start2: 
                consensus_score = compare_bp_pks(dotbrackets[i], dotbrackets2[i2])
                bp_pk_consensus_scores.append(consensus_score)
        else: 
            bp_pk_consensus_scores.append(0)
    return bp_pk_consensus_scores
    
# put it all together

def score_pk_overall(csv, shape_file, csv2):
    df = get_csv(csv)
    starts, ends, sequences, dotbrackets = get_info(df)
    
    # get rough score for consensus with shape data for entire window
    
    full_shape = get_shape_data(shape_file)
    shapes = []
    for i, start in enumerate(starts):
        end = ends[i]
        shape_window = full_shape[start:end]
        shapes.append(shape_window)
    
    shape_scores = []
    for i, struct in enumerate(dotbrackets):
        shape = shapes[i]
        score = evaluate_L1_shape_score(struct, shape)
        shape_scores.append(score)
        
    # get rough score for consensus with shape data for pk region
    
    pk_starts = []
    pk_ends = []
    window_pk_locs = []
    pk_dotbrackets = []
    
    for i, struct in enumerate(dotbrackets):
        start = starts[i]
        pk_start, pk_end, window_pk_loc, pk_dotbracket = get_pk_locs(struct, start)
        pk_starts.append(pk_start)
        pk_ends.append(pk_end)
        window_pk_locs.append(window_pk_loc)
        pk_dotbrackets.append(pk_dotbracket)
    
    pk_shapes = []
    for i, start in enumerate(pk_starts):
        end = pk_ends[i] + 1
        pk_shape = get_pk_shape(full_shape, start, end)
        pk_shapes.append(pk_shape)
        
    pk_shape_scores = []
    for i, struct in enumerate(pk_dotbrackets):
        shape = pk_shapes[i]
        score = evaluate_L1_shape_score(struct, shape)
        pk_shape_scores.append(score)
    
    # get score for shape consensus with only pk bps
    
    bp_locs_in_window = []
    bps_in_window = []
    for i, struct in enumerate(dotbrackets): 
        locs, struct = get_bp_pk_locs(struct, starts[i])
        bp_locs_in_window.append(locs)
        bps_in_window.append(struct)
        
    bp_pk_shape = []
    for locs in bp_locs_in_window:
        shape = get_bp_pk_shape(full_shape, locs)
        bp_pk_shape.append(shape)
        
    bp_pk_shape_scores = []
    for i, struct in enumerate(bps_in_window):
        shape = bp_pk_shape[i]
        score = evaluate_L1_shape_score(struct, shape)
        bp_pk_shape_scores.append(score)
    
    # get rough ranking for likelihood of PK
        
    ranks = []
    for struct in dotbrackets: 
        rank = get_PK_rank(struct)
        ranks.append(rank)
        
    # get consensus score with other predictions
    
    df2 = get_csv(csv2)
    starts2, ends2, sequences2, dotbrackets2 = get_info(df2)
    
    bp_lists1 = []
    for dotbracket in dotbrackets: 
        bp_list = get_bp_list(dotbracket)
        bp_lists1.append(bp_list)
        
    bp_lists2 = []
    for dotbracket in dotbrackets2: 
        bp_list = get_bp_list(dotbracket)
        bp_lists2.append(bp_list)
        
    consensus_scores = get_consensus_scores(starts, bp_lists1, starts2, bp_lists2)
    
    # get consensus score for pk regions specifically 
    
    pk_starts2 = []
    pk_ends2 = []
    pk_dotbrackets2 = []
    for i, struct in enumerate(dotbrackets2):
        start = starts2[i]
        pk_start, pk_end, window_pk_loc, pk_dotbracket = get_pk_locs(struct, start)
        pk_starts2.append(pk_start)
        pk_ends2.append(pk_end)
        pk_dotbrackets2.append(pk_dotbracket)
        
    pk_bp_lists1 = []
    for dotbracket in pk_dotbrackets:
        bp_list = get_bp_list(dotbracket)
        pk_bp_lists1.append(bp_list)
        
    pk_bp_lists2 = []
    for dotbracket in pk_dotbrackets2: 
        bp_list = get_bp_list(dotbracket)
        pk_bp_lists2.append(bp_list)
        
    pk_consensus_scores = get_consensus_scores(pk_starts, pk_bp_lists1, pk_starts2, pk_bp_lists2)
    
    # get consensus scores for pk bps only 
    
    bp_pk_consensus_scores = get_bp_pk_consensus_scores(starts, dotbrackets, starts2, dotbrackets2)
    
    # put it all together into a dataframe
        
    df = get_df(starts, ends, sequences, dotbrackets, shape_scores, pk_shape_scores, bp_pk_shape_scores, ranks, consensus_scores, pk_consensus_scores, bp_pk_consensus_scores)
    return df

In [106]:
knotty_df = score_pk_overall('/home/gnye8/Desktop/PK_research/pipeline_results/knotty/knotty_output.csv', '/home/gnye8/Desktop/PK_research/SSRP_work/shape_data/incarnato_invivo_reactivity-Copy1.csv', '/home/gnye8/Desktop/PK_research/pipeline_results/pknots/pk_predictor_output.csv')

In [110]:
knotty_df.to_csv('/home/gnye8/Desktop/PK_research/pipeline_results/knotty/knotty_analysis_scores.csv')

In [111]:
pknots_df = score_pk_overall('/home/gnye8/Desktop/PK_research/pipeline_results/pknots/pk_predictor_output.csv', '/home/gnye8/Desktop/PK_research/SSRP_work/shape_data/incarnato_invivo_reactivity-Copy1.csv', '/home/gnye8/Desktop/PK_research/pipeline_results/knotty/knotty_output.csv')

In [112]:
pknots_df.to_csv('/home/gnye8/Desktop/PK_research/pipeline_results/pknots/pknots_analysis_scores.csv')

In [108]:
knotty_df

# pseudoknot specific shape ranking
# pseudoknot specific consensus score
# visualize interesting pseudoknots

Unnamed: 0,start,end,sequence,structure,shape_score,pk_shape_score,pk_bp_shape_score,rank,consensus_score,pk_consensus_score,bp_pk_consensus_score
109,5000,5120,UCCACACGCAAGUUGUGGACAUGUCAAUGACAUAUGGACAACAGUU...,(((((((....).)))))).(((((.[[)))))]].((((.(((((...,0.808333,0.666667,0.000000,-3,0.000000,0.368421,0.000000
379,16560,16680,AUUGCAACAUGUGACUGGACAAAUGCUGGUGAUUACAUUUUAGCUA...,.(((((.[[))))).]]......(((.((((.(((.........))...,0.775000,0.800000,0.000000,-3,0.000000,0.000000,0.000000
62,2720,2840,CACCAACAAAGGUUACUUUUGGUGAUGACACUGUGAUAGAAGUGCA...,((((((..(((....)))))))))....((((........))))((...,0.850000,0.818182,1.000000,-3,0.000000,0.000000,0.000000
45,1960,2080,GCUAUAACAAUACUAGAUGGAAUUUCACAGUAUUCACUGAGACUCA...,.............((((((((((.(((.((((.(((.((([[.)))...,0.791667,0.757576,0.250000,-3,0.000000,0.000000,0.000000
132,6160,6280,GAUGUGGUGGCUAUUGAUUAUAAACACUACACACCCUCUUUUAAGA...,..(((((((...............)))))))....(((((((.[[)...,0.716667,0.818182,0.750000,-3,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
141,6520,6640,AUUACAGAAGAGGUUGGCCACACAGAUCUAAUGGCUGCUUAUGUAG...,.....(([[[.[[[[(((((..[[[[[[[[[))))).......[[[...,0.833333,0.846154,0.530303,-38,0.000000,0.000000,0.000000
273,12080,12200,CAACCUUACAAGCUAUAGCCUCAGAGUUUAGUUCCCUUCCAUCAUA...,.........((((([[[[[.[[[[[[[............[[[[[[....,0.758333,0.663265,0.468750,-38,0.181818,0.000000,0.000000
472,20560,20680,UUUCUAAGGUUGUCAAAGUGACUAUUGACUAUACAGAAAUUUCAUU...,((((((.[[[[[[[[((((((...[[[[[[[....[.[[[.)))))...,0.516667,0.486486,0.578125,-39,0.166667,0.000000,0.285714
372,16280,16400,AUGUGGUGCUUGCAUACGUAGACCAUUCUUAUGUUGUAAAUGCUGU...,.((((([[[..[[[[[[[[[[[[....[[.[[..[[[..[[[.[[....,0.741667,0.683168,0.387097,-40,0.000000,0.000000,0.000000


In [109]:
pknots_df

Unnamed: 0,start,end,sequence,structure,shape_score,pk_shape_score,pk_bp_shape_score,rank,consensus_score,pk_consensus_score,bp_pk_consensus_score
237,21200,21320,UAAGCUCAUGGGACACUUCGCAUGGUGGACAGCCUUUGUUACUAAU...,...........(((...................................,0.541667,0.857143,0.000000,-2,0.0,0.454545,0.428571
255,22520,22640,UCCAACCAACAGAAUCUAUUGUUAGAUUUCCUAAUAUUACAAACUU...,...........(((((((....)))))))........((((....)...,0.691667,1.000000,0.750000,-3,0.0,0.000000,0.000000
222,20120,20240,AAUUGGAGAAGCCGUAAAAACACAGUUCAAUUAUUAUAAGAAAGUU...,...(((.....)))....(((((...((((((..........))))...,0.508333,0.833333,0.750000,-3,0.0,0.000000,0.000000
115,10320,10440,UAAGGUUGAUACAGCCAAUCCUAAGACACCUAAGUAUAAGUUUGUU...,...((((.....))))........(((((..(((...(((((((((...,0.750000,0.538462,0.750000,-3,0.0,0.000000,0.000000
1,120,240,CACGCAGUAUAAUUAAUAACUAAUUACUGUCGUUGACAGGACACGA...,.((((((((..((((.....))))))))).)))((.(((((.((((...,0.700000,0.741935,1.000000,-3,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
4,440,560,GCGUUUUGCCUCAACUUGAACAGCCCUAUGUGUUCAUCAAACGUUC...,((.....))((((((.((((((........)))))).....[[[.....,0.716667,0.684211,0.700000,-23,0.2,0.000000,0.000000
92,8800,8920,ACUAAUGACAAAGCUUGCCCAUUGAUUGCUGCAGUCAUAACAAGAG...,....((((([[[[.[[[[[.[[[[.[[[.[[[.))))).....(((...,0.683333,0.626263,0.473684,-24,0.0,0.000000,0.000000
30,3000,3120,UGAGUUUAAAUUGGCUUCACAUAUGUAUUGUUCUUUCUACCCUCCA...,...........((.[[[[[[[....[[[[.[[[[[[....[[[[))...,0.750000,0.698413,0.404762,-25,0.0,0.000000,0.000000
49,4480,4600,UAUAAGGGUAUUAAAAUACAAGAGGGUGUGGUUGAUUAUGGUGCUA...,..((((((((...((((..[[[[[[[[[[[.[[[[[..[[[[[[[[...,0.791667,0.782178,0.520833,-27,0.0,0.000000,0.000000


In [97]:
dotbracket1 = '...(((..[[..)))..]]...'
shape1 = '1110001100110001100111'
start1 = 0

def get_sp_pk_locs(dotbracket, start):
    locs = []
    struct = []
    for i, char in enumerate(dotbracket):
        if char == '[':
            locs.append(i)
            struct.append(char)
        elif char == ']':
            locs.append(i)
            struct.append(char)
    return locs, struct

locs1, struct1 = get_pk_locs(dotbracket1, start1)
print(locs1)
print(struct1)

def get_sp_pk_shape(full_shape, locs):
    shapes = []
    for loc in locs: 
        shape = full_shape[loc]
        shape = float(shape)
        shapes.append(shape)
    return shapes

pk_shape = get_pk_shape(shape1, locs1)
print(pk_shape)

score = evaluate_L1_shape_score(struct1, pk_shape)
print(score)

[8, 9, 17, 18]
['[', '[', ']', ']']
[0.0, 0.0, 0.0, 0.0]
1.0


In [100]:
struct1 = '...(((..[[..)))..]]...)))'
struct2 = '...(((..[...)))...]...)))'
def compare_sp_pks(struct1, struct2):
    bp_count = 0
    score = 0
    for i, char in enumerate(struct1):
        if char == '[' or char == ']':
            bp_count += 1
            if char == struct2[i]:
                score += 1
    return score/bp_count

compare_pks(struct1, struct2)

0.5