In [1]:
# take csvs of pks predicted by each program 
# make lists with all the categories (name of program, structure, sequence, start, end)
# get shape score for each whole structure and each pk 
# get rank for each pk 
# compare all to all 
# for each base pair in a window, add a point if every other program also predicted the same base pair (or that the base was not paired)
# weighted mean - get consensus scores for each pk, then find the weighted average 

In [21]:
##### compare all to all
import pandas as pd 

seq = 'AAAAGGGAACCAACCCAAGGAAAA'
struct1 = '....(((..[[..)))..]]....'
struct2 = '....(((..[...)))...]....'
struct3 = '....(((...[..)))..].....'

bp_list1 = [[4, 15], [5, 14], [6, 13], [9, 19], [10, 18]]
bp_list2 = [[4, 15], [5, 14], [6, 13], [9, 19]]
bp_list3 = [[4, 15], [5, 14], [6, 13], [10, 18]]

start1 = 40
start2 = 40
start3 = 40

bp_lists = []
all_bp_lists = bp_lists + bp_list1
all_bp_lists = all_bp_lists + bp_list2
all_bp_lists = all_bp_lists + bp_list3

bp_lists.append(bp_list1)
bp_lists.append(bp_list2)
bp_lists.append(bp_list3)

print(all_bp_lists)
print(bp_lists)

[[4, 15], [5, 14], [6, 13], [9, 19], [10, 18], [4, 15], [5, 14], [6, 13], [9, 19], [4, 15], [5, 14], [6, 13], [10, 18]]
[[[4, 15], [5, 14], [6, 13], [9, 19], [10, 18]], [[4, 15], [5, 14], [6, 13], [9, 19]], [[4, 15], [5, 14], [6, 13], [10, 18]]]


In [3]:
#### this function takes as input a list of dotbracket structures and bp_lists for pseudoknots predicted in the same window
#### returns as output a score for consensus of all to all for each structure

def all_to_all(structs_list, bp_lists):
    scores = []
    for idx, struct in enumerate(structs_list): 
        score = 0
        for i, char in enumerate(struct):
            same_np = 0
            for other_struct in structs_list: 
                if (char == '.') and (char == other_struct[i]):
                    same_np += 1
            if same_np == len(structs_list):
                score += 1
        for bp in bp_lists[idx]:
            same_bp = 0
            for bp_list in bp_lists: 
                for bp2 in bp_list:
                    if bp == bp2:
                        same_bp += 1
            if same_bp == len(bp_lists):
                score += 1
        
        count = 0
        for char in struct:
            if char == '.':
                count += 1
        count += len(bp_lists[idx])
        
        score = score/count
        scores.append(score)
        
    return scores

In [4]:
all_to_all(structs, bp_lists)

[0.8947368421052632, 0.85, 0.85]

In [17]:
## getting info from lists of csvs 

def get_csv(csv_loc):
    df = pd.read_csv(csv_loc)
    return df

def get_info(df):
    starts = df['start'].to_list()
    ends = df['end'].to_list()
    sequences = df['sequence'].to_list()
    dotbrackets = df['struct'].to_list()
    
    return starts, ends, sequences, dotbrackets

def get_info_from_shapeknots(df):
    starts = (df['start']-1).to_list()
    ends = df['end'].to_list()
    sequences = df['sequence'].to_list()
    dotbrackets = df['threshknot_structure'].to_list()
    
    return starts, ends, sequences, dotbrackets

def check_if_shapeknots(name, shapeknots_names):
    for program in shapeknots_names: 
        if name == program: 
            return True

names = ['knotty', 'threshknot', 'pknots', 'incarnato_invitro', 'incarnato_invivo', 'zhang_invivo', 'zhang_invitro']
shapeknots_names = ['incarnato_invitro', 'incarnato_invivo', 'zhang_invivo', 'zhang_invitro']

all_programs = []
all_starts = []
all_ends = []
all_sequences = []
all_dotbrackets = []

for name in names:
    df = get_csv('/home/gnye8/Desktop/PK_research/pipeline_results/direct_output/{}.csv'.format(name))
    
    if check_if_shapeknots(name, shapeknots_names):
        starts, ends, sequences, dotbrackets = get_info_from_shapeknots(df)
    elif not check_if_shapeknots(name, shapeknots_names):
        starts, ends, sequences, dotbrackets = get_info(df)
    
    for i in range(len(starts)):
        all_programs.append(name)
    all_starts = all_starts + starts
    all_ends = all_ends + ends
    all_sequences = all_sequences + sequences
    all_dotbrackets = all_dotbrackets + dotbrackets

print(len(all_programs))
print(len(all_starts))

1323
1323


In [101]:
def compare_bp_lists(bp_list1, bp_list2):
    bp_list_score = 0
    for bp1 in bp_list1: 
        for bp2 in bp_list2: 
            if bp1 == bp2: 
                bp_list_score += 1
        # divide by total number of base pairs in bp_list1 to normalize results
    return bp_list_score/len(bp_list1)

def check_if_same_window(start1, program2_starts):
    for start2 in program2_starts: 
        if start1 == start2: 
            return True

def get_consensus_scores(program1_starts, bp_lists1, program2_starts, bp_lists2):
    scores = []
    for i, start1 in enumerate(program1_starts):
        if check_if_same_window(start1, program2_starts):
            for idx, start2 in enumerate(program2_starts):
                if start1 == start2:
                    bp_list1 = bp_lists1[i]
                    bp_list2 = bp_lists2[idx]
                    bp_list_score = compare_bp_lists(bp_list1, bp_list2)
                    scores.append(bp_list_score)
        elif not check_if_same_window(start1, program2_starts):
            scores.append(0)
    return scores

def get_weighted_consensus(starts_by_program, bp_lists_by_program):
    weighted_avgs = []
    for idx1, program1_starts in enumerate(starts_by_program):
        program1_bp_lists = bp_lists_by_program[idx1]
        program1_consensus_scores = []
        for idx2, program2_starts in enumerate(starts_by_program):
            if idx1 != idx2:
                program2_bp_lists = bp_lists_by_program[idx2]
                consensus_scores = get_consensus_scores(program1_starts, program1_bp_lists, program2_starts, program2_bp_lists)
                program1_consensus_scores.append(consensus_scores)
        
        program1_weighted_avgs = []
        for i in range(len(program1_consensus_scores[0])):
            window = []
            for program in program1_consensus_scores:
                window.append(program[i])
            weighted_avg = sum(window)/len(window)
            program1_weighted_avgs.append(weighted_avg)
            
        weighted_avgs = weighted_avgs + program1_weighted_avgs
    return weighted_avgs

In [102]:
program1_starts = [40, 50, 60]
program2_starts = [40, 50, 60]
program3_starts = [40, 50]

program1_bp_list1 = [[1, 5], [2, 4]]
program1_bp_list2 = [[1, 9], [3, 7]]
program1_bp_list3 = [[3, 6], [2, 7]]

program2_bp_list1 = [[1, 5], [2, 4]]
program2_bp_list2 = [[1, 9], [3, 8]]
program2_bp_list3 = [[3, 6], [2, 7]]

program3_bp_list1 = [[1, 5], [2, 4]]
program3_bp_list2 = [[1, 9], [3, 7]]

starts = []
starts.append(program1_starts)
starts.append(program2_starts)
starts.append(program3_starts)

program1_bp_lists = []
program1_bp_lists.append(program1_bp_list1)
program1_bp_lists.append(program1_bp_list2)
program1_bp_lists.append(program1_bp_list3)

program2_bp_lists = []
program2_bp_lists.append(program2_bp_list1)
program2_bp_lists.append(program2_bp_list2)
program2_bp_lists.append(program2_bp_list3)

program3_bp_lists = []
program3_bp_lists.append(program3_bp_list1)
program3_bp_lists.append(program3_bp_list2)

bp_lists = []
bp_lists.append(program1_bp_lists)
bp_lists.append(program2_bp_lists)
bp_lists.append(program3_bp_lists)

print(starts)
print(bp_lists)

[[40, 50, 60], [40, 50, 60], [40, 50]]
[[[[1, 5], [2, 4]], [[1, 9], [3, 7]], [[3, 6], [2, 7]]], [[[1, 5], [2, 4]], [[1, 9], [3, 8]], [[3, 6], [2, 7]]], [[[1, 5], [2, 4]], [[1, 9], [3, 7]]]]


In [103]:
get_weighted_consensus(starts, bp_lists)

[1.0, 0.75, 0.5, 1.0, 0.5, 0.5, 1.0, 0.75]