In [None]:
## get_consensus_scores takes a list of structures and their location in the genome
    #compares to a separate list of structures and their location within the same genome
    #returns a list of scores for each structure based on 
        #how many base pairs it shares with the corresponding structure on the second list

def get_consensus_scores(program1_starts, bp_lists1, program2_starts, bp_lists2):
    scores = []
    for i, start1 in enumerate(program1_starts):
        # first checks if the second list contains a structure at the same location
        if check_if_same_window(start1, program2_starts):
            for idx, start2 in enumerate(program2_starts):
                if start1 == start2:
                    # retrieves consensus score for that structure
                    bp_list_score = compare_bp_lists(bp_lists1[i], bp_lists2[idx])
                    scores.append(bp_list_score)
        # gives the structure a score of zero if there is no corresponding structure in the second list 
        elif not check_if_same_window(start1, program2_starts):
            scores.append(0)
    return scores

# check_if_same_window is a simple boolean that returns True 
    #if there is a structure at a given location within a list of locations
    #I'm sure there is a simpler way to do this within the get_consensus_scores function
        #but this was the solution I came up with first, and I am open to better ways to do this
def check_if_same_window(start1, program2_starts):
    for start2 in program2_starts: 
        if start1 == start2: 
            return True
        
# compare_bp_lists takes two proposed bp_lists corresponding to different structures for the same seq
# returns a score which contains the number of base pairs held in common divided by the total length of the seq 
def compare_bp_lists(bp_list1, bp_list2):
    bp_list_score = 0
    for bp1 in bp_list1: 
        for bp2 in bp_list2: 
            if bp1 == bp2: 
                bp_list_score += 1
        # divide by total number of base pairs in bp_list1 to normalize results
    return bp_list_score/len(bp_list1)

# average_consensus takes as input a dataframe of pseudoknots and 
def average_consensus(df):
    
    #goal get a list of starts for pks without repeats
    starts = []
    all_starts = df['start'].to_list()
    for idx in all_starts:
        if idx not in starts:
            starts.append(idx)
    
    #goal: get a list of locations for pks predicted multiple times
    locations = []
    for idx in starts: 
        specdf = df[df['start'] == idx]
        all_locations = specdf['start'].to_list()
        if len(all_locations) > 1:
            locations.append(idx)
            
    # goal: get list of weighted consensus scores for each location 
    
    all_scores = []
    all_pk_bp_scores = []
    #note that all_scores is a list containing lists of consensus scores for each program in each window
    for idx in locations: 
        specdf = testdf[testdf['start'] == idx]
        scores_list = specdf['weighted_consensus_score'].to_list()
        pk_bp_scores_list = specdf['weighted_pk_bp_consensus_score'].to_list()
        all_scores.append(scores_list)
        all_pk_bp_scores.append(pk_bp_scores_list)
    
    # goal: average the scores for each window and add to a new list
    
    averaged_scores = []
    averaged_pk_bp_scores = []
    #note that averaged_scores is a list of the averaged scores for each location 
    for window in all_scores: 
        window_avg = sum(window)/len(window)
        averaged_scores.append(window_avg)
    for window in all_pk_bp_scores:
        window_avg = sum(window)/len(window)
        averaged_pk_bp_scores.append(window_avg)
        
    #goal: create a dataframe that contains the location and the consensus scores for all pks
    
    pks_list = zip(locations, averaged_scores, averaged_pk_bp_scores)
    df = pd.DataFrame(pks_list, columns = ['location', 'average_consensus_score', 'average_pk_bp_consensus_score'])
    return df