# Extensions

The purpose of this notebook is to test everything related to generating extensions among hits

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('..', 'hypedsearch', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import testing_utils
import database

import operator

ppm_tolerance = 20
max_peptide_length = 20

import matplotlib.pyplot as plt

In [2]:
datasets = testing_utils.define_data()

dataset = datasets[0]

input_spectra_path = dataset[0]
input_spectra, boundaries = testing_utils.preprocess_input_spectra(input_spectra_path, ppm_tolerance)

correct_sequences = testing_utils.generate_truth_set(datasets[0])

path = dataset[2]
db = database.build(path)

Loading spectra...
Done


In [None]:
matched_masses_b, matched_masses_y, database = testing_utils.modified_match_masses(boundaries, db, max_peptide_length)
# matched_masses_b, matched_masses_y, database = match_masses(boundaries, db, max_peptide_length)
print('Finished matching masses')

On protein 279/279 [100%]
Sorting the set of protein masses...
Sorting the set of protein masses done
Performing Merge


# Getting initial hits

In [None]:
spectrum_num = 19

correct_sequence = correct_sequences[spectrum_num]
print(correct_sequence)

input_spectrum = input_spectra[spectrum_num]

In [None]:
b_hits, y_hits, b_set, y_set, misses = testing_utils.find_hits(boundaries, input_spectrum, spectrum_num, matched_masses_b, matched_masses_y)
correct_hits = testing_utils.append_correct_hits(correct_sequence, input_spectrum, ppm_tolerance)
testing_utils.write_hits(b_hits, y_hits)
ion = 'b'
testing_utils.create_clusters(ion)
b_sorted_clusters = testing_utils.sort_clusters_by_post_prob(ion, boundaries, matched_masses_b, matched_masses_y)
ion = 'y'
testing_utils.create_clusters(ion)
y_sorted_clusters = testing_utils.sort_clusters_by_post_prob(ion, boundaries, matched_masses_b, matched_masses_y)

# Printing hits

In [None]:
# b_sorted_clusters = sorted(b_sorted_clusters, key=operator.attrgetter('post_prob', 'score', 'pid', 'prior'), reverse = True)
b_sorted_clusters = sorted(b_sorted_clusters, key=operator.attrgetter('score', 'post_prob', 'pid', 'prior'), reverse = True)
testing_utils.write_b_sorted_cluster(b_sorted_clusters)
for i in range(0, 50):
    x = b_sorted_clusters[i]
    post_prob = x.post_prob
    score = x.score
    seq = x.seq
    indices = x.indices
    print(post_prob, score, seq)

In [None]:
testing_utils.write_y_sorted_cluster(y_sorted_clusters)  
# y_sorted_clusters = sorted(y_sorted_clusters, key=operator.attrgetter('post_prob', 'score', 'pid', 'prior'), reverse = True)
y_sorted_clusters = sorted(y_sorted_clusters, key=operator.attrgetter('score', 'post_prob', 'pid', 'prior'), reverse = True)
for i in range(0, 50):
    x = y_sorted_clusters[i]
    post_prob = x.post_prob
    score = x.score
    seq = x.seq
    indices = x.indices
    print(post_prob, score, seq)

# To Filter data by parent prot

I want to be able to only view the b or y hits from a certain protein

In [None]:
# b_hits
target_pid = 274
b_target_clusters = []
for cluster in b_sorted_clusters:
    if cluster.pid == target_pid:
        b_target_clusters.append(cluster)

for cluster in b_target_clusters:
    assessment, _ = testing_utils.is_good_hit(cluster.seq, 'b', correct_sequence)
    non_indices = str(cluster.score) + '\t' + str(cluster.post_prob) + '\t' + str(cluster.pid) + '\t' + cluster.seq + '\t' + str(assessment)
    print(non_indices + '\t'+ '\t'.join([str(o) for o in cluster.indices]))


In [None]:
# y_hits
y_target_clusters = []
for cluster in y_sorted_clusters:
    if cluster.pid == target_pid:
        y_target_clusters.append(cluster)

for cluster in y_target_clusters:
    assessment, _ = testing_utils.is_good_hit(cluster.seq, 'y', correct_sequence)
    non_indices = str(cluster.score) + '\t' + str(cluster.post_prob) + '\t' + str(cluster.pid) + '\t' + cluster.seq + '\t' + str(assessment)
    print(non_indices + '\t'+ '\t'.join([str(o) for o in cluster.indices]))

# Finding non-hybrid interesting Combos

In [None]:
def get_top(b_clusters, y_clusters, top_num):
    filtered_b = []
    filtered_y = []
    b_len = top_num if len(b_clusters) >= top_num else len(b_clusters)
    y_len = top_num if len(y_clusters) >= top_num else len(y_clusters)
    for x in range(0,b_len):
        filtered_b.append(b_clusters[x])
    for x in range(0,y_len):
        filtered_y.append(y_clusters[x])
    return filtered_b, filtered_y

In [None]:
# b side starting. Ideally, we would probably pick the higher scoring side to start
filtered_b, filtered_y = get_top(b_target_clusters, y_target_clusters, 50)
target_precursor = input_spectrum.precursor_mass
#Start with printing overlapping. Then will incorportate boundary overlaps between last of b and first of y
for b_cluster in filtered_b:
    interesting_combos = []
    for y_cluster in filtered_y:
        if b_cluster.start <= y_cluster.end:
            interesting_combos.append(b_cluster.seq + '-' + y_cluster.seq)
    print(interesting_combos)
# # Calculating start and end indices for each interval
# for b_cluster in filtered_b:
#     interesting_combos = []
#     b_index_set = set()
#     for index in b_cluster.indices:
#         index = index.replace('(', '')
#         index = index.replace(')', '')
#         A = index.rstrip().split(',')
#         b_index_set.add(int(A[0]))
#         b_index_set.add(int(A[1]))
    
#     b_start_pos = min(b_index_set)
#     b_end_pos = max(b_index_set)
    
#     for y_cluster in filtered_y:
#         y_index_set = set()
#         for index in y_cluster.indices:
#             index = index.replace('(', '')
#             index = index.replace(')', '')
#             A = index.rstrip().split(',')
#             y_index_set.add(int(A[0]))
#             y_index_set.add(int(A[1]))
            
#         y_start_pos = min(y_index_set)
#         y_end_pos = max(y_index_set)
        
#         if (b_start_pos <= y_start_pos) and (b_end_pos == y_start_pos - 1):
#             interesting_combos.append(b_cluster)
#             interesting_combos.append(y_cluster)
    
#     print(interesting_combos)

# Finding optimal "hybrid" combos

* Hybrid is in quotation marks because all outputs will be a hybrid and then we can check if it is a non-hybrid

In [None]:
def parse_indices(index_set):
    indices = []
    for index in index_set:
        string = str(index)
        A = string.rstrip().split(',')
        start = A[0]
        end = A[1]
        seq = A[2]
        mz = A[3]
        disallowed_characters = " ()\'"
        for character in disallowed_characters:
            start = start.replace(character, "")
            end = end.replace(character, "")
            seq = seq.replace(character, "")
            mz = mz.replace(character, "")
        
        target_tuple = (int(start), int(end), seq, float(mz))
        indices.append(target_tuple)
    
    
    return indices

def calc_combined_score(b_indices, y_indices, b_score, y_score):
    masses = []
    combined_score = 0
    for index in b_indices:
        current_mass = float(index[3])
        if current_mass not in masses:
            masses.append(current_mass)
            combined_score = combined_score + 1
        else:
            combined_score = combined_score - 2
    for index in y_indices:
        current_mass = float(index[3])
        if current_mass not in masses:
            masses.append(current_mass)
            combined_score = combined_score + 1
        else:
            combined_score = combined_score - 2
    return combined_score

In [None]:
# b side starting. Ideally, we would probably pick the higher scoring side to start
filtered_b, filtered_y = get_top(b_sorted_clusters, y_sorted_clusters, 50)
target_precursor = input_spectrum.precursor_mass
#Start with printing overlapping. Then will incorportate boundary overlaps between last of b and first of y
interesting_combos = []
for b_cluster in filtered_b:
    for y_cluster in filtered_y:
        if b_cluster.start <= y_cluster.end:
            seq = b_cluster.seq + '-' + y_cluster.seq
            b_indices = parse_indices(b_cluster.indices)
            y_indices = parse_indices(y_cluster.indices)
            score = calc_combined_score(b_indices, y_indices, b_cluster.score, y_cluster.score)
            tup = (seq, score)
            interesting_combos.append(tup)
interesting_combos.sort(key=lambda a: a[1], reverse=True)
[print(x) for x in interesting_combos]
            
            
# # Calculating start and end indices for each interval
# for b_cluster in filtered_b:
#     interesting_combos = []
#     b_index_set = set()
#     for index in b_cluster.indices:
#         index = index.replace('(', '')
#         index = index.replace(')', '')
#         A = index.rstrip().split(',')
#         b_index_set.add(int(A[0]))
#         b_index_set.add(int(A[1]))
    
#     b_start_pos = min(b_index_set)
#     b_end_pos = max(b_index_set)
    
#     for y_cluster in filtered_y:
#         y_index_set = set()
#         for index in y_cluster.indices:
#             index = index.replace('(', '')
#             index = index.replace(')', '')
#             A = index.rstrip().split(',')
#             y_index_set.add(int(A[0]))
#             y_index_set.add(int(A[1]))
            
#         y_start_pos = min(y_index_set)
#         y_end_pos = max(y_index_set)
        
#         if (b_start_pos <= y_start_pos) and (b_end_pos == y_start_pos - 1):
#             interesting_combos.append(b_cluster)
#             interesting_combos.append(y_cluster)
    
#     print(interesting_combos)

# Filtering overlap

Q: What happens if we completely filter out all cases where any mass overlaps?

In [None]:
def filter_by_validity(b_cluster, y_cluster):
    interesting_combos = []
    valid = True
    for b in b_cluster.indices:
        for y in y_cluster.indices:
            if b[3] == y[3]:
                valid = False
    return valid

def brutal_calc_combined_score(b_indices, y_indices, b_score, y_score):
    masses = []
    combined_score = 0
    for index in b_indices:
        current_mass = float(index[3])
        if current_mass not in masses:
            masses.append(current_mass)
            combined_score = combined_score + 1
        else:
            combined_score = combined_score - 2000 #Little bit of a hack
    for index in y_indices:
        current_mass = float(index[3])
        if current_mass not in masses:
            masses.append(current_mass)
            combined_score = combined_score + 1
        else:
            combined_score = combined_score - 2000
    return combined_score

In [None]:
# b side starting. Ideally, we would probably pick the higher scoring side to start
filtered_b, filtered_y = get_top(b_sorted_clusters, y_sorted_clusters, 50)
target_precursor = input_spectrum.precursor_mass
#Start with printing overlapping. Then will incorportate boundary overlaps between last of b and first of y
interesting_combos = []
for b_cluster in filtered_b:
    for y_cluster in filtered_y:
        if b_cluster.start <= y_cluster.end:
            seq = b_cluster.seq + '-' + y_cluster.seq
            b_indices = parse_indices(b_cluster.indices)
            y_indices = parse_indices(y_cluster.indices)
#             score = brutal_calc_combined_score(b_indices, y_indices, b_cluster.score, y_cluster.score)
#             tup = (seq, score)
#             interesting_combos.append(tup)
            evaluation = filter_by_validity(b_cluster, y_cluster)
            if evaluation == True:
                tup = (seq, b_cluster.score + y_cluster.score)
                interesting_combos.append(tup)
interesting_combos.sort(key=lambda a: a[1], reverse=True)
[print(x) for x in interesting_combos]

# Over all datasets

* Want to know:
    * What do good hits look like when put through extensions?
    * Only factor in the hits for which there exist a good hit from the b and y side. It doesn't make sense to talk about extensions for which we don't have good initial hits on both sides
        * How often does this happen?
        * Do we lose out on good combinations after penalizing double counting?