# Extensions

The purpose of this notebook is to test everything related to generating extensions among hits

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('..', 'hypedsearch', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import testing_utils
import database

import operator

ppm_tolerance = 20
max_peptide_length = 20

import matplotlib.pyplot as plt

In [2]:
datasets = testing_utils.define_data()

dataset = datasets[0]

input_spectra_path = dataset[0]
input_spectra, boundaries = testing_utils.preprocess_input_spectra(input_spectra_path, ppm_tolerance)

correct_sequences = testing_utils.generate_truth_set(datasets[0])

path = dataset[2]
db = database.build(path)

Loading spectra...
Done


In [3]:
matched_masses_b, matched_masses_y, database = testing_utils.modified_match_masses(boundaries, db, max_peptide_length)
# matched_masses_b, matched_masses_y, database = match_masses(boundaries, db, max_peptide_length)
print('Finished matching masses')

On protein 279/279 [100%]
Sorting the set of protein masses...
Sorting the set of protein masses done
Performing Merge
Done
Finished matching masses


# Getting initial hits

In [14]:
spectrum_num = 0

correct_sequence = correct_sequences[spectrum_num]
print(correct_sequence)

input_spectrum = input_spectra[spectrum_num]

DPQVEQLEL


In [15]:
b_hits, y_hits, b_set, y_set, misses = testing_utils.find_hits(boundaries, input_spectrum, spectrum_num, matched_masses_b, matched_masses_y)
correct_hits = testing_utils.append_correct_hits(correct_sequence, input_spectrum, ppm_tolerance)
testing_utils.write_hits(b_hits, y_hits)
ion = 'b'
testing_utils.create_clusters(ion)
b_sorted_clusters = testing_utils.sort_clusters_by_post_prob(ion, boundaries, matched_masses_b, matched_masses_y)
ion = 'y'
testing_utils.create_clusters(ion)
y_sorted_clusters = testing_utils.sort_clusters_by_post_prob(ion, boundaries, matched_masses_b, matched_masses_y)

Done


# Printing hits

In [16]:
# b_sorted_clusters = sorted(b_sorted_clusters, key=operator.attrgetter('post_prob', 'score', 'pid', 'prior'), reverse = True)
b_sorted_clusters = sorted(b_sorted_clusters, key=operator.attrgetter('score', 'post_prob', 'pid', 'prior'), reverse = True)
testing_utils.write_b_sorted_cluster(b_sorted_clusters)
for i in range(0, 50):
    x = b_sorted_clusters[i]
    post_prob = x.post_prob
    score = x.score
    seq = x.seq
    indices = x.indices
    print(post_prob, score, seq)

0.8674851509837346 7 DPQVEQLE
0.5832415059687787 4 TVFSDFL
0.5689800742992233 4 TQAGVEELDPENKIP
0.5348891652083142 4 PAGDQKDV
0.45824150596877866 4 DPEVQQI
0.42904238878594025 4 GTYFEVKIPSDTFYDN
0.42446275789051985 4 HSLMPMLE
0.381750637069786 4 PDAGAPTSASGLSGHTTL
0.6258241758241758 3 TTFV
0.6258241758241758 3 TTFV
0.5721812218122181 3 TTPGPD
0.5676756625334306 3 KKEECP
0.5570921985815603 3 TTSTRTY
0.549407674478496 3 TTYNSIMK
0.5399705014749262 3 KKDLEEWNQ
0.5390044108380593 3 TTYRTALTY
0.5390044108380593 3 KKEEREAEA
0.5379931876241839 3 TTGFSTEVWQ
0.5300595238095238 3 TTVESNSSWWTN
0.5280657395701643 3 TTSPNLGTREN
0.5277367773677737 3 TTPATSTTCTAT
0.5265676567656766 3 KKCGWFHPPANE
0.525892857142857 3 TTNPHVFPEGSEP
0.5183733670459334 3 QPDV
0.5178571428571428 3 KHHMY
0.5170879922216821 3 KKEKKSLDSDESED
0.5154565456545654 3 TTQPPAQPASQGSGS
0.5111301625167736 3 TTESVKEQEMKWTDLA
0.5084985835694051 3 TTTPCMLRDSDSILETL
0.4917237934479313 3 KPEGRPGT
0.4917237934479313 3 TLSFSSIS
0.4900793650

In [17]:
testing_utils.write_y_sorted_cluster(y_sorted_clusters)  
# y_sorted_clusters = sorted(y_sorted_clusters, key=operator.attrgetter('post_prob', 'score', 'pid', 'prior'), reverse = True)
y_sorted_clusters = sorted(y_sorted_clusters, key=operator.attrgetter('score', 'post_prob', 'pid', 'prior'), reverse = True)
for i in range(0, 50):
    x = y_sorted_clusters[i]
    post_prob = x.post_prob
    score = x.score
    seq = x.seq
    indices = x.indices
    print(post_prob, score, seq)

0.7857723577235773 5 LDSFSEI
0.6975386779184247 4 CGLYEL
0.6975386779184247 4 HPDSEL
0.5330623306233063 4 LESYGLE
0.483033033033033 4 SSNWVGKGFFAVYEAIC
0.42394212394212394 4 EISSIDEF
0.42394212394212394 4 EIPHSELD
0.41172391549750037 4 FVDLTMPYSV
0.3574670382889561 4 ICLFRLVDDQQLHLNAED
0.6505494505494506 3 HICL
0.6102990033222592 3 ITCI
0.6102990033222592 3 LTCI
0.6102990033222592 3 TLCI
0.5978738652651696 3 FSCL
0.5978738652651696 3 SFCL
0.5978738652651696 3 AYCL
0.5844339622641509 3 NNPEL
0.5734567901234567 3 DPLEEL
0.5734567901234567 3 HVMSEI
0.5721001221001221 3 PENPEI
0.5721001221001221 3 YNVSCI
0.5692411924119242 3 HWEPEI
0.5654639175257732 3 TEEKMEL
0.5606761565836299 3 YQASLEL
0.5606761565836299 3 KPCLFCL
0.5581300813008131 3 ESYGLEL
0.5581300813008131 3 SFSEVEL
0.5581300813008131 3 YGDLTEI
0.5581300813008131 3 FSESVEL
0.5563730084348641 3 SDIAMTEL
0.5515852474927209 3 GYISAAEL
0.549017199017199 3 PIDHLCEL
0.5454301075268817 3 YENLNDQEL
0.5454301075268817 3 IMTERDMEL
0.54543010

# To Filter data by parent prot

I want to be able to only view the b or y hits from a certain protein

In [8]:
# b_hits
target_pid = 274
b_target_clusters = []
for cluster in b_sorted_clusters:
    if cluster.pid == target_pid:
        b_target_clusters.append(cluster)

for cluster in b_target_clusters:
    assessment, _ = testing_utils.is_good_hit(cluster.seq, 'b', correct_sequence)
    non_indices = str(cluster.score) + '\t' + str(cluster.post_prob) + '\t' + str(cluster.pid) + '\t' + cluster.seq + '\t' + str(assessment)
    print(non_indices + '\t'+ '\t'.join([str(o) for o in cluster.indices]))


1	0.20083892617449667	274	GS	False	(70, 71, 'GS', '145.0601806640625')
1	0.14363173619564013	274	LCGP	False	(30, 33, 'LCGP', '186.08746337890625')
1	0.11385083713850837	274	EPKPTQ	False	(18, 23, 'EPKPTQ', '341.1816101074219')
1	0.07366570789389582	274	LGGSPGDLQTL	False	(68, 78, 'LGGSPGDLQTL', '1039.521484375')
1	0.048765803732691154	274	RREVEDPQVEQLELGGSPG	False	(55, 73, 'RREVEDPQVEQLELGGSPG', '1039.0203857421875')
1	0.04772727272727273	274	DPQVEQLELGGSPGDLQTL	False	(60, 78, 'DPQVEQLELGGSPGDLQTL', '989.4848022460938')


In [9]:
# y_hits
y_target_clusters = []
for cluster in y_sorted_clusters:
    if cluster.pid == target_pid:
        y_target_clusters.append(cluster)

for cluster in y_target_clusters:
    assessment, _ = testing_utils.is_good_hit(cluster.seq, 'y', correct_sequence)
    non_indices = str(cluster.score) + '\t' + str(cluster.post_prob) + '\t' + str(cluster.pid) + '\t' + cluster.seq + '\t' + str(assessment)
    print(non_indices + '\t'+ '\t'.join([str(o) for o in cluster.indices]))

2	0.44019607843137254	274	FVK	False	(25, 27, 'FVK', '197.12884521484375')	(27, 27, 'K', '74.06013488769531')
1	0.25015087507543754	274	K	False	(20, 20, 'K', '74.06013488769531')
1	0.25015087507543754	274	K	False	(53, 53, 'K', '74.06013488769531')
1	0.25015087507543754	274	K	False	(86, 86, 'K', '74.06013488769531')
1	0.11453576864535768	274	ALYLVC	False	(38, 43, 'ALYLVC', '341.1816101074219')
1	0.06428801028608165	274	EALYLVCGERGFFY	False	(37, 50, 'EALYLVCGERGFFY', '833.8908081054688')
1	0.05867082035306334	274	EALYLVCGERGFFYT	False	(37, 51, 'EALYLVCGERGFFYT', '884.4120483398438')
1	0.054694835680751175	274	VEDPQVEQLELGGSPGD	False	(58, 74, 'VEDPQVEQLELGGSPGD', '884.9159545898438')
1	0.05128205128205128	274	LVEALYLVCGERGFFYTP	False	(35, 52, 'LVEALYLVCGERGFFYTP', '1039.0203857421875')


# Finding non-hybrid interesting Combos

In [10]:
def get_top(b_clusters, y_clusters, top_num):
    filtered_b = []
    filtered_y = []
    b_len = top_num if len(b_clusters) >= top_num else len(b_clusters)
    y_len = top_num if len(y_clusters) >= top_num else len(y_clusters)
    for x in range(0,b_len):
        filtered_b.append(b_clusters[x])
    for x in range(0,y_len):
        filtered_y.append(y_clusters[x])
    return filtered_b, filtered_y

In [11]:
# b side starting. Ideally, we would probably pick the higher scoring side to start
filtered_b, filtered_y = get_top(b_target_clusters, y_target_clusters, 50)
target_precursor = input_spectrum.precursor_mass
#Start with printing overlapping. Then will incorportate boundary overlaps between last of b and first of y
for b_cluster in filtered_b:
    interesting_combos = []
    for y_cluster in filtered_y:
        if b_cluster.start <= y_cluster.end:
            interesting_combos.append(b_cluster.seq + '-' + y_cluster.seq)
    print(interesting_combos)
# # Calculating start and end indices for each interval
# for b_cluster in filtered_b:
#     interesting_combos = []
#     b_index_set = set()
#     for index in b_cluster.indices:
#         index = index.replace('(', '')
#         index = index.replace(')', '')
#         A = index.rstrip().split(',')
#         b_index_set.add(int(A[0]))
#         b_index_set.add(int(A[1]))
    
#     b_start_pos = min(b_index_set)
#     b_end_pos = max(b_index_set)
    
#     for y_cluster in filtered_y:
#         y_index_set = set()
#         for index in y_cluster.indices:
#             index = index.replace('(', '')
#             index = index.replace(')', '')
#             A = index.rstrip().split(',')
#             y_index_set.add(int(A[0]))
#             y_index_set.add(int(A[1]))
            
#         y_start_pos = min(y_index_set)
#         y_end_pos = max(y_index_set)
        
#         if (b_start_pos <= y_start_pos) and (b_end_pos == y_start_pos - 1):
#             interesting_combos.append(b_cluster)
#             interesting_combos.append(y_cluster)
    
#     print(interesting_combos)

['GS-K', 'GS-VEDPQVEQLELGGSPGD']
['LCGP-K', 'LCGP-K', 'LCGP-ALYLVC', 'LCGP-EALYLVCGERGFFY', 'LCGP-EALYLVCGERGFFYT', 'LCGP-VEDPQVEQLELGGSPGD', 'LCGP-LVEALYLVCGERGFFYTP']
['EPKPTQ-FVK', 'EPKPTQ-K', 'EPKPTQ-K', 'EPKPTQ-K', 'EPKPTQ-ALYLVC', 'EPKPTQ-EALYLVCGERGFFY', 'EPKPTQ-EALYLVCGERGFFYT', 'EPKPTQ-VEDPQVEQLELGGSPGD', 'EPKPTQ-LVEALYLVCGERGFFYTP']
['LGGSPGDLQTL-K', 'LGGSPGDLQTL-VEDPQVEQLELGGSPGD']
['RREVEDPQVEQLELGGSPG-K', 'RREVEDPQVEQLELGGSPG-VEDPQVEQLELGGSPGD']
['DPQVEQLELGGSPGDLQTL-K', 'DPQVEQLELGGSPGDLQTL-VEDPQVEQLELGGSPGD']


# Finding optimal "hybrid" combos

* Hybrid is in quotation marks because all outputs will be a hybrid and then we can check if it is a non-hybrid

In [12]:
def parse_indices(index_set):
    indices = []
    for index in index_set:
        string = str(index)
        A = string.rstrip().split(',')
        start = A[0]
        end = A[1]
        seq = A[2]
        mz = A[3]
        disallowed_characters = " ()\'"
        for character in disallowed_characters:
            start = start.replace(character, "")
            end = end.replace(character, "")
            seq = seq.replace(character, "")
            mz = mz.replace(character, "")
        
        target_tuple = (int(start), int(end), seq, float(mz))
        indices.append(target_tuple)
    
    
    return indices

def calc_combined_score(b_indices, y_indices, b_score, y_score):
    masses = []
    combined_score = 0
    for index in b_indices:
        current_mass = float(index[3])
        if current_mass not in masses:
            masses.append(current_mass)
            combined_score = combined_score + 1
        else:
            combined_score = combined_score - 2
    for index in y_indices:
        current_mass = float(index[3])
        if current_mass not in masses:
            masses.append(current_mass)
            combined_score = combined_score + 1
        else:
            combined_score = combined_score - 2
    return combined_score

In [13]:
# b side starting. Ideally, we would probably pick the higher scoring side to start
filtered_b, filtered_y = get_top(b_sorted_clusters, y_sorted_clusters, 50)
target_precursor = input_spectrum.precursor_mass
#Start with printing overlapping. Then will incorportate boundary overlaps between last of b and first of y
interesting_combos = []
for b_cluster in filtered_b:
    for y_cluster in filtered_y:
        if b_cluster.start <= y_cluster.end:
            seq = b_cluster.seq + '-' + y_cluster.seq
            b_indices = parse_indices(b_cluster.indices)
            y_indices = parse_indices(y_cluster.indices)
            score = calc_combined_score(b_indices, y_indices, b_cluster.score, y_cluster.score)
            tup = (seq, score)
            interesting_combos.append(tup)
interesting_combos.sort(key=lambda a: a[1], reverse=True)
print(interesting_combos)
            
            
# # Calculating start and end indices for each interval
# for b_cluster in filtered_b:
#     interesting_combos = []
#     b_index_set = set()
#     for index in b_cluster.indices:
#         index = index.replace('(', '')
#         index = index.replace(')', '')
#         A = index.rstrip().split(',')
#         b_index_set.add(int(A[0]))
#         b_index_set.add(int(A[1]))
    
#     b_start_pos = min(b_index_set)
#     b_end_pos = max(b_index_set)
    
#     for y_cluster in filtered_y:
#         y_index_set = set()
#         for index in y_cluster.indices:
#             index = index.replace('(', '')
#             index = index.replace(')', '')
#             A = index.rstrip().split(',')
#             y_index_set.add(int(A[0]))
#             y_index_set.add(int(A[1]))
            
#         y_start_pos = min(y_index_set)
#         y_end_pos = max(y_index_set)
        
#         if (b_start_pos <= y_start_pos) and (b_end_pos == y_start_pos - 1):
#             interesting_combos.append(b_cluster)
#             interesting_combos.append(y_cluster)
    
#     print(interesting_combos)

[('GSQFFICT-GLGNKTYEHFNAMGK', 6), ('GSQFFICT-VVWTSEYDPLASNPGWKK', 6), ('GSQFFICT-VTVTVLDVNDNRPEFTMK', 6), ('SGQADDERVREYHLL-GLGNKTYEHFNAMGK', 6), ('SGQADDERVREYHLL-VVWTSEYDPLASNPGWKK', 6), ('SGQADDERVREYHLL-VTVTVLDVNDNRPEFTMK', 6), ('GSGAVALCPE-GLGNKTYEHFNAMGK', 6), ('GSGAVALCPE-GRPSGEAFVELESEDEVK', 6), ('GSGAVALCPE-VVWTSEYDPLASNPGWKK', 6), ('GSGAVALCPE-VCPTEIIAFSDHAEDFRK', 6), ('GSGAVALCPE-VTVTVLDVNDNRPEFTMK', 6), ('ANSWNLDWGDNGFFKIL-VVWTSEYDPLASNPGWKK', 6), ('QGSPGAWAPLDPTSGSSA-GLGNKTYEHFNAMGK', 6), ('QGSPGAWAPLDPTSGSSA-VVWTSEYDPLASNPGWKK', 6), ('QGSPGAWAPLDPTSGSSA-VCPTEIIAFSDHAEDFRK', 6), ('QGSPGAWAPLDPTSGSSA-VTVTVLDVNDNRPEFTMK', 6), ('QGSPGAWAPLDPTSGSSA-TGLTEGQHGFHVHQYG', 6), ('GSAGLMLVEFFAPWCG-GRPSGEAFVELESEDEVK', 6), ('GSAGLMLVEFFAPWCG-VVWTSEYDPLASNPGWKK', 6), ('GSAGLMLVEFFAPWCG-VCPTEIIAFSDHAEDFRK', 6), ('GSAGLMLVEFFAPWCG-VTVTVLDVNDNRPEFTMK', 6), ('SNSQQAYQEAFEI-VVWTSEYDPLASNPGWKK', 6), ('SNSQQAYQEAFEI-VTVTVLDVNDNRPEFTMK', 6), ('GQVLPEMEIHLQTDAKKGT-GLGNKTYEHFNAMGK', 6), ('GQVLPEM

# Over all datasets

* Want to know:
    * What do good hits look like when put through extensions?
    * Only factor in the hits for which there exist a good hit from the b and y side. It doesn't make sense to talk about extensions for which we don't have good initial hits on both sides
        * How often does this happen?
        * Do we lose out on good combinations after penalizing double counting?