# Extensions

The purpose of this notebook is to test everything related to generating extensions among hits

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('..', 'hypedsearch', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import testing_utils
import database

import operator

ppm_tolerance = 20
max_peptide_length = 20

import matplotlib.pyplot as plt

In [2]:
datasets = testing_utils.define_data()

dataset = datasets[0]

input_spectra_path = dataset[0]
input_spectra, boundaries = testing_utils.preprocess_input_spectra(input_spectra_path, ppm_tolerance)

correct_sequences = testing_utils.generate_truth_set(datasets[0])

path = dataset[2]
db = database.build(path)

Loading spectra...
Done


In [3]:
matched_masses_b, matched_masses_y, database = testing_utils.modified_match_masses(boundaries, db, max_peptide_length)
# matched_masses_b, matched_masses_y, database = match_masses(boundaries, db, max_peptide_length)
print('Finished matching masses')

On protein 279/279 [100%]
Sorting the set of protein masses...
Sorting the set of protein masses done
Performing Merge
Done
Finished matching masses


# Getting initial hits

In [4]:
spectrum_num = 0
correct_sequence = correct_sequences[spectrum_num]
print(correct_sequence)

input_spectrum = input_spectra[spectrum_num]

DPQVEQLEL


In [5]:
b_hits, y_hits, b_set, y_set, misses = testing_utils.find_hits(boundaries, input_spectrum, spectrum_num, matched_masses_b, matched_masses_y)
correct_hits = testing_utils.append_correct_hits(correct_sequence, input_spectrum, ppm_tolerance)
testing_utils.write_hits(b_hits, y_hits)
ion = 'b'
testing_utils.create_clusters(ion)
b_sorted_clusters = testing_utils.sort_clusters_by_post_prob(ion, boundaries, matched_masses_b, matched_masses_y)
ion = 'y'
testing_utils.create_clusters(ion)
y_sorted_clusters = testing_utils.sort_clusters_by_post_prob(ion, boundaries, matched_masses_b, matched_masses_y)

Done


# Printing hits

In [6]:
# b_sorted_clusters = sorted(b_sorted_clusters, key=operator.attrgetter('post_prob', 'score', 'pid', 'prior'), reverse = True)
b_sorted_clusters = sorted(b_sorted_clusters, key=operator.attrgetter('score', 'post_prob', 'pid', 'prior'), reverse = True)
testing_utils.write_b_sorted_cluster(b_sorted_clusters)
for i in range(0, 50):
    x = b_sorted_clusters[i]
    post_prob = x.post_prob
    score = x.score
    seq = x.seq
    indices = x.indices
    print(post_prob, score, seq)

0.8674851509837346 7 DPQVEQLE
0.5832415059687787 4 TVFSDFL
0.5689800742992233 4 TQAGVEELDPENKIP
0.5348891652083142 4 PAGDQKDV
0.45824150596877866 4 DPEVQQI
0.42904238878594025 4 GTYFEVKIPSDTFYDN
0.42446275789051985 4 HSLMPMLE
0.381750637069786 4 PDAGAPTSASGLSGHTTL
0.6258241758241758 3 TTFV
0.6258241758241758 3 TTFV
0.5721812218122181 3 TTPGPD
0.5676756625334306 3 KKEECP
0.5570921985815603 3 TTSTRTY
0.549407674478496 3 TTYNSIMK
0.5428270042194093 3 TTQEPIWLT
0.5399705014749262 3 KKDLEEWNQ
0.5379931876241839 3 TTGFSTEVWQ
0.5364167478091529 3 TTLRIEGNQG
0.5309222423146475 3 KKAIPAGCGDE
0.5309222423146475 3 TTTGPRAQIGS
0.5300595238095238 3 TTVESNSSWWTN
0.5280657395701643 3 TTSPNLGTREN
0.5277367773677737 3 TTPATSTTCTAT
0.5265676567656766 3 KKCGWFHPPANE
0.5261603375527427 3 TTEAAPGTGRGA
0.525892857142857 3 TTNPHVFPEGSEP
0.5183733670459334 3 QPDV
0.5178571428571428 3 KHHMY
0.5170879922216821 3 KKEKKSLDSDESED
0.5154565456545654 3 TTQPPAQPASQGSGS
0.5111301625167736 3 TTESVKEQEMKWTDLA
0.50849858

In [7]:
testing_utils.write_y_sorted_cluster(y_sorted_clusters)  
# y_sorted_clusters = sorted(y_sorted_clusters, key=operator.attrgetter('post_prob', 'score', 'pid', 'prior'), reverse = True)
y_sorted_clusters = sorted(y_sorted_clusters, key=operator.attrgetter('score', 'post_prob', 'pid', 'prior'), reverse = True)
for i in range(0, 50):
    x = y_sorted_clusters[i]
    post_prob = x.post_prob
    score = x.score
    seq = x.seq
    indices = x.indices
    print(post_prob, score, seq)

0.7857723577235773 5 LDSFSEI
0.6975386779184247 4 CGLYEL
0.6975386779184247 4 HPDSEL
0.5330623306233063 4 LESYGLE
0.483033033033033 4 SSNWVGKGFFAVYEAIC
0.42394212394212394 4 EISSIDEF
0.42394212394212394 4 EIPHSELD
0.41172391549750037 4 FVDLTMPYSV
0.6505494505494506 3 HICL
0.6102990033222592 3 ITCI
0.6102990033222592 3 LTCI
0.6102990033222592 3 TLCI
0.5978738652651696 3 FSCL
0.5978738652651696 3 SFCL
0.5978738652651696 3 AYCL
0.5844339622641509 3 NNPEL
0.5734567901234567 3 DPLEEL
0.5734567901234567 3 HVMSEI
0.5721001221001221 3 PENPEI
0.5721001221001221 3 YNVSCI
0.5692411924119242 3 HWEPEI
0.5654639175257732 3 TEEKMEL
0.5606761565836299 3 YQASLEL
0.5606761565836299 3 KPCLFCL
0.5581300813008131 3 ESYGLEL
0.5581300813008131 3 SFSEVEL
0.5581300813008131 3 YGDLTEI
0.5581300813008131 3 FSESVEL
0.5563730084348641 3 SDIAMTEL
0.5515852474927209 3 GYISAAEL
0.549017199017199 3 PIDHLCEL
0.5475513428120063 3 DPQVEQLEL
0.5475513428120063 3 HGEIIYPEI
0.5454301075268817 3 YENLNDQEL
0.5454301075268817 

# To Filter data by parent prot

I want to be able to only view the b or y hits from a certain protein

In [8]:
# b_hits
target_pid = 274
b_target_clusters = []
for cluster in b_sorted_clusters:
    if cluster.pid == target_pid:
        b_target_clusters.append(cluster)

for cluster in b_target_clusters:
    assessment, _ = testing_utils.is_good_hit(cluster.seq, 'b', correct_sequence)
    non_indices = str(cluster.score) + '\t' + str(cluster.post_prob) + '\t' + str(cluster.pid) + '\t' + cluster.seq + '\t' + str(assessment)
    print(non_indices + '\t'+ '\t'.join([str(o) for o in cluster.indices]))


7	0.8674851509837346	274	DPQVEQLE	True	(60, 62, 'DPQ', '341.1453857421875')	(60, 63, 'DPQV', '440.216064453125')	(60, 64, 'DPQVE', '569.2589721679688')	(60, 65, 'DPQVEQ', '349.1622619628906')	(60, 65, 'DPQVEQ', '697.3143310546875')	(60, 66, 'DPQVEQL', '810.3919677734375')	(60, 67, 'DPQVEQLE', '939.4456176757812')
2	0.2215748132886943	274	EDPQVEQL	False	(59, 63, 'EDPQV', '569.2589721679688')	(59, 66, 'EDPQVEQL', '939.4456176757812')
1	0.2501791793585379	274	T	False	(22, 22, 'T', '102.05448913574219')
1	0.2501791793585379	274	T	False	(51, 51, 'T', '102.05448913574219')
1	0.2501791793585379	274	T	False	(77, 77, 'T', '102.05448913574219')
1	0.2501791793585379	274	T	False	(95, 95, 'T', '102.05448913574219')
1	0.25013925637097895	274	K	False	(20, 20, 'K', '129.1007537841797')
1	0.25013925637097895	274	K	False	(27, 27, 'K', '129.1007537841797')
1	0.25013925637097895	274	K	False	(53, 53, 'K', '129.1007537841797')
1	0.25013925637097895	274	K	False	(86, 86, 'K', '129.1007537841797')
1	0.20128040

In [9]:
# y_hits
y_target_clusters = []
for cluster in y_sorted_clusters:
    if cluster.pid == target_pid:
        y_target_clusters.append(cluster)

for cluster in y_target_clusters:
    assessment, _ = testing_utils.is_good_hit(cluster.seq, 'y', correct_sequence)
    non_indices = str(cluster.score) + '\t' + str(cluster.post_prob) + '\t' + str(cluster.pid) + '\t' + cluster.seq + '\t' + str(assessment)
    print(non_indices + '\t'+ '\t'.join([str(o) for o in cluster.indices]))

3	0.5475513428120063	274	DPQVEQLEL	True	(60, 68, 'DPQVEQLEL', '535.772527')	(67, 68, 'EL', '261.1429443359375')	(68, 68, 'L', '132.10121154785156')
1	0.2500691802144587	274	L	True	(3, 3, 'L', '132.10121154785156')
1	0.2500691802144587	274	L	True	(4, 4, 'L', '132.10121154785156')
1	0.2500691802144587	274	L	True	(8, 8, 'L', '132.10121154785156')
1	0.2500691802144587	274	L	True	(10, 10, 'L', '132.10121154785156')
1	0.2500691802144587	274	L	True	(11, 11, 'L', '132.10121154785156')
1	0.2500691802144587	274	L	True	(13, 13, 'L', '132.10121154785156')
1	0.2500691802144587	274	L	True	(14, 14, 'L', '132.10121154785156')
1	0.2500691802144587	274	L	True	(16, 16, 'L', '132.10121154785156')
1	0.2500691802144587	274	L	True	(30, 30, 'L', '132.10121154785156')
1	0.2500691802144587	274	L	True	(35, 35, 'L', '132.10121154785156')
1	0.2500691802144587	274	L	True	(39, 39, 'L', '132.10121154785156')
1	0.2500691802144587	274	L	True	(41, 41, 'L', '132.10121154785156')
1	0.2500691802144587	274	L	True	(66, 66, '

# Finding non-hybrid interesting Combos

In [10]:
def get_top(b_clusters, y_clusters, top_num):
    filtered_b = []
    filtered_y = []
    b_len = top_num if len(b_clusters) >= top_num else len(b_clusters)
    y_len = top_num if len(y_clusters) >= top_num else len(y_clusters)
    for x in range(0,b_len):
        filtered_b.append(b_clusters[x])
    for x in range(0,y_len):
        filtered_y.append(y_clusters[x])
    return filtered_b, filtered_y

In [11]:
# b side starting. Ideally, we would probably pick the higher scoring side to start
filtered_b, filtered_y = get_top(b_target_clusters, y_target_clusters, 50)
target_precursor = 
#Start with printing overlapping. Then will incorportate boundary overlaps between last of b and first of y
for b_cluster in filtered_b:
    interesting_combos = []
    for y_cluster in filtered_y:
        if b_cluster.start <= y_cluster.end:
            interesting_combos.append(b_cluster.seq + '-' + y_cluster.seq)
    print(interesting_combos)
# # Calculating start and end indices for each interval
# for b_cluster in filtered_b:
#     interesting_combos = []
#     b_index_set = set()
#     for index in b_cluster.indices:
#         index = index.replace('(', '')
#         index = index.replace(')', '')
#         A = index.rstrip().split(',')
#         b_index_set.add(int(A[0]))
#         b_index_set.add(int(A[1]))
    
#     b_start_pos = min(b_index_set)
#     b_end_pos = max(b_index_set)
    
#     for y_cluster in filtered_y:
#         y_index_set = set()
#         for index in y_cluster.indices:
#             index = index.replace('(', '')
#             index = index.replace(')', '')
#             A = index.rstrip().split(',')
#             y_index_set.add(int(A[0]))
#             y_index_set.add(int(A[1]))
            
#         y_start_pos = min(y_index_set)
#         y_end_pos = max(y_index_set)
        
#         if (b_start_pos <= y_start_pos) and (b_end_pos == y_start_pos - 1):
#             interesting_combos.append(b_cluster)
#             interesting_combos.append(y_cluster)
    
#     print(interesting_combos)

SyntaxError: invalid syntax (<ipython-input-11-0cc46c058e3e>, line 3)