# Extensions

The purpose of this notebook is to test everything related to generating extensions among hits

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from testing_framework import testing_utils
import database
from preprocessing import merge_search
from identification import create_hits

import operator

ppm_tolerance = 20
max_peptide_length = 20

import matplotlib.pyplot as plt

In [2]:
datasets = testing_utils.define_data()

dataset = datasets[0]

input_spectra_path = dataset[0]
input_spectra, boundaries = testing_utils.preprocess_input_spectra(input_spectra_path, ppm_tolerance)

correct_sequences = testing_utils.generate_truth_set(datasets[0])

path = dataset[2]
db = database.build(path)

Loading spectra...
Done


In [3]:
write_path = os.path.abspath(os.path.join(module_path, 'intermediate_files'))
matched_masses_b, matched_masses_y, kmer_set = merge_search.modified_match_masses(boundaries, db, max_peptide_length, True, write_path)
print('Finished matching masses')

On protein 279/279 [100%]
Sorting the set of protein masses...
Sorting the set of protein masses done
Performing Merge
Done
Finished matching masses


# Getting initial hits

In [4]:
from gen_spectra import gen_spectrum
spectrum_num = 0

correct_sequence = correct_sequences[spectrum_num]
print(correct_sequence)

input_spectrum = input_spectra[spectrum_num]

DPQVEQLEL


In [5]:
unique_b,unique_y = testing_utils.get_unique_matched_masses(boundaries, matched_masses_b, matched_masses_y)

In [6]:
location = os.path.join(os.path.abspath(os.path.join('../..')), 'intermediate_files/')
b_hits, y_hits = create_hits(spectrum_num, input_spectrum, matched_masses_b, matched_masses_y, False, location)
correct_hits = testing_utils.append_correct_hits(correct_sequence, input_spectrum, ppm_tolerance)
ion = 'b'
clusters = testing_utils.create_clusters(ion, b_hits, y_hits)
b_sorted_clusters = testing_utils.Bayes_clusters(ion, clusters, kmer_set, unique_b)
ion = 'y'
clusters = testing_utils.create_clusters(ion, b_hits, y_hits)
y_sorted_clusters = testing_utils.Bayes_clusters(ion, clusters, kmer_set, unique_y)

ZeroDivisionError: division by zero

# Printing hits

In [13]:
b_sorted_clusters = sorted(b_sorted_clusters, key=operator.attrgetter('prob', 'score', 'pid'), reverse = True)
# b_sorted_clusters = sorted(b_sorted_clusters, key=operator.attrgetter('score', 'prob', 'pid'), reverse = True)
testing_utils.write_b_sorted_cluster(b_sorted_clusters)
for i in range(0, 50):
    x = b_sorted_clusters[i]
    post_prob = x.prob
    score = x.score
    seq = x.seq
    indices = x.indices
    print(post_prob, score, seq)

1.0 8 DPQVEQLEL
0.9999999999999902 4 PDAGAPTSASGLSGHTTL
0.9999999999999798 4 TQAGVEELDPENKIP
0.999999999999943 4 GTYFEVKIPSDTFYDN
0.9999999999997486 4 PAGDQKDV
0.9999999999990805 4 HSLMPMLE
0.9999999999987744 4 DPEVQQI
0.9999999999987744 4 TVFSDFL
0.9999999999924445 3 TTSPNLGTREN
0.9999999999899392 3 PQDHPRSQPQ
0.9999999999897169 4 IQEYYNKL
0.9999999999881277 3 GDEPGPQRSVEGWIL
0.9999999999881277 3 PQQPQPPPQQQAAPQ
0.9999999999861916 3 KKDLEEWNQ
0.9999999999853924 3 IEKNTDGVNFYNIL
0.9999999999774949 3 TTTPCMLRDSDSILETL
0.9999999999774949 3 TDMTKLEECVRSIQADG
0.9999999999754571 3 TTQPPAQPASQGSGS
0.9999999999729959 3 TTESVKEQEMKWTDLA
0.9999999999698007 3 PGGEEVLREQAGGD
0.9999999999665664 3 KYFDSGDYNMAKAK
0.9999999999665664 3 KKEKKSLDSDESED
0.9999999999665664 3 KYFDSGDYNMAKAK
0.9999999999605552 3 AGPQPAQTGA
0.9999999999598287 3 AGGATVEPAG
0.9999999999586807 3 TGAAGRNS
0.9999999999582204 3 AFVKSYENLAFYW
0.9999999999567284 3 KPEGRPGT
0.9999999999567284 3 TLSFSSIS
0.9999999999566352 3 TTPATSTTC

In [14]:
y_sorted_clusters = sorted(y_sorted_clusters, key=operator.attrgetter('prob', 'score', 'pid'), reverse = True)
# y_sorted_clusters = sorted(y_sorted_clusters, key=operator.attrgetter('score', 'prob', 'pid'), reverse = True)
testing_utils.write_y_sorted_cluster(y_sorted_clusters)
for i in range(0, 50):
    x = y_sorted_clusters[i]
    post_prob = x.prob
    score = x.score
    seq = x.seq
    indices = x.indices
    print(post_prob, score, seq)

0.9999999999999973 5 LDSFSEI
0.9999999999999695 4 SSNWVGKGFFAVYEAIC
0.9999999999998629 4 FVDLTMPYSV
0.9999999999993714 4 EISSIDEF
0.9999999999993714 4 EIPHSELD
0.9999999999993199 4 LESYGLE
0.9999999999970575 4 IEVLETDPH
0.9999999999859213 3 LSNPTGLQESISDVTTCL
0.9999999999826609 3 TAEIASLDSENIDEI
0.9999999999816636 3 AAGCKVEAFAVQGEEL
0.9999999999799413 3 LSIHQLVENTDETYCI
0.9999999999786653 3 LEDSDLKKSDIDEI
0.9999999999786653 3 LLMAASIYFHDQNP
0.999999999977738 3 AKDLDTVASDMMVLL
0.999999999977738 3 DTNAPAHQLIQTESP
0.999999999977738 3 WLKGQGVYLGMPGCL
0.999999999977738 3 DLNINMTSPMGTKSI
0.9999999999756466 3 QQQPPKQQQQQQQQQ
0.9999999999756466 3 QQPPKQQQQQQQQQQ
0.9999999999756466 3 YNYVWANCFEITLEL
0.9999999999733412 3 YFEEYGKIDTIEI
0.9999999999726071 3 LQCYSEAIKLDPQN
0.9999999999726071 3 EIEGEIKRDFMAAL
0.9999999999714403 4 CGLYEL
0.9999999999714403 4 HPDSEL
0.999999999945851 3 DFTFVCPTEI
0.999999999945851 3 DFTFVCPTEI
0.999999999945851 3 DFTFVCPTEI
0.999999999945851 3 TSQDARFYAL
0.99999999994

# To Filter data by parent prot

I want to be able to only view the b or y hits from a certain protein

In [16]:
# b_hits
target_pid = 274
b_target_clusters = []
for cluster in b_sorted_clusters:
    if cluster.pid == target_pid:
        b_target_clusters.append(cluster)

for cluster in b_target_clusters:
    assessment, _ = testing_utils.is_good_hit(cluster.seq, 'b', correct_sequence)
    non_indices = str(cluster.score) + '\t' + str(cluster.prob) + '\t' + str(cluster.pid) + '\t' + cluster.seq + '\t' + str(assessment)
    print(non_indices + '\t'+ '\t'.join([str(o) for o in cluster.indices]))


8	1.0	274	DPQVEQLEL	True	(60, 62, 'DPQ', 341.1453857421875)	(60, 63, 'DPQV', 440.216064453125)	(60, 64, 'DPQVE', 569.2589721679688)	(60, 65, 'DPQVEQ', 349.1622619628906)	(60, 65, 'DPQVEQ', 697.3143310546875)	(60, 66, 'DPQVEQL', 810.3919677734375)	(60, 67, 'DPQVEQLE', 939.4456176757812)	(60, 68, 'DPQVEQLEL', 526.7672446181209)
2	0.9999999485970288	274	EDPQVEQL	False	(59, 63, 'EDPQV', 569.2589721679688)	(59, 66, 'EDPQVEQL', 939.4456176757812)
1	0.9999872198959185	274	LGGSP	False	(68, 72, 'LGGSP', 412.21746826171875)
1	0.9999791832993288	274	VEDPQ	False	(58, 62, 'VEDPQ', 569.2589721679688)
1	0.9998568688632525	274	LE	False	(66, 67, 'LE', 243.13357543945312)
1	0.9998568688632525	274	EL	False	(67, 68, 'EL', 243.13357543945312)
1	0.9998568688632525	274	LE	False	(80, 81, 'LE', 243.13357543945312)
1	0.9998568688632525	274	LE	False	(103, 104, 'LE', 243.13357543945312)
1	0.9995387453874539	274	PQ	False	(61, 62, 'PQ', 226.1183624267578)
1	0.33333333333333337	274	K	False	(20, 20, 'K', 129.10075378

In [17]:
# y_hits
y_target_clusters = []
for cluster in y_sorted_clusters:
    if cluster.pid == target_pid:
        y_target_clusters.append(cluster)

for cluster in y_target_clusters:
    assessment, _ = testing_utils.is_good_hit(cluster.seq, 'y', correct_sequence)
    non_indices = str(cluster.score) + '\t' + str(cluster.prob) + '\t' + str(cluster.pid) + '\t' + cluster.seq + '\t' + str(assessment)
    print(non_indices + '\t'+ '\t'.join([str(o) for o in cluster.indices]))

3	0.9999999993294415	274	DPQVEQLEL	True	(60, 68, 'DPQVEQLEL', 535.7725269681209)	(67, 68, 'EL', 261.1429443359375)	(68, 68, 'L', 132.10121154785156)
1	0.999989057715208	274	EALYLVC	False	(37, 43, 'EALYLVC', 810.3919677734375)
1	0.9998721663541844	274	LE	False	(66, 67, 'LE', 261.1429443359375)
1	0.9998721663541844	274	LE	False	(80, 81, 'LE', 261.1429443359375)
1	0.9998721663541844	274	LE	False	(103, 104, 'LE', 261.1429443359375)
1	0.9996506346803308	274	LC	False	(30, 31, 'LC', 235.1075439453125)
1	0.9996506346803308	274	IC	False	(97, 98, 'IC', 235.1075439453125)
1	0.5	274	L	True	(3, 3, 'L', 132.10121154785156)
1	0.5	274	L	True	(4, 4, 'L', 132.10121154785156)
1	0.5	274	L	True	(8, 8, 'L', 132.10121154785156)
1	0.5	274	L	True	(10, 10, 'L', 132.10121154785156)
1	0.5	274	L	True	(11, 11, 'L', 132.10121154785156)
1	0.5	274	L	True	(13, 13, 'L', 132.10121154785156)
1	0.5	274	L	True	(14, 14, 'L', 132.10121154785156)
1	0.5	274	L	True	(16, 16, 'L', 132.10121154785156)
1	0.5	274	L	True	(30, 30, 'L',

# Finding non-hybrid interesting Combos

In [101]:
def get_top(b_clusters, y_clusters, top_num):
    filtered_b = []
    filtered_y = []
    b_len = top_num if len(b_clusters) >= top_num else len(b_clusters)
    y_len = top_num if len(y_clusters) >= top_num else len(y_clusters)
    for x in range(0,b_len):
        filtered_b.append(b_clusters[x])
    for x in range(0,y_len):
        filtered_y.append(y_clusters[x])
    return filtered_b, filtered_y

In [102]:
# b side starting. Ideally, we would probably pick the higher scoring side to start
filtered_b, filtered_y = get_top(b_target_clusters, y_target_clusters, 50)
target_precursor = input_spectrum.precursor_mass
#Start with printing overlapping. Then will incorportate boundary overlaps between last of b and first of y
for b_cluster in filtered_b:
    interesting_combos = []
    for y_cluster in filtered_y:
        if b_cluster.start <= y_cluster.end:
            interesting_combos.append(b_cluster.seq + '-' + y_cluster.seq)
    print(interesting_combos)
# # Calculating start and end indices for each interval
# for b_cluster in filtered_b:
#     interesting_combos = []
#     b_index_set = set()
#     for index in b_cluster.indices:
#         index = index.replace('(', '')
#         index = index.replace(')', '')
#         A = index.rstrip().split(',')
#         b_index_set.add(int(A[0]))
#         b_index_set.add(int(A[1]))
    
#     b_start_pos = min(b_index_set)
#     b_end_pos = max(b_index_set)
    
#     for y_cluster in filtered_y:
#         y_index_set = set()
#         for index in y_cluster.indices:
#             index = index.replace('(', '')
#             index = index.replace(')', '')
#             A = index.rstrip().split(',')
#             y_index_set.add(int(A[0]))
#             y_index_set.add(int(A[1]))
            
#         y_start_pos = min(y_index_set)
#         y_end_pos = max(y_index_set)
        
#         if (b_start_pos <= y_start_pos) and (b_end_pos == y_start_pos - 1):
#             interesting_combos.append(b_cluster)
#             interesting_combos.append(y_cluster)
    
#     print(interesting_combos)

['RREVEDPQVEQLELGGSPG-VEDPQVEQLELGGSPGD', 'RREVEDPQVEQLELGGSPG-K']
['EPKPTQ-FVK', 'EPKPTQ-EALYLVCGERGFFY', 'EPKPTQ-VEDPQVEQLELGGSPGD', 'EPKPTQ-LVEALYLVCGERGFFYTP', 'EPKPTQ-EALYLVCGERGFFYT', 'EPKPTQ-ALYLVC', 'EPKPTQ-K', 'EPKPTQ-K', 'EPKPTQ-K']
['DPQVEQLELGGSPGDLQTL-VEDPQVEQLELGGSPGD', 'DPQVEQLELGGSPGDLQTL-K']
['LGGSPGDLQTL-VEDPQVEQLELGGSPGD', 'LGGSPGDLQTL-K']
['GS-VEDPQVEQLELGGSPGD', 'GS-K']
['LCGP-EALYLVCGERGFFY', 'LCGP-VEDPQVEQLELGGSPGD', 'LCGP-LVEALYLVCGERGFFYTP', 'LCGP-EALYLVCGERGFFYT', 'LCGP-ALYLVC', 'LCGP-K', 'LCGP-K']


# Finding optimal "hybrid" combos

* Hybrid is in quotation marks because all outputs will be a hybrid and then we can check if it is a non-hybrid

In [103]:
def parse_indices(index_set):
    indices = []
    for index in index_set:
        string = str(index)
        A = string.rstrip().split(',')
        start = A[0]
        end = A[1]
        seq = A[2]
        mz = A[3]
        disallowed_characters = " ()\'"
        for character in disallowed_characters:
            start = start.replace(character, "")
            end = end.replace(character, "")
            seq = seq.replace(character, "")
            mz = mz.replace(character, "")
        
        target_tuple = (int(start), int(end), seq, float(mz))
        indices.append(target_tuple)
    
    
    return indices

def calc_combined_score(b_indices, y_indices, b_score, y_score):
    masses = []
    combined_score = 0
    for index in b_indices:
        current_mass = float(index[3])
        if current_mass not in masses:
            masses.append(current_mass)
            combined_score = combined_score + 1
        else:
            combined_score = combined_score - 2
    for index in y_indices:
        current_mass = float(index[3])
        if current_mass not in masses:
            masses.append(current_mass)
            combined_score = combined_score + 1
        else:
            combined_score = combined_score - 2
    return combined_score

In [104]:
# b side starting. Ideally, we would probably pick the higher scoring side to start
filtered_b, filtered_y = get_top(b_sorted_clusters, y_sorted_clusters, 50)
target_precursor = input_spectrum.precursor_mass
#Start with printing overlapping. Then will incorportate boundary overlaps between last of b and first of y
interesting_combos = []
for b_cluster in filtered_b:
    for y_cluster in filtered_y:
        if b_cluster.start <= y_cluster.end:
            seq = b_cluster.seq + '-' + y_cluster.seq
            b_indices = parse_indices(b_cluster.indices)
            y_indices = parse_indices(y_cluster.indices)
            score = calc_combined_score(b_indices, y_indices, b_cluster.score, y_cluster.score)
            tup = (seq, score)
            interesting_combos.append(tup)
interesting_combos.sort(key=lambda a: a[1], reverse=True)
[print(x) for x in interesting_combos]
            
            
# # Calculating start and end indices for each interval
# for b_cluster in filtered_b:
#     interesting_combos = []
#     b_index_set = set()
#     for index in b_cluster.indices:
#         index = index.replace('(', '')
#         index = index.replace(')', '')
#         A = index.rstrip().split(',')
#         b_index_set.add(int(A[0]))
#         b_index_set.add(int(A[1]))
    
#     b_start_pos = min(b_index_set)
#     b_end_pos = max(b_index_set)
    
#     for y_cluster in filtered_y:
#         y_index_set = set()
#         for index in y_cluster.indices:
#             index = index.replace('(', '')
#             index = index.replace(')', '')
#             A = index.rstrip().split(',')
#             y_index_set.add(int(A[0]))
#             y_index_set.add(int(A[1]))
            
#         y_start_pos = min(y_index_set)
#         y_end_pos = max(y_index_set)
        
#         if (b_start_pos <= y_start_pos) and (b_end_pos == y_start_pos - 1):
#             interesting_combos.append(b_cluster)
#             interesting_combos.append(y_cluster)
    
#     print(interesting_combos)

('SNSQQAYQEAFEI-VVWTSEYDPLASNPGWKK', 6)
('SNSQQAYQEAFEI-VTVTVLDVNDNRPEFTMK', 6)
('IFVGTPYYGYVYEDTLP-VVWTSEYDPLASNPGWKK', 6)
('QVNQSLASMPPMNPATTLP-VVWTSEYDPLASNPGWKK', 6)
('MAEVDAAMAARPHSIDGRVV-VVWTSEYDPLASNPGWKK', 6)
('GSAGLMLVEFFAPWCG-VVWTSEYDPLASNPGWKK', 6)
('GSAGLMLVEFFAPWCG-VCPTEIIAFSDHAEDFRK', 6)
('GSAGLMLVEFFAPWCG-VTVTVLDVNDNRPEFTMK', 6)
('GSAGLMLVEFFAPWCG-GRPSGEAFVELESEDEVK', 6)
('GQVLPEMEIHLQTDAKKGT-GLGNKTYEHFNAMGK', 6)
('GQVLPEMEIHLQTDAKKGT-VTVTVLDVNDNRPEFTMK', 6)
('VPEMKPSMFDVSRELGSSV-VTVTVLDVNDNRPEFTMK', 6)
('ANSWNLDWGDNGFFKIL-VVWTSEYDPLASNPGWKK', 6)
('GSQFFICT-GLGNKTYEHFNAMGK', 6)
('GSQFFICT-VVWTSEYDPLASNPGWKK', 6)
('GSQFFICT-VTVTVLDVNDNRPEFTMK', 6)
('SSNGFLIDGYPREVKQGE-GLGNKTYEHFNAMGK', 6)
('SSNGFLIDGYPREVKQGE-VTVTVLDVNDNRPEFTMK', 6)
('GSGAVALCPE-GLGNKTYEHFNAMGK', 6)
('GSGAVALCPE-VVWTSEYDPLASNPGWKK', 6)
('GSGAVALCPE-VCPTEIIAFSDHAEDFRK', 6)
('GSGAVALCPE-VTVTVLDVNDNRPEFTMK', 6)
('GSGAVALCPE-GRPSGEAFVELESEDEVK', 6)
('SGQADDERVREYHLL-GLGNKTYEHFNAMGK', 6)
('SGQADDERVREYHLL-VVWT

('SGQADDERVREYHLL-FVK', 5)
('SGQADDERVREYHLL-FVK', 5)
('SGQADDERVREYHLL-FVK', 5)
('SGQADDERVREYHLL-VFK', 5)
('SGQADDERVREYHLL-FVK', 5)
('SGQADDERVREYHLL-FVK', 5)
('QGSPVVQMAEDAVDGER-TEGGGSEALPCPGPPAG', 5)
('QGSPVVQMAEDAVDGER-LGSHTDEMLWHVLE', 5)
('QGSPVVQMAEDAVDGER-QGSFLAANMQDSRENT', 5)
('QGSPVVQMAEDAVDGER-FVK', 5)
('QGSPVVQMAEDAVDGER-FVK', 5)
('QGSPVVQMAEDAVDGER-FVK', 5)
('QGSPGAWAPLDPTSGSSA-TEGGGSEALPCPGPPAG', 5)
('QGSPGAWAPLDPTSGSSA-LGSHTDEMLWHVLE', 5)
('QGSPGAWAPLDPTSGSSA-NGTDPEDVIRNAFACF', 5)
('QGSPGAWAPLDPTSGSSA-NGTDPEDVIRNAFACF', 5)
('QGSPGAWAPLDPTSGSSA-QGSFLAANMQDSRENT', 5)
('QGSPGAWAPLDPTSGSSA-GKSSDNRSRGYRGGSAGG', 5)
('QGSPGAWAPLDPTSGSSA-DTDDAPVPAPAGDQK', 5)
('QGSPGAWAPLDPTSGSSA-YGFGSDRFGRDLNY', 5)
('QGSPGAWAPLDPTSGSSA-SGEMMGAPAVVAPQQPP', 5)
('QGSPGAWAPLDPTSGSSA-FVK', 5)
('QGSPGAWAPLDPTSGSSA-VFK', 5)
('QGSPGAWAPLDPTSGSSA-FVK', 5)
('QGSPGAWAPLDPTSGSSA-VFK', 5)
('QGSPGAWAPLDPTSGSSA-FVK', 5)
('QGSPGAWAPLDPTSGSSA-FVK', 5)
('QGSPGAWAPLDPTSGSSA-VFK', 5)
('QGSPGAWAPLDPTSGSSA-FVK', 5)


('QGVMVGMGQKDCYV-VFK', 4)
('QGVMVGMGQKDCYV-FVK', 4)
('QGVMVGMGQKDCYV-FVK', 4)
('NGQAACHSAQGRWE-YGFGSDRFGRDLNY', 4)
('NGQAACHSAQGRWE-SGEMMGAPAVVAPQQPP', 4)
('NGQAACHSAQGRWE-FVK', 4)
('NGQAACHSAQGRWE-FVK', 4)
('NGQAACHSAQGRWE-FVK', 4)
('NGQAACHSAQGRWE-FVK', 4)
('NGQAACHSAQGRWE-FVK', 4)
('NGQAACHSAQGRWE-VFK', 4)
('NGQAACHSAQGRWE-FVK', 4)
('NGQAACHSAQGRWE-FVK', 4)
('NGQAACHSAQGRWE-FVK', 4)
('NGQAACHSAQGRWE-FVK', 4)
('NGQAACHSAQGRWE-FVK', 4)
('NGQAACHSAQGRWE-VFK', 4)
('NGQAACHSAQGRWE-FVK', 4)
('NGQAACHSAQGRWE-FVK', 4)
('VLDEELEGVSPDELKD-TEGGGSEALPCPGPPAG', 4)
('VLDEELEGVSPDELKD-LGSHTDEMLWHVLE', 4)
('VLDEELEGVSPDELKD-GDCYSRLTTEQSH', 4)
('VLDEELEGVSPDELKD-DTDDAPVPAPAGDQK', 4)
('VLDEELEGVSPDELKD-YGFGSDRFGRDLNY', 4)
('VLDEELEGVSPDELKD-SGEMMGAPAVVAPQQPP', 4)
('VLDEELEGVSPDELKD-VFK', 4)
('VLDEELEGVSPDELKD-FVK', 4)
('VLDEELEGVSPDELKD-VFK', 4)
('VLDEELEGVSPDELKD-FVK', 4)
('VLDEELEGVSPDELKD-FVK', 4)
('VLDEELEGVSPDELKD-VFK', 4)
('VLDEELEGVSPDELKD-FVK', 4)
('VLDEELEGVSPDELKD-FVK', 4)
('VLDEELEGVSPDELK

('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-VFK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-VFK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('VNKSCYEDGWLIKMT-FVK', 4)
('KEDLLEERKEEMHT-LGSHTDEMLWHVLE', 4)
('KEDLLEERKEEMHT-FVK', 4)
('KEDLLEERKEEMHT-FVK', 4)
('KEDLLEERKEEMHT-FVK', 4)
('SNSQQAYQEAFEI-GLGNKTYEHFNAMGK', 3)
('SNSQQAYQEAFEI-YEAGEKRWGTDEVK', 3)
('IFVGTPYYGYVYEDTLP-VTVTVLDVNDNRPEFTMK', 3)
('QVNQSLASMPPMNPATTLP-VTVTVLDVNDNRPEFTMK', 3)
('MAEVDAAMAARPHSIDGRVV-GLGNKTYEHFNAMGK', 3)
('MAEVDAAMAARPHSIDGRVV-YEAGEKRWGTDEVK', 3)
('MAEVDAAMAARPHSIDGRVV-DVHPSDLKPKGDDKDPSK', 3)
('GSAGLMLVEFFAPWCG-TGLTEGQHGFHVHQYG', 3)
('GSAGLMLVEFFAPWCG-GLGNKTYEHFNAMGK

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

# Filtering overlap

Q: What happens if we completely filter out all cases where any mass overlaps?

In [109]:
from gen_spectra import get_precursor
from utils import ppm_to_da
precursor_tolerance = 10

def filter_by_validity(b_cluster, y_cluster):
    interesting_combos = []
    valid = True
    for b in b_cluster.indices:
        for y in y_cluster.indices:
            if b[3] == y[3]:
                valid = False
    return valid

def brutal_calc_combined_score(b_indices, y_indices, b_score, y_score):
    masses = []
    combined_score = 0
    for index in b_indices:
        current_mass = float(index[3])
        if current_mass not in masses:
            masses.append(current_mass)
            combined_score = combined_score + 1
        else:
            combined_score = combined_score - 2000 #Little bit of a hack
    for index in y_indices:
        current_mass = float(index[3])
        if current_mass not in masses:
            masses.append(current_mass)
            combined_score = combined_score + 1
        else:
            combined_score = combined_score - 2000
    return combined_score

def combine(b_cluster, y_cluster):
    b_start, b_end, y_start, y_end = b_cluster[4], b_cluster[5], y_cluster[4], y_cluster[5]
    if b_cluster.pid == y_cluster.pid:
        score_add = 2
        hybrid = False
        if  (b_end <= y_end) and (b_start <= y_start) and (b_end >= y_start): #overlap
            overlap = True
            seq = b_cluster.seq
            rem_chars = y_start - b_end
            while (rem_chars >= 0):
                seq = seq + y_cluster.seq[len(y_cluster.seq)-1 - rem_chars]
                rem_chars = rem_chars - 1
        else:                                                                #no overlap
            hybrid = False
            overlap = False
            score_add = 2
            seq = b_cluster.seq + '-' + y_cluster.seq
    else:                                                                    #hybrid
        hybrid = True
        overlap = False
        score_add = 0
        seq = b_cluster.seq + '-' + y_cluster.seq
    return seq, score_add, hybrid, overlap

def filter_by_precursor(seq, pc, overlap, obs_prec):
    new_seq = seq.replace("-", "") if overlap == False else seq
    tol = ppm_to_da(obs_prec, precursor_tolerance)
    if get_precursor(new_seq, charge=pc) > obs_prec + tol:
        return False
    else:
        return True

In [20]:
import collections
import operator


Cluster = collections.namedtuple('Cluster', 'score pid seq mass start end ion hits')

def load_fasta(fasta_file):
    f = []
    for l in open(fasta_file):
        if l[0] == '>':
            continue
        f.append(l.rstrip())
    return f

def get_seq(f, pid, start, end):
    return f[pid][start-1:end]

def min_info(cluster):
    return (cluster.pid, cluster.start, cluster.end, cluster.score, cluster.seq)

def bsearch(key, Y):
    lo = -1
    hi = len(Y)

    mid = -1
    while (hi - lo > 1):
        mid = int((hi+lo) / 2)
        if Y[mid].start < key:
            lo = mid
        else:
            hi = mid
    return hi

fasta_file = os.path.abspath(os.path.join('../../../data/database/sample_database.fasta'))
fa = load_fasta(fasta_file)
solution_array = []

B = {}
for l in b_sorted_clusters:
    if l.pid not in B:
        B[l.pid] = []

    B[l.pid].append(l)

Y = {}
for l in y_sorted_clusters:
    if l.pid not in Y:
        Y[l.pid] = []
    Y[l.pid].append(l)

for pid in B:
    if pid not in Y:
        continue

    sorted_B = sorted(B[pid], key=operator.attrgetter('pid', 'start', 'end'))
    sorted_Y = sorted(Y[pid], key=operator.attrgetter('pid', 'start', 'end'))

    for b in sorted_B:
        y_i = bsearch(b.start, sorted_Y)

        if y_i >= len(sorted_Y): break

        y = sorted_Y[y_i]

        while y_i < len(sorted_Y) and y.start - b.end < 10:
            y = sorted_Y[y_i]
            seq = get_seq(fa, b.pid, b.start, y.end)
            tup = (b.score + y.score, b.end - y.start, y.end - b.start, seq, min_info(b), min_info(y))
            solution_array.append(tup)
            y_i += 1
            
solution_array = sorted(solution_array, key = lambda x: x[0], reverse=True)
[print(x) for x in solution_array]

(5, 10, 18, 'GMLPANYVEAI', (233, 43, 58, 3, 'GSAGLMLVEFFAPWCG'), (233, 48, 61, 2, 'MLVEFFAPWCGHCK'))
(5, -9, 46, '', (199, 137, 155, 3, 'GQVLPEMEIHLQTDAKKGT'), (199, 164, 183, 2, 'GMTQEELVSNLGTIARSGSK'))
(5, -6, 40, 'ELQEQLYVRRAALAARSLLDVLPFDDNVCLREPCENYMKCV', (167, 12, 31, 2, 'DGPVQGTIHFEQKASGEPVV'), (167, 37, 52, 3, 'TGLTEGQHGFHVHQYG'))
(5, -2, 19, 'LPFDDNVCLREPCENYMKCV', (167, 33, 35, 2, 'SGQ'), (167, 37, 52, 3, 'TGLTEGQHGFHVHQYG'))
(4, -7, 19, 'LSQAGQKASAAFSSVGSVIT', (240, 2, 7, 1, 'SAEAAD'), (240, 14, 21, 3, 'SRPCTPPQ'))
(4, -3, 18, 'SQAGQKASAAFSSVGSVIT', (240, 3, 11, 1, 'AEAADREAA'), (240, 14, 21, 3, 'SRPCTPPQ'))
(4, 8, 15, 'GQKASAAFSSVGSVIT', (240, 6, 22, 1, 'ADREAATSSRPCTPPQT'), (240, 14, 21, 3, 'SRPCTPPQ'))
(4, -22, 53, '', (264, 338, 352, 2, 'VPYLEDLHGRAEEEE'), (264, 374, 391, 2, 'QVLEEEVGQNGQSHSLPK'))
(4, -70, 104, '', (159, 1204, 1221, 2, 'SQGEPSPMVSPAEASPQS'), (159, 1291, 1308, 2, 'MGQDAEICLLKSGELMIK'))
(4, 7, 11, '', (149, 853, 865, 2, 'SGSRGSQDFYPKW'), (149, 858, 864, 2,

(3, -4, 23, '', (233, 425, 444, 2, 'KDPNIVIAKMDATANDVPSP'), (233, 448, 448, 1, 'K'))
(3, -16, 35, '', (233, 425, 444, 2, 'KDPNIVIAKMDATANDVPSP'), (233, 460, 460, 1, 'K'))
(3, -5, 7, 'GSCWAFGA', (102, 36, 38, 2, 'QGS'), (102, 43, 43, 1, 'K'))
(3, -15, 17, 'GSCWAFGAVEAISDRTCI', (102, 36, 38, 2, 'QGS'), (102, 53, 53, 1, 'K'))
(3, -34, 62, '', (102, 117, 131, 1, 'KHTCMKFYARVCRSG'), (102, 165, 179, 2, 'DRQQSQVLDAMQDSF'))
(3, -13, 30, '', (102, 183, 184, 1, 'SG'), (102, 197, 213, 2, 'RELHDPHYFSPIGFPHK'))
(3, 0, 33, '', (102, 347, 364, 1, 'FQSKMLNTSSLLEQLNDQ'), (102, 364, 380, 2, 'QFNWVSQLANLTQGEDK'))
(3, 1, 32, '', (102, 348, 365, 1, 'QSKMLNTSSLLEQLNDQF'), (102, 364, 380, 2, 'QFNWVSQLANLTQGEDK'))
(3, -5, 29, '', (102, 351, 359, 1, 'MLNTSSLLE'), (102, 364, 380, 2, 'QFNWVSQLANLTQGEDK'))
(3, 9, 18, '', (49, 179, 197, 1, 'EAIDDIPFGITSNSGVFSK'), (49, 188, 197, 2, 'ITSNSGVFSK'))
(3, 4, 10, '', (49, 377, 391, 2, 'ANFEEVAFDEKKNVF'), (49, 387, 387, 1, 'K'))
(3, 3, 11, '', (49, 377, 391, 2, 'ANFEEVAFD

(2, -22, 41, '', (254, 738, 755, 1, 'GLEHDNLEAHSPEQPPRA'), (254, 777, 779, 1, 'HSQ'))
(2, -5, 24, '', (254, 755, 772, 1, 'ATDLTARQTEALQNQAQH'), (254, 777, 779, 1, 'HSQ'))
(2, -6, 30, '', (254, 755, 772, 1, 'ATDLTARQTEALQNQAQH'), (254, 778, 785, 1, 'SQVEELER'))
(2, -8, 12, '', (254, 767, 769, 1, 'QNQ'), (254, 777, 779, 1, 'HSQ'))
(2, -9, 18, '', (254, 767, 769, 1, 'QNQ'), (254, 778, 785, 1, 'SQVEELER'))
(2, -1, 8, 'PLTRDFWDN', (253, 19, 20, 1, 'QG'), (253, 21, 27, 1, 'DTDTARR'))
(2, -25, 29, 'PLTRDFWDNLEKETDWVRQEMNKDLEEVKQ', (253, 19, 20, 1, 'QG'), (253, 45, 48, 1, 'PAGP'))
(2, -9, 13, 'VRQEMNKDLEEVKQ', (253, 35, 36, 1, 'PV'), (253, 45, 48, 1, 'PAGP'))
(2, -116, 117, 'VRQEMNKDLEEVKQKVQPYLDEFQKKWKEDVELYRQ', (253, 35, 36, 1, 'PV'), (253, 152, 152, 1, 'K'))
(2, 3, 3, 'EVKQ', (253, 45, 48, 1, 'PAGP'), (253, 45, 48, 1, 'PAGP'))
(2, -104, 107, 'EVKQKVQPYLDEFQKKWKEDVELYRQ', (253, 45, 48, 1, 'PAGP'), (253, 152, 152, 1, 'K'))
(2, 8, 27, '', (253, 155, 172, 1, 'LEAGAIVDHHTPSGESPA'), (253, 164, 18

(2, 4, 10, '', (189, 661, 675, 1, 'PICEHIAENSKDLES'), (189, 671, 671, 1, 'K'))
(2, -7, 21, '', (189, 661, 675, 1, 'PICEHIAENSKDLES'), (189, 682, 682, 1, 'K'))
(2, -17, 31, '', (189, 661, 675, 1, 'PICEHIAENSKDLES'), (189, 692, 692, 1, 'K'))
(2, 13, 17, '', (189, 694, 709, 1, 'SFHYKSQLRNHEREQH'), (189, 696, 711, 1, 'HYKSQLRNHEREQHCL'))
(2, 11, 4, '', (189, 694, 709, 1, 'SFHYKSQLRNHEREQH'), (189, 698, 698, 1, 'K'))
(2, 10, 12, '', (189, 694, 709, 1, 'SFHYKSQLRNHEREQH'), (189, 699, 706, 1, 'SQLRNHER'))
(2, 0, 22, '', (189, 694, 709, 1, 'SFHYKSQLRNHEREQH'), (189, 709, 716, 1, 'HCLPNTLS'))
(2, -23, 38, '', (189, 694, 709, 1, 'SFHYKSQLRNHEREQH'), (189, 732, 732, 1, 'K'))
(2, 16, 9, '', (189, 707, 725, 1, 'EQHCLPNTLSVASNEPRIS'), (189, 709, 716, 1, 'HCLPNTLS'))
(2, -7, 25, '', (189, 707, 725, 1, 'EQHCLPNTLSVASNEPRIS'), (189, 732, 732, 1, 'K'))
(2, -10, 41, '', (189, 707, 725, 1, 'EQHCLPNTLSVASNEPRIS'), (189, 735, 748, 1, 'QEGNKPSTQKQYRC'))
(2, -4, 12, '', (189, 720, 728, 1, 'NEPRISRDA'), (189, 

(2, 13, 18, 'GEAGR', (152, 66, 81, 1, 'MKFSLYFLAYEDKNDI'), (152, 68, 84, 1, 'FSLYFLAYEDKNDIPKD'))
(2, 3, 12, 'GEAGR', (152, 66, 81, 1, 'MKFSLYFLAYEDKNDI'), (152, 78, 78, 1, 'K'))
(2, -2, 17, 'GEAGR', (152, 66, 81, 1, 'MKFSLYFLAYEDKNDI'), (152, 83, 83, 1, 'K'))
(2, -4, 19, 'GEAGR', (152, 66, 81, 1, 'MKFSLYFLAYEDKNDI'), (152, 85, 85, 1, 'K'))
(2, -7, 22, 'GEAGR', (152, 66, 81, 1, 'MKFSLYFLAYEDKNDI'), (152, 88, 88, 1, 'K'))
(2, -15, 30, 'GEAGR', (152, 66, 81, 1, 'MKFSLYFLAYEDKNDI'), (152, 96, 96, 1, 'K'))
(2, 1, 5, '', (152, 73, 79, 1, 'LAYEDKN'), (152, 78, 78, 1, 'K'))
(2, -4, 10, '', (152, 73, 79, 1, 'LAYEDKN'), (152, 83, 83, 1, 'K'))
(2, -6, 12, '', (152, 73, 79, 1, 'LAYEDKN'), (152, 85, 85, 1, 'K'))
(2, -9, 15, '', (152, 73, 79, 1, 'LAYEDKN'), (152, 88, 88, 1, 'K'))
(2, -17, 23, '', (152, 73, 79, 1, 'LAYEDKN'), (152, 96, 96, 1, 'K'))
(2, 1, 1, '', (152, 82, 84, 1, 'PKD'), (152, 83, 83, 1, 'K'))
(2, -1, 3, '', (152, 82, 84, 1, 'PKD'), (152, 85, 85, 1, 'K'))
(2, -4, 6, '', (152, 82, 84,

(2, -8, 28, '', (76, 599, 612, 1, 'FLDHKTLYYDVEPF'), (76, 620, 627, 1, 'NDVKGCHL'))
(2, -9, 39, '', (76, 599, 612, 1, 'FLDHKTLYYDVEPF'), (76, 621, 638, 1, 'DVKGCHLVGYFSKEKHCQ'))
(2, -11, 24, '', (76, 599, 612, 1, 'FLDHKTLYYDVEPF'), (76, 623, 623, 1, 'K'))
(2, 11, 2, '', (76, 601, 614, 1, 'DHKTLYYDVEPFLF'), (76, 603, 603, 1, 'K'))
(2, -4, 31, '', (76, 601, 614, 1, 'DHKTLYYDVEPFLF'), (76, 618, 632, 1, 'TQNDVKGCHLVGYFS'))
(2, -6, 26, '', (76, 601, 614, 1, 'DHKTLYYDVEPFLF'), (76, 620, 627, 1, 'NDVKGCHL'))
(2, -7, 37, '', (76, 601, 614, 1, 'DHKTLYYDVEPFLF'), (76, 621, 638, 1, 'DVKGCHLVGYFSKEKHCQ'))
(2, -9, 22, '', (76, 601, 614, 1, 'DHKTLYYDVEPFLF'), (76, 623, 623, 1, 'K'))
(2, -19, 32, '', (76, 601, 614, 1, 'DHKTLYYDVEPFLF'), (76, 633, 633, 1, 'K'))
(2, -8, 28, '', (76, 604, 610, 1, 'TLYYDVE'), (76, 618, 632, 1, 'TQNDVKGCHLVGYFS'))
(2, -10, 23, '', (76, 604, 610, 1, 'TLYYDVE'), (76, 620, 627, 1, 'NDVKGCHL'))
(2, 3, 17, '', (76, 615, 621, 1, 'YVLTQND'), (76, 618, 632, 1, 'TQNDVKGCHLVGYFS'))

(2, -34, 35, '', (27, 1916, 1917, 1, 'GS'), (27, 1951, 1951, 1, 'K'))
(2, -9, 10, '', (27, 1941, 1942, 1, 'SG'), (27, 1951, 1951, 1, 'K'))
(2, -11, 18, '', (27, 1941, 1942, 1, 'SG'), (27, 1953, 1959, 1, 'LWQTFSC'))
(2, 13, 16, '', (27, 1974, 1988, 1, 'CLLNPCQNQGSCRHL'), (27, 1975, 1990, 1, 'LLNPCQNQGSCRHLQG'))
(2, -22, 42, '', (27, 1974, 1988, 1, 'CLLNPCQNQGSCRHL'), (27, 2010, 2016, 1, 'HRVDQQC'))
(2, -8, 15, '', (27, 2001, 2002, 1, 'SG'), (27, 2010, 2016, 1, 'HRVDQQC'))
(2, -33, 34, '', (27, 2001, 2002, 1, 'SG'), (27, 2035, 2035, 1, 'K'))
(2, -4, 11, '', (27, 2005, 2006, 1, 'GQ'), (27, 2010, 2016, 1, 'HRVDQQC'))
(2, -29, 30, '', (27, 2005, 2006, 1, 'GQ'), (27, 2035, 2035, 1, 'K'))
(2, -5, 7, '', (27, 2044, 2046, 1, 'TNG'), (27, 2051, 2051, 1, 'K'))
(2, -11, 20, '', (27, 2044, 2046, 1, 'TNG'), (27, 2057, 2064, 1, 'PRGSDSCL'))
(2, -4, 5, '', (27, 2046, 2047, 1, 'GQ'), (27, 2051, 2051, 1, 'K'))
(2, -10, 18, '', (27, 2046, 2047, 1, 'GQ'), (27, 2057, 2064, 1, 'PRGSDSCL'))
(2, -5, 20, '', (

(2, -10, 16, '', (162, 602, 608, 1, 'PAEAKSP'), (162, 618, 618, 1, 'K'))
(2, 5, 5, '', (162, 607, 617, 1, 'SPAEAKSPAEA'), (162, 612, 612, 1, 'K'))
(2, -1, 11, '', (162, 607, 617, 1, 'SPAEAKSPAEA'), (162, 618, 618, 1, 'K'))
(2, -7, 17, '', (162, 607, 617, 1, 'SPAEAKSPAEA'), (162, 624, 624, 1, 'K'))
(2, -13, 23, '', (162, 607, 617, 1, 'SPAEAKSPAEA'), (162, 630, 630, 1, 'K'))
(2, 2, 4, '', (162, 608, 614, 1, 'PAEAKSP'), (162, 612, 612, 1, 'K'))
(2, -4, 10, '', (162, 608, 614, 1, 'PAEAKSP'), (162, 618, 618, 1, 'K'))
(2, -10, 16, '', (162, 608, 614, 1, 'PAEAKSP'), (162, 624, 624, 1, 'K'))
(2, 5, 5, '', (162, 613, 623, 1, 'SPAEAKSPAEA'), (162, 618, 618, 1, 'K'))
(2, -1, 11, '', (162, 613, 623, 1, 'SPAEAKSPAEA'), (162, 624, 624, 1, 'K'))
(2, -7, 17, '', (162, 613, 623, 1, 'SPAEAKSPAEA'), (162, 630, 630, 1, 'K'))
(2, -13, 23, '', (162, 613, 623, 1, 'SPAEAKSPAEA'), (162, 636, 636, 1, 'K'))
(2, 2, 4, '', (162, 614, 620, 1, 'PAEAKSP'), (162, 618, 618, 1, 'K'))
(2, -4, 10, '', (162, 614, 620, 1, '

(2, 3, 28, '', (179, 152, 166, 1, 'KAYSEAHEISKEHMQ'), (179, 163, 180, 1, 'EHMQPTHPIRLGLALNYS'))
(2, -7, 37, '', (179, 152, 166, 1, 'KAYSEAHEISKEHMQ'), (179, 173, 189, 1, 'LGLALNYSVFYYEIQNA'))
(2, -11, 40, '', (179, 152, 166, 1, 'KAYSEAHEISKEHMQ'), (179, 177, 192, 1, 'LNYSVFYYEIQNAPEQ'))
(2, -9, 10, '', (179, 188, 189, 1, 'NA'), (179, 198, 198, 1, 'K'))
(2, -28, 29, '', (179, 188, 189, 1, 'NA'), (179, 217, 217, 1, 'K'))
(2, 6, 2, '', (179, 215, 223, 1, 'SYKDSTLIM'), (179, 217, 217, 1, 'K'))
(2, 0, 24, '', (179, 215, 223, 1, 'SYKDSTLIM'), (179, 223, 239, 1, 'MQLLRDNLTLWTSDQQD'))
(2, -7, 8, 'AFVANNGTM', (172, 17, 18, 1, 'SG'), (172, 25, 25, 1, 'K'))
(2, -14, 15, 'AFVANNGTMAGCQAKS', (172, 17, 18, 1, 'SG'), (172, 32, 32, 1, 'K'))
(2, 15, 2, 'KDC', (172, 64, 81, 1, 'ESKPSESNYSSVDNLNLL'), (172, 66, 66, 1, 'K'))
(2, 15, 16, 'KDCRLTM', (172, 64, 81, 1, 'ESKPSESNYSSVDNLNLL'), (172, 66, 80, 1, 'KPSESNYSSVDNLNL'))
(2, -6, 23, 'KDCRLTM', (172, 64, 81, 1, 'ESKPSESNYSSVDNLNLL'), (172, 87, 87, 1, 'K')

(2, -28, 30, '', (211, 103, 105, 1, 'GTN'), (211, 133, 133, 1, 'K'))
(2, -1, 2, '', (211, 153, 154, 1, 'NA'), (211, 155, 155, 1, 'K'))
(2, -7, 8, '', (211, 153, 154, 1, 'NA'), (211, 161, 161, 1, 'K'))
(2, -12, 13, '', (211, 153, 154, 1, 'NA'), (211, 166, 166, 1, 'K'))
(2, 18, 1, '', (211, 165, 184, 1, 'PKLANVQLLDIDGGFVHSDG'), (211, 166, 166, 1, 'K'))
(2, 6, 22, '', (211, 165, 184, 1, 'PKLANVQLLDIDGGFVHSDG'), (211, 178, 187, 1, 'GFVHSDGAIS'))
(2, -20, 39, '', (211, 165, 184, 1, 'PKLANVQLLDIDGGFVHSDG'), (211, 204, 204, 1, 'K'))
(2, 6, 18, '', (211, 169, 184, 1, 'NVQLLDIDGGFVHSDG'), (211, 178, 187, 1, 'GFVHSDGAIS'))
(2, -20, 35, '', (211, 169, 184, 1, 'NVQLLDIDGGFVHSDG'), (211, 204, 204, 1, 'K'))
(2, 11, 12, '', (211, 175, 189, 1, 'IDGGFVHSDGAISCH'), (211, 178, 187, 1, 'GFVHSDGAIS'))
(2, -15, 29, '', (211, 175, 189, 1, 'IDGGFVHSDGAISCH'), (211, 204, 204, 1, 'K'))
(2, 8, 9, '', (211, 178, 186, 1, 'GFVHSDGAI'), (211, 178, 187, 1, 'GFVHSDGAIS'))
(2, -18, 26, '', (211, 178, 186, 1, 'GFVHSDGAI

(2, -15, 29, 'PDNTYEVKIDNSQVESGSLEDDWDFLPPKK', (143, 38, 52, 1, 'FVHRDTPENNPDTPF'), (143, 67, 67, 1, 'K'))
(2, 10, 11, '', (143, 105, 121, 1, 'QVPPMRVYEVATFYTMY'), (143, 111, 116, 1, 'VYEVAT'))
(2, -3, 19, '', (143, 105, 121, 1, 'QVPPMRVYEVATFYTMY'), (143, 124, 124, 1, 'K'))
(2, -7, 23, '', (143, 105, 121, 1, 'QVPPMRVYEVATFYTMY'), (143, 128, 128, 1, 'K'))
(2, -10, 41, '', (143, 105, 121, 1, 'QVPPMRVYEVATFYTMY'), (143, 131, 146, 1, 'IQVCTTTPCMLRDSDS'))
(2, -4, 10, '', (143, 106, 107, 1, 'VP'), (143, 111, 116, 1, 'VYEVAT'))
(2, -17, 18, '', (143, 106, 107, 1, 'VP'), (143, 124, 124, 1, 'K'))
(2, 11, 7, '', (143, 109, 122, 1, 'MRVYEVATFYTMYN'), (143, 111, 116, 1, 'VYEVAT'))
(2, -2, 15, '', (143, 109, 122, 1, 'MRVYEVATFYTMYN'), (143, 124, 124, 1, 'K'))
(2, -6, 19, '', (143, 109, 122, 1, 'MRVYEVATFYTMYN'), (143, 128, 128, 1, 'K'))
(2, -9, 37, '', (143, 109, 122, 1, 'MRVYEVATFYTMYN'), (143, 131, 146, 1, 'IQVCTTTPCMLRDSDS'))
(2, -10, 38, '', (143, 109, 122, 1, 'MRVYEVATFYTMYN'), (143, 132, 147

(2, 8, 15, '', (73, 32, 40, 1, 'RMFASFPTT'), (73, 32, 47, 1, 'RMFASFPTTKTYFPHF'))
(2, -1, 9, '', (73, 32, 40, 1, 'RMFASFPTT'), (73, 41, 41, 1, 'K'))
(2, -11, 22, '', (73, 32, 40, 1, 'RMFASFPTT'), (73, 51, 54, 1, 'HGSA'))
(2, 8, 4, '', (73, 50, 59, 1, 'SHGSAQVKGH'), (73, 51, 54, 1, 'HGSA'))
(2, 2, 7, '', (73, 50, 59, 1, 'SHGSAQVKGH'), (73, 57, 57, 1, 'K'))
(2, -2, 11, '', (73, 50, 59, 1, 'SHGSAQVKGH'), (73, 61, 61, 1, 'K'))
(2, -3, 12, '', (73, 50, 59, 1, 'SHGSAQVKGH'), (73, 62, 62, 1, 'K'))
(2, -32, 41, '', (73, 50, 59, 1, 'SHGSAQVKGH'), (73, 91, 91, 1, 'K'))
(2, -4, 5, '', (73, 52, 53, 1, 'GS'), (73, 57, 57, 1, 'K'))
(2, -8, 9, '', (73, 52, 53, 1, 'GS'), (73, 61, 61, 1, 'K'))
(2, -9, 10, '', (73, 52, 53, 1, 'GS'), (73, 62, 62, 1, 'K'))
(2, -38, 39, '', (73, 52, 53, 1, 'GS'), (73, 91, 91, 1, 'K'))
(2, -2, 7, '', (73, 93, 98, 1, 'RVDPVN'), (73, 100, 100, 1, 'K'))
(2, -18, 42, '', (73, 93, 98, 1, 'RVDPVN'), (73, 116, 135, 1, 'ADFTPAVHASLDKFLASVST'))
(2, -3, 4, '', (73, 96, 97, 1, 'PV'), 

(2, -12, 28, '', (7, 119, 135, 1, 'HKLATDKNDPHLCDFIE'), (7, 147, 147, 1, 'K'))
(2, -2, 8, '', (7, 136, 142, 1, 'TYYLSEQ'), (7, 144, 144, 1, 'K'))
(2, -5, 11, '', (7, 136, 142, 1, 'TYYLSEQ'), (7, 147, 147, 1, 'K'))
(2, -6, 19, '', (7, 136, 142, 1, 'TYYLSEQ'), (7, 148, 155, 1, 'ELGDHVTN'))
(2, -16, 22, '', (7, 136, 142, 1, 'TYYLSEQ'), (7, 158, 158, 1, 'K'))
(2, 14, 1, '', (7, 157, 172, 1, 'RKMGAPEAGMAEYLFD'), (7, 158, 158, 1, 'K'))
(2, -1, 16, '', (7, 157, 172, 1, 'RKMGAPEAGMAEYLFD'), (7, 173, 173, 1, 'K'))
(2, 13, 0, '', (7, 158, 171, 1, 'KMGAPEAGMAEYLF'), (7, 158, 158, 1, 'K'))
(2, -2, 15, '', (7, 158, 171, 1, 'KMGAPEAGMAEYLF'), (7, 173, 173, 1, 'K'))
(2, -1, 2, 'CLK', (277, 20, 21, 1, 'SG'), (277, 22, 22, 1, 'K'))
(2, -5, 6, 'CLKQNDT', (277, 20, 21, 1, 'SG'), (277, 26, 26, 1, 'K'))
(2, -10, 11, 'CLKQNDTYINGI', (277, 20, 21, 1, 'SG'), (277, 31, 31, 1, 'K'))
(2, -1, 2, 'YGF', (277, 45, 46, 1, 'GS'), (277, 47, 47, 1, 'K'))
(2, -18, 19, 'YGFLKDMGLKVFTNLNIRKP', (277, 45, 46, 1, 'GS'), (277

(2, 11, 19, '', (203, 174, 190, 1, 'AGWKPGSDTIKPNVDDS'), (203, 179, 193, 1, 'GSDTIKPNVDDSKEY'))
(2, 6, 10, '', (203, 174, 190, 1, 'AGWKPGSDTIKPNVDDS'), (203, 184, 184, 1, 'K'))
(2, -1, 17, '', (203, 174, 190, 1, 'AGWKPGSDTIKPNVDDS'), (203, 191, 191, 1, 'K'))
(2, -2, 21, '', (203, 174, 190, 1, 'AGWKPGSDTIKPNVDDS'), (203, 192, 195, 1, 'EYFS'))
(2, -6, 22, '', (203, 174, 190, 1, 'AGWKPGSDTIKPNVDDS'), (203, 196, 196, 1, 'K'))
(2, 1, 14, '', (203, 179, 180, 1, 'GS'), (203, 179, 193, 1, 'GSDTIKPNVDDSKEY'))
(2, -4, 5, '', (203, 179, 180, 1, 'GS'), (203, 184, 184, 1, 'K'))
(2, -11, 12, '', (203, 179, 180, 1, 'GS'), (203, 191, 191, 1, 'K'))
(2, -5, 6, '', (199, 32, 33, 1, 'PV'), (199, 38, 38, 1, 'K'))
(2, -20, 37, '', (199, 32, 33, 1, 'PV'), (199, 53, 69, 1, 'GISAGQLYSTQAAEDKE'))
(2, 14, 21, '', (199, 48, 67, 1, 'QSLASGISAGQLYSTQAAED'), (199, 53, 69, 1, 'GISAGQLYSTQAAEDKE'))
(2, -1, 20, '', (199, 48, 67, 1, 'QSLASGISAGQLYSTQAAED'), (199, 68, 68, 1, 'K'))
(2, 15, 20, '', (199, 49, 68, 1, 'SLASGI

(2, -19, 34, '', (96, 143, 158, 1, 'YMVEWYTKSHGLLIEQ'), (96, 177, 177, 1, 'K'))
(2, -3, 4, '', (96, 158, 159, 1, 'QG'), (96, 162, 162, 1, 'K'))
(2, -5, 6, '', (96, 158, 159, 1, 'QG'), (96, 164, 164, 1, 'K'))
(2, -7, 8, '', (96, 158, 159, 1, 'QG'), (96, 166, 166, 1, 'K'))
(2, -18, 19, '', (96, 158, 159, 1, 'QG'), (96, 177, 177, 1, 'K'))
(2, 5, 1, '', (96, 176, 182, 1, 'LKEGYEN'), (96, 177, 177, 1, 'K'))
(2, -4, 10, '', (96, 176, 182, 1, 'LKEGYEN'), (96, 186, 186, 1, 'K'))
(2, -16, 31, '', (96, 176, 182, 1, 'LKEGYEN'), (96, 198, 207, 1, 'SAGIGDVLEE'))
(2, -4, 14, '', (96, 193, 194, 1, 'PV'), (96, 198, 207, 1, 'SAGIGDVLEE'))
(2, -22, 37, '', (96, 193, 194, 1, 'PV'), (96, 216, 230, 1, 'HSNVKVVSNFMDFDE'))
(2, 8, 10, '', (96, 197, 206, 1, 'FSAGIGDVLE'), (96, 198, 207, 1, 'SAGIGDVLEE'))
(2, -10, 33, '', (96, 197, 206, 1, 'FSAGIGDVLE'), (96, 216, 230, 1, 'HSNVKVVSNFMDFDE'))
(2, 5, 9, '', (96, 226, 240, 1, 'MDFDENGVLKGFKGE'), (96, 235, 235, 1, 'K'))
(2, 2, 12, '', (96, 226, 240, 1, 'MDFDENGVLKG

(2, 0, 16, '', (131, 726, 742, 1, 'SPAQRDEDAVLASGDEK'), (131, 742, 742, 1, 'K'))
(2, -4, 20, '', (131, 726, 742, 1, 'SPAQRDEDAVLASGDEK'), (131, 746, 746, 1, 'K'))
(2, -11, 34, '', (131, 726, 742, 1, 'SPAQRDEDAVLASGDEK'), (131, 753, 760, 1, 'FVDTNAPA'))
(2, -1, 15, '', (131, 728, 735, 1, 'AQRDEDAV'), (131, 736, 743, 1, 'LASGDEKD'))
(2, -6, 19, '', (131, 728, 735, 1, 'AQRDEDAV'), (131, 741, 747, 1, 'EKDEGKE'))
(2, -7, 14, '', (131, 728, 735, 1, 'AQRDEDAV'), (131, 742, 742, 1, 'K'))
(2, -11, 18, '', (131, 728, 735, 1, 'AQRDEDAV'), (131, 746, 746, 1, 'K'))
(2, -2, 9, '', (131, 738, 739, 1, 'SG'), (131, 741, 747, 1, 'EKDEGKE'))
(2, -3, 4, '', (131, 738, 739, 1, 'SG'), (131, 742, 742, 1, 'K'))
(2, -7, 8, '', (131, 738, 739, 1, 'SG'), (131, 746, 746, 1, 'K'))
(2, -14, 22, '', (131, 738, 739, 1, 'SG'), (131, 753, 760, 1, 'FVDTNAPA'))
(2, 2, 8, '', (131, 739, 743, 1, 'GDEKD'), (131, 741, 747, 1, 'EKDEGKE'))
(2, 1, 3, '', (131, 739, 743, 1, 'GDEKD'), (131, 742, 742, 1, 'K'))
(2, -3, 7, '', (131,

(2, 14, 21, '', (128, 90, 107, 1, 'YDVQELRRAMKGAGTDEG'), (128, 93, 111, 1, 'QELRRAMKGAGTDEGCLIE'))
(2, 7, 10, '', (128, 90, 107, 1, 'YDVQELRRAMKGAGTDEG'), (128, 100, 100, 1, 'K'))
(2, 1, 16, '', (128, 168, 185, 1, 'DALMKQDAQELYEAGEKR'), (128, 184, 184, 1, 'K'))
(2, -6, 25, '', (128, 168, 185, 1, 'DALMKQDAQELYEAGEKR'), (128, 191, 193, 1, 'VKF'))
(2, -28, 45, '', (128, 168, 185, 1, 'DALMKQDAQELYEAGEKR'), (128, 213, 213, 1, 'K'))
(2, 2, 14, '', (128, 170, 186, 1, 'LMKQDAQELYEAGEKRW'), (128, 184, 184, 1, 'K'))
(2, -5, 23, '', (128, 170, 186, 1, 'LMKQDAQELYEAGEKRW'), (128, 191, 193, 1, 'VKF'))
(2, -27, 43, '', (128, 170, 186, 1, 'LMKQDAQELYEAGEKRW'), (128, 213, 213, 1, 'K'))
(2, 3, 4, '', (128, 209, 216, 1, 'FDEYKRIS'), (128, 213, 213, 1, 'K'))
(2, -2, 9, '', (128, 209, 216, 1, 'FDEYKRIS'), (128, 218, 218, 1, 'K'))
(2, -9, 16, '', (128, 209, 216, 1, 'FDEYKRIS'), (128, 225, 225, 1, 'K'))
(2, -25, 32, '', (128, 209, 216, 1, 'FDEYKRIS'), (128, 241, 241, 1, 'K'))
(2, -4, 5, '', (128, 304, 305, 

(2, -14, 32, '', (25, 117, 127, 1, 'GIPDTGSASRP'), (25, 141, 149, 1, 'PKTQEQCGV'))
(2, 6, 19, '', (25, 119, 135, 1, 'PDTGSASRPDTPGTAQK'), (25, 129, 138, 1, 'TPGTAQKSAE'))
(2, 2, 21, '', (25, 119, 135, 1, 'PDTGSASRPDTPGTAQK'), (25, 133, 140, 1, 'AQKSAESN'))
(2, 0, 16, '', (25, 119, 135, 1, 'PDTGSASRPDTPGTAQK'), (25, 135, 135, 1, 'K'))
(2, -6, 30, '', (25, 119, 135, 1, 'PDTGSASRPDTPGTAQK'), (25, 141, 149, 1, 'PKTQEQCGV'))
(2, -7, 23, '', (25, 119, 135, 1, 'PDTGSASRPDTPGTAQK'), (25, 142, 142, 1, 'K'))
(2, -6, 16, '', (25, 122, 123, 1, 'GS'), (25, 129, 138, 1, 'TPGTAQKSAE'))
(2, -10, 18, '', (25, 122, 123, 1, 'GS'), (25, 133, 140, 1, 'AQKSAESN'))
(2, 6, 17, 'SQERLDTAPARLEARDRG', (192, 14, 22, 1, 'PPSLTDCIG'), (192, 16, 31, 1, 'SLTDCIGTVDSRAESI'))
(2, 0, 15, 'SQERLDTAPARLEARD', (192, 14, 22, 1, 'PPSLTDCIG'), (192, 22, 29, 1, 'GTVDSRAE'))
(2, -11, 19, 'SQERLDTAPARLEARDRGST', (192, 14, 22, 1, 'PPSLTDCIG'), (192, 33, 33, 1, 'K'))
(2, -7, 8, '', (192, 92, 93, 1, 'AN'), (192, 100, 100, 1, 'K'))


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [10]:
def make_cluster_dict(b_clusters, y_clusters):
    B = {}
    for l in b_clusters:
        if l.pid not in B:
            B[l.pid] = []

    B[l.pid].append(l)
    
    Y = {}
    for l in y_clusters:
        if l.pid not in Y:
            Y[l.pid] = []
    Y[l.pid].append(l)
    
    return B, Y

def bsearch(key, Y):
    lo = -1
    hi = len(Y)

    mid = -1
    while (hi - lo > 1):
        mid = int((hi+lo) / 2)
        if Y[mid].start < key:
            lo = mid
        else:
            hi = mid
    return hi

In [11]:
B, Y = make_cluster_dict(b_sorted_clusters, y_sorted_clusters)
for pid in B:
    if pid not in Y:
        continue
        
sorted_B = sorted(B[pid], key=operator.attrgetter('pid', 'start', 'end'))
sorted_Y = sorted(Y[pid], key=operator.attrgetter('pid', 'start', 'end'))

for b in sorted_B:
    y_i = bsearch(b.start, sorted_Y)

    if y_i >= len(sorted_Y): break

    y = sorted_Y[y_i]

    while y_i < len(sorted_Y) and y.start - b.end < 10:
        y = sorted_Y[y_i]
        seq = get_seq(fa, b.pid, b.start, y.end)
        print(b.score + y.score, b.end - y.start, y.end - b.start, seq, min_info(b), min_info(y))
        y_i += 1

{240: [], 267: [], 264: [], 261: [], 254: [], 253: [], 249: [], 247: [], 241: [], 235: [], 202: [], 191: [], 190: [], 189: [], 187: [], 178: [], 175: [], 174: [], 166: [], 159: [], 158: [], 152: [], 149: [], 140: [], 136: [], 127: [], 124: [], 111: [], 98: [], 90: [], 86: [], 78: [], 76: [], 75: [], 70: [], 68: [], 67: [], 58: [], 54: [], 51: [], 50: [], 36: [], 33: [], 27: [], 23: [], 6: [], 209: [], 85: [], 255: [], 122: [], 162: [], 153: [], 20: [], 278: [], 137: [], 57: [], 207: [], 270: [], 242: [], 220: [], 179: [], 172: [], 106: [], 104: [], 97: [], 1: [], 269: [], 252: [], 245: [], 239: [], 238: [], 237: [], 230: [], 227: [], 226: [], 222: [], 219: [], 216: [], 214: [], 211: [], 204: [], 198: [], 196: [], 194: [], 184: [], 180: [], 169: [], 160: [], 157: [], 156: [], 150: [], 146: [], 145: [], 143: [], 141: [], 139: [], 129: [], 118: [], 110: [], 109: [], 95: [], 77: [], 73: [], 64: [], 56: [], 55: [], 53: [], 45: [], 44: [], 40: [], 39: [], 26: [], 24: [], 22: [], 18: [], 16: 

In [127]:
# b side starting. Ideally, we would probably pick the higher scoring side to start
filtered_b, filtered_y = get_top(b_sorted_clusters, y_sorted_clusters, 150)
target_precursor = input_spectrum.precursor_mass
#Start with printing overlapping. Then will incorportate boundary overlaps between last of b and first of y
interesting_combos = []

for b_cluster in filtered_b:
    for y_cluster in filtered_y:
        if b_cluster.start <= y_cluster.end:
            seq = b_cluster.seq + '-' + y_cluster.seq
            b_indices = parse_indices(b_cluster.indices)
            y_indices = parse_indices(y_cluster.indices)
#             score = brutal_calc_combined_score(b_indices, y_indices, b_cluster.score, y_cluster.score)
#             tup = (seq, score)
#             interesting_combos.append(tup)
            evaluation = filter_by_validity(b_cluster, y_cluster)
            if evaluation == True:
                comb_seq, score_add, hybrid, overlap = combine(b_cluster, y_cluster)
                if filter_by_precursor(comb_seq, input_spectrum.precursor_charge, overlap, input_spectrum.precursor_mass):
                    if filter_by_dist(b_cluster, y_cluster:)
                        tup = (comb_seq, b_cluster.score + y_cluster.score + score_add)
                        interesting_combos.append(tup)

interesting_combos.sort(key=lambda a: a[1], reverse=True)
[print(x) for x in interesting_combos]

('GSGAVALCPE-VTVTVLDVNDNRPEFTMK', 8)
('IFVGTPYYGYVYEDTLP-TPSGDYSWSLQVQAK', 7)
('GSPGSAGLVQHLEEYAAT-VSSVSSARSGRAQDQDSQRG', 7)
('GSAGLMLVEFFAPWCG-VAENFDDIVNEEDKDVLI', 7)
('GQVLPEMEIHLQTDAKKGT-QFGVGFYSAFMVADK', 7)
('VPEMKPSMFDVSRELGSSV-TSLPDYASRMQAGTRN', 7)
('GSQYRVESMMLRIAKPM-FVK', 7)
('GQSTVPPCTASPEPVKAAEQ-TEGGGSEALPCPGPPAG', 7)
('GQVRLTYSTGESNTVVSPTV-VSSVSSARSGRAQDQDSQRG', 7)
('PVPVDCSDEAGNSALQLA-DRSHTCHNGKADPTK', 7)
('PVPVDCSDEAGNSALQLA-DEPELALDSTMRAPPQ', 7)
('GSGAVALCPE-NYMKCVSVLRFDSSAPFL', 7)
('GSGAVALCPE-VSSVSSARSGRAQDQDSQRG', 7)
('QGSPVVQMAEDAVDGER-DVPDHKDLNMDVSFHLP', 7)
('QGSPGAWAPLDPTSGSSA-VEFLPVYQPSLEESKDPT', 7)
('GQGQSPSIRQ-TLATMLACLQACAGSVSQ', 7)
('DGPVQGTIHFEQKASGEPVV-TGLTEGQHGFHVHQYG', 7)
('SNSQQAYQEAFEI-VVWTSEYDPLASNPGWKK', 6)
('SNSQQAYQEAFEI-VTVTVLDVNDNRPEFTMK', 6)
('SNSQQAYQEAFEI-YEAGEKRWGTDEVK', 6)
('IFVGTPYYGYVYEDTLP-VVWTSEYDPLASNPGWKK', 6)
('QVNQSLASMPPMNPATTLP-VVWTSEYDPLASNPGWKK', 6)
('MAEVDAAMAARPHSIDGRVV-VVWTSEYDPLASNPGWKK', 6)
('MAEVDAAMAARPHSIDGRVV-YEAGEKRWGTDEVK

('MAEVDAAMAARPHSIDGRVV-TSLPSQEHVDPQATGDS', 5)
('MAEVDAAMAARPHSIDGRVV-EHVDPQATGDSERGLSA', 5)
('MAEVDAAMAARPHSIDGRVV-TDDTSDPTSKEFEALI', 5)
('MAEVDAAMAARPHSIDGRVV-TKWEMAAQLREYQD', 5)
('MAEVDAAMAARPHSIDGRVV-THSSFSSTVKDKAASES', 5)
('MAEVDAAMAARPHSIDGRVV-TPTGRLMNRFSKDMD', 5)
('MAEVDAAMAARPHSIDGRVV-TYMEEMLGNVAGARQV', 5)
('MAEVDAAMAARPHSIDGRVV-TQERFGDKDSKMLVD', 5)
('MAEVDAAMAARPHSIDGRVV-TTSSDIRAMSPLDSSNS', 5)
('MAEVDAAMAARPHSIDGRVV-TEGSEFLRLQVEGGGCS', 5)
('MAEVDAAMAARPHSIDGRVV-DRSHTCHNGKADPTKT', 5)
('MAEVDAAMAARPHSIDGRVV-DRQQSQVLDAMQDSF', 5)
('MAEVDAAMAARPHSIDGRVV-LTCVPEHTHPFKVGTC', 5)
('MAEVDAAMAARPHSIDGRVV-ASEALPSEGKGELEHSQ', 5)
('MAEVDAAMAARPHSIDGRVV-YFDSGDYNMAKAKMK', 5)
('MAEVDAAMAARPHSIDGRVV-DEAASAPAIPEGVPTDTK', 5)
('MAEVDAAMAARPHSIDGRVV-GSFLAANMQDSRENTK', 5)
('MAEVDAAMAARPHSIDGRVV-YFDSGDYNMAKAKMK', 5)
('MAEVDAAMAARPHSIDGRVV-LPGAPPQQLQYGQQQPMVP', 5)
('MAEVDAAMAARPHSIDGRVV-EFFNGKEPSRGINPDEAVA', 5)
('MAEVDAAMAARPHSIDGRVV-LQVDHMNLLKQFEHLDP', 5)
('MAEVDAAMAARPHSIDGRVV-MKWTDLALQGLHENVPPAG', 5)

('GSGAVALCPE-VFK', 5)
('GSGAVALCPE-FVK', 5)
('GSGAVALCPE-FVK', 5)
('GSGAVALCPE-TSLPSQEHVDPQATGDS', 5)
('GSGAVALCPE-EHVDPQATGDSERGLSA', 5)
('GSGAVALCPE-TDDTSDPTSKEFEALI', 5)
('GSGAVALCPE-TKWEMAAQLREYQD', 5)
('GSGAVALCPE-THSSFSSTVKDKAASES', 5)
('GSGAVALCPE-TPTGRLMNRFSKDMD', 5)
('GSGAVALCPE-TYMEEMLGNVAGARQV', 5)
('GSGAVALCPE-TQERFGDKDSKMLVD', 5)
('GSGAVALCPE-TTSSDIRAMSPLDSSNS', 5)
('GSGAVALCPE-TEGSEFLRLQVEGGGCS', 5)
('GSGAVALCPE-DRSHTCHNGKADPTKT', 5)
('GSGAVALCPE-DRQQSQVLDAMQDSF', 5)
('GSGAVALCPE-EVTFNSLLCPTGAEVS', 5)
('GSGAVALCPE-RRNTGGKGGDYALAPGSQSS', 5)
('GSGAVALCPE-LTCVPEHTHPFKVGTC', 5)
('GSGAVALCPE-TSLPDYASRMQAGTRN', 5)
('GSGAVALCPE-TLATMLACLQACAGSVSQ', 5)
('GSGAVALCPE-TNTSHVMQYGNKSIST', 5)
('GSGAVALCPE-TEKSHPSEEELLSQPG', 5)
('GSGAVALCPE-TTDKNGLARFSINTDD', 5)
('GSGAVALCPE-ASEALPSEGKGELEHSQ', 5)
('GSGAVALCPE-DRSHTCHNGKADPTK', 5)
('GSGAVALCPE-QFGVGFYSAFMVADK', 5)
('GSGAVALCPE-TPSGDYSWSLQVQAK', 5)
('GSGAVALCPE-KEDALEDTRDSEMK', 5)
('GSGAVALCPE-SKDINAYNGETPTEK', 5)
('GSGAVALCPE-EFGNETWGVT

('GSQANS-VLVSLSAAGRDEGNYLDDAL', 5)
('GSQANS-VNFHFILFNNVDGHLYE', 5)
('GSQANS-VEFLPVYQPSLEESKDPT', 5)
('GSQANS-VDGDGGLNNRLVKLSQDFM', 5)
('GSQANS-VMEMMSQKIQQLTALGAAQ', 5)
('GSQANS-VSSVSSARSGRAQDQDSQRG', 5)
('GSQANS-VRALNEQACRDGSSIQIAF', 5)
('GSQANS-NPGVHEPGEPEFKYIG', 5)
('GSQANS-CDLCGVKF', 5)
('PPQLNVMNQMQQEK-VVWTSEYDPLASNPGWKK', 5)
('PPQLNVMNQMQQEK-VTVTVLDVNDNRPEFTMK', 5)
('PPQLNVMNQMQQEK-YEAGEKRWGTDEVK', 5)
('SFVDQYGQRDDGKIG-VVWTSEYDPLASNPGWKK', 5)
('SFVDQYGQRDDGKIG-VCPTEIIAFSDHAEDFRK', 5)
('SFVDQYGQRDDGKIG-VTVTVLDVNDNRPEFTMK', 5)
('SFVDQYGQRDDGKIG-YEAGEKRWGTDEVK', 5)
('SFVDQYGQRDDGKIG-GRPSGEAFVELESEDEVK', 5)
('SFVDQYGQRDDGKIG-DVHPSDLKPKGDDKDPSK', 5)
('TPGSGKNYAGVFMDAGL-VTVTVLDVNDNRPEFTMK', 5)
('PQPEHPLRADLAEEYSKD-GLGNKTYEHFNAMGK', 5)
('PQPEHPLRADLAEEYSKD-VTVTVLDVNDNRPEFTMK', 5)
('PQPEHPLRADLAEEYSKD-YEAGEKRWGTDEVK', 5)
('NGTDPEGDPISYHI-VVWTSEYDPLASNPGWKK', 5)
('NGTDPEGDPISYHI-VCPTEIIAFSDHAEDFRK', 5)
('NGTDPEGDPISYHI-VTVTVLDVNDNRPEFTMK', 5)
('NGTDPEGDPISYHI-YEAGEKRWGTDEVK', 5)
('NGTDPEGD

('PPQLNVMNQMQQEK-INNRIADKAFYQQPDAD', 4)
('PPQLNVMNQMQQEK-GLEHDNLEAHSPEQPP', 4)
('PPQLNVMNQMQQEK-DEPELALDSTMRAPPQ', 4)
('PPQLNVMNQMQQEK-VHRQRQPYQCPICEHIA', 4)
('PPQLNVMNQMQQEK-VQTPLPGAPPQQLQYGQQQ', 4)
('PPQLNVMNQMQQEK-VAPFMGFIQERAWLREQ', 4)
('PPQLNVMNQMQQEK-VYNPFQKEMLTYLLDGF', 4)
('PPQLNVMNQMQQEK-VCGLQKDLNSLPYGDLTEI', 4)
('PPQLNVMNQMQQEK-VLVSLSAAGRDEGNYLDDAL', 4)
('PPQLNVMNQMQQEK-VNFHFILFNNVDGHLYE', 4)
('PPQLNVMNQMQQEK-VEFLPVYQPSLEESKDPT', 4)
('PPQLNVMNQMQQEK-VDGDGGLNNRLVKLSQDFM', 4)
('PPQLNVMNQMQQEK-VSSVSSARSGRAQDQDSQRG', 4)
('PPQLNVMNQMQQEK-NPGVHEPGEPEFKYIG', 4)
('PPQLNVMNQMQQEK-CDLCGVKF', 4)
('SFVDQYGQRDDGKIG-VFK', 4)
('SFVDQYGQRDDGKIG-FVK', 4)
('SFVDQYGQRDDGKIG-FVK', 4)
('SFVDQYGQRDDGKIG-FVK', 4)
('SFVDQYGQRDDGKIG-FVK', 4)
('SFVDQYGQRDDGKIG-FVK', 4)
('SFVDQYGQRDDGKIG-FVK', 4)
('SFVDQYGQRDDGKIG-VFK', 4)
('SFVDQYGQRDDGKIG-FVK', 4)
('SFVDQYGQRDDGKIG-FVK', 4)
('SFVDQYGQRDDGKIG-VFK', 4)
('SFVDQYGQRDDGKIG-FVK', 4)
('SFVDQYGQRDDGKIG-FVK', 4)
('SFVDQYGQRDDGKIG-FVK', 4)
('SFVDQYGQRDDGKIG-FVK

('NGTDPEGDPISYHI-FVK', 4)
('NGTDPEGDPISYHI-FVK', 4)
('NGTDPEGDPISYHI-FVK', 4)
('NGTDPEGDPISYHI-FVK', 4)
('NGTDPEGDPISYHI-FVK', 4)
('NGTDPEGDPISYHI-FVK', 4)
('NGTDPEGDPISYHI-VFK', 4)
('NGTDPEGDPISYHI-VFK', 4)
('NGTDPEGDPISYHI-FVK', 4)
('NGTDPEGDPISYHI-FVK', 4)
('NGTDPEGDPISYHI-FVK', 4)
('NGTDPEGDPISYHI-VFK', 4)
('NGTDPEGDPISYHI-FVK', 4)
('NGTDPEGDPISYHI-FVK', 4)
('NGTDPEGDPISYHI-LSDPGNYHEFCRFLA', 4)
('NGTDPEGDPISYHI-TSLPSQEHVDPQATGDS', 4)
('NGTDPEGDPISYHI-EHVDPQATGDSERGLSA', 4)
('NGTDPEGDPISYHI-TDDTSDPTSKEFEALI', 4)
('NGTDPEGDPISYHI-TKWEMAAQLREYQD', 4)
('NGTDPEGDPISYHI-THSSFSSTVKDKAASES', 4)
('NGTDPEGDPISYHI-TPTGRLMNRFSKDMD', 4)
('NGTDPEGDPISYHI-TYMEEMLGNVAGARQV', 4)
('NGTDPEGDPISYHI-TQERFGDKDSKMLVD', 4)
('NGTDPEGDPISYHI-TTSSDIRAMSPLDSSNS', 4)
('NGTDPEGDPISYHI-TEGSEFLRLQVEGGGCS', 4)
('NGTDPEGDPISYHI-EVTFNSLLCPTGAEVS', 4)
('NGTDPEGDPISYHI-RRNTGGKGGDYALAPGSQSS', 4)
('NGTDPEGDPISYHI-LTCVPEHTHPFKVGTC', 4)
('NGTDPEGDPISYHI-TSLPDYASRMQAGTRN', 4)
('NGTDPEGDPISYHI-TLATMLACLQACAGSVSQ', 4)
('NGTD

('NGQAACHSAQGRWE-VVFQETEDTPLDQCKVLI', 4)
('NGQAACHSAQGRWE-VKFTRSPESYFSPGKAFE', 4)
('NGQAACHSAQGRWE-VKDVIEQYSAGHLDMLCR', 4)
('NGQAACHSAQGRWE-VRSMNNTNIQWSAILSWG', 4)
('NGQAACHSAQGRWE-SSFLVYLVEKDANKEFST', 4)
('NGQAACHSAQGRWE-EMLWHVLERTFMRDTI', 4)
('NGQAACHSAQGRWE-NYMKCVSVLRFDSSAPFL', 4)
('NGQAACHSAQGRWE-GLEHDNLEAHSPEQPP', 4)
('NGQAACHSAQGRWE-DEPELALDSTMRAPPQ', 4)
('NGQAACHSAQGRWE-VHRQRQPYQCPICEHIA', 4)
('NGQAACHSAQGRWE-VQTPLPGAPPQQLQYGQQQ', 4)
('NGQAACHSAQGRWE-VAPFMGFIQERAWLREQ', 4)
('NGQAACHSAQGRWE-VYNPFQKEMLTYLLDGF', 4)
('NGQAACHSAQGRWE-VCGLQKDLNSLPYGDLTEI', 4)
('NGQAACHSAQGRWE-VEFLPVYQPSLEESKDPT', 4)
('NGQAACHSAQGRWE-VSSVSSARSGRAQDQDSQRG', 4)
('NGQAACHSAQGRWE-NPGVHEPGEPEFKYIG', 4)
('NGQAACHSAQGRWE-CDLCGVKF', 4)
('VLDEELEGVSPDELKD-TEGGGSEALPCPGPPAG', 4)
('VLDEELEGVSPDELKD-LGSHTDEMLWHVLE', 4)
('VLDEELEGVSPDELKD-GDCYSRLTTEQSH', 4)
('VLDEELEGVSPDELKD-DTDDAPVPAPAGDQK', 4)
('VLDEELEGVSPDELKD-YGFGSDRFGRDLNY', 4)
('VLDEELEGVSPDELKD-SGEMMGAPAVVAPQQPP', 4)
('VLDEELEGVSPDELKD-VFK', 4)
('VLDEELEGV

('YSEAHEISKEHMQPT-EFGNETWGVTKAAEK', 4)
('YSEAHEISKEHMQPT-SSFSHYSGLKHEDKRGGS', 4)
('YSEAHEISKEHMQPT-PPPPQPQQQQQQQQQPP', 4)
('YSEAHEISKEHMQPT-QQQQQQPPPPQQQPQPP', 4)
('YSEAHEISKEHMQPT-LPGAPPQQLQYGQQQPMVP', 4)
('YSEAHEISKEHMQPT-EFFNGKEPSRGINPDEAVA', 4)
('YSEAHEISKEHMQPT-LHSMMQRRMSQEHP', 4)
('YSEAHEISKEHMQPT-VAENFDDIVNEEDKDVLI', 4)
('YSEAHEISKEHMQPT-VWQMTLHNFQIQAFNVT', 4)
('YSEAHEISKEHMQPT-VVFQETEDTPLDQCKVLI', 4)
('YSEAHEISKEHMQPT-VKFTRSPESYFSPGKAFE', 4)
('YSEAHEISKEHMQPT-VKDVIEQYSAGHLDMLCR', 4)
('YSEAHEISKEHMQPT-VRSMNNTNIQWSAILSWG', 4)
('YSEAHEISKEHMQPT-VPKEPSSTVNTEVYPKNST', 4)
('YSEAHEISKEHMQPT-SSFLVYLVEKDANKEFST', 4)
('YSEAHEISKEHMQPT-EMLWHVLERTFMRDTI', 4)
('YSEAHEISKEHMQPT-NYMKCVSVLRFDSSAPFL', 4)
('YSEAHEISKEHMQPT-INNRIADKAFYQQPDAD', 4)
('YSEAHEISKEHMQPT-FRQMDTNNDGKLSLE', 4)
('YSEAHEISKEHMQPT-GLEHDNLEAHSPEQPP', 4)
('YSEAHEISKEHMQPT-DEPELALDSTMRAPPQ', 4)
('YSEAHEISKEHMQPT-VHRQRQPYQCPICEHIA', 4)
('YSEAHEISKEHMQPT-VQTPLPGAPPQQLQYGQQQ', 4)
('YSEAHEISKEHMQPT-VAPFMGFIQERAWLREQ', 4)
('YSEAHEIS

('IYKMVSSVMKMPEDE-FVK', 4)
('IYKMVSSVMKMPEDE-FVK', 4)
('IYKMVSSVMKMPEDE-VFK', 4)
('IYKMVSSVMKMPEDE-FVK', 4)
('IYKMVSSVMKMPEDE-FVK', 4)
('IYKMVSSVMKMPEDE-FVK', 4)
('IYKMVSSVMKMPEDE-FVK', 4)
('IYKMVSSVMKMPEDE-DRSHTCHNGKADPTKT', 4)
('IYKMVSSVMKMPEDE-DRQQSQVLDAMQDSF', 4)
('IYKMVSSVMKMPEDE-EVTFNSLLCPTGAEVS', 4)
('IYKMVSSVMKMPEDE-RRNTGGKGGDYALAPGSQSS', 4)
('IYKMVSSVMKMPEDE-GTYFEVKIPSDTFY', 4)
('IYKMVSSVMKMPEDE-TSLPDYASRMQAGTRN', 4)
('IYKMVSSVMKMPEDE-TLATMLACLQACAGSVSQ', 4)
('IYKMVSSVMKMPEDE-TNTSHVMQYGNKSIST', 4)
('IYKMVSSVMKMPEDE-TEKSHPSEEELLSQPG', 4)
('IYKMVSSVMKMPEDE-TTDKNGLARFSINTDD', 4)
('IYKMVSSVMKMPEDE-CGMQEMLDSVPEKRF', 4)
('IYKMVSSVMKMPEDE-CCYFPSAAQLPALWAE', 4)
('IYKMVSSVMKMPEDE-DRSHTCHNGKADPTK', 4)
('IYKMVSSVMKMPEDE-QFGVGFYSAFMVADK', 4)
('IYKMVSSVMKMPEDE-TPSGDYSWSLQVQAK', 4)
('IYKMVSSVMKMPEDE-EFGNETWGVTKAAEK', 4)
('IYKMVSSVMKMPEDE-SSFSHYSGLKHEDKRGGS', 4)
('IYKMVSSVMKMPEDE-PPPPQPQQQQQQQQQPP', 4)
('IYKMVSSVMKMPEDE-QQQQQQPPPPQQQPQPP', 4)
('IYKMVSSVMKMPEDE-LFNGAKIGSQEAFFLYAC', 4)
('IYKMV

('FENQSPDAKHRDAAAE-FVK', 4)
('FENQSPDAKHRDAAAE-FVK', 4)
('FENQSPDAKHRDAAAE-FVK', 4)
('FENQSPDAKHRDAAAE-FVK', 4)
('FENQSPDAKHRDAAAE-FVK', 4)
('FENQSPDAKHRDAAAE-FVK', 4)
('FENQSPDAKHRDAAAE-VFK', 4)
('FENQSPDAKHRDAAAE-FVK', 4)
('FENQSPDAKHRDAAAE-FVK', 4)
('FENQSPDAKHRDAAAE-FVK', 4)
('FENQSPDAKHRDAAAE-FVK', 4)
('FENQSPDAKHRDAAAE-LSDPGNYHEFCRFLA', 4)
('FENQSPDAKHRDAAAE-TDDTSDPTSKEFEALI', 4)
('FENQSPDAKHRDAAAE-TKWEMAAQLREYQD', 4)
('FENQSPDAKHRDAAAE-THSSFSSTVKDKAASES', 4)
('FENQSPDAKHRDAAAE-TPTGRLMNRFSKDMD', 4)
('FENQSPDAKHRDAAAE-TYMEEMLGNVAGARQV', 4)
('FENQSPDAKHRDAAAE-TQERFGDKDSKMLVD', 4)
('FENQSPDAKHRDAAAE-TTSSDIRAMSPLDSSNS', 4)
('FENQSPDAKHRDAAAE-EVTFNSLLCPTGAEVS', 4)
('FENQSPDAKHRDAAAE-RRNTGGKGGDYALAPGSQSS', 4)
('FENQSPDAKHRDAAAE-LTCVPEHTHPFKVGTC', 4)
('FENQSPDAKHRDAAAE-GTYFEVKIPSDTFY', 4)
('FENQSPDAKHRDAAAE-CGMQEMLDSVPEKRF', 4)
('FENQSPDAKHRDAAAE-CCYFPSAAQLPALWAE', 4)
('FENQSPDAKHRDAAAE-DRSHTCHNGKADPTK', 4)
('FENQSPDAKHRDAAAE-QFGVGFYSAFMVADK', 4)
('FENQSPDAKHRDAAAE-TPSGDYSWSLQVQAK', 4)


('ANHYITPMMELKPNAG-EVTFNSLLCPTGAEVS', 4)
('ANHYITPMMELKPNAG-RRNTGGKGGDYALAPGSQSS', 4)
('ANHYITPMMELKPNAG-GTYFEVKIPSDTFY', 4)
('ANHYITPMMELKPNAG-TSLPDYASRMQAGTRN', 4)
('ANHYITPMMELKPNAG-TLATMLACLQACAGSVSQ', 4)
('ANHYITPMMELKPNAG-TNTSHVMQYGNKSIST', 4)
('ANHYITPMMELKPNAG-TEKSHPSEEELLSQPG', 4)
('ANHYITPMMELKPNAG-TTDKNGLARFSINTDD', 4)
('ANHYITPMMELKPNAG-CGMQEMLDSVPEKRF', 4)
('ANHYITPMMELKPNAG-CCYFPSAAQLPALWAE', 4)
('ANHYITPMMELKPNAG-DRSHTCHNGKADPTK', 4)
('ANHYITPMMELKPNAG-QFGVGFYSAFMVADK', 4)
('ANHYITPMMELKPNAG-TPSGDYSWSLQVQAK', 4)
('ANHYITPMMELKPNAG-EFGNETWGVTKAAEK', 4)
('ANHYITPMMELKPNAG-SSFSHYSGLKHEDKRGGS', 4)
('ANHYITPMMELKPNAG-PPPPQPQQQQQQQQQPP', 4)
('ANHYITPMMELKPNAG-QQQQQQPPPPQQQPQPP', 4)
('ANHYITPMMELKPNAG-LFNGAKIGSQEAFFLYAC', 4)
('ANHYITPMMELKPNAG-DVPDHKDLNMDVSFHLP', 4)
('ANHYITPMMELKPNAG-VLVFSEDGYFLRAWNY', 4)
('ANHYITPMMELKPNAG-LPGAPPQQLQYGQQQPMVP', 4)
('ANHYITPMMELKPNAG-EFFNGKEPSRGINPDEAVA', 4)
('ANHYITPMMELKPNAG-LQVDHMNLLKQFEHLDP', 4)
('ANHYITPMMELKPNAG-KIPASKEFNSDKEGHKYV', 4)
(

('ANFEEVAFDEKKNVF-NYMKCVSVLRFDSSAPFL', 4)
('ANFEEVAFDEKKNVF-GLEHDNLEAHSPEQPP', 4)
('ANFEEVAFDEKKNVF-DEPELALDSTMRAPPQ', 4)
('ANFEEVAFDEKKNVF-VHRQRQPYQCPICEHIA', 4)
('ANFEEVAFDEKKNVF-VQTPLPGAPPQQLQYGQQQ', 4)
('ANFEEVAFDEKKNVF-VAPFMGFIQERAWLREQ', 4)
('ANFEEVAFDEKKNVF-VYNPFQKEMLTYLLDGF', 4)
('ANFEEVAFDEKKNVF-VCGLQKDLNSLPYGDLTEI', 4)
('ANFEEVAFDEKKNVF-VSSVSSARSGRAQDQDSQRG', 4)
('ANFEEVAFDEKKNVF-NPGVHEPGEPEFKYIG', 4)
('ANFEEVAFDEKKNVF-CDLCGVKF', 4)
('ASNRYTGVPDRFTGSG-TEGGGSEALPCPGPPAG', 4)
('ASNRYTGVPDRFTGSG-NGTDPEDVIRNAFACF', 4)
('ASNRYTGVPDRFTGSG-NGTDPEDVIRNAFACF', 4)
('ASNRYTGVPDRFTGSG-QGSFLAANMQDSRENT', 4)
('ASNRYTGVPDRFTGSG-GKSSDNRSRGYRGGSAGG', 4)
('ASNRYTGVPDRFTGSG-GDCYSRLTTEQSH', 4)
('ASNRYTGVPDRFTGSG-DTDDAPVPAPAGDQK', 4)
('ASNRYTGVPDRFTGSG-VFK', 4)
('ASNRYTGVPDRFTGSG-FVK', 4)
('ASNRYTGVPDRFTGSG-FVK', 4)
('ASNRYTGVPDRFTGSG-FVK', 4)
('ASNRYTGVPDRFTGSG-FVK', 4)
('ASNRYTGVPDRFTGSG-FVK', 4)
('ASNRYTGVPDRFTGSG-VFK', 4)
('ASNRYTGVPDRFTGSG-FVK', 4)
('ASNRYTGVPDRFTGSG-FVK', 4)
('ASNRYTGVPDRFT

('CIGEYSLIVNTATETAT-ASEALPSEGKGELEHSQ', 4)
('CIGEYSLIVNTATETAT-SSFSHYSGLKHEDKRGGS', 4)
('CIGEYSLIVNTATETAT-PPPPQPQQQQQQQQQPP', 4)
('CIGEYSLIVNTATETAT-QQQQQQPPPPQQQPQPP', 4)
('CIGEYSLIVNTATETAT-DEAASAPAIPEGVPTDTK', 4)
('CIGEYSLIVNTATETAT-GSFLAANMQDSRENTK', 4)
('CIGEYSLIVNTATETAT-LFNGAKIGSQEAFFLYAC', 4)
('CIGEYSLIVNTATETAT-DVPDHKDLNMDVSFHLP', 4)
('CIGEYSLIVNTATETAT-LVPVVNNRLFDMSAFMAGP', 4)
('CIGEYSLIVNTATETAT-EFFNGKEPSRGINPDEAVA', 4)
('CIGEYSLIVNTATETAT-VAENFDDIVNEEDKDVLI', 4)
('CIGEYSLIVNTATETAT-VWQMTLHNFQIQAFNVT', 4)
('CIGEYSLIVNTATETAT-VVFQETEDTPLDQCKVLI', 4)
('CIGEYSLIVNTATETAT-VKFTRSPESYFSPGKAFE', 4)
('CIGEYSLIVNTATETAT-VKDVIEQYSAGHLDMLCR', 4)
('CIGEYSLIVNTATETAT-VRSMNNTNIQWSAILSWG', 4)
('CIGEYSLIVNTATETAT-VPKEPSSTVNTEVYPKNST', 4)
('CIGEYSLIVNTATETAT-SSFLVYLVEKDANKEFST', 4)
('CIGEYSLIVNTATETAT-EMLWHVLERTFMRDTI', 4)
('CIGEYSLIVNTATETAT-NYMKCVSVLRFDSSAPFL', 4)
('CIGEYSLIVNTATETAT-GLEHDNLEAHSPEQPP', 4)
('CIGEYSLIVNTATETAT-DEPELALDSTMRAPPQ', 4)
('CIGEYSLIVNTATETAT-VHRQRQPYQCPICEHIA', 4)

('SGRGGRDHGDWDVDR-FVK', 4)
('SGRGGRDHGDWDVDR-FVK', 4)
('SGRGGRDHGDWDVDR-LSDPGNYHEFCRFLA', 4)
('SGRGGRDHGDWDVDR-TSLPSQEHVDPQATGDS', 4)
('SGRGGRDHGDWDVDR-EHVDPQATGDSERGLSA', 4)
('SGRGGRDHGDWDVDR-TDDTSDPTSKEFEALI', 4)
('SGRGGRDHGDWDVDR-TKWEMAAQLREYQD', 4)
('SGRGGRDHGDWDVDR-TSLDKGENGTLSREDF', 4)
('SGRGGRDHGDWDVDR-THSSFSSTVKDKAASES', 4)
('SGRGGRDHGDWDVDR-TPTGRLMNRFSKDMD', 4)
('SGRGGRDHGDWDVDR-TEQGHELSNEERNLL', 4)
('SGRGGRDHGDWDVDR-TYMEEMLGNVAGARQV', 4)
('SGRGGRDHGDWDVDR-TQERFGDKDSKMLVD', 4)
('SGRGGRDHGDWDVDR-TTSSDIRAMSPLDSSNS', 4)
('SGRGGRDHGDWDVDR-TEGSEFLRLQVEGGGCS', 4)
('SGRGGRDHGDWDVDR-DRSHTCHNGKADPTKT', 4)
('SGRGGRDHGDWDVDR-DRQQSQVLDAMQDSF', 4)
('SGRGGRDHGDWDVDR-LTCVPEHTHPFKVGTC', 4)
('SGRGGRDHGDWDVDR-MAACMKSVTEQGAELSN', 4)
('SGRGGRDHGDWDVDR-ASEALPSEGKGELEHSQ', 4)
('SGRGGRDHGDWDVDR-SSFSHYSGLKHEDKRGGS', 4)
('SGRGGRDHGDWDVDR-PPPPQPQQQQQQQQQPP', 4)
('SGRGGRDHGDWDVDR-QQQQQQPPPPQQQPQPP', 4)
('SGRGGRDHGDWDVDR-YFDSGDYNMAKAKMK', 4)
('SGRGGRDHGDWDVDR-DEAASAPAIPEGVPTDTK', 4)
('SGRGGRDHGDWDVDR-YDL

('LSIGGSAAPHTQSMQGFPPN-TPTGRLMNRFSKDMD', 4)
('LSIGGSAAPHTQSMQGFPPN-TQERFGDKDSKMLVD', 4)
('LSIGGSAAPHTQSMQGFPPN-TTSSDIRAMSPLDSSNS', 4)
('LSIGGSAAPHTQSMQGFPPN-EVTFNSLLCPTGAEVS', 4)
('LSIGGSAAPHTQSMQGFPPN-LTCVPEHTHPFKVGTC', 4)
('LSIGGSAAPHTQSMQGFPPN-CCYFPSAAQLPALWAE', 4)
('LSIGGSAAPHTQSMQGFPPN-GSFLAANMQDSRENTK', 4)
('LSIGGSAAPHTQSMQGFPPN-VVFQETEDTPLDQCKVLI', 4)
('LSIGGSAAPHTQSMQGFPPN-VKFTRSPESYFSPGKAFE', 4)
('LSIGGSAAPHTQSMQGFPPN-VRSMNNTNIQWSAILSWG', 4)
('LSIGGSAAPHTQSMQGFPPN-EMLWHVLERTFMRDTI', 4)
('LSIGGSAAPHTQSMQGFPPN-NYMKCVSVLRFDSSAPFL', 4)
('LSIGGSAAPHTQSMQGFPPN-GLEHDNLEAHSPEQPP', 4)
('EGDRPEVTESINPGDRLI-FVK', 4)
('EGDRPEVTESINPGDRLI-FVK', 4)
('EGDRPEVTESINPGDRLI-VKFTRSPESYFSPGKAFE', 4)
('GDRPEVTESINPGDRLIE-FVK', 4)
('GDRPEVTESINPGDRLIE-FVK', 4)
('GDRPEVTESINPGDRLIE-VKFTRSPESYFSPGKAFE', 4)
('DGVANVSIEDRVISLSGEH-TEGGGSEALPCPGPPAG', 4)
('DGVANVSIEDRVISLSGEH-LGSHTDEMLWHVLE', 4)
('DGVANVSIEDRVISLSGEH-NGTDPEDVIRNAFACF', 4)
('DGVANVSIEDRVISLSGEH-NGTDPEDVIRNAFACF', 4)
('DGVANVSIEDRVISLSGEH-Q

('LGNAKSHLMSLYSACSSEV-TEGGGSEALPCPGPPAG', 4)
('LGNAKSHLMSLYSACSSEV-LGSHTDEMLWHVLE', 4)
('LGNAKSHLMSLYSACSSEV-QGSFLAANMQDSRENT', 4)
('LGNAKSHLMSLYSACSSEV-GDCYSRLTTEQSH', 4)
('LGNAKSHLMSLYSACSSEV-DTDDAPVPAPAGDQK', 4)
('LGNAKSHLMSLYSACSSEV-YGFGSDRFGRDLNY', 4)
('LGNAKSHLMSLYSACSSEV-SGEMMGAPAVVAPQQPP', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-VFK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-VFK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-FVK', 4)
('LGNAKSHLMSLYSACSSEV-LSDPGNYHEFCRFLA', 4)
('LGNAKSHLMSLYSACSSEV-TSLPSQEHVDPQATGDS', 4)
('LGNAKSHLMSLYSACSSEV

('PDKYPSKKECTWAISSTPG-VSSVSSARSGRAQDQDSQRG', 4)
('PDKYPSKKECTWAISSTPG-CDLCGVKF', 4)
('GRMPCIEVGDSIVHKARE-THSSFSSTVKDKAASES', 4)
('GRMPCIEVGDSIVHKARE-VKFTRSPESYFSPGKAFE', 4)
('GRMPCIEVGDSIVHKARE-VRSMNNTNIQWSAILSWG', 4)
('GRMPCIEVGDSIVHKARE-CDLCGVKF', 4)
('TTAPDGDYWRLLPPGSHIV-TEGGGSEALPCPGPPAG', 4)
('TTAPDGDYWRLLPPGSHIV-LGSHTDEMLWHVLE', 4)
('TTAPDGDYWRLLPPGSHIV-QGSFLAANMQDSRENT', 4)
('TTAPDGDYWRLLPPGSHIV-GDCYSRLTTEQSH', 4)
('TTAPDGDYWRLLPPGSHIV-FVK', 4)
('TTAPDGDYWRLLPPGSHIV-VFK', 4)
('TTAPDGDYWRLLPPGSHIV-FVK', 4)
('TTAPDGDYWRLLPPGSHIV-FVK', 4)
('TTAPDGDYWRLLPPGSHIV-FVK', 4)
('TTAPDGDYWRLLPPGSHIV-FVK', 4)
('TTAPDGDYWRLLPPGSHIV-TDDTSDPTSKEFEALI', 4)
('TTAPDGDYWRLLPPGSHIV-THSSFSSTVKDKAASES', 4)
('TTAPDGDYWRLLPPGSHIV-TPTGRLMNRFSKDMD', 4)
('TTAPDGDYWRLLPPGSHIV-TQERFGDKDSKMLVD', 4)
('TTAPDGDYWRLLPPGSHIV-TTSSDIRAMSPLDSSNS', 4)
('TTAPDGDYWRLLPPGSHIV-DRSHTCHNGKADPTKT', 4)
('TTAPDGDYWRLLPPGSHIV-TLATMLACLQACAGSVSQ', 4)
('TTAPDGDYWRLLPPGSHIV-TEKSHPSEEELLSQPG', 4)
('TTAPDGDYWRLLPPGSHIV-CCYFPSAAQLPAL

('RYWFAMERLEIHSNGSV-TTDKNGLARFSINTDD', 4)
('RYWFAMERLEIHSNGSV-CCYFPSAAQLPALWAE', 4)
('RYWFAMERLEIHSNGSV-DRSHTCHNGKADPTK', 4)
('RYWFAMERLEIHSNGSV-TPSGDYSWSLQVQAK', 4)
('RYWFAMERLEIHSNGSV-EFGNETWGVTKAAEK', 4)
('RYWFAMERLEIHSNGSV-PPPPQPQQQQQQQQQPP', 4)
('RYWFAMERLEIHSNGSV-QQQQQQPPPPQQQPQPP', 4)
('RYWFAMERLEIHSNGSV-GSFLAANMQDSRENTK', 4)
('RYWFAMERLEIHSNGSV-LFNGAKIGSQEAFFLYAC', 4)
('RYWFAMERLEIHSNGSV-DVPDHKDLNMDVSFHLP', 4)
('RYWFAMERLEIHSNGSV-LHSMMQRRMSQEHP', 4)
('RYWFAMERLEIHSNGSV-GLEHDNLEAHSPEQPP', 4)
('RYWFAMERLEIHSNGSV-DEPELALDSTMRAPPQ', 4)
('RYWFAMERLEIHSNGSV-VHRQRQPYQCPICEHIA', 4)
('RYWFAMERLEIHSNGSV-VQTPLPGAPPQQLQYGQQQ', 4)
('RYWFAMERLEIHSNGSV-VAPFMGFIQERAWLREQ', 4)
('RYWFAMERLEIHSNGSV-VYNPFQKEMLTYLLDGF', 4)
('RYWFAMERLEIHSNGSV-VCGLQKDLNSLPYGDLTEI', 4)
('RYWFAMERLEIHSNGSV-VSSVSSARSGRAQDQDSQRG', 4)
('RYWFAMERLEIHSNGSV-NPGVHEPGEPEFKYIG', 4)
('RYWFAMERLEIHSNGSV-CDLCGVKF', 4)
('DGPVQGTIHFEQKASGEPVV-TEGGGSEALPCPGPPAG', 4)
('DGPVQGTIHFEQKASGEPVV-LGSHTDEMLWHVLE', 4)
('DGPVQGTIHFEQKASGEPVV-N

('KLFVGGLSFDTNEQALEQV-VSSVSSARSGRAQDQDSQRG', 4)
('KLFVGGLSFDTNEQALEQV-VRALNEQACRDGSSIQIAF', 4)
('KLFVGGLSFDTNEQALEQV-PKNSEYF', 4)
('KLFVGGLSFDTNEQALEQV-NPGVHEPGEPEFKYIG', 4)
('KLFVGGLSFDTNEQALEQV-GMMNEAPGPINFTMFL', 4)
('KLFVGGLSFDTNEQALEQV-NPGVHEPGEPEFKYIG', 4)
('KLFVGGLSFDTNEQALEQV-LDLNCDGQLDFQEFL', 4)
('KLFVGGLSFDTNEQALEQV-LRTDYNASVSVPDSSG', 4)
('KLFVGGLSFDTNEQALEQV-ESSSRSAEKRSAEDE', 4)
('KLFVGGLSFDTNEQALEQV-CDLCGVKF', 4)
('EVDEPPQHALRVDYAGVTV-TEGGGSEALPCPGPPAG', 4)
('EVDEPPQHALRVDYAGVTV-LGSHTDEMLWHVLE', 4)
('EVDEPPQHALRVDYAGVTV-NGTDPEDVIRNAFACF', 4)
('EVDEPPQHALRVDYAGVTV-NGTDPEDVIRNAFACF', 4)
('EVDEPPQHALRVDYAGVTV-QGSFLAANMQDSRENT', 4)
('EVDEPPQHALRVDYAGVTV-GKSSDNRSRGYRGGSAGG', 4)
('EVDEPPQHALRVDYAGVTV-GDCYSRLTTEQSH', 4)
('EVDEPPQHALRVDYAGVTV-DTDDAPVPAPAGDQK', 4)
('EVDEPPQHALRVDYAGVTV-YGFGSDRFGRDLNY', 4)
('EVDEPPQHALRVDYAGVTV-SGEMMGAPAVVAPQQPP', 4)
('EVDEPPQHALRVDYAGVTV-FVK', 4)
('EVDEPPQHALRVDYAGVTV-VFK', 4)
('EVDEPPQHALRVDYAGVTV-FVK', 4)
('EVDEPPQHALRVDYAGVTV-VFK', 4)
('EVDEPPQHAL

('TGNDRKEAAENSLVAYKAAS-VMEMMSQKIQQLTALGAAQ', 4)
('TGNDRKEAAENSLVAYKAAS-VSSVSSARSGRAQDQDSQRG', 4)
('TGNDRKEAAENSLVAYKAAS-NPGVHEPGEPEFKYIG', 4)
('TGNDRKEAAENSLVAYKAAS-CDLCGVKF', 4)
('NEDVSIIPPLFTVSVDHRG-TEGGGSEALPCPGPPAG', 4)
('NEDVSIIPPLFTVSVDHRG-LGSHTDEMLWHVLE', 4)
('NEDVSIIPPLFTVSVDHRG-QGSFLAANMQDSRENT', 4)
('NEDVSIIPPLFTVSVDHRG-GDCYSRLTTEQSH', 4)
('NEDVSIIPPLFTVSVDHRG-DTDDAPVPAPAGDQK', 4)
('NEDVSIIPPLFTVSVDHRG-YGFGSDRFGRDLNY', 4)
('NEDVSIIPPLFTVSVDHRG-SGEMMGAPAVVAPQQPP', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-VFK', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-VFK', 4)
('NEDVSIIPPLFTVSVDHRG-FVK', 4)
('NEDVSIIPPLFTVSVDHRG-F

('GSHLVEALYLVCGERGFF-NPGVHEPGEPEFKYIG', 4)
('GSHLVEALYLVCGERGFF-GMMNEAPGPINFTMFL', 4)
('GSHLVEALYLVCGERGFF-NPGVHEPGEPEFKYIG', 4)
('GSHLVEALYLVCGERGFF-LDLNCDGQLDFQEFL', 4)
('GSHLVEALYLVCGERGFF-LRTDYNASVSVPDSSG', 4)
('GSHLVEALYLVCGERGFF-ESSSRSAEKRSAEDE', 4)
('GSHLVEALYLVCGERGFF-CDLCGVKF', 4)
('SGAFGHLFRPDNFIFGQS-TEGGGSEALPCPGPPAG', 4)
('SGAFGHLFRPDNFIFGQS-LGSHTDEMLWHVLE', 4)
('SGAFGHLFRPDNFIFGQS-NGTDPEDVIRNAFACF', 4)
('SGAFGHLFRPDNFIFGQS-NGTDPEDVIRNAFACF', 4)
('SGAFGHLFRPDNFIFGQS-QGSFLAANMQDSRENT', 4)
('SGAFGHLFRPDNFIFGQS-GKSSDNRSRGYRGGSAGG', 4)
('SGAFGHLFRPDNFIFGQS-GDCYSRLTTEQSH', 4)
('SGAFGHLFRPDNFIFGQS-DTDDAPVPAPAGDQK', 4)
('SGAFGHLFRPDNFIFGQS-YGFGSDRFGRDLNY', 4)
('SGAFGHLFRPDNFIFGQS-SGEMMGAPAVVAPQQPP', 4)
('SGAFGHLFRPDNFIFGQS-VFK', 4)
('SGAFGHLFRPDNFIFGQS-FVK', 4)
('SGAFGHLFRPDNFIFGQS-FVK', 4)
('SGAFGHLFRPDNFIFGQS-FVK', 4)
('SGAFGHLFRPDNFIFGQS-FVK', 4)
('SGAFGHLFRPDNFIFGQS-FVK', 4)
('SGAFGHLFRPDNFIFGQS-VFK', 4)
('SGAFGHLFRPDNFIFGQS-FVK', 4)
('SGAFGHLFRPDNFIFGQS-FVK', 4)
('SGAFGHLFRPD

('IKDCVTPNEYTAACSRLLV-FVK', 4)
('IKDCVTPNEYTAACSRLLV-FVK', 4)
('IKDCVTPNEYTAACSRLLV-FVK', 4)
('IKDCVTPNEYTAACSRLLV-FVK', 4)
('IKDCVTPNEYTAACSRLLV-VFK', 4)
('IKDCVTPNEYTAACSRLLV-VFK', 4)
('IKDCVTPNEYTAACSRLLV-FVK', 4)
('IKDCVTPNEYTAACSRLLV-FVK', 4)
('IKDCVTPNEYTAACSRLLV-FVK', 4)
('IKDCVTPNEYTAACSRLLV-VFK', 4)
('IKDCVTPNEYTAACSRLLV-FVK', 4)
('IKDCVTPNEYTAACSRLLV-FVK', 4)
('IKDCVTPNEYTAACSRLLV-LSDPGNYHEFCRFLA', 4)
('IKDCVTPNEYTAACSRLLV-TSLPSQEHVDPQATGDS', 4)
('IKDCVTPNEYTAACSRLLV-EHVDPQATGDSERGLSA', 4)
('IKDCVTPNEYTAACSRLLV-TDDTSDPTSKEFEALI', 4)
('IKDCVTPNEYTAACSRLLV-TKWEMAAQLREYQD', 4)
('IKDCVTPNEYTAACSRLLV-THSSFSSTVKDKAASES', 4)
('IKDCVTPNEYTAACSRLLV-TPTGRLMNRFSKDMD', 4)
('IKDCVTPNEYTAACSRLLV-TYMEEMLGNVAGARQV', 4)
('IKDCVTPNEYTAACSRLLV-TQERFGDKDSKMLVD', 4)
('IKDCVTPNEYTAACSRLLV-TTSSDIRAMSPLDSSNS', 4)
('IKDCVTPNEYTAACSRLLV-TEGSEFLRLQVEGGGCS', 4)
('IKDCVTPNEYTAACSRLLV-DRSHTCHNGKADPTKT', 4)
('IKDCVTPNEYTAACSRLLV-DRQQSQVLDAMQDSF', 4)
('IKDCVTPNEYTAACSRLLV-EVTFNSLLCPTGAEVS', 4)
('IKDCVTPNEYT

('EELSCEERNLLSVAYKNV-ESSSRSAEKRSAEDE', 4)
('EELSCEERNLLSVAYKNV-CDLCGVKF', 4)


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

# How big does mass have to be to get matches from both b and y ions

In [4]:
from utils import ppm_to_da
ppm_tolerance = 20

m = 1000

print(ppm_to_da(m, ppm_tolerance))

0.02


# Occurance

Q: How does occurance scale with the length of the peptide

In [None]:
mz_array = []
occurance_array = []
for key in matched_masses_b:
    for tup in matched_masses_b[key]:
        mz = tup[]
        

# Over all datasets

* Want to know:
    * What do good hits look like when put through extensions?
    * Only factor in the hits for which there exist a good hit from the b and y side. It doesn't make sense to talk about extensions for which we don't have good initial hits on both sides
        * How often does this happen?
        * Do we lose out on good combinations after penalizing double counting?