# Extensions

The purpose of this notebook is to test everything related to generating extensions among hits

In [16]:
import os
import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from testing_framework import testing_utils
import database
from preprocessing import merge_search
from identification import create_hits
import utils

import operator

ppm_tolerance = 20
max_peptide_length = 20
precursor_tolerance = 10

import matplotlib.pyplot as plt

In [2]:
datasets = testing_utils.define_data()

dataset = datasets[0]

input_spectra_path = dataset[0]
input_spectra, boundaries = testing_utils.preprocess_input_spectra(input_spectra_path, ppm_tolerance)

correct_sequences = testing_utils.generate_truth_set(datasets[0])

path = dataset[2]
db = database.build(path)

Loading spectra...
Done


In [3]:
write_path = os.path.abspath(os.path.join(module_path, 'intermediate_files'))
matched_masses_b, matched_masses_y, kmer_set = merge_search.modified_match_masses(boundaries, db, max_peptide_length, True, write_path)
print('Finished matching masses')

On protein 279/279 [100%]
Sorting the set of protein masses...
Sorting the set of protein masses done
Performing Merge
Done
Finished matching masses


# Getting initial hits

In [4]:
from gen_spectra import gen_spectrum
spectrum_num = 0

correct_sequence = correct_sequences[spectrum_num]
print(correct_sequence)

input_spectrum = input_spectra[spectrum_num]

DPQVEQLEL


In [5]:
unique_b,unique_y = testing_utils.get_unique_matched_masses(boundaries, matched_masses_b, matched_masses_y)

In [6]:
location = os.path.join(os.path.abspath(os.path.join('../..')), 'intermediate_files/')
b_hits, y_hits = create_hits(spectrum_num, input_spectrum, matched_masses_b, matched_masses_y, False, location)
correct_hits = testing_utils.append_correct_hits(correct_sequence, input_spectrum, ppm_tolerance)
ion = 'b'
clusters = testing_utils.create_clusters(ion, b_hits, y_hits)
b_sorted_clusters = testing_utils.Bayes_clusters(ion, clusters, location, kmer_set, unique_b)
ion = 'y'
clusters = testing_utils.create_clusters(ion, b_hits, y_hits)
y_sorted_clusters = testing_utils.Bayes_clusters(ion, clusters, location, kmer_set, unique_y)

# Printing hits

In [7]:
b_sorted_clusters = sorted(b_sorted_clusters, key=operator.attrgetter('prob', 'score', 'pid'))
# b_sorted_clusters = sorted(b_sorted_clusters, key=operator.attrgetter('score', 'prob', 'pid'), reverse = True)
testing_utils.write_b_sorted_cluster(b_sorted_clusters)
for i in range(0, 50):
    x = b_sorted_clusters[i]
    post_prob = x.prob
    score = x.score
    seq = x.seq
    indices = x.indices
    print(post_prob, score, seq)

7.37632168378539e-24 8 DPQVEQLEL
4.4528149180897643e-17 4 PDAGAPTSASGLSGHTTL
7.703189449090993e-17 4 TQAGVEELDPENKIP
1.9373592979518995e-16 4 GTYFEVKIPSDTFYDN
5.108142991660681e-16 4 PAGDQKDV
1.56303178756188e-15 4 HSLMPMLE
1.833341688635198e-15 4 TVFSDFL
1.833341688635198e-15 4 DPEVQQI
9.091538461518476e-15 4 IQEYYNKL
2.1668429667773813e-14 3 TTSPNLGTREN
2.6230409769948193e-14 3 PQDHPRSQPQ
3.24007561622655e-14 3 KKDLEEWNQ
4.523368031382904e-14 3 PQQPQPPPQQQAAPQ
4.523368031382904e-14 3 GDEPGPQRSVEGWIL
5.1945340793898446e-14 3 IEKNTDGVNFYNIL
5.974174096356828e-14 3 TGAAGRNS
6.171904971593171e-14 3 KPEGRPGT
6.171904971593175e-14 3 TLSFSSIS
6.23782502402988e-14 3 AGPQPAQTGA
6.720993999005019e-14 3 TTQPPAQPASQGSGS
7.309947871977779e-14 3 QPSVSS
7.718635778598766e-14 3 PGGEEVLREQAGGD
8.129325868558074e-14 3 TDMTKLEECVRSIQADG
8.129325868558074e-14 3 TTTPCMLRDSDSILETL
8.229287530506726e-14 3 TTPATSTTCTAT
9.180659198563205e-14 3 TTESVKEQEMKWTDLA
1.0002839007505794e-13 3 KYFDSGDYNMAKAK
1.000283

In [8]:
y_sorted_clusters = sorted(y_sorted_clusters, key=operator.attrgetter('prob', 'score', 'pid'))
# y_sorted_clusters = sorted(y_sorted_clusters, key=operator.attrgetter('score', 'prob', 'pid'), reverse = True)
testing_utils.write_y_sorted_cluster(y_sorted_clusters)
for i in range(0, 50):
    x = y_sorted_clusters[i]
    post_prob = x.prob
    score = x.score
    seq = x.seq
    indices = x.indices
    print(post_prob, score, seq)

4.16818804282123e-18 5 LDSFSEI
1.1616080807536527e-16 4 SSNWVGKGFFAVYEAIC
2.512036051166044e-16 4 FVDLTMPYSV
1.05226703890205e-15 4 LESYGLE
1.12385997955871e-15 4 EIPHSELD
1.12385997955871e-15 4 EISSIDEF
3.347163330282923e-15 4 IEVLETDPH
1.2869227751022117e-14 4 HPDSEL
1.2869227751022117e-14 4 CGLYEL
5.663227040807141e-14 3 LSNPTGLQESISDVTTCL
5.7487143274934e-14 3 TAEIASLDSENIDEI
6.37719274049424e-14 3 AAGCKVEAFAVQGEEL
6.601894630036116e-14 3 LLMAASIYFHDQNP
6.601894630036116e-14 3 LEDSDLKKSDIDEI
7.172157438730102e-14 3 LSIHQLVENTDETYCI
7.258536525544357e-14 3 DLNINMTSPMGTKSI
7.258536525544357e-14 3 WLKGQGVYLGMPGCL
7.258536525544357e-14 3 DTNAPAHQLIQTESP
7.258536525544357e-14 3 AKDLDTVASDMMVLL
7.660124169797852e-14 3 YFEEYGKIDTIEI
8.163489183127015e-14 3 YNYVWANCFEITLEL
8.163489183127015e-14 3 QQQPPKQQQQQQQQQ
8.163489183127015e-14 3 QQPPKQQQQQQQQQQ
8.336059504994219e-14 3 EIEGEIKRDFMAAL
8.336059504994219e-14 3 LQCYSEAIKLDPQN
9.91871119161064e-14 3 TSQDARFYAL
9.91871119161064e-14 3 DFTFV

# To Filter data by parent prot

I want to be able to only view the b or y hits from a certain protein

In [9]:
# b_hits
target_pid = 274
b_target_clusters = []
for cluster in b_sorted_clusters:
    if cluster.pid == target_pid:
        b_target_clusters.append(cluster)

for cluster in b_target_clusters:
    assessment, _ = testing_utils.is_good_hit(cluster.seq, 'b', correct_sequence)
    non_indices = str(cluster.score) + '\t' + str(cluster.prob) + '\t' + str(cluster.pid) + '\t' + cluster.seq + '\t' + str(assessment)
    print(non_indices + '\t'+ '\t'.join([str(o) for o in cluster.indices]))


8	7.37632168378539e-24	274	DPQVEQLEL	True	(60, 62, 'DPQ', 341.1453857421875)	(60, 63, 'DPQV', 440.216064453125)	(60, 64, 'DPQVE', 569.2589721679688)	(60, 65, 'DPQVEQ', 349.1622619628906)	(60, 65, 'DPQVEQ', 697.3143310546875)	(60, 66, 'DPQVEQL', 810.3919677734375)	(60, 67, 'DPQVEQLE', 939.4456176757812)	(60, 68, 'DPQVEQLEL', 526.7672446181209)
2	8.737764218909832e-11	274	EDPQVEQL	False	(59, 63, 'EDPQV', 569.2589721679688)	(59, 66, 'EDPQVEQL', 939.4456176757812)
1	1.139291460182141e-08	274	LGGSP	False	(68, 72, 'LGGSP', 412.21746826171875)
1	1.646011061622541e-08	274	VEDPQ	False	(58, 62, 'VEDPQ', 569.2589721679688)
1	2.0627639582379252e-08	274	LE	False	(66, 67, 'LE', 243.13357543945312)
1	2.0627639582379252e-08	274	EL	False	(67, 68, 'EL', 243.13357543945312)
1	2.0627639582379252e-08	274	LE	False	(80, 81, 'LE', 243.13357543945312)
1	2.0627639582379252e-08	274	LE	False	(103, 104, 'LE', 243.13357543945312)
1	3.719257593041041e-08	274	PQ	False	(61, 62, 'PQ', 226.1183624267578)
1	1.22115405163

In [10]:
# y_hits
y_target_clusters = []
for cluster in y_sorted_clusters:
    if cluster.pid == target_pid:
        y_target_clusters.append(cluster)

for cluster in y_target_clusters:
    assessment, _ = testing_utils.is_good_hit(cluster.seq, 'y', correct_sequence)
    non_indices = str(cluster.score) + '\t' + str(cluster.prob) + '\t' + str(cluster.pid) + '\t' + cluster.seq + '\t' + str(assessment)
    print(non_indices + '\t'+ '\t'.join([str(o) for o in cluster.indices]))

3	7.553986085145072e-13	274	DPQVEQLEL	True	(60, 68, 'DPQVEQLEL', 535.7725269681209)	(67, 68, 'EL', 261.1429443359375)	(68, 68, 'L', 132.10121154785156)
1	1.6930194586815518e-08	274	EALYLVC	False	(37, 43, 'EALYLVC', 810.3919677734375)
1	2.029622598916721e-08	274	LE	False	(66, 67, 'LE', 261.1429443359375)
1	2.029622598916721e-08	274	LE	False	(80, 81, 'LE', 261.1429443359375)
1	2.029622598916721e-08	274	LE	False	(103, 104, 'LE', 261.1429443359375)
1	3.15816271525945e-08	274	LC	False	(30, 31, 'LC', 235.1075439453125)
1	3.15816271525945e-08	274	IC	False	(97, 98, 'IC', 235.1075439453125)
1	6.105773986227816e-07	274	L	True	(3, 3, 'L', 132.10121154785156)
1	6.105773986227816e-07	274	L	True	(4, 4, 'L', 132.10121154785156)
1	6.105773986227816e-07	274	L	True	(8, 8, 'L', 132.10121154785156)
1	6.105773986227816e-07	274	L	True	(10, 10, 'L', 132.10121154785156)
1	6.105773986227816e-07	274	L	True	(11, 11, 'L', 132.10121154785156)
1	6.105773986227816e-07	274	L	True	(13, 13, 'L', 132.10121154785156)
1	6

# Finding optimal "hybrid" combos

* Hybrid is in quotation marks because all outputs will be a hybrid and then we can check if it is a non-hybrid

In [11]:
def get_top_X(b_clusters, y_clusters, top_num):
    filtered_b = []
    filtered_y = []
    b_len = top_num if len(b_clusters) >= top_num else len(b_clusters)
    y_len = top_num if len(y_clusters) >= top_num else len(y_clusters)
    for x in range(0,b_len):
        filtered_b.append(b_clusters[x])
    for x in range(0,y_len):
        filtered_y.append(y_clusters[x])
    return filtered_b, filtered_y

def combine(b_cluster, y_cluster):
    b_start, b_end, y_start, y_end = b_cluster.start, b_cluster.end, y_cluster.start, y_cluster.end
    if b_cluster.pid == y_cluster.pid:
        score_add = 2
        hybrid = False
        if  (b_end <= y_end) and (b_start <= y_start) and (b_end >= y_start): #overlap
            overlap = True
            score_add = 2
            seq = b_cluster.seq
            rem_chars = y_start - b_end
            while (rem_chars >= 0):
                seq = seq + y_cluster.seq[len(y_cluster.seq)-1 - rem_chars]
                rem_chars = rem_chars - 1
        else:                                                                #no overlap
            hybrid = False
            overlap = False
            score_add = 2
            seq = b_cluster.seq + '-' + y_cluster.seq
    else:                                                                    #hybrid
        hybrid = True
        overlap = False
        score_add = 0
        seq = b_cluster.seq + '-' + y_cluster.seq
    return seq, score_add, hybrid, overlap

def filter_by_validity(b_cluster, y_cluster):
    valid = True
    for b in b_cluster.indices:
        for y in y_cluster.indices:
            if b[3] == y[3]:
                valid = False
    return valid

def filter_by_precursor(seq, pc, overlap, obs_prec, precursor_tolerance):
    new_seq = seq.replace("-", "") if overlap == False else seq
    tol = ppm_to_da(obs_prec, precursor_tolerance)
    if get_precursor(new_seq, charge=pc) > obs_prec + tol:
        return False
    else:
        return True

def filter_by_dist(b, y, x):
    if y.start - b.end > x:
        return False
    else:
        return True

def merge_clusters(b_clusters, y_clusters, target_precursor, precursor_tolerance):
    # filtered_b, filtered_y = get_top_X(b_sorted_clusters, y_sorted_clusters, 50)
    #Start with printing overlapping. Then will incorportate boundary overlaps between last of b and first of y
    interesting_combos = []
    for b_cluster in b_sorted_clusters:
        for y_cluster in y_sorted_clusters:
            if b_cluster.start <= y_cluster.end:
                # b_indices = parse_indices(b_cluster.indices)
                # y_indices = parse_indices(y_cluster.indices)
                if filter_by_validity(b_cluster, y_cluster):
                    comb_seq, score_add, hybrid, overlap = combine(b_cluster, y_cluster)
                    if filter_by_precursor(comb_seq, 2, overlap, target_precursor, precursor_tolerance):
                        if filter_by_dist(b_cluster, y_cluster, 10):
                            tup = (comb_seq, b_cluster.score + y_cluster.score + score_add, overlap)
                            interesting_combos.append(tup)

    interesting_combos.sort(key=lambda a: a[1], reverse=True)
    return interesting_combos

In [12]:
print(len(b_sorted_clusters), len(y_sorted_clusters))
def get_unique_sorted_clusters(b_sorted_clusters, y_sorted_clusters):
    b_sorted_set = set()
    for b in b_sorted_clusters:
        seq = b.seq
        b_sorted_set.add(seq)
    y_sorted_set = set()
    for y in y_sorted_clusters:
        seq = y.seq
        y_sorted_set.add(seq)
    return b_sorted_set, y_sorted_set

b_sorted_set, y_sorted_set = get_unique_sorted_clusters(b_sorted_clusters, y_sorted_clusters)

19756 20184


In [15]:
merged_seqs = merge_clusters(b_sorted_clusters, y_sorted_clusters, input_spectrum.precursor_mass, precursor_tolerance)
[print(merged_seqs[x]) for x in range(0,50)]

NameError: name 'ppm_to_da' is not defined

In [None]:
print(len(b_sorted_set), len(y_sorted_set))

In [17]:
import sys
import collections
import operator

def min_info(cluster):
    return (cluster.pid, cluster.start, cluster.end, cluster.score, cluster.seq)

def bsearch(key, Y):
        lo = -1
        hi = len(Y)
        mid = -1
        while (hi - lo > 1):
            mid = int((hi+lo) / 2)
            if Y[mid].start < key:
                lo = mid
            else:
                hi = mid
        return hi
def Ryan_merge(b_sorted_clusters, y_sorted_clusters):
    Cluster = collections.namedtuple('Cluster', 'score pid seq mass start end ion hits')
    merge_seqs = []

    B = {}
    for c in b_sorted_clusters:
        if c.pid not in B:
            B[c.pid] = []
        B[c.pid].append(c)

    Y = {}
    for c in y_sorted_clusters:
        if c.pid not in Y:
            Y[c.pid] = []
        Y[c.pid].append(c)

    for pid in B:
        if pid not in Y:
            continue

        sorted_B = sorted(B[pid], key=operator.attrgetter('pid', 'start', 'end'))
        sorted_Y = sorted(Y[pid], key=operator.attrgetter('pid', 'start', 'end'))

        for b in sorted_B:
            y_i = bsearch(b.start, sorted_Y)

            if y_i >= len(sorted_Y): break

            y = sorted_Y[y_i]

            while y_i < len(sorted_Y) and y.start - b.end < 10:
                y = sorted_Y[y_i]
                merge_seqs.append((b.score + y.score, b.end - y.start, y.end-b.start,min_info(b), min_info(y)))
                y_i += 1
    return merge_seqs

In [18]:
m = Ryan_merge(b_sorted_clusters, y_sorted_clusters)
m.sort(key = lambda x: x[0], reverse = True) 
[print(m[x]) for x in range(0,50)]

(11, 8, 8, (274, 60, 68, 8, 'DPQVEQLEL'), (274, 60, 68, 3, 'DPQVEQLEL'))
(9, 2, 6, (274, 60, 68, 8, 'DPQVEQLEL'), (274, 66, 66, 1, 'L'))
(9, 2, 7, (274, 60, 68, 8, 'DPQVEQLEL'), (274, 66, 67, 1, 'LE'))
(9, -3, 12, (274, 60, 68, 8, 'DPQVEQLEL'), (274, 71, 72, 1, 'SP'))
(9, -7, 15, (274, 60, 68, 8, 'DPQVEQLEL'), (274, 75, 75, 1, 'L'))
(9, -10, 18, (274, 60, 68, 8, 'DPQVEQLEL'), (274, 78, 78, 1, 'L'))
(7, -6, 25, (201, 45, 56, 3, 'IEPPDTGLYYDE'), (201, 62, 70, 4, 'IEVLETDPH'))
(6, 8, 7, (112, 57, 71, 4, 'TQAGVEELDPENKIP'), (112, 63, 64, 2, 'EL'))
(6, 7, 7, (159, 1061, 1068, 4, 'IQEYYNKL'), (159, 1061, 1068, 2, 'IQEYYNKL'))
(6, 7, 6, (76, 640, 648, 3, 'KYNVSCIMI'), (76, 641, 646, 3, 'YNVSCI'))
(6, -6, 16, (76, 1399, 1402, 2, 'ELIE'), (76, 1408, 1415, 4, 'EIPHSELD'))
(6, -2, 22, (175, 46, 61, 3, 'TTESVKEQEMKWTDLA'), (175, 63, 68, 3, 'QGLHEN'))
(6, 1, 8, (259, 59, 62, 2, 'YYLE'), (259, 61, 67, 4, 'LESYGLE'))
(5, 6, 9, (274, 59, 66, 2, 'EDPQVEQL'), (274, 60, 68, 3, 'DPQVEQLEL'))
(5, 8, 13, (1

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [37]:
target_combos = merge_clusters(b_target_clusters, y_target_clusters, input_spectrum.precursor_mass, precursor_tolerance)
print(target_combos)
print(len(target_combos))

[('DPQVEQLEL', 13, True), ('EDPQVEQL', 7, True), ('VEDPQ', 6, True), ('EDPQVEQLL', 5, True), ('EDPQVEQL-L', 5, False), ('LGGSP-LE', 4, False), ('LGGSP-L', 4, False), ('LGGSP-L', 4, False), ('LGGSP-L', 4, False), ('LGGSP', 4, True), ('VEDPQ-LE', 4, False), ('VEDPQ-L', 4, False), ('VEDPQ-SP', 4, False), ('LE', 4, True), ('LE-L', 4, False), ('LE-L', 4, False), ('LE-SP', 4, False), ('EL-LE', 4, False), ('EL-L', 4, False), ('EL-L', 4, False), ('EL-SP', 4, False), ('LE', 4, True), ('LE-L', 4, False), ('LE-I', 4, False), ('LE', 4, True), ('LE-L', 4, False), ('PQ-LE', 4, False), ('PQ-L', 4, False), ('PQ-SP', 4, False), ('K-LC', 4, False), ('K-L', 4, False), ('T-LC', 4, False), ('T-L', 4, False), ('K-EALYLVC', 4, False), ('K-LC', 4, False), ('K-L', 4, False), ('K-L', 4, False), ('T-LE', 4, False), ('T-L', 4, False), ('T-L', 4, False), ('K-I', 4, False), ('T-LE', 4, False), ('T-IC', 4, False), ('T-I', 4, False), ('T-L', 4, False), ('T-L', 4, False)]
46


In [33]:
def modified_filter_by_precursor(b, y, pc, obs_prec, precursor_tolerance):
    new_seq = b + y
    tol = ppm_to_da(obs_prec, precursor_tolerance)
    if get_precursor(new_seq, charge=pc) > obs_prec + tol:
        return False
    else:
        return True

interesting_combos = []
for b in b_sorted_set:
    for y in y_sorted_set:
        if modified_filter_by_precursor(b, y, 2, input_spectrum.precursor_mass, precursor_tolerance):
            interesting_combos.append(b + '-' + y)
print(len(interesting_combos))

653528


In [22]:
print(merge_clusters(b_sorted_clusters, y_sorted_clusters, input_spectrum.precursor_mass, precursor_tolerance))

KeyboardInterrupt: 

# How big does mass have to be to get matches from both b and y ions

In [4]:
from utils import ppm_to_da
ppm_tolerance = 20

m = 1000

print(ppm_to_da(m, ppm_tolerance))

0.02


# Occurance

Q: How does occurance scale with the length of the peptide

In [None]:
mz_array = []
occurance_array = []
for key in matched_masses_b:
    for tup in matched_masses_b[key]:
        mz = tup[]
        

# Over all datasets

* Want to know:
    * What do good hits look like when put through extensions?
    * Only factor in the hits for which there exist a good hit from the b and y side. It doesn't make sense to talk about extensions for which we don't have good initial hits on both sides
        * How often does this happen?
        * Do we lose out on good combinations after penalizing double counting?