# General finder

* The purpose of this notebook is to try to turn the hybrid merger into a general merge algorithm
* We will also add a non-hybrid and hybrid labelling after alignments

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from testing_framework import testing_utils
import database
from preprocessing import preprocessing_utils, merge_search, clustering
from identification import create_hits
from gen_spectra import get_precursor, gen_spectrum
import utils

import operator

max_peptide_length = 23
ppm_tolerance = 20
precursor_tolerance = 10
peak_filter = 25
relative_abundance_filter = 0.1

import matplotlib.pyplot as plt

In [2]:
datasets = testing_utils.define_data()

dataset = datasets[0]

input_spectra_path = [os.path.join(dataset[0], 'NOD2_E3.mzML')]
input_spectra, boundaries = preprocessing_utils.load_spectra(input_spectra_path, ppm_tolerance, peak_filter=peak_filter, relative_abundance_filter=relative_abundance_filter)

correct_sequences = testing_utils.generate_truth_set(datasets[0])

path = dataset[2]
db = database.build(path)

In [3]:
write_path = os.path.abspath(os.path.join(module_path, 'intermediate_files'))
matched_masses_b, matched_masses_y, kmer_set = merge_search.modified_match_masses(boundaries, db, max_peptide_length, True, write_path)
print('Finished matching masses')

On protein 279/279 [100%]
Sorting the set of protein masses...
Sorting the set of protein masses done
Performing Merge
Done
Finished matching masses


In [4]:
spectrum_num = 0

correct_sequence = correct_sequences[spectrum_num]
print(correct_sequence)

input_spectrum = input_spectra[spectrum_num]
location = os.path.join(os.path.abspath(os.path.join('../..')), 'intermediate_files/')
b_hits, y_hits = create_hits(spectrum_num, input_spectrum, matched_masses_b, matched_masses_y, False, location)
correct_hits = testing_utils.append_correct_hits(correct_sequence, input_spectrum, ppm_tolerance)
ion = 'b'
clusters = testing_utils.create_clusters(ion, b_hits, y_hits)
b_sorted_clusters = clustering.Score_clusters(ion, clusters)
ion = 'y'
clusters = testing_utils.create_clusters(ion, b_hits, y_hits)
y_sorted_clusters = clustering.Score_clusters(ion, clusters)
with open(os.path.join(location, 'b_sorted_clusters.txt'), 'w') as b:
    [b.write(str(x) + '\n') for x in b_sorted_clusters]
with open(os.path.join(location, 'y_sorted_clusters.txt'), 'w') as y:
    [y.write(str(x) + '\n') for x in y_sorted_clusters]
print(len(b_hits), len(y_hits), len(boundaries), len(input_spectra))
print(len(clusters), len(b_sorted_clusters), len(y_sorted_clusters))

DPQVEQLEL
21653 22360 26331 1086
20187 19757 20187


# General Merge

In [5]:
from constants import WATER_MASS, PROTON_MASS
import collections

def min_info(cluster):
    return (cluster.pid, cluster.start, cluster.end, cluster.score, cluster.seq)

def check_for_hybrid_overlap(b_seq, y_seq, ion):
    match = True
    if ion == 'b':
        for i, char in enumerate(b_seq):
            if char == y_seq[0]:
                k = 0
                for j in range(i, len(b_seq) + 1):
                    if b_seq[j] != y_seq[k]:
                        match = False
                        break
        if match == True:
            print('Match was true for', b_seq)
            modified_seq = b_seq[:i]
    else:
        for i, char in enumerate(y_seq):
            if char == y_seq[0]:
                k = 0
                for j in range(i, len(b_seq) + 1):
                    if b_seq[j] != y_seq[k]:
                        match = False
                        break
        if match == True:
            print('Match was true for', b_seq)
            modified_seq = b_seq[:i]
    return match, modified_seq

def grab_matches(b,indexed_clusters, target_val, ion):
    #Given a cluster we want to find everything that it can pair with
    # It can pair with anything up to a certain mass 
    current_index = 0
    matches = []
    for key in indexed_clusters.keys():
        if key<=target_val: #if key is a valid key
            for y in indexed_clusters[key]:
                if ion == 'b':
                    matches.append((b.score + y.score, b.end - y.start, y.end-b.start,min_info(b), min_info(y)))
                else:
                    matches.append((b.score + y.score, b.end - y.start, y.end-b.start,min_info(y), min_info(b)))
        else:
#             match, modified_seq = check_for_hybrid_overlap()
            break            
    return matches
    
def index_by_precursor_mass(sorted_clusters, pc):
    indexed = dict()
    for y in sorted_clusters:
        if get_precursor(y.seq, pc) not in indexed.keys():
            indexed[get_precursor(y.seq, pc)] = []
        indexed[get_precursor(y.seq, pc)].append(y)
    indexed = collections.OrderedDict(sorted(indexed.items(),key=lambda t: t[0]))
    return indexed
    
def get_matches(b_sorted_clusters, y_sorted_clusters, obs_prec, precursor_tol, charge):
    merged_seqs = []
    ind_b, ind_y = index_by_precursor_mass(b_sorted_clusters, charge),index_by_precursor_mass(y_sorted_clusters, charge)
    for i, cluster in enumerate(b_sorted_clusters[:10]):
        cluster_seq = cluster.seq
        cluster_mass = get_precursor(cluster_seq, charge)
        tol = utils.ppm_to_da(obs_prec, precursor_tol)
        if not (cluster_mass > obs_prec + tol):
            diff = obs_prec + tol - cluster_mass + (charge * PROTON_MASS) + WATER_MASS
            merges = grab_matches(cluster,ind_y, diff, 'b')
            [merged_seqs.append(x) for x in merges]
    for i, cluster in enumerate(y_sorted_clusters[:10]):
        cluster_seq = cluster.seq
        cluster_mass = get_precursor(cluster_seq, charge)
        tol = utils.ppm_to_da(obs_prec, precursor_tol)
        if not (cluster_mass > obs_prec + tol):
            diff = obs_prec + tol - cluster_mass + (charge * PROTON_MASS) + WATER_MASS
#             print(get_precursor(cluster_seq + 'DL', charge), obs_prec + tol)
            merges = grab_matches(cluster,ind_b, diff, 'y')
            [merged_seqs.append(x) for x in merges]

    merged_seqs.sort(key=lambda a: a[0], reverse=True)
    return merged_seqs

#All inputs are the same
merged_seqs = clustering.get_matches(b_sorted_clusters, y_sorted_clusters, input_spectrum.precursor_mass, precursor_tolerance, input_spectrum.precursor_charge)
print(len(merged_seqs))
[print(x) for x in merged_seqs[:50]]
# with open(os.path.join(location, 'merged_seqs.txt'), 'w') as b:
#     [b.write(str(x) + '\n') for x in merged_seqs]

160994
(7, 2431, -2424, (278, 148, 149, 2, 'TT'), (158, 2573, 2579, 5, 'LDSFSEI'))
(7, 2378, -2371, (278, 201, 202, 2, 'TT'), (158, 2573, 2579, 5, 'LDSFSEI'))
(7, 2377, -2370, (278, 202, 203, 2, 'TT'), (158, 2573, 2579, 5, 'LDSFSEI'))
(7, 2302, -2295, (278, 277, 278, 2, 'TT'), (158, 2573, 2579, 5, 'LDSFSEI'))
(7, 2448, -2441, (275, 131, 132, 2, 'TT'), (158, 2573, 2579, 5, 'LDSFSEI'))
(7, 2430, -2423, (273, 149, 150, 2, 'TT'), (158, 2573, 2579, 5, 'LDSFSEI'))
(7, 2376, -2369, (273, 203, 204, 2, 'TT'), (158, 2573, 2579, 5, 'LDSFSEI'))
(7, 2275, -2268, (273, 304, 305, 2, 'TT'), (158, 2573, 2579, 5, 'LDSFSEI'))
(7, 2293, -2286, (269, 286, 287, 2, 'TT'), (158, 2573, 2579, 5, 'LDSFSEI'))
(7, 2042, -2035, (269, 537, 538, 2, 'TT'), (158, 2573, 2579, 5, 'LDSFSEI'))
(7, 2472, -2465, (266, 107, 108, 2, 'TT'), (158, 2573, 2579, 5, 'LDSFSEI'))
(7, 2395, -2388, (264, 184, 185, 2, 'TT'), (158, 2573, 2579, 5, 'LDSFSEI'))
(7, 2331, -2324, (264, 248, 249, 2, 'TT'), (158, 2573, 2579, 5, 'LDSFSEI'))
(7, 2

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]