In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import database
from testing_framework import testing_utils
from preprocessing import preprocessing_utils, merge_search, clustering
from identification import create_hits
from utils import ppm_to_da
from gen_spectra import get_precursor
import operator
import collections


#Assumptions:
max_peptide_length = 23
ppm_tolerance = 20
peak_filter = 50
relative_abundance_filter = 0.1
precursor_tolerance = 10
DEBUG = False

import matplotlib.pyplot as plt

In [3]:
datasets = testing_utils.define_data()

dataset = datasets[0]

input_spectra_path = [os.path.join(dataset[0], 'NOD2_E3.mzML')]
input_spectra, boundaries = preprocessing_utils.load_spectra(input_spectra_path, ppm_tolerance, peak_filter=peak_filter, relative_abundance_filter=relative_abundance_filter)

correct_sequences = testing_utils.generate_truth_set(datasets[0])

path = dataset[2]
db = database.build(path)

In [None]:
write_path = os.path.abspath(os.path.join(module_path, 'intermediate_files'))
matched_masses_b, matched_masses_y, kmer_set = merge_search.modified_match_masses(boundaries, db, max_peptide_length, True, write_path)
print('Finished matching masses')

On protein 279/279 [100%]

In [None]:
unique_b,unique_y = testing_utils.get_unique_matched_masses(boundaries, matched_masses_b, matched_masses_y)

In [None]:
from gen_spectra import gen_spectrum
spectrum_num = 5

correct_sequence = correct_sequences[spectrum_num]
print(correct_sequence)

input_spectrum = input_spectra[spectrum_num]

In [None]:
location = os.path.join(os.path.abspath(os.path.join('../..')), 'intermediate_files/')
b_hits,y_hits = create_hits(spectrum_num,input_spectrum,matched_masses_b,matched_masses_y,DEBUG,location)
for ion in "by":
    clusters = clustering.create_clusters(ion, b_hits, y_hits)
    if ion ==  'b':
        b_sorted_clusters = clustering.Score_clusters(ion, clusters)
    else:
        y_sorted_clusters = clustering.Score_clusters(ion, clusters)
merged_seqs = clustering.Ryan_merge(b_sorted_clusters, y_sorted_clusters)
merged_seqs.sort(key = lambda x: x[0], reverse = True)
print(len(b_hits), len(y_hits), len(boundaries), len(input_spectra))
print(len(clusters), len(b_sorted_clusters), len(y_sorted_clusters))

In [None]:
for i in range(0, 50):
    x = b_sorted_clusters[i]
    score = x.score
    seq = x.seq
    indices = x.indices
    print(score, seq)

In [None]:
for i in range(0, 50):
    x = y_sorted_clusters[i]
    score = x.score
    seq = x.seq
    indices = x.indices
    print(score, seq)

In [None]:
def filter_by_precursor(mseqs, obs_prec, precursor_tol, charge):
    filtered_seqs = []
    for comb_seq in mseqs:
        b_seq = comb_seq[3][4]
        y_seq = comb_seq[4][4]
        if b_seq != y_seq:
            new_seq = b_seq + y_seq
        else:
            new_seq = b_seq
        tol = ppm_to_da(obs_prec, precursor_tol)
        if not (get_precursor(new_seq, charge) > obs_prec + tol):
            filtered_seqs.append(comb_seq)
    return filtered_seqs
        
m_seqs = merged_seqs[:50]
m_seqs = filter_by_precursor(m_seqs,input_spectrum.precursor_mass,precursor_tolerance,2)
for i in range(0, len(m_seqs)):
    print(m_seqs[i])

In [None]:
from constants import WATER_MASS, PROTON_MASS
def grab_matches(unique_m, target_val):
    #Given a cluster we want to find everything that it can pair with
    # It can pair with anything up to a certain mass
    current_index = 0
    matches = []
    for key in unique_m.keys():
        if key-WATER_MASS-(2*PROTON_MASS) <= 2*target_val:
            if unique_m[key] != []:
                matches.append(unique_m[key])
        else:
            break            
    return matches

def make_hybrids(pairs, seq, ion):
    hybrids = []
    for pair in pairs:
        for pseq in pair:
            new_seq = seq+'-'+pseq if ion == 'b' else pseq+'-'+seq
            hybrids.append(new_seq)
    return hybrids

def score_seqs(mseqs, prec):
    scored_seqs = []
    for seq in mseqs:
        A = seq.rstrip().split('-')
        n_seq = A[0]+A[1]
        score = prec - get_precursor(n_seq, 2)
        if score > 0:
            scored_seqs.append((score, seq))
    scored_seqs.sort(key=lambda x: x[0])
    return scored_seqs
    
def get_hybrid_matches(b_sorted_clusters, unique_b, obs_prec, precursor_tol, charge):
    merged_seqs = []
    for ion in 'by':
        if ion == 'b':
            for cluster in b_sorted_clusters[:50]:
                cluster_seq = cluster.seq
                cluster_mass = get_precursor(cluster_seq, 2)
                tol = ppm_to_da(obs_prec, precursor_tol)
                if not (cluster_mass > obs_prec + tol):
                    diff = obs_prec + tol - cluster_mass
                    pairs = grab_matches(unique_b, diff)
                    ion_merged_seqs = make_hybrids(pairs, cluster_seq, ion)
                    ion_merged_seqs = score_seqs(ion_merged_seqs, obs_prec)
                    [merged_seqs.append(x) for x in ion_merged_seqs]
        else:
            for cluster in y_sorted_clusters[:50]:
                cluster_seq = cluster.seq
                cluster_mass = get_precursor(cluster_seq, 2)
                tol = ppm_to_da(obs_prec, precursor_tol)
                if not (cluster_mass > obs_prec + tol):
                    diff = obs_prec + tol - cluster_mass
                    pairs = grab_matches(unique_y, diff, b_prec, y_prec, ion)
                    print(len(pairs))
                    ion_merged_seqs = make_hybrids(pairs, cluster_seq, ion)
                    ion_merged_seqs = score_seqs(ion_merged_seqs, obs_prec)
                    [merged_seqs.append(x) for x in ion_merged_seqs]

    return merged_seqs

merged_seqs = get_hybrid_matches(b_sorted_clusters[:50], unique_b, input_spectrum.precursor_mass, precursor_tolerance, 2)
[print(x) for x in merged_seqs[:50]]