# 02-725 HW 4

In [151]:
pip install biopython

Note: you may need to restart the kernel to use updated packages.


### Imports

In [152]:
import pandas as pd 
import numpy as np 
from Bio import SeqIO

## Question 1 Database Search

In [153]:
def read_peak_list(file_path):
    with open(file_path, 'r') as file:
        spectrum = []
        for line in file:
            if line.startswith('PM='):
                parent_mass = float(line.strip().split('=')[1])
                continue  
            else:
                spectrum.append(float(line.strip()))
    return parent_mass, spectrum
pm, peak = read_peak_list("part_1.peaklist")

def read_fasta(f):
    database = []
    for record in SeqIO.parse(f, "fasta"):
        database.append(str(record.seq))
    return database
seq_list = read_fasta("sequence.fasta")


In [154]:
def trypsin_digestion(seq_fasta):
    all_digested_peptides = []
    for seq in seq_fasta:
        peptides = []
        current_peptide = ""
        for i in range(len(seq)):
            current_peptide += seq[i]
            if seq[i] in ["K", "R"]: 
                peptides.append(current_peptide)
        all_digested_peptides.append(peptides)
    return all_digested_peptides

trypsin_digested_fragments = trypsin_digestion(seq_list)

In [155]:
amino_acid_masses = {
    'A': 71.037113787, 'R': 156.101111026, 'N': 114.042927446, 'D': 115.026943031, 
    'C': 103.009184477, 'E': 129.042593095, 'Q': 128.058577510, 'G': 57.021463723, 
    'H': 137.058911861, 'I': 113.084063979, 'L': 113.084063979, 'K': 128.094963016, 
    'M': 131.040484605, 'F': 147.068413915, 'P': 97.052763851, 'S': 87.032028409, 
    'T': 101.047678473, 'W': 186.079312952, 'Y': 163.063328537, 'V': 99.068413915
}

def calculate_mass(peptide):
    return sum(amino_acid_masses[aa] for aa in peptide) + 1.00728


def generate_ions(list_of_peptide_lists):
    spectrum_for_all_peptides = dict()
    for peptide_list in list_of_peptide_lists:
        for peptide in peptide_list:
            b_ions = [calculate_mass(peptide[:i]) for i in range(1, len(peptide))]
            y_ions = [calculate_mass(peptide[i:]) + 18.01056 for i in range(1, len(peptide))]
            combined_ions = b_ions + y_ions
            combined_ions.sort()     
            spectrum_for_all_peptides[peptide] = combined_ions
    return spectrum_for_all_peptides
spectrum = generate_ions(trypsin_digested_fragments)

In [156]:
def find_best_match(peak_list, peptide_spectra, tolerance=0.02):
    peak_list = [float(peak) for peak in peak_list]

    best_match = None
    max_matches = 0

    for peptide, spectrum in peptide_spectra.items():
        matches = 0
        for peak in peak_list:
            if any(abs(peak - mz) <= tolerance for mz in spectrum):
                matches += 1
        if matches > max_matches:
            max_matches = matches
            best_match = peptide

    return best_match

best_match = find_best_match(peak, spectrum, tolerance=0.02)
print("The best match is", best_match)


The best match is IKRKAMAKEEFVRTKPHVNIGTIGHVDHGKTTLTAAISKVLNEKLGTSEAVKSFDQIDNAPEEKERGITINSAHIEYETEKRHYAHVDCPGHADYVKNMVTGAAQMDGAILVCAATDGPMPQTREHVLLARQVNVPRLVVFLNKCDMVDDEEMLELVEMELREILEQYGYEEDTPIVRGSALGALNGVEKWVKSVETLMDTVDEWIQEPEREIDKPFLMPIEDVFSITGRGTVATGRIETGRCKVGDEVQLLGLGEDKKSVITGVEMFRKILAEGEAGDNVGLLLRGIDKAEVKRGMVVVHPGAITPHDHFKASIYVLKKEEGGRHTPFGNKYRPQFYLRTMDCTGEIKLPEGVEMVMPGDNVEIEVELIYKVALNEGLRFAIREGGRTVGSGQITTILDDIK


## Question 2 Modification Discovery

In [158]:
def calculate_mass2(peptide):
    return sum(amino_acid_masses[aa] for aa in peptide)

def get_possible_fragment_masses(fragments):
    all_possible_masses = []
    for fragment in fragments:
        for peptide in fragment:
            possible_masses = dict()
            max_mass = calculate_mass2(peptide)

            counts_S = peptide.count("S")
            counts_T = peptide.count("T")
        
            for s_mods in range(counts_S + 1):
                for t_mods in range(counts_T + 1):
                    mass_diff = (s_mods + t_mods) * 18
                    mod_key = f"{s_mods}-S-18_{t_mods}-T-18"
                    possible_masses[mod_key] = max_mass - mass_diff

        all_possible_masses.append(possible_masses)
    return all_possible_masses

possible_masses = get_possible_fragment_masses(trypsin_digested_fragments)

In [159]:
seq_list = read_fasta("sequence.fasta")
parent_mass, peaklist_file_path = read_peak_list("part_2.peaklist")

In [160]:
def find_mass_matches(mass_of_the_parent, modification_masses, tryp_peptides, tolerance=0.02):
    matching_fragments = []
    matching_fragments_indices = []

    for idx, fragment_masses in enumerate(modification_masses):
        for modification, mass in fragment_masses.items():
            if abs(parent_mass - mass) <= tolerance:
                matching_fragments.append((modification, mass))
                matching_fragments_indices.append((idx, tryp_peptides[idx]))

    return matching_fragments, matching_fragments_indices


matching_fragments, indices = find_mass_matches(parent_mass, possible_masses, trypsin_digested_fragments,tolerance=0.02)
matching_fragments.pop(1)
mass_matches = []
mass_matches.append(indices[0][1][4])
mass_matches.append(indices[2][1][2])
print("Modifications that match the parent mass:", matching_fragments)
print("Fragments that match the parent mass with the modfications shown above", mass_matches)
mass_matches[0] = mass_matches[0][:7] + 's' + mass_matches[0][8:]
mass_matches[0] = mass_matches[0].replace('T', 't')

Modifications that match the parent mass: [('1-S-18_3-T-18', 2213.09931742), ('0-S-18_0-T-18', 2213.0952611260004)]
Fragments that match the parent mass with the modfications shown above ['CPARRRCSATTCPRTPWWR', 'CRPWPDRSDPPISGPWPLR']


In [161]:
amino_acid_masses_ques_2 = {
    'A': 71.037113787, 'R': 156.101111026, 'N': 114.042927446, 'D': 115.026943031, 
    'C': 103.009184477, 'E': 129.042593095, 'Q': 128.058577510, 'G': 57.021463723, 
    'H': 137.058911861, 'I': 113.084063979, 'L': 113.084063979, 'K': 128.094963016, 
    'M': 131.040484605, 'F': 147.068413915, 'P': 97.052763851, 'S': 87.032028409, 
    'T': 101.047678473, 'W': 186.079312952, 'Y': 163.063328537, 'V': 99.068413915, 
    't' : 83.0371184, 's' : 69.021468409
}

def calculate_mass_ques_2(peptide):
    return sum(amino_acid_masses_ques_2[aa] for aa in peptide) + 1.00728


def generate_ions_ques_2(peptide_list):
    spectrum_for_all_peptides = dict()
    for peptide in peptide_list:
        b_ions = [calculate_mass_ques_2(peptide[:i]) for i in range(1, len(peptide))]
        y_ions = [calculate_mass_ques_2(peptide[i:]) + 18.01056 for i in range(1, len(peptide))]
        combined_ions = b_ions + y_ions
        combined_ions.sort()     
        spectrum_for_all_peptides[peptide] = combined_ions
    return spectrum_for_all_peptides
spectrum_of_mass_matches = generate_ions_ques_2(mass_matches)

result = find_best_match(peaklist_file_path, spectrum_of_mass_matches, tolerance=0.02)
print(result)


None
