In [40]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqUtils import ProtParam
import os

In [41]:
file_path = "files/sprot_human.fasta"

In [33]:
#sequences = [i for i in SeqIO.parse(file_path,'fasta')]
#len(sequences)

In [52]:
def assignment(file_path, enzyme_rule, NTT, missed_cleavages, peptide_length):
    """
    This function generates peptide substrings from protein sequences based on enzyme cleavage rules and other parameters.
    
    Args:
        file_path (str): File path to the FASTA file that contains the protein sequences.
        enzyme_rule (list): A list of amino acid residue/s that defines the cleavage rule of the enzyme used. 
                            This helps to filter out peptides that are unlikely to have been generated by the chosen enzyme, and thus reduces false positives
        missed_cleavages (int): Number of missed cleavage sites allowed.  In some cases, where the enzyme may not cleave all of the possible peptide bonds due to various reasons, 
                            results in the formation of longer peptide fragments, which may be more difficult to analyze by MS. 
                            This parameter "missed_cleavages" is used to specify the maximum number of such uncleaved peptide bonds that are allowed during the enzymatic digestion process.
        peptide_length (list): A list of two integers representing the minimum and maximum length of the generated peptide substrings.

    Returns:
        peptides (list): Thereafter returns a list of dictionaries containing peptide information such as the protein header and sequence,
                        peptide number and sequence, peptide length, NTT, and missed cleavages -  which we can invoke for output
        
     """
    peptides = [] #initialize an empty list to store peptide information

#loading the file and setting up the rules
    with open(file_path) as handle:#open the FASTA file for reading
        #use a loop to parse each sequence record in the FASTA file
        for record in SeqIO.parse(handle, "fasta"): 
            prot_seq = str(record.seq) 
            prot_header = record.description

            #iterate over all possible peptide sequences in the protein sequence
            for i in range(len(prot_seq)):
                for j in range(i + peptide_length[0], min(i + peptide_length[1] + 1, len(prot_seq))):
                    peptide_seq = prot_seq[i:j]
                    pep_len = len(peptide_seq)
                    pep_NTT = 0

                    #filter out peptides that don't meet the enzyme cleavage rules, peptide length, or NTT criteria
                    if peptide_seq[0] not in enzyme_rule:
                        continue
                    if peptide_seq[-1] not in enzyme_rule:
                        continue
                    if pep_len < peptide_length[0] or pep_len > peptide_length[1]:
                        continue
                    for k in range(1, pep_len):
                        if peptide_seq[k-1] in enzyme_rule and peptide_seq[k] not in enzyme_rule:
                            pep_NTT += 1
                    if pep_NTT > NTT:
                        continue
                    
                    #iterate over all possible combinations of missed cleavage sites
                    for k in range(1, missed_cleavages+2):
                        for l in range(pep_len-k+1):
                            missed_sites = [pos for pos, char in enumerate(peptide_seq) if char not in enzyme_rule]
                            if l not in missed_sites:
                                pep_missed_cleavages = k-1
                                peptide_num = len(peptides) + 1

                                #add the peptide information to the list of peptides
                                peptides.append({
                                    "protein_header": prot_header,
                                    "protein_seq": prot_seq,
                                    "peptide_number": peptide_num,
                                    "peptide_sequence": peptide_seq,
                                    "peptide_length": pep_len,
                                    "peptide_NTT": pep_NTT,
                                    "peptide_missed_cleavages": pep_missed_cleavages
                                })

    return peptides

In [47]:
def print_peptide_of_interest(peptides,cleavage_rules,min_length,max_length):
    peptides = assignment(file_path,cleavage_rules,min_length,max_length,[7,20])
    for i, peptide in enumerate(peptides):
        sequence = peptide['sequence']
        NTT = peptide['NTT']
        missed_cleavages = peptide['missed_cleavages']
        length = peptide['length']
        start = peptide['start'] + 1
        end = peptide['end'] + 1
        prefix = sequence[0]
        suffix = sequence[-1]
        print(f"{i+1}. ({prefix}){sequence[1:-1]}({suffix}), {end-start}, {NTT}, {missed_cleavages}")


In [53]:
cleavage_rules = [["R", "K"], "C"]
NTT = 2
missed_cleavages = 1
peptide_length = [7,20]

assignment(file_path, cleavage_rules, NTT, missed_cleavages,peptide_length)

[{'protein_header': 'sp|P61981|1433G_HUMAN 14-3-3 protein gamma OS=Homo sapiens OX=9606 GN=YWHAG PE=1 SV=2',
  'protein_seq': 'MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTSADGNEKKIEMVRAYREKIEKELEAVCQDVLSLLDNYLIKNCSETQYESKVFYLKMKGDYYRYLAEVATGEKRATVVESSEKAYSEAHEISKEHMQPTHPIRLGLALNYSVFYYEIQNAPEQACHLAKTAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDDDGGEGNN',
  'peptide_number': 1,
  'peptide_sequence': 'CQDVLSLLDNYLIKNC',
  'peptide_length': 16,
  'peptide_NTT': 1,
  'peptide_missed_cleavages': 0},
 {'protein_header': 'sp|P61981|1433G_HUMAN 14-3-3 protein gamma OS=Homo sapiens OX=9606 GN=YWHAG PE=1 SV=2',
  'protein_seq': 'MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTSADGNEKKIEMVRAYREKIEKELEAVCQDVLSLLDNYLIKNCSETQYESKVFYLKMKGDYYRYLAEVATGEKRATVVESSEKAYSEAHEISKEHMQPTHPIRLGLALNYSVFYYEIQNAPEQACHLAKTAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDDDGGEGNN',
  'peptide_number': 2,
  'peptide_sequence': 'CQDVLSLLDNYLIKNC',
  'peptide_length': 16,
  'peptide_NTT

In [38]:
# Saving the Output instead of just printing it
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
output_file = 'files/output.fasta'
def save_peptide_of_interest(protein, cleavage_rules, min_length, max_length, output_file):
    peptides = assignment(file_path, cleavage_rules, min_length, max_length, [7,20])
    
    records = []
    for i, peptide in enumerate(peptides):
        sequence = peptide['sequence']
        NTT = peptide['NTT']
        missed_cleavages = peptide['missed_cleavages']
        length = peptide['length']
        start = peptide['start'] + 1
        end = peptide['end'] + 1
        prefix = sequence[0]
        suffix = sequence[-1]
        description = f"({prefix}){sequence[1:-1]}({suffix}), {end-start}, {NTT}, {missed_cleavages}"
        record = SeqRecord(Seq(sequence), id=f"{protein.id}_{i+1}", description=description)
        records.append(record)
    
    SeqIO.write(records, 'files/output.fasta', "fasta")


In [39]:
#SeqIO.write(gene_of_interest,'Spring Semester/Proteomics Analysis/Home Work/output/output.xml','fasta')