In [3]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqUtils import ProtParam
import os

In [4]:
file_path = "files/sprot_human.fasta"

In [5]:
def assignment(file_path, enzyme_rule, NTT, missed_cleavages, peptide_length):
    """
    This function generates peptide substrings from protein sequences based on enzyme cleavage rules and other parameters.
    
    Args:
        file_path (str): File path to the FASTA file that contains the protein sequences.
        enzyme_rule (list): A list of amino acid residue/s that defines the cleavage rule of the enzyme used. 
                            This helps to filter out peptides that are unlikely to have been generated by the chosen enzyme, and thus reduces false positives
        missed_cleavages (int): Number of missed cleavage sites allowed.  In some cases, where the enzyme may not cleave all of the possible peptide bonds due to various reasons, 
                            results in the formation of longer peptide fragments, which may be more difficult to analyze by MS. 
                            This parameter "missed_cleavages" is used to specify the maximum number of such uncleaved peptide bonds that are allowed during the enzymatic digestion process.
        peptide_length (list): A list of two integers representing the minimum and maximum length of the generated peptide substrings.

    Returns:
        peptides (list): Thereafter returns a list of dictionaries containing peptide information such as the protein header and sequence,
                        peptide number and sequence, peptide length, NTT, and missed cleavages -  which we can invoke for output
        
     """
    peptides = [] #initialize an empty list to store peptide information

#loading the file and setting up the rules
    with open(file_path) as handle:#open the FASTA file for reading
        #use a loop to parse each sequence record in the FASTA file
        for record in SeqIO.parse(handle, "fasta"): 
            prot_seq = str(record.seq) 
            prot_header = record.description

            #iterate over all possible peptide sequences in the protein sequence
            for i in range(len(prot_seq)):
                for j in range(i + peptide_length[0], min(i + peptide_length[1] + 1, len(prot_seq))):
                    peptide_seq = prot_seq[i:j]
                    pep_len = len(peptide_seq)
                    pep_NTT = 0

                    #filter out peptides that don't meet the enzyme cleavage rules, peptide length, or NTT criteria
                    if peptide_seq[0] not in enzyme_rule:
                        continue
                    if peptide_seq[-1] not in enzyme_rule:
                        continue
                    if pep_len < peptide_length[0] or pep_len > peptide_length[1]:
                        continue
                    for k in range(1, pep_len):
                        if peptide_seq[k-1] in enzyme_rule and peptide_seq[k] not in enzyme_rule:
                            pep_NTT += 1
                    if pep_NTT > NTT:
                        continue
                    
                    #iterate over all possible combinations of missed cleavage sites

                    #For each missed cleavage site, we check if the resulting peptide sequence contains any residues that are not part of the enzyme rule, 
                    # and if it does not, then we add the peptide sequence to the list of peptides with the appropriate info such as protein header and sequence, 
                    # peptide number and sequence, peptide length, NTT, and missed cleavages.
                    for k in range(1, missed_cleavages+2):
                        for l in range(pep_len-k+1):
                            missed_sites = [pos for pos, char in enumerate(peptide_seq) if char not in enzyme_rule]
                            if l not in missed_sites:
                                pep_missed_cleavages = k-1
                                peptide_num = len(peptides) + 1
                                pre_res = prot_seq[i+l-1] if i+l-1 >= 0 else "-"
                                post_res = prot_seq[i+j] if i+j < len(prot_seq) else "-"
                                peptides.append({
                                    "protein_header": prot_header,
                                    "protein_seq": prot_seq,
                                    "peptide_number": peptide_num,
                                    "peptide_sequence": peptide_seq,
                                    "pre_residue": pre_res,
                                    "post_residue": post_res,
                                    "peptide_length": pep_len,
                                    "peptide_NTT": pep_NTT,
                                    "peptide_missed_cleavages": pep_missed_cleavages
                                })
    return peptides

In [6]:
results = assignment(file_path, [['R', 'K'], 'C'], 2, 1, [7, 20])

In [7]:
for result in results:
    print("({}\n,{}){}, {}, {}, {}".format(
        result['protein_header'],
        result['protein_seq'],
        result['peptide_sequence'][0],
        result['peptide_sequence'][1:-1],
        result['peptide_sequence'][-1],
        result['peptide_length'],
        result['peptide_missed_cleavages']
    ))