## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
import re
import plotly.express as px
from tqdm import tqdm
import tempfile
import csv
import requests as r
from Bio import SeqIO
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import warnings

warnings.filterwarnings('ignore')

# Import structuremap functions
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score, annotate_proteins_with_idr_pattern, get_extended_flexible_pattern, get_proximity_pvals, perform_enrichment_analysis, perform_enrichment_analysis_per_protein, evaluate_ptm_colocalization, extract_motifs_in_proteome
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization

## Set Parameters of Analysis

In [2]:
analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "655.3735" 
light_modification = "649.3660"

## Load Dataset - Coverage

In [3]:
# set correct pathing
curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("global_data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/MsrKD
global_data Directory: /Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/global_data


In [4]:
data_loc = os.path.join(curr_dir_path, "05_06_24_Combined_Proteomics.xlsx")
experimental_peptides = pd.read_excel(data_loc, sheet_name="Coverage")
experimental_peptides;

In [5]:
# Canonicalize data - here, drop extra columns & string formatting for modifications columns
experimental_peptides.drop(columns=["Unnamed: 14"], axis=1, inplace=True)
experimental_peptides["Assigned Modifications"] = experimental_peptides["Assigned Modifications"].fillna(" ")
experimental_peptides["Assigned Modifications"] = experimental_peptides["Assigned Modifications"].str.replace("(", "[")
experimental_peptides["Assigned Modifications"] = experimental_peptides["Assigned Modifications"].str.replace(")", "]")
experimental_peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,Protein ID,Entry Name,Gene,Protein Description
0,AAALVLQTIWGYK,AAALVLQTIWGYK,R,E,821,833,13,2,,sp|P30999|CTND1_MOUSE,P30999,CTND1_MOUSE,Ctnnd1,Catenin delta-1
1,AAATFNPELITHILDGSPENTRR,AAATFNPELITHILDGSPENTRR,R,R,10,32,23,4,,sp|Q9R0H0|ACOX1_MOUSE,Q9R0H0,ACOX1_MOUSE,Acox1,Peroxisomal acyl-coenzyme A oxidase 1
2,AAAVGIAQVVISR,AAAVGIAQVVISR,R,I,220,232,13,2,,sp|Q925N2|SFXN2_MOUSE,Q925N2,SFXN2_MOUSE,Sfxn2,Sideroflexin-2
3,AAAVGIAQVVISRITMAAPGMILLPVIMER,AAAVGIAQVVISRITMAAPGMILLPVIMER,R,L,220,249,30,34,,sp|Q925N2|SFXN2_MOUSE,Q925N2,SFXN2_MOUSE,Sfxn2,Sideroflexin-2
4,AAAVGIAQVVISRITMAAPGMILLPVIMERLER,AAAVGIAQVVISRITMAAPGMILLPVIMERLER,R,L,220,252,33,4,,sp|Q925N2|SFXN2_MOUSE,Q925N2,SFXN2_MOUSE,Sfxn2,Sideroflexin-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5447,STFRLALIQLQVSSIKSDNLTR,n[42.0106]STFRLALIQLQVSSIKSDNLTR,M,A,2,23,22,3,N-term[42.0106],sp|Q9JHW2|NIT2_MOUSE,Q9JHW2,NIT2_MOUSE,Nit2,Omega-amidase NIT2
5448,TDAAVSFAK,n[42.0106]TDAAVSFAK,M,D,2,10,9,2,N-term[42.0106],sp|P51881|ADT2_MOUSE,P51881,ADT2_MOUSE,Slc25a5,ADP/ATP translocase 2
5449,TDAAVSFAKDFLAGGVAAAISK,n[42.0106]TDAAVSFAKDFLAGGVAAAISK,M,T,2,23,22,23,N-term[42.0106],sp|P51881|ADT2_MOUSE,P51881,ADT2_MOUSE,Slc25a5,ADP/ATP translocase 2
5450,TDAAVSFAKDFLAGGVAAAISKTAVAPIER,n[42.0106]TDAAVSFAKDFLAGGVAAAISKTAVAPIER,M,V,2,31,30,234,N-term[42.0106],sp|P51881|ADT2_MOUSE,P51881,ADT2_MOUSE,Slc25a5,ADP/ATP translocase 2


In [6]:
# create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    split_mod = modifications[0].split(".")
    whole = split_mod[0]
    mantissa = split_mod[1]
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        split_mod = modifications[i].split(".")
        whole = split_mod[0]
        mantissa = split_mod[1]
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [7]:
modified_peptides = experimental_peptides[experimental_peptides["Assigned Modifications"].str.contains(modifications_pattern)]
modified_peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,Protein ID,Entry Name,Gene,Protein Description
6,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch..."
9,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,Q9WUR2,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2
10,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,Q9WUR2,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2
11,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,Q9Z1P6,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...
12,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,Q9Z1P6,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5431,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,Q9CQ54,NDUC2_MOUSE,Ndufc2,NADH dehydrogenase [ubiquinone] 1 subunit C2
5441,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,P24270,CATA_MOUSE,Cat,Catalase
5442,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,P24270,CATA_MOUSE,Cat,Catalase
5445,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,Q9CQR4,ACO13_MOUSE,Acot13,Acyl-coenzyme A thioesterase 13


In [8]:
# helper function to get full amino acid sequence for a protein
def get_full_protein_seq(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [9]:
unique_uniprotIDs = modified_peptides["Protein ID"].unique()
unique_uniprotIDs, unique_uniprotIDs.size

(array(['Q8C196', 'Q9WUR2', 'Q9Z1P6', 'Q8K370', 'P48410', 'P51658',
        'Q8QZY2', 'Q8VC30', 'Q99LC5', 'P51881', 'Q91VS7', 'Q61335',
        'Q8BH95', 'Q8BGT5', 'Q920A5', 'Q8BWT1', 'Q9DB77', 'Q9QXF8',
        'Q91WU0', 'P97807', 'P30115', 'Q99JY0', 'P63038', 'Q8BM55',
        'P43024', 'P54116', 'Q9DBJ1', 'Q9CYV5', 'Q61102', 'Q9WTP6',
        'Q9JJW0', 'Q9CPQ8', 'Q8CGK3', 'P97872', 'P51660', 'Q60936',
        'P54869', 'Q61733', 'Q925I1', 'Q8CAQ8', 'P38647', 'Q9DBG1',
        'P63030', 'Q01853', 'Q9Z2I8', 'P26443', 'O35129', 'Q9D6J5',
        'P24270', 'P08226', 'Q9DCM2', 'Q05920', 'P19783', 'P52825',
        'Q9DBL7', 'Q9R0H0', 'Q8CC88', 'Q99MR8', 'Q8BGY7', 'P29758',
        'O35423', 'G3X982', 'Q03265', 'Q9CRB9', 'Q9CQN1', 'P61620',
        'Q91VR2', 'Q8VDN2', 'O35386', 'P33267', 'Q61425', 'P20029',
        'O88962', 'Q91WL5', 'Q9JKR6', 'P32020', 'Q8VI47', 'Q91YI0',
        'Q9CQZ5', 'O35490', 'Q8VCW8', 'P08249', 'P97450', 'Q64433',
        'Q7TNG8', 'Q8BUY5', 'Q80W54', 'P16460', 

In [10]:
# load known completed sequences
path = os.path.join(global_data_path, "uniprotID_to_complete_sequence_mapping.csv")
unique_IDs_to_sequence_df = pd.read_csv(path)
unique_IDs_to_sequence_df.set_index("Unnamed: 0", inplace=True)
unique_IDs_to_sequence_df.index.name = None
unique_IDs_to_sequence_df;

In [11]:
modified_peptides = modified_peptides.merge(unique_IDs_to_sequence_df, how="left", on="Protein ID")
modified_peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,Protein ID,Entry Name,Gene,Protein Description,Complete Sequence
0,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,Q9WUR2,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...
2,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,Q9WUR2,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...
3,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,Q9Z1P6,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...
4,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,Q9Z1P6,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,Q9CQ54,NDUC2_MOUSE,Ndufc2,NADH dehydrogenase [ubiquinone] 1 subunit C2,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...
730,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,P24270,CATA_MOUSE,Cat,Catalase,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...
731,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,P24270,CATA_MOUSE,Cat,Catalase,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...
732,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,Q9CQR4,ACO13_MOUSE,Acot13,Acyl-coenzyme A thioesterase 13,MSSMTQNLREVMKVMFKVPGFDRVLEKVTLVSAAPEKLICEMKVEE...


In [12]:
modified_peptides["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(modified_peptides["Complete Sequence"], modified_peptides["Peptide Sequence"])])
modified_peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,Protein ID,Entry Name,Gene,Protein Description,Complete Sequence,Sequence Location
0,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575
1,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,Q9WUR2,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,292
2,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,Q9WUR2,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,292
3,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,Q9Z1P6,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,80
4,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,Q9Z1P6,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,Q9CQ54,NDUC2_MOUSE,Ndufc2,NADH dehydrogenase [ubiquinone] 1 subunit C2,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...,0
730,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,P24270,CATA_MOUSE,Cat,Catalase,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,1
731,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,P24270,CATA_MOUSE,Cat,Catalase,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,1
732,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,Q9CQR4,ACO13_MOUSE,Acot13,Acyl-coenzyme A thioesterase 13,MSSMTQNLREVMKVMFKVPGFDRVLEKVTLVSAAPEKLICEMKVEE...,1


In [13]:
modified_peptides["Sequence Length"] = modified_peptides["Peptide Sequence"].str.len()
modified_peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,Protein ID,Entry Name,Gene,Protein Description,Complete Sequence,Sequence Location,Sequence Length
0,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12
1,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,Q9WUR2,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,292,9
2,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,Q9WUR2,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,292,9
3,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,Q9Z1P6,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,80,12
4,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,Q9Z1P6,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,80,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,Q9CQ54,NDUC2_MOUSE,Ndufc2,NADH dehydrogenase [ubiquinone] 1 subunit C2,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...,0,12
730,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,P24270,CATA_MOUSE,Cat,Catalase,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,1,15
731,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,P24270,CATA_MOUSE,Cat,Catalase,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,1,15
732,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,Q9CQR4,ACO13_MOUSE,Acot13,Acyl-coenzyme A thioesterase 13,MSSMTQNLREVMKVMFKVPGFDRVLEKVTLVSAAPEKLICEMKVEE...,1,8


In [14]:
# sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(modified_peptides["Complete Sequence"], modified_peptides["Sequence Location"], modified_peptides["Sequence Length"])]
(temp == modified_peptides["Peptide Sequence"]).value_counts()

Peptide Sequence
True    734
Name: count, dtype: int64

In [15]:
# create regex pattern to identify desired modifications
left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [16]:
# extract left prefix of modified methionine (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

modified_peptides["Left Prefix"] = modified_peptides["Modified Sequence"].str.extract(left_prefix_pattern)[0]
modified_peptides["Left Prefix"] = modified_peptides["Left Prefix"].map(filtering)
modified_peptides["Left Prefix Length"] = modified_peptides["Left Prefix"].str.len()

modified_peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,Protein ID,Entry Name,Gene,Protein Description,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length
0,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12,AADTIGYPV,9
1,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,Q9WUR2,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,292,9,AAE,3
2,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,Q9WUR2,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,292,9,AAE,3
3,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,Q9Z1P6,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,80,12,AAESSA,6
4,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,Q9Z1P6,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,80,12,AAESSA,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,Q9CQ54,NDUC2_MOUSE,Ndufc2,NADH dehydrogenase [ubiquinone] 1 subunit C2,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...,0,12,M,1
730,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,P24270,CATA_MOUSE,Cat,Catalase,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,1,15,SDSRDPASDQ,10
731,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,P24270,CATA_MOUSE,Cat,Catalase,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,1,15,SDSRDPASDQ,10
732,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,Q9CQR4,ACO13_MOUSE,Acot13,Acyl-coenzyme A thioesterase 13,MSSMTQNLREVMKVMFKVPGFDRVLEKVTLVSAAPEKLICEMKVEE...,1,8,SS,2


In [17]:
modified_peptides["Methionine Location"] = modified_peptides["Sequence Location"] + modified_peptides["Left Prefix Length"]
modified_peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,Protein ID,Entry Name,Gene,Protein Description,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location
0,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12,AADTIGYPV,9,584
1,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,Q9WUR2,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,292,9,AAE,3,295
2,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,Q9WUR2,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,292,9,AAE,3,295
3,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,Q9Z1P6,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,80,12,AAESSA,6,86
4,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,Q9Z1P6,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,80,12,AAESSA,6,86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,Q9CQ54,NDUC2_MOUSE,Ndufc2,NADH dehydrogenase [ubiquinone] 1 subunit C2,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...,0,12,M,1,1
730,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,P24270,CATA_MOUSE,Cat,Catalase,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,1,15,SDSRDPASDQ,10,11
731,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,P24270,CATA_MOUSE,Cat,Catalase,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,1,15,SDSRDPASDQ,10,11
732,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,Q9CQR4,ACO13_MOUSE,Acot13,Acyl-coenzyme A thioesterase 13,MSSMTQNLREVMKVMFKVPGFDRVLEKVTLVSAAPEKLICEMKVEE...,1,8,SS,2,3


In [18]:
# sanity check - ensure methionine location is correct
temp = [A[B] for A, B in zip(modified_peptides["Complete Sequence"], modified_peptides["Methionine Location"])]
temp.count("M") == len(temp)

True

## Load Dataset - Full Mitochondrial Proteome

In [19]:
RvsS_data_path_str = "../RvsS"
RvsS_data_path = os.path.abspath(RvsS_data_path_str)

path = os.path.join(RvsS_data_path, "RvsS_full_mitochondrial_with_alphafold.csv")
mitocarta3_0_methionines = pd.read_csv(path)
mitocarta3_0_methionines.set_index("Unnamed: 0", inplace=True)
mitocarta3_0_methionines.index.name = None
mitocarta3_0_methionines

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR,UniProt,UniProt-Primary,Complete Sequence
0,A2ADF7,1,M,0,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,1.909091,1.909091,3.727273,5.545455,8.454545,0.000000,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
1,A2ADF7,1,M,7,25.00,-17.959,-18.801,-19.503,-17.880,-11.797,...,1.944444,1.944444,4.000000,6.555556,10.500000,0.111111,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
2,A2ADF7,1,M,11,31.62,-22.014,-20.790,-20.768,-20.882,-2.522,...,2.476190,2.476190,5.000000,9.142857,17.666667,0.285714,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
3,A2ADF7,1,M,16,60.19,-24.992,-24.872,-26.170,-24.382,3.164,...,3.857143,3.904762,7.666667,17.238095,32.380952,1.047619,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
4,A2ADF7,1,M,96,86.39,-12.515,-11.617,-10.128,-11.907,1.475,...,7.619048,8.142857,19.047619,51.666667,104.095238,5.095238,0,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4843,S4R2K0,528,M,90,98.40,-3.270,-3.856,-3.011,-5.229,-11.159,...,7.523810,8.571429,25.428571,66.619048,119.285714,7.761905,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4844,S4R2K0,528,M,132,98.22,14.774,14.306,13.644,15.422,-7.821,...,6.380952,7.666667,18.952381,49.142857,89.476190,4.380952,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4845,S4R2K0,528,M,203,98.79,0.508,1.302,2.674,1.468,3.585,...,7.619048,8.809524,28.714286,82.714286,134.619048,8.571429,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4846,S4R2K0,528,M,215,98.03,3.940,3.358,2.382,2.708,12.422,...,5.761905,6.714286,18.619048,46.714286,87.380952,4.428571,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...


# Coverage Analysis

In [20]:
modified_peptides["Site ID"] = modified_peptides["Protein ID"] + "-" + modified_peptides["Methionine Location"].astype(str)
modified_peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,Entry Name,Gene,Protein Description,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Site ID
0,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12,AADTIGYPV,9,584,Q8C196-584
1,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,292,9,AAE,3,295,Q9WUR2-295
2,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,...,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,292,9,AAE,3,295,Q9WUR2-295
3,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,80,12,AAESSA,6,86,Q9Z1P6-86
4,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,...,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,80,12,AAESSA,6,86,Q9Z1P6-86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,...,NDUC2_MOUSE,Ndufc2,NADH dehydrogenase [ubiquinone] 1 subunit C2,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...,0,12,M,1,1,Q9CQ54-1
730,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,...,CATA_MOUSE,Cat,Catalase,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,1,15,SDSRDPASDQ,10,11,P24270-11
731,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,...,CATA_MOUSE,Cat,Catalase,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,1,15,SDSRDPASDQ,10,11,P24270-11
732,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,...,ACO13_MOUSE,Acot13,Acyl-coenzyme A thioesterase 13,MSSMTQNLREVMKVMFKVPGFDRVLEKVTLVSAAPEKLICEMKVEE...,1,8,SS,2,3,Q9CQR4-3


In [21]:
modified_peptides_unique = modified_peptides.drop_duplicates(subset=["Site ID"])
modified_peptides_unique

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,Entry Name,Gene,Protein Description,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Site ID
0,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12,AADTIGYPV,9,584,Q8C196-584
1,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,292,9,AAE,3,295,Q9WUR2-295
3,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,80,12,AAESSA,6,86,Q9Z1P6-86
5,AAHLMDVAGNK,AAHLM[655.3735]DVAGNK,K,T,977,987,11,3,5M[655.3735],sp|Q8K370|ACD10_MOUSE,...,ACD10_MOUSE,Acad10,Acyl-CoA dehydrogenase family member 10,MLVRRLFQPSTLHWAWRTTALNHPLGRHQGGLRWTHSGGRSYRAVI...,976,11,AAHL,4,980,Q8K370-980
6,AALEMKEEELVSER,AALEM[655.3735]KEEELVSER,K,T,368,381,14,4,5M[655.3735],sp|P48410|ABCD1_MOUSE,...,ABCD1_MOUSE,Abcd1,ATP-binding cassette sub-family D member 1,MPVLSTPRPSRVTTLKRTAVVLALTAYGVHKIYPLVRQCLTPARGP...,367,14,AALE,4,371,P48410-371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,YPIEHGIITNWDDMEK,YPIEHGIITNWDDM[649.3660]EK,K,I,71,86,16,4,14M[649.3660],sp|P68033|ACTC_MOUSE,...,ACTC_MOUSE,Actc1,"Actin, alpha cardiac muscle 1",MCDDEETTALVCDNGSGLVKAGFAGDDAPRAVFPSIVGRPRHQGVM...,70,16,YPIEHGIITNWDD,13,83,P68033-83
725,AATASPGAGRMDGKPR,n[42.0106]AATASPGAGRM[649.3660]DGKPR,M,T,2,17,16,4,"N-term[42.0106],11M[649.3660]",sp|Q9CR62|M2OM_MOUSE,...,M2OM_MOUSE,Slc25a11,Mitochondrial 2-oxoglutarate/malate carrier pr...,MAATASPGAGRMDGKPRTSPKSVKFLFGGLAGMGATVFVQPLDLVK...,1,16,AATASPGAGR,10,11,Q9CR62-11
727,ASVLSYESLVHAVAGAVGSVTAMTVFFPLDTAR,n[42.0106]ASVLSYESLVHAVAGAVGSVTAM[649.3660]TVF...,M,L,2,34,33,4,"N-term[42.0106],23M[649.3660]",sp|O70579|PM34_MOUSE,...,PM34_MOUSE,Slc25a17,Peroxisomal membrane protein PMP34,MASVLSYESLVHAVAGAVGSVTAMTVFFPLDTARLRLQVDEKRKSK...,1,33,ASVLSYESLVHAVAGAVGSVTA,22,23,O70579-23
729,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,...,NDUC2_MOUSE,Ndufc2,NADH dehydrogenase [ubiquinone] 1 subunit C2,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...,0,12,M,1,1,Q9CQ54-1


In [22]:
mitochondrial_proteinIDs = mitocarta3_0_methionines["UniProt"].unique()
mitochondrial_proteinIDs, len(mitochondrial_proteinIDs)

(array(['A2ADF7', 'A2AMZ4', 'A2ASZ8', 'A2ATU0', 'A6H611', 'B1AXP6',
        'B9EJ57', 'D3YTP3', 'D3Z7P3', 'E9QLB8', 'E9QPD7', 'F6W8I0',
        'G5E814', 'G5E8U5', 'H3BLL2', 'O08691', 'O08715', 'O08749',
        'O09111', 'O35129', 'O35143', 'O35435', 'O35459', 'O35465',
        'O35683', 'O35855', 'O35857', 'O35943', 'O35972', 'O55028',
        'O88396', 'O88441', 'O88696', 'O88741', 'O88967', 'P00158',
        'P00397', 'P03903', 'P08249', 'P09671', 'P0DJE0', 'P11725',
        'P19783', 'P26443', 'P29758', 'P35486', 'P36552', 'P38647',
        'P40630', 'P42125', 'P43024', 'P45952', 'P46656', 'P48962',
        'P50544', 'P50637', 'P51881', 'P52196', 'P52825', 'P53395',
        'P53702', 'P54071', 'P54869', 'P56135', 'P56213', 'P56379',
        'P56391', 'P56394', 'P56480', 'P58059', 'P58064', 'P58281',
        'P59017', 'P61922', 'P62073', 'P62075', 'P62897', 'P63030',
        'P63038', 'P67778', 'P70404', 'P70406', 'P70677', 'P85094',
        'P97287', 'P97363', 'P97450', 'P97493', 

In [23]:
mitochondrial_modified_peptides_unique = modified_peptides_unique[modified_peptides_unique["Protein ID"].isin(mitochondrial_proteinIDs)]
mitochondrial_modified_peptides_unique

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,Entry Name,Gene,Protein Description,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Site ID
1,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,ECI2_MOUSE,Eci2,Enoyl-CoA delta isomerase 2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,292,9,AAE,3,295,Q9WUR2-295
3,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,NDUA7_MOUSE,Ndufa7,NADH dehydrogenase [ubiquinone] 1 alpha subcom...,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,80,12,AAESSA,6,86,Q9Z1P6-86
13,AAVDAGFVPNDMQVGQTGK,AAVDAGFVPNDM[649.3660]QVGQTGK,R,I,250,268,19,4,12M[649.3660],sp|Q99LC5|ETFA_MOUSE,...,ETFA_MOUSE,Etfa,"Electron transfer flavoprotein subunit alpha, ...",MFRAAAPGQLRRAASLLRFQSTLVIAEHANDSLAPITLNTITAAGR...,249,19,AAVDAGFVPND,11,260,Q99LC5-260
15,AAYFGIYDTAKGMLPDPK,AAYFGIYDTAKGM[649.3660]LPDPK,R,N,189,206,18,4,13M[649.3660],sp|P51881|ADT2_MOUSE,...,ADT2_MOUSE,Slc25a5,ADP/ATP translocase 2,MTDAAVSFAKDFLAGGVAAAISKTAVAPIERVKLLLQVQHASKQIT...,188,18,AAYFGIYDTAKG,12,200,P51881-200
17,ADLRQLMDNEVLMAFTSYATIILTKMMFMSSATAFQR,ADLRQLM[655.3735]DNEVLMAFTSYATIILTKMMFMSSATAFQR,M,I,2,38,37,5,7M[655.3735],sp|Q91VS7|MGST1_MOUSE,...,MGST1_MOUSE,Mgst1,Microsomal glutathione S-transferase 1,MADLRQLMDNEVLMAFTSYATIILTKMMFMSSATAFQRITNKVFAN...,1,37,ADLRQL,6,7,Q91VS7-7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698,YAHMVDVGQVGVNVPIPVPLPMFSFTGSR,YAHMVDVGQVGVNVPIPVPLPM[649.3660]FSFTGSR,K,S,460,488,29,4,22M[649.3660],sp|Q9EQ20|MMSA_MOUSE,...,MMSA_MOUSE,Aldh6a1,Methylmalonate-semialdehyde/malonate-semialdeh...,MAAAVAAAAAMRSRILQVSSKVNATWYPASSFSSSSVPTVKLFIDG...,459,29,YAHMVDVGQVGVNVPIPVPLP,21,480,Q9EQ20-480
701,YAHMVDVGQVGVNVPIPVPLPMFSFTGSR,YAHM[649.3660]VDVGQVGVNVPIPVPLPMFSFTGSR,K,S,460,488,29,4,4M[649.3660],sp|Q9EQ20|MMSA_MOUSE,...,MMSA_MOUSE,Aldh6a1,Methylmalonate-semialdehyde/malonate-semialdeh...,MAAAVAAAAAMRSRILQVSSKVNATWYPASSFSSSSVPTVKLFIDG...,459,29,YAH,3,462,Q9EQ20-462
704,YAMAWGVVEK,YAM[649.3660]AWGVVEK,K,K,464,473,10,3,3M[649.3660],sp|Q9EP89|LACTB_MOUSE,...,LACTB_MOUSE,Lactb,"Serine beta-lactamase-like protein LACTB, mito...",MYRLLSSVTARAAATAGPAWDGGRRGAHRRPGLPVLGLGWAGGLGL...,463,10,YA,2,465,Q9EP89-465
715,YHPMDYYWWLR,YHPM[649.3660]DYYWWLR,R,M,315,325,11,4,4M[649.3660],sp|Q80XN0|BDH_MOUSE,...,BDH_MOUSE,Bdh1,"D-beta-hydroxybutyrate dehydrogenase, mitochon...",MLAARLSRPLSQLPGKALSVRDRENGTRHTLLFYPASFSPDTRRTY...,314,11,YHP,3,317,Q80XN0-317


In [24]:
print(f"Number of unique mitochondrial Met sites labeled: {len(mitochondrial_modified_peptides_unique['Site ID'])}")
print(f"Number of unique Met sites labeled: {len(modified_peptides_unique['Site ID'])}")
print(f"Number of total mitochondrial Met sites: {len(mitocarta3_0_methionines)}")

Number of unique mitochondrial Met sites labeled: 156
Number of unique Met sites labeled: 356
Number of total mitochondrial Met sites: 4848


# End