## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
import re
import plotly.express as px
from tqdm import tqdm
import tempfile
import csv
import requests as r
from Bio import SeqIO
from io import StringIO

# Import structuremap functions
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score, annotate_proteins_with_idr_pattern, get_extended_flexible_pattern, get_proximity_pvals, perform_enrichment_analysis, perform_enrichment_analysis_per_protein, evaluate_ptm_colocalization, extract_motifs_in_proteome
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization

## Set Parameters of Analysis

In [2]:
analysis_threshold = 10 # number of amino acids either side to analyze
modifications = ["655.3735", "649.3660"] # which modifications we are looking for, as regex strings

## Load Chemoproteomics Data

In [3]:
# set correct pathing
curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("global_data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/jayde_og
global_data Directory: /Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/global_data


In [4]:
data_loc = os.path.join(curr_dir_path, "combined_modified_peptide.tsv")
peptides = pd.read_csv(data_loc, delimiter="\t")
peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,11 Match Type,12 Match Type,2 Match Type,3 Match Type,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type
0,AAALVLQTIWGYK,AAALVLQTIWGYK,R,E,821,833,13,2,,sp|P30999|CTND1_MOUSE,...,unmatched,MS/MS,unmatched,unmatched,MS/MS,MS/MS,MS/MS,unmatched,unmatched,unmatched
1,AAATFNPELITHILDGSPENTRR,AAATFNPELITHILDGSPENTRR,R,R,10,32,23,4,,sp|Q9R0H0|ACOX1_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
2,AAAVGIAQVVISR,AAAVGIAQVVISR,R,I,220,232,13,2,,sp|Q925N2|SFXN2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
3,AAAVGIAQVVISRITMAAPGMILLPVIMER,AAAVGIAQVVISRITMAAPGMILLPVIMER,R,L,220,249,30,34,,sp|Q925N2|SFXN2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
4,AAAVGIAQVVISRITMAAPGMILLPVIMERLER,AAAVGIAQVVISRITMAAPGMILLPVIMERLER,R,L,220,252,33,4,,sp|Q925N2|SFXN2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5447,STFRLALIQLQVSSIKSDNLTR,n[42.0106]STFRLALIQLQVSSIKSDNLTR,M,A,2,23,22,3,N-term(42.0106),sp|Q9JHW2|NIT2_MOUSE,...,MS/MS,unmatched,unmatched,MS/MS,unmatched,unmatched,MS/MS,unmatched,MS/MS,MS/MS
5448,TDAAVSFAK,n[42.0106]TDAAVSFAK,M,D,2,10,9,2,N-term(42.0106),sp|P51881|ADT2_MOUSE,...,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched
5449,TDAAVSFAKDFLAGGVAAAISK,n[42.0106]TDAAVSFAKDFLAGGVAAAISK,M,T,2,23,22,23,N-term(42.0106),sp|P51881|ADT2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
5450,TDAAVSFAKDFLAGGVAAAISKTAVAPIER,n[42.0106]TDAAVSFAKDFLAGGVAAAISKTAVAPIER,M,V,2,31,30,234,N-term(42.0106),sp|P51881|ADT2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS


In [5]:
# Canonicalize data
peptides["Assigned Modifications"] = peptides["Assigned Modifications"].str.replace("(", "[", regex=False)
peptides["Assigned Modifications"] = peptides["Assigned Modifications"].str.replace(")", "]", regex=False)
peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,11 Match Type,12 Match Type,2 Match Type,3 Match Type,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type
0,AAALVLQTIWGYK,AAALVLQTIWGYK,R,E,821,833,13,2,,sp|P30999|CTND1_MOUSE,...,unmatched,MS/MS,unmatched,unmatched,MS/MS,MS/MS,MS/MS,unmatched,unmatched,unmatched
1,AAATFNPELITHILDGSPENTRR,AAATFNPELITHILDGSPENTRR,R,R,10,32,23,4,,sp|Q9R0H0|ACOX1_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
2,AAAVGIAQVVISR,AAAVGIAQVVISR,R,I,220,232,13,2,,sp|Q925N2|SFXN2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
3,AAAVGIAQVVISRITMAAPGMILLPVIMER,AAAVGIAQVVISRITMAAPGMILLPVIMER,R,L,220,249,30,34,,sp|Q925N2|SFXN2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
4,AAAVGIAQVVISRITMAAPGMILLPVIMERLER,AAAVGIAQVVISRITMAAPGMILLPVIMERLER,R,L,220,252,33,4,,sp|Q925N2|SFXN2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5447,STFRLALIQLQVSSIKSDNLTR,n[42.0106]STFRLALIQLQVSSIKSDNLTR,M,A,2,23,22,3,N-term[42.0106],sp|Q9JHW2|NIT2_MOUSE,...,MS/MS,unmatched,unmatched,MS/MS,unmatched,unmatched,MS/MS,unmatched,MS/MS,MS/MS
5448,TDAAVSFAK,n[42.0106]TDAAVSFAK,M,D,2,10,9,2,N-term[42.0106],sp|P51881|ADT2_MOUSE,...,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched
5449,TDAAVSFAKDFLAGGVAAAISK,n[42.0106]TDAAVSFAKDFLAGGVAAAISK,M,T,2,23,22,23,N-term[42.0106],sp|P51881|ADT2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
5450,TDAAVSFAKDFLAGGVAAAISKTAVAPIER,n[42.0106]TDAAVSFAKDFLAGGVAAAISKTAVAPIER,M,V,2,31,30,234,N-term[42.0106],sp|P51881|ADT2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS


In [6]:
# create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    split_mod = modifications[0].split(".")
    whole = split_mod[0]
    mantissa = split_mod[1]
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        split_mod = modifications[i].split(".")
        whole = split_mod[0]
        mantissa = split_mod[1]
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[655\.3735\]|M\[649\.3660\]


In [7]:
# select rows corresponding to desired sequences
methionine_peptides = peptides[peptides["Assigned Modifications"].str.contains(modifications_pattern, na=False)]
methionine_peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,11 Match Type,12 Match Type,2 Match Type,3 Match Type,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type
6,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,MS/MS,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched
9,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched
10,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,MS/MS
11,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
12,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5431,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched
5441,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,...,unmatched,unmatched,MS/MS,MS/MS,unmatched,MS/MS,unmatched,MS/MS,unmatched,MS/MS
5442,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,...,MS/MS,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,MS/MS
5445,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,...,MS/MS,unmatched,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS


In [8]:
# helper function to get full amino acid sequence for a protein
def get_full_protein_seq(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [9]:
# test - get a single amino acid sequence
#first_protein_ID = methionine_peptides["Protein ID"].iloc[0]
#test_sequence = get_full_protein_seq(first_protein_ID)
#print(test_sequence[575:587])
#print(methionine_peptides["Peptide Sequence"].iloc[0])

In [10]:
unique_uniprotIDs = methionine_peptides["Protein ID"].unique()
unique_uniprotIDs, unique_uniprotIDs.size

(array(['Q8C196', 'Q9WUR2', 'Q9Z1P6', 'Q8K370', 'P48410', 'P51658',
        'Q8QZY2', 'Q8VC30', 'Q99LC5', 'P51881', 'Q91VS7', 'Q61335',
        'Q8BH95', 'Q8BGT5', 'Q920A5', 'Q8BWT1', 'Q9DB77', 'Q9QXF8',
        'Q91WU0', 'P97807', 'P30115', 'Q99JY0', 'P63038', 'Q8BM55',
        'P43024', 'P54116', 'Q9DBJ1', 'Q9CYV5', 'Q61102', 'Q9WTP6',
        'Q9JJW0', 'Q9CPQ8', 'Q8CGK3', 'P97872', 'P51660', 'Q60936',
        'P54869', 'Q61733', 'Q925I1', 'Q8CAQ8', 'P38647', 'Q9DBG1',
        'P63030', 'Q01853', 'Q9Z2I8', 'P26443', 'O35129', 'Q9D6J5',
        'P24270', 'P08226', 'Q9DCM2', 'Q05920', 'P19783', 'P52825',
        'Q9DBL7', 'Q9R0H0', 'Q8CC88', 'Q99MR8', 'Q8BGY7', 'P29758',
        'O35423', 'G3X982', 'Q03265', 'Q9CRB9', 'Q9CQN1', 'P61620',
        'Q91VR2', 'Q8VDN2', 'O35386', 'P33267', 'Q61425', 'P20029',
        'O88962', 'Q91WL5', 'Q9JKR6', 'P32020', 'Q8VI47', 'Q91YI0',
        'Q9CQZ5', 'O35490', 'Q8VCW8', 'P08249', 'P97450', 'Q64433',
        'Q7TNG8', 'Q8BUY5', 'Q80W54', 'P16460', 

In [11]:
unique_IDs_to_sequence_df = pd.DataFrame({"Protein ID": unique_uniprotIDs})
unique_IDs_to_sequence_df

Unnamed: 0,Protein ID
0,Q8C196
1,Q9WUR2
2,Q9Z1P6
3,Q8K370
4,P48410
...,...
202,P47962
203,P68033
204,O70579
205,Q9CQ54


In [12]:
# get whole amino acid sequences for methionine peptides
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#tqdm.pandas()
#unique_IDs_to_sequence_df["Complete Sequence"] = unique_IDs_to_sequence_df["Protein ID"].progress_apply(get_full_protein_seq)
#unique_IDs_to_sequence_df

In [13]:
#unique_IDs_to_sequence_df.to_csv(os.path.join(global_data_path, "uniprotID_to_complete_sequence_mapping.csv"))

In [14]:
path = os.path.join(global_data_path, "uniprotID_to_complete_sequence_mapping.csv")
unique_IDs_to_sequence_df = pd.read_csv(path)
unique_IDs_to_sequence_df.set_index("Unnamed: 0", inplace=True)
unique_IDs_to_sequence_df.index.name = None
unique_IDs_to_sequence_df

Unnamed: 0,Protein ID,Complete Sequence
0,Q8C196,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1,Q9WUR2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...
2,Q9Z1P6,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...
3,Q8K370,MLVRRLFQPSTLHWAWRTTALNHPLGRHQGGLRWTHSGGRSYRAVI...
4,P48410,MPVLSTPRPSRVTTLKRTAVVLALTAYGVHKIYPLVRQCLTPARGP...
...,...,...
202,P47962,MGFVKVVKNKAYFKRYQVRFRRRREGKTDYYARKRLVIQDKNKYNT...
203,P68033,MCDDEETTALVCDNGSGLVKAGFAGDDAPRAVFPSIVGRPRHQGVM...
204,O70579,MASVLSYESLVHAVAGAVGSVTAMTVFFPLDTARLRLQVDEKRKSK...
205,Q9CQ54,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...


In [15]:
methionine_peptides_completed_sequence = methionine_peptides.merge(unique_IDs_to_sequence_df, how="left", on="Protein ID")
methionine_peptides_completed_sequence.index = methionine_peptides.index
methionine_peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,12 Match Type,2 Match Type,3 Match Type,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type,Complete Sequence
6,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
9,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...
10,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,MS/MS,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...
11,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...
12,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5431,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...
5441,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,...,unmatched,MS/MS,MS/MS,unmatched,MS/MS,unmatched,MS/MS,unmatched,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...
5442,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,...,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...
5445,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,...,unmatched,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MSSMTQNLREVMKVMFKVPGFDRVLEKVTLVSAAPEKLICEMKVEE...


In [16]:
#methionine_peptides_completed_sequence.to_csv(os.path.join(curr_dir_path, "methionine_peptides_completed_sequence.csv"))

In [17]:
path = os.path.join(curr_dir_path, "methionine_peptides_completed_sequence.csv")
methionine_peptides_completed_sequence = pd.read_csv(path)
methionine_peptides_completed_sequence.set_index("Unnamed: 0", inplace=True)
methionine_peptides_completed_sequence.index.name = None
methionine_peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,12 Match Type,2 Match Type,3 Match Type,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type,Complete Sequence
6,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
9,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...
10,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,MS/MS,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...
11,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...
12,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5431,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...
5441,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,...,unmatched,MS/MS,MS/MS,unmatched,MS/MS,unmatched,MS/MS,unmatched,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...
5442,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,...,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...
5445,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,...,unmatched,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MSSMTQNLREVMKVMFKVPGFDRVLEKVTLVSAAPEKLICEMKVEE...


## Download Alphafold Data

In [18]:
# path for alphafold protein data
alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/pae


In [19]:
# set uniprot IDs to use
uniprotIDs = methionine_peptides_completed_sequence["Protein ID"].unique()
uniprotIDs, len(uniprotIDs)

(array(['Q8C196', 'Q9WUR2', 'Q9Z1P6', 'Q8K370', 'P48410', 'P51658',
        'Q8QZY2', 'Q8VC30', 'Q99LC5', 'P51881', 'Q91VS7', 'Q61335',
        'Q8BH95', 'Q8BGT5', 'Q920A5', 'Q8BWT1', 'Q9DB77', 'Q9QXF8',
        'Q91WU0', 'P97807', 'P30115', 'Q99JY0', 'P63038', 'Q8BM55',
        'P43024', 'P54116', 'Q9DBJ1', 'Q9CYV5', 'Q61102', 'Q9WTP6',
        'Q9JJW0', 'Q9CPQ8', 'Q8CGK3', 'P97872', 'P51660', 'Q60936',
        'P54869', 'Q61733', 'Q925I1', 'Q8CAQ8', 'P38647', 'Q9DBG1',
        'P63030', 'Q01853', 'Q9Z2I8', 'P26443', 'O35129', 'Q9D6J5',
        'P24270', 'P08226', 'Q9DCM2', 'Q05920', 'P19783', 'P52825',
        'Q9DBL7', 'Q9R0H0', 'Q8CC88', 'Q99MR8', 'Q8BGY7', 'P29758',
        'O35423', 'G3X982', 'Q03265', 'Q9CRB9', 'Q9CQN1', 'P61620',
        'Q91VR2', 'Q8VDN2', 'O35386', 'P33267', 'Q61425', 'P20029',
        'O88962', 'Q91WL5', 'Q9JKR6', 'P32020', 'Q8VI47', 'Q91YI0',
        'Q9CQZ5', 'O35490', 'Q8VCW8', 'P08249', 'P97450', 'Q64433',
        'Q7TNG8', 'Q8BUY5', 'Q80W54', 'P16460', 

In [20]:
# download cif data for proteins
# SLOW THE FIRST TIME
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 207/207 [00:00<00:00, 150341.29it/s]

2024-05-20 21:25:24> Valid proteins: 0
2024-05-20 21:25:24> Invalid proteins: 0
2024-05-20 21:25:24> Existing proteins: 207





In [21]:
# download pae data for proteins
# SLOW THE FIRST TIME
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 207/207 [00:00<00:00, 151205.32it/s]

2024-05-20 21:25:24> Valid proteins: 0
2024-05-20 21:25:24> Invalid proteins: 0
2024-05-20 21:25:24> Existing proteins: 207





## Construct Alphafold Dataframe (Calculate Accessibilities)

In [22]:
# format alphafold data into dataframe
alphafold_annotation = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=uniprotIDs)
alphafold_annotation

  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='i

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,G3X982,1,M,1,31.22,-50.603,-50.225,-49.844,-51.379,11.652,...,-0.598,-2.036,-0.594,unstructured,unstructured,0,0,0,0,1
1,G3X982,1,S,2,28.66,-49.214,-50.170,-49.735,-50.171,13.940,...,2.283,3.726,1.486,unstructured,unstructured,0,0,0,0,1
2,G3X982,1,P,3,32.45,-47.175,-48.539,-49.120,-49.460,15.988,...,1.141,1.476,1.704,unstructured,unstructured,0,0,0,0,1
3,G3X982,1,S,4,43.06,-44.370,-44.826,-43.908,-46.183,16.650,...,1.415,0.217,0.952,unstructured,unstructured,0,0,0,0,1
4,G3X982,1,K,5,58.81,-42.153,-43.494,-43.279,-44.188,17.835,...,4.260,5.723,3.496,HELX_LH_PP_P,HELX,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99429,Q9Z2I8,207,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,-18.565,-18.413,-17.274,HELX_RH_AL_P,HELX,0,1,0,0,0
99430,Q9Z2I8,207,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,-18.909,-17.814,-18.401,BEND,BEND,1,0,0,0,0
99431,Q9Z2I8,207,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,-21.302,-22.660,-20.673,unstructured,unstructured,0,0,0,0,1
99432,Q9Z2I8,207,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,-19.459,-19.112,-20.119,unstructured,unstructured,0,0,0,0,1


In [23]:
# calculate full sphere exposure
full_sphere_exposure = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
full_sphere_exposure

100%|██████████| 207/207 [00:04<00:00, 46.36it/s]


Unnamed: 0,protein_id,AA,position,nAA_24_180_pae
0,G3X982,M,1,5
1,G3X982,S,2,5
2,G3X982,P,3,6
3,G3X982,S,4,9
4,G3X982,K,5,14
...,...,...,...,...
99429,Q9Z2I8,S,429,91
99430,Q9Z2I8,V,430,78
99431,Q9Z2I8,A,431,36
99432,Q9Z2I8,K,432,19


In [24]:
alphafold_accessibility = alphafold_annotation.merge(
    full_sphere_exposure, how='left', on=['protein_id','AA','position'])
alphafold_accessibility

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae
0,G3X982,1,M,1,31.22,-50.603,-50.225,-49.844,-51.379,11.652,...,-2.036,-0.594,unstructured,unstructured,0,0,0,0,1,5
1,G3X982,1,S,2,28.66,-49.214,-50.170,-49.735,-50.171,13.940,...,3.726,1.486,unstructured,unstructured,0,0,0,0,1,5
2,G3X982,1,P,3,32.45,-47.175,-48.539,-49.120,-49.460,15.988,...,1.476,1.704,unstructured,unstructured,0,0,0,0,1,6
3,G3X982,1,S,4,43.06,-44.370,-44.826,-43.908,-46.183,16.650,...,0.217,0.952,unstructured,unstructured,0,0,0,0,1,9
4,G3X982,1,K,5,58.81,-42.153,-43.494,-43.279,-44.188,17.835,...,5.723,3.496,HELX_LH_PP_P,HELX,0,1,0,0,0,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99429,Q9Z2I8,207,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,-18.413,-17.274,HELX_RH_AL_P,HELX,0,1,0,0,0,91
99430,Q9Z2I8,207,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,-17.814,-18.401,BEND,BEND,1,0,0,0,0,78
99431,Q9Z2I8,207,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,-22.660,-20.673,unstructured,unstructured,0,0,0,0,1,36
99432,Q9Z2I8,207,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,-19.112,-20.119,unstructured,unstructured,0,0,0,0,1,19


In [25]:
part_sphere_exposure = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
part_sphere_exposure

100%|██████████| 207/207 [00:01<00:00, 132.03it/s]


Unnamed: 0,protein_id,AA,position,nAA_12_70_pae
0,G3X982,M,1,0
1,G3X982,S,2,0
2,G3X982,P,3,0
3,G3X982,S,4,0
4,G3X982,K,5,0
...,...,...,...,...
99429,Q9Z2I8,S,429,12
99430,Q9Z2I8,V,430,4
99431,Q9Z2I8,A,431,0
99432,Q9Z2I8,K,432,0


In [26]:
alphafold_accessibility = alphafold_accessibility.merge(
    part_sphere_exposure, how='left', on=['protein_id','AA','position'])
alphafold_accessibility

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae
0,G3X982,1,M,1,31.22,-50.603,-50.225,-49.844,-51.379,11.652,...,-0.594,unstructured,unstructured,0,0,0,0,1,5,0
1,G3X982,1,S,2,28.66,-49.214,-50.170,-49.735,-50.171,13.940,...,1.486,unstructured,unstructured,0,0,0,0,1,5,0
2,G3X982,1,P,3,32.45,-47.175,-48.539,-49.120,-49.460,15.988,...,1.704,unstructured,unstructured,0,0,0,0,1,6,0
3,G3X982,1,S,4,43.06,-44.370,-44.826,-43.908,-46.183,16.650,...,0.952,unstructured,unstructured,0,0,0,0,1,9,0
4,G3X982,1,K,5,58.81,-42.153,-43.494,-43.279,-44.188,17.835,...,3.496,HELX_LH_PP_P,HELX,0,1,0,0,0,14,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99429,Q9Z2I8,207,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,-17.274,HELX_RH_AL_P,HELX,0,1,0,0,0,91,12
99430,Q9Z2I8,207,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,-18.401,BEND,BEND,1,0,0,0,0,78,4
99431,Q9Z2I8,207,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,-20.673,unstructured,unstructured,0,0,0,0,1,36,0
99432,Q9Z2I8,207,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,-20.119,unstructured,unstructured,0,0,0,0,1,19,0


In [27]:
alphafold_accessibility['high_acc_5'] = np.where(alphafold_accessibility.nAA_12_70_pae <= 5, 1, 0)
alphafold_accessibility['low_acc_5'] = np.where(alphafold_accessibility.nAA_12_70_pae > 5, 1, 0)
alphafold_accessibility

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,high_acc_5,low_acc_5
0,G3X982,1,M,1,31.22,-50.603,-50.225,-49.844,-51.379,11.652,...,unstructured,0,0,0,0,1,5,0,1,0
1,G3X982,1,S,2,28.66,-49.214,-50.170,-49.735,-50.171,13.940,...,unstructured,0,0,0,0,1,5,0,1,0
2,G3X982,1,P,3,32.45,-47.175,-48.539,-49.120,-49.460,15.988,...,unstructured,0,0,0,0,1,6,0,1,0
3,G3X982,1,S,4,43.06,-44.370,-44.826,-43.908,-46.183,16.650,...,unstructured,0,0,0,0,1,9,0,1,0
4,G3X982,1,K,5,58.81,-42.153,-43.494,-43.279,-44.188,17.835,...,HELX,0,1,0,0,0,14,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99429,Q9Z2I8,207,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,HELX,0,1,0,0,0,91,12,0,1
99430,Q9Z2I8,207,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,BEND,1,0,0,0,0,78,4,1,0
99431,Q9Z2I8,207,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,unstructured,0,0,0,0,1,36,0,1,0
99432,Q9Z2I8,207,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,unstructured,0,0,0,0,1,19,0,1,0


In [28]:
alphafold_accessibility_smooth = get_smooth_score(
    alphafold_accessibility, 
    np.array(['nAA_24_180_pae']), 
    [10])
alphafold_accessibility_smooth

100%|██████████| 207/207 [00:00<00:00, 1282.80it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,high_acc_5,low_acc_5,nAA_24_180_pae_smooth10
0,G3X982,1,M,1,31.22,-50.603,-50.225,-49.844,-51.379,11.652,...,0,0,0,0,1,5,0,1,0,50.909091
1,G3X982,1,S,2,28.66,-49.214,-50.170,-49.735,-50.171,13.940,...,0,0,0,0,1,5,0,1,0,58.916667
2,G3X982,1,P,3,32.45,-47.175,-48.539,-49.120,-49.460,15.988,...,0,0,0,0,1,6,0,1,0,66.230769
3,G3X982,1,S,4,43.06,-44.370,-44.826,-43.908,-46.183,16.650,...,0,0,0,0,1,9,0,1,0,73.142857
4,G3X982,1,K,5,58.81,-42.153,-43.494,-43.279,-44.188,17.835,...,0,1,0,0,0,14,0,1,0,78.466667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,Q9Z2I8,207,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,0,1,0,0,0,91,12,0,1,85.200000
429,Q9Z2I8,207,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,1,0,0,0,0,78,4,1,0,83.642857
430,Q9Z2I8,207,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,0,0,0,0,1,36,0,1,0,83.307692
431,Q9Z2I8,207,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,0,0,0,0,1,19,0,1,0,81.916667


In [29]:
alphafold_accessibility_smooth['IDR'] = np.where(
    alphafold_accessibility_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,high_acc_5,low_acc_5,nAA_24_180_pae_smooth10,IDR
0,G3X982,1,M,1,31.22,-50.603,-50.225,-49.844,-51.379,11.652,...,0,0,0,1,5,0,1,0,50.909091,0
1,G3X982,1,S,2,28.66,-49.214,-50.170,-49.735,-50.171,13.940,...,0,0,0,1,5,0,1,0,58.916667,0
2,G3X982,1,P,3,32.45,-47.175,-48.539,-49.120,-49.460,15.988,...,0,0,0,1,6,0,1,0,66.230769,0
3,G3X982,1,S,4,43.06,-44.370,-44.826,-43.908,-46.183,16.650,...,0,0,0,1,9,0,1,0,73.142857,0
4,G3X982,1,K,5,58.81,-42.153,-43.494,-43.279,-44.188,17.835,...,1,0,0,0,14,0,1,0,78.466667,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,Q9Z2I8,207,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,1,0,0,0,91,12,0,1,85.200000,0
429,Q9Z2I8,207,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,0,0,0,0,78,4,1,0,83.642857,0
430,Q9Z2I8,207,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,0,0,0,1,36,0,1,0,83.307692,0
431,Q9Z2I8,207,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,0,0,0,1,19,0,1,0,81.916667,0


## Filter

In [30]:
# filter out rows with double methionine modifications (for now) - makes code easier
methionine_peptides_completed_sequence = methionine_peptides_completed_sequence[methionine_peptides_completed_sequence["Modified Sequence"].str.count(modifications_pattern) == 1]
methionine_peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,12 Match Type,2 Match Type,3 Match Type,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type,Complete Sequence
6,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
9,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...
10,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,MS/MS,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...
11,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...
12,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5431,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...
5441,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,...,unmatched,MS/MS,MS/MS,unmatched,MS/MS,unmatched,MS/MS,unmatched,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...
5442,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,...,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...
5445,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,...,unmatched,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MSSMTQNLREVMKVMFKVPGFDRVLEKVTLVSAAPEKLICEMKVEE...


In [31]:
# create regex pattern to identify desired modifications
left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

(.*)(M\[655\.3735\]|M\[649\.3660\])


In [32]:
# extract left prefix of modified methionine (for indexing purposes)
methionine_peptides_completed_sequence["Left Prefix"] = methionine_peptides_completed_sequence["Modified Sequence"].str.extract(left_prefix_pattern)[0]
methionine_peptides_completed_sequence

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  methionine_peptides_completed_sequence["Left Prefix"] = methionine_peptides_completed_sequence["Modified Sequence"].str.extract(left_prefix_pattern)[0]


Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,2 Match Type,3 Match Type,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type,Complete Sequence,Left Prefix
6,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,AADTIGYPV
9,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,AAE
10,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,MS/MS,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,AAE
11,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,AAESSA
12,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,AAESSA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5431,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...,n[42.0106]M
5441,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,...,MS/MS,MS/MS,unmatched,MS/MS,unmatched,MS/MS,unmatched,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,n[42.0106]SDSRDPASDQ
5442,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,n[42.0106]SDSRDPASDQ
5445,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MSSMTQNLREVMKVMFKVPGFDRVLEKVTLVSAAPEKLICEMKVEE...,n[42.0106]SS


In [33]:
# something
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char.isupper()])

methionine_peptides_completed_sequence["Left Prefix"] = methionine_peptides_completed_sequence["Left Prefix"].map(filtering)
methionine_peptides_completed_sequence["Left Prefix Length"] = methionine_peptides_completed_sequence["Left Prefix"].str.len()
methionine_peptides_completed_sequence

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  methionine_peptides_completed_sequence["Left Prefix"] = methionine_peptides_completed_sequence["Left Prefix"].map(filtering)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  methionine_peptides_completed_sequence["Left Prefix Length"] = methionine_peptides_completed_sequence["Left Prefix"].str.len()


Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,3 Match Type,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type,Complete Sequence,Left Prefix,Left Prefix Length
6,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,AADTIGYPV,9
9,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,AAE,3
10,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,MS/MS,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,AAE,3
11,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,AAESSA,6
12,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,AAESSA,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5431,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...,M,1
5441,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,...,MS/MS,unmatched,MS/MS,unmatched,MS/MS,unmatched,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,SDSRDPASDQ,10
5442,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,SDSRDPASDQ,10
5445,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MSSMTQNLREVMKVMFKVPGFDRVLEKVTLVSAAPEKLICEMKVEE...,SS,2


In [34]:
# something
methionine_peptides_completed_sequence["Methionine Loc"] = methionine_peptides_completed_sequence["Start"] + methionine_peptides_completed_sequence["Left Prefix Length"]
methionine_peptides_completed_sequence

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  methionine_peptides_completed_sequence["Methionine Loc"] = methionine_peptides_completed_sequence["Start"] + methionine_peptides_completed_sequence["Left Prefix Length"]


Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type,Complete Sequence,Left Prefix,Left Prefix Length,Methionine Loc
6,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,AADTIGYPV,9,585
9,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,AAE,3,296
10,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,...,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,MS/MS,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,AAE,3,296
11,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,AAESSA,6,87
12,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,AAESSA,6,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5431,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,...,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...,M,1,2
5441,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,...,unmatched,MS/MS,unmatched,MS/MS,unmatched,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,SDSRDPASDQ,10,12
5442,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,...,unmatched,unmatched,unmatched,unmatched,MS/MS,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...,SDSRDPASDQ,10,12
5445,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MSSMTQNLREVMKVMFKVPGFDRVLEKVTLVSAAPEKLICEMKVEE...,SS,2,4


In [35]:
# remove duplicate methionine locations
methionine_peptides_completed_sequence_unique = methionine_peptides_completed_sequence[~methionine_peptides_completed_sequence[["Protein ID", "Methionine Loc"]].duplicated()]
methionine_peptides_completed_sequence_unique

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type,Complete Sequence,Left Prefix,Left Prefix Length,Methionine Loc
6,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,AADTIGYPV,9,585
9,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,AAE,3,296
11,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...,AAESSA,6,87
17,AAHLMDVAGNK,AAHLM[655.3735]DVAGNK,K,T,977,987,11,3,5M[655.3735],sp|Q8K370|ACD10_MOUSE,...,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,MLVRRLFQPSTLHWAWRTTALNHPLGRHQGGLRWTHSGGRSYRAVI...,AAHL,4,981
21,AALEMKEEELVSER,AALEM[655.3735]KEEELVSER,K,T,368,381,14,4,5M[655.3735],sp|P48410|ABCD1_MOUSE,...,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,MPVLSTPRPSRVTTLKRTAVVLALTAYGVHKIYPLVRQCLTPARGP...,AALE,4,372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5306,YPIEHGIITNWDDMEK,YPIEHGIITNWDDM[649.3660]EK,K,I,71,86,16,4,14M[649.3660],sp|P68033|ACTC_MOUSE,...,unmatched,MS/MS,unmatched,unmatched,unmatched,MS/MS,MCDDEETTALVCDNGSGLVKAGFAGDDAPRAVFPSIVGRPRHQGVM...,YPIEHGIITNWDD,13,84
5378,AATASPGAGRMDGKPR,n[42.0106]AATASPGAGRM[649.3660]DGKPR,M,T,2,17,16,4,"N-term[42.0106],11M[649.3660]",sp|Q9CR62|M2OM_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MAATASPGAGRMDGKPRTSPKSVKFLFGGLAGMGATVFVQPLDLVK...,AATASPGAGR,10,12
5410,ASVLSYESLVHAVAGAVGSVTAMTVFFPLDTAR,n[42.0106]ASVLSYESLVHAVAGAVGSVTAM[649.3660]TVF...,M,L,2,34,33,4,"N-term[42.0106],23M[649.3660]",sp|O70579|PM34_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASVLSYESLVHAVAGAVGSVTAMTVFFPLDTARLRLQVDEKRKSK...,ASVLSYESLVHAVAGAVGSVTA,22,24
5431,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,...,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...,M,1,2


In [36]:
methionine_peptides_completed_sequence_unique_with_alphafold = methionine_peptides_completed_sequence_unique.merge(
    alphafold_accessibility_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Loc"], 
    right_on=["protein_id", "position"]
)
methionine_peptides_completed_sequence_unique_with_alphafold

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,high_acc_5,low_acc_5,nAA_24_180_pae_smooth10,IDR
0,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,0,1,0,0,144,5,1,0,120.857143,0
1,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,1,0,0,0,123,11,0,1,102.000000,0
2,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,0,0,0,1,10,0,1,0,10.809524,1
3,AAHLMDVAGNK,AAHLM[655.3735]DVAGNK,K,T,977,987,11,3,5M[655.3735],sp|Q8K370|ACD10_MOUSE,...,1,0,0,0,138,10,0,1,123.904762,0
4,AALEMKEEELVSER,AALEM[655.3735]KEEELVSER,K,T,368,381,14,4,5M[655.3735],sp|P48410|ABCD1_MOUSE,...,0,0,0,0,17,1,1,0,29.904762,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,YPIEHGIITNWDDMEK,YPIEHGIITNWDDM[649.3660]EK,K,I,71,86,16,4,14M[649.3660],sp|P68033|ACTC_MOUSE,...,1,0,0,0,149,10,0,1,129.238095,0
345,AATASPGAGRMDGKPR,n[42.0106]AATASPGAGRM[649.3660]DGKPR,M,T,2,17,16,4,"N-term[42.0106],11M[649.3660]",sp|Q9CR62|M2OM_MOUSE,...,0,0,0,1,9,0,1,0,16.809524,1
346,ASVLSYESLVHAVAGAVGSVTAMTVFFPLDTAR,n[42.0106]ASVLSYESLVHAVAGAVGSVTAM[649.3660]TVF...,M,L,2,34,33,4,"N-term[42.0106],23M[649.3660]",sp|O70579|PM34_MOUSE,...,1,0,0,0,110,2,1,0,97.857143,0
347,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,...,1,0,0,0,33,0,1,0,26.833333,1


In [37]:
inaccessible_methionines = methionine_peptides_completed_sequence_unique_with_alphafold[methionine_peptides_completed_sequence_unique_with_alphafold["IDR"] == 0]
inaccessible_methionines

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,high_acc_5,low_acc_5,nAA_24_180_pae_smooth10,IDR
0,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,0,1,0,0,144,5,1,0,120.857143,0
1,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,1,0,0,0,123,11,0,1,102.000000,0
3,AAHLMDVAGNK,AAHLM[655.3735]DVAGNK,K,T,977,987,11,3,5M[655.3735],sp|Q8K370|ACD10_MOUSE,...,1,0,0,0,138,10,0,1,123.904762,0
5,AALTMFSTIIR,AALTM[649.3660]FSTIIR,K,Q,238,248,11,3,5M[649.3660],sp|P51658|DHB2_MOUSE,...,1,0,0,0,125,2,1,0,129.761905,0
6,AAMEHAGK,AAM[649.3660]EHAGK,R,K,118,125,8,3,3M[649.3660],sp|Q8QZY2|GLCTK_MOUSE,...,1,0,0,0,115,8,0,1,114.523810,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,YHPMDYYWWLR,YHPM[649.3660]DYYWWLR,R,M,315,325,11,4,4M[649.3660],sp|Q80XN0|BDH_MOUSE,...,0,0,0,0,100,7,0,1,97.428571,0
343,YLMEEDEDAYKK,YLM[655.3735]EEDEDAYKK,R,Q,210,221,12,4,3M[655.3735],sp|P47962|RL5_MOUSE,...,1,0,0,0,46,3,1,0,64.666667,0
344,YPIEHGIITNWDDMEK,YPIEHGIITNWDDM[649.3660]EK,K,I,71,86,16,4,14M[649.3660],sp|P68033|ACTC_MOUSE,...,1,0,0,0,149,10,0,1,129.238095,0
346,ASVLSYESLVHAVAGAVGSVTAMTVFFPLDTAR,n[42.0106]ASVLSYESLVHAVAGAVGSVTAM[649.3660]TVF...,M,L,2,34,33,4,"N-term[42.0106],23M[649.3660]",sp|O70579|PM34_MOUSE,...,1,0,0,0,110,2,1,0,97.857143,0


In [38]:
# Compute left/right analysis sequences based on threshold
inaccessible_methionines[f"Left {analysis_threshold}"] = [A[B:C] for A, B, C in zip(inaccessible_methionines["Complete Sequence"], inaccessible_methionines["Methionine Loc"] - analysis_threshold - 1, inaccessible_methionines["Methionine Loc"] - 1)]
inaccessible_methionines[f"Right {analysis_threshold}"] = [A[B:C] for A, B, C in zip(inaccessible_methionines["Complete Sequence"], inaccessible_methionines["Methionine Loc"], inaccessible_methionines["Methionine Loc"] + analysis_threshold)]
inaccessible_methionines

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inaccessible_methionines[f"Left {analysis_threshold}"] = [A[B:C] for A, B, C in zip(inaccessible_methionines["Complete Sequence"], inaccessible_methionines["Methionine Loc"] - analysis_threshold - 1, inaccessible_methionines["Methionine Loc"] - 1)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inaccessible_methionines[f"Right {analysis_threshold}"] = [A[B:C] for A, B, C in zip(inaccessible_methionines["Complete Sequence"], inaccessible_methionines["Methionine Loc"], inaccessible_methionines["Methionine Loc"] + 

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,high_acc_5,low_acc_5,nAA_24_180_pae_smooth10,IDR,Left 10,Right 10
0,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,0,0,144,5,1,0,120.857143,0,KAADTIGYPV,IRSAYALGGL
1,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,0,0,123,11,0,1,102.000000,0,KMMGSAKAAE,LLFGKKLTAR
3,AAHLMDVAGNK,AAHLM[655.3735]DVAGNK,K,T,977,987,11,3,5M[655.3735],sp|Q8K370|ACD10_MOUSE,...,0,0,138,10,0,1,123.904762,0,RLLVLKAAHL,DVAGNKTAAL
5,AALTMFSTIIR,AALTM[649.3660]FSTIIR,K,Q,238,248,11,3,5M[649.3660],sp|P51658|DHB2_MOUSE,...,0,0,125,2,1,0,129.761905,0,AYAATKAALT,FSTIIRQELD
6,AAMEHAGK,AAM[649.3660]EHAGK,R,K,118,125,8,3,3M[649.3660],sp|Q8QZY2|GLCTK_MOUSE,...,0,0,115,8,0,1,114.523810,0,ISVPKGIRAA,EHAGKKEMLL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,YHPMDYYWWLR,YHPM[649.3660]DYYWWLR,R,M,315,325,11,4,4M[649.3660],sp|Q80XN0|BDH_MOUSE,...,0,0,100,7,0,1,97.428571,0,AATPYTRYHP,DYYWWLRMQI
343,YLMEEDEDAYKK,YLM[655.3735]EEDEDAYKK,R,Q,210,221,12,4,3M[655.3735],sp|P47962|RL5_MOUSE,...,0,0,46,3,1,0,64.666667,0,QNVADYMRYL,EEDEDAYKKQ
344,YPIEHGIITNWDDMEK,YPIEHGIITNWDDM[649.3660]EK,K,I,71,86,16,4,14M[649.3660],sp|P68033|ACTC_MOUSE,...,0,0,149,10,0,1,129.238095,0,EHGIITNWDD,EKIWHHTFYN
346,ASVLSYESLVHAVAGAVGSVTAMTVFFPLDTAR,n[42.0106]ASVLSYESLVHAVAGAVGSVTAM[649.3660]TVF...,M,L,2,34,33,4,"N-term[42.0106],23M[649.3660]",sp|O70579|PM34_MOUSE,...,0,0,110,2,1,0,97.857143,0,VAGAVGSVTA,TVFFPLDTAR


In [39]:
#inaccessible_methionines.to_csv(os.path.join(curr_dir_path, "inaccessible_methionines_dataset.csv"))