## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
import re
import plotly.express as px
from tqdm import tqdm
import tempfile
import csv
import requests as r
from Bio import SeqIO
from io import StringIO

# Import structuremap functions
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score, annotate_proteins_with_idr_pattern, get_extended_flexible_pattern, get_proximity_pvals, perform_enrichment_analysis, perform_enrichment_analysis_per_protein, evaluate_ptm_colocalization, extract_motifs_in_proteome
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization

## Set Parameters of Analysis

In [2]:
analysis_threshold = 10 # number of amino acids either side to analyze
modifications = ["655.3735", "649.3660"] # which modifications we are looking for, as regex strings

## Load Chemoproteomics Data

In [3]:
# path for csv output data
datasets_path_str = "../datasets/"
datasets_path = os.path.abspath(datasets_path_str)
print(datasets_path)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/datasets


In [4]:
data_loc = os.path.join(datasets_path, "combined_modified_peptide.tsv")
peptides = pd.read_csv(data_loc, delimiter="\t")
peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,11 Match Type,12 Match Type,2 Match Type,3 Match Type,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type
0,AAALVLQTIWGYK,AAALVLQTIWGYK,R,E,821,833,13,2,,sp|P30999|CTND1_MOUSE,...,unmatched,MS/MS,unmatched,unmatched,MS/MS,MS/MS,MS/MS,unmatched,unmatched,unmatched
1,AAATFNPELITHILDGSPENTRR,AAATFNPELITHILDGSPENTRR,R,R,10,32,23,4,,sp|Q9R0H0|ACOX1_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
2,AAAVGIAQVVISR,AAAVGIAQVVISR,R,I,220,232,13,2,,sp|Q925N2|SFXN2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
3,AAAVGIAQVVISRITMAAPGMILLPVIMER,AAAVGIAQVVISRITMAAPGMILLPVIMER,R,L,220,249,30,34,,sp|Q925N2|SFXN2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
4,AAAVGIAQVVISRITMAAPGMILLPVIMERLER,AAAVGIAQVVISRITMAAPGMILLPVIMERLER,R,L,220,252,33,4,,sp|Q925N2|SFXN2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5447,STFRLALIQLQVSSIKSDNLTR,n[42.0106]STFRLALIQLQVSSIKSDNLTR,M,A,2,23,22,3,N-term(42.0106),sp|Q9JHW2|NIT2_MOUSE,...,MS/MS,unmatched,unmatched,MS/MS,unmatched,unmatched,MS/MS,unmatched,MS/MS,MS/MS
5448,TDAAVSFAK,n[42.0106]TDAAVSFAK,M,D,2,10,9,2,N-term(42.0106),sp|P51881|ADT2_MOUSE,...,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched
5449,TDAAVSFAKDFLAGGVAAAISK,n[42.0106]TDAAVSFAKDFLAGGVAAAISK,M,T,2,23,22,23,N-term(42.0106),sp|P51881|ADT2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
5450,TDAAVSFAKDFLAGGVAAAISKTAVAPIER,n[42.0106]TDAAVSFAKDFLAGGVAAAISKTAVAPIER,M,V,2,31,30,234,N-term(42.0106),sp|P51881|ADT2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS


In [5]:
# Canonicalize data
peptides["Assigned Modifications"] = peptides["Assigned Modifications"].str.replace("(", "[", regex=False)
peptides["Assigned Modifications"] = peptides["Assigned Modifications"].str.replace(")", "]", regex=False)
peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,11 Match Type,12 Match Type,2 Match Type,3 Match Type,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type
0,AAALVLQTIWGYK,AAALVLQTIWGYK,R,E,821,833,13,2,,sp|P30999|CTND1_MOUSE,...,unmatched,MS/MS,unmatched,unmatched,MS/MS,MS/MS,MS/MS,unmatched,unmatched,unmatched
1,AAATFNPELITHILDGSPENTRR,AAATFNPELITHILDGSPENTRR,R,R,10,32,23,4,,sp|Q9R0H0|ACOX1_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
2,AAAVGIAQVVISR,AAAVGIAQVVISR,R,I,220,232,13,2,,sp|Q925N2|SFXN2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
3,AAAVGIAQVVISRITMAAPGMILLPVIMER,AAAVGIAQVVISRITMAAPGMILLPVIMER,R,L,220,249,30,34,,sp|Q925N2|SFXN2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
4,AAAVGIAQVVISRITMAAPGMILLPVIMERLER,AAAVGIAQVVISRITMAAPGMILLPVIMERLER,R,L,220,252,33,4,,sp|Q925N2|SFXN2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5447,STFRLALIQLQVSSIKSDNLTR,n[42.0106]STFRLALIQLQVSSIKSDNLTR,M,A,2,23,22,3,N-term[42.0106],sp|Q9JHW2|NIT2_MOUSE,...,MS/MS,unmatched,unmatched,MS/MS,unmatched,unmatched,MS/MS,unmatched,MS/MS,MS/MS
5448,TDAAVSFAK,n[42.0106]TDAAVSFAK,M,D,2,10,9,2,N-term[42.0106],sp|P51881|ADT2_MOUSE,...,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched
5449,TDAAVSFAKDFLAGGVAAAISK,n[42.0106]TDAAVSFAKDFLAGGVAAAISK,M,T,2,23,22,23,N-term[42.0106],sp|P51881|ADT2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
5450,TDAAVSFAKDFLAGGVAAAISKTAVAPIER,n[42.0106]TDAAVSFAKDFLAGGVAAAISKTAVAPIER,M,V,2,31,30,234,N-term[42.0106],sp|P51881|ADT2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS


In [6]:
# create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    split_mod = modifications[0].split(".")
    whole = split_mod[0]
    mantissa = split_mod[1]
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        split_mod = modifications[i].split(".")
        whole = split_mod[0]
        mantissa = split_mod[1]
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    #prefix = 
    #suffix = 
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[655\.3735\]|M\[649\.3660\]


In [7]:
# select rows corresponding to desired sequences
methionine_peptides = peptides[peptides["Assigned Modifications"].str.contains(modifications_pattern, na=False)]
methionine_peptides

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,11 Match Type,12 Match Type,2 Match Type,3 Match Type,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type
6,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,MS/MS,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched
9,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched
10,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,MS/MS
11,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
12,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5431,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched
5441,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,...,unmatched,unmatched,MS/MS,MS/MS,unmatched,MS/MS,unmatched,MS/MS,unmatched,MS/MS
5442,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,...,MS/MS,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,MS/MS
5445,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,...,MS/MS,unmatched,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS


In [8]:
# helper function to get full amino acid sequence for a protein
def get_full_protein_seq(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [9]:
# test - get a single amino acid sequence
#first_protein_ID = methionine_peptides["Protein ID"].iloc[0]
#test_sequence = get_full_protein_seq(first_protein_ID)
#print(test_sequence[575:587])
#print(methionine_peptides["Peptide Sequence"].iloc[0])

In [10]:
# get whole amino acid sequences for methionine peptides
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE
#tqdm.pandas()
#methionine_peptides_completed_sequence = methionine_peptides.copy()
#methionine_peptides_completed_sequence["Complete Sequence"] = methionine_peptides_completed_sequence["Protein ID"].progress_apply(get_full_protein_seq)
#methionine_peptides_completed_sequence

# NOTE: WE CAN SPEED THIS UP BY ONLY GETTING UNIQUE PROTEINS (lots of repeats in the dataset)

In [11]:
#methionine_peptides_completed_sequence.to_csv(os.path.join(datasets_path, "methionine_peptides_completed_sequence.csv"))

In [12]:
path = os.path.join(datasets_path, "methionine_peptides_completed_sequence.csv")
methionine_peptides_completed_sequence = pd.read_csv(path)
methionine_peptides_completed_sequence.set_index("Unnamed: 0", inplace=True)
methionine_peptides_completed_sequence.index.name = None
methionine_peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Assigned Modifications,Protein,...,12 Match Type,2 Match Type,3 Match Type,4 Match Type,5 Match Type,6 Match Type,7 Match Type,8 Match Type,9 Match Type,Complete Sequence
6,AADTIGYPVMIR,AADTIGYPVM[655.3735]IR,K,S,576,587,12,3,10M[655.3735],sp|Q8C196|CPSM_MOUSE,...,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
9,AAEMLLFGK,AAEM[649.3660]LLFGK,K,K,293,301,9,3,4M[649.3660],sp|Q9WUR2|ECI2_MOUSE,...,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...
10,AAEMLLFGK,AAEM[655.3735]LLFGK,K,K,293,301,9,3,4M[655.3735],sp|Q9WUR2|ECI2_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,MS/MS,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...
11,AAESSAMAATEK,AAESSAM[649.3660]AATEK,K,K,81,92,12,3,7M[649.3660],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...
12,AAESSAMAATEK,AAESSAM[655.3735]AATEK,K,K,81,92,12,3,7M[655.3735],sp|Q9Z1P6|NDUA7_MOUSE,...,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5431,MMNGRPGHEPLK,n[42.0106]MM[655.3735]NGRPGHEPLK,-,F,1,12,12,4,"N-term[42.0106],2M[655.3735]",sp|Q9CQ54|NDUC2_MOUSE,...,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,MMNGRPGHEPLKFLPDEARSLPPPKLNDPRLVYMGLLGYCTGLMDN...
5441,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[649.3660]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[649.3660]",sp|P24270|CATA_MOUSE,...,unmatched,MS/MS,MS/MS,unmatched,MS/MS,unmatched,MS/MS,unmatched,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...
5442,SDSRDPASDQMKQWK,n[42.0106]SDSRDPASDQM[655.3735]KQWK,M,E,2,16,15,4,"N-term[42.0106],11M[655.3735]",sp|P24270|CATA_MOUSE,...,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,MS/MS,MSDSRDPASDQMKQWKEQRASQRPDVLTTGGGNPIGDKLNIMTAGS...
5445,SSMTQNLR,n[42.0106]SSM[649.3660]TQNLR,M,E,2,9,8,3,"N-term[42.0106],3M[649.3660]",sp|Q9CQR4|ACO13_MOUSE,...,unmatched,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MSSMTQNLREVMKVMFKVPGFDRVLEKVTLVSAAPEKLICEMKVEE...


## Download Alphafold Data

In [14]:
# path for alphafold protein data
alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/pae


In [17]:
# set uniprot IDs to use
uniprotIDs = methionine_peptides_completed_sequence["Protein ID"].values
uniprotIDs

array(['Q8C196', 'Q9WUR2', 'Q9WUR2', 'Q9Z1P6', 'Q9Z1P6', 'Q8K370',
       'P48410', 'P51658', 'P51658', 'Q8QZY2', 'Q8QZY2', 'Q8VC30',
       'Q8VC30', 'Q99LC5', 'Q99LC5', 'P51881', 'P51881', 'Q91VS7',
       'Q61335', 'Q8BH95', 'Q8BGT5', 'Q8BGT5', 'Q920A5', 'Q920A5',
       'Q920A5', 'Q8BWT1', 'Q9DB77', 'Q91VS7', 'Q91VS7', 'Q91VS7',
       'Q9QXF8', 'Q9QXF8', 'Q91WU0', 'P97807', 'P97807', 'P30115',
       'Q99JY0', 'Q99JY0', 'P63038', 'P63038', 'P63038', 'P63038',
       'P63038', 'P63038', 'P63038', 'P63038', 'Q8BM55', 'P43024',
       'P43024', 'P54116', 'Q9DBJ1', 'Q8C196', 'Q9CYV5', 'Q9DBJ1',
       'Q8C196', 'Q61102', 'Q9WTP6', 'Q8BGT5', 'Q9JJW0', 'Q9JJW0',
       'Q8C196', 'Q8C196', 'Q9CPQ8', 'Q9CPQ8', 'Q8CGK3', 'P97872',
       'P51660', 'Q60936', 'P54869', 'Q61733', 'Q61733', 'Q9WTP6',
       'Q9WTP6', 'Q925I1', 'Q925I1', 'Q8C196', 'Q8CAQ8', 'P63038',
       'P38647', 'Q8C196', 'Q8C196', 'Q9DBG1', 'P63030', 'Q01853',
       'Q9Z2I8', 'P63038', 'P63038', 'P63038', 'P26443', 'P264

In [18]:
# download cif data for proteins
# SLOW - ONLY DO THIS ONCE TO POPULATE THE FOLDER
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 734/734 [09:27<00:00,  1.29it/s]

2024-03-08 21:17:51> Valid proteins: 207
2024-03-08 21:17:51> Invalid proteins: 0
2024-03-08 21:17:51> Existing proteins: 527





In [23]:
# download pae data for proteins
#  SLOW - ONLY DO THIS ONCE TO POPULATE THE FOLDER
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 734/734 [09:30<00:00,  1.29it/s]

2024-03-08 21:30:51> Valid proteins: 207
2024-03-08 21:30:51> Invalid proteins: 0
2024-03-08 21:30:51> Existing proteins: 527





In [22]:
np.unique(uniprotIDs).size

207