## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
import re
import plotly.express as px
from tqdm import tqdm
import tempfile
import csv
import requests as r
from Bio import SeqIO
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import warnings

warnings.filterwarnings('ignore')

# Import structuremap functions
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score, annotate_proteins_with_idr_pattern, get_extended_flexible_pattern, get_proximity_pvals, perform_enrichment_analysis, perform_enrichment_analysis_per_protein, evaluate_ptm_colocalization, extract_motifs_in_proteome
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization

## Set Parameters of Analysis

In [2]:
analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "655.3735" 
light_modification = "649.3660"

## Load Dataset - MsrAKD

In [None]:
# path for csv output data
datasets_path_str = "../datasets/"
datasets_path = os.path.abspath(datasets_path_str)
print(datasets_path)

In [None]:
data_loc = os.path.join(datasets_path, "05_10_24_293T_MsrKD_data.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="293T_MsrAKD_quant")
peptides

In [None]:
# Canonicalize data - none to do here
peptides;

In [None]:
label_col_data = ["blue"] * 157 + ["green"] * 381 + ["white"] * 9 + ["red"] * 12 + ["gray"] * 104
label_col = pd.Series(label_col_data)
peptides["color"] = label_col

#pd.set_option("display.max_rows", None)
display(peptides)
#pd.reset_option("display.max_rows")

In [None]:
peptides_completed_sequence = peptides.copy()

# Download Alphafold Data - MsrAKD

In [None]:
# path for alphafold protein data
alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

In [None]:
# set uniprot IDs to use
uniprotIDs = peptides_completed_sequence["Protein ID"].unique()
uniprotIDs, len(uniprotIDs)

In [None]:
# download cif data for proteins
# SLOW THE FIRST TIME
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprotIDs,
    out_folder=cif_dir
)

In [None]:
# download pae data for proteins
# SLOW THE FIRST TIME
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprotIDs,
    out_folder=pae_dir, 
)

In [None]:
invalid_proteins_pae

In [None]:
peptides_completed_sequence[peptides_completed_sequence["Protein ID"].isin(invalid_proteins_pae)]

## Load Dataset - MsrBKD

In [3]:
# path for csv output data
datasets_path_str = "../datasets/"
datasets_path = os.path.abspath(datasets_path_str)
print(datasets_path)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/datasets


In [4]:
data_loc = os.path.join(datasets_path, "05_10_24_293T_MsrKD_data.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="293T_MsrB2KD_quant")
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,Protein ID,Entry Name,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,Q16836,HCDH_HUMAN,HADH,"Hydroxyacyl-coenzyme A dehydrogenase, mitochon...",7.317853e-14,13.135616,-4.116590,178,M178,HCDH_M178
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,P23193,TCEA1_HUMAN,TCEA1,Transcription elongation factor A protein 1,3.213832e-04,3.492977,-3.852951,48,M48,TCEA1_M48
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,Q16181,SEPT7_HUMAN,SEPTIN7,Septin-7,2.521873e-11,10.598277,-2.655733,355,M355,SEPT7_M355
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,P35579,MYH9_HUMAN,MYH9,Myosin-9,2.469697e-03,2.607356,-2.118498,1565,M1565,MYH9_M1565
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,P62258,1433E_HUMAN,YWHAE,14-3-3 protein epsilon,3.952954e-03,2.403078,-2.045838,160,M160,1433E_M160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,Q00341,VIGLN_HUMAN,HDLBP,Vigilin,1.332075e-01,0.875471,2.654099,128,M128,VIGLN_M128
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,P00374,DYR_HUMAN,DHFR,Dihydrofolate reductase,5.244905e-02,1.280262,2.951996,126,M126,DYR_M126
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,P35579,MYH9_HUMAN,MYH9,Myosin-9,6.250049e-02,1.204117,2.976128,1489,M1489,MYH9_M1489
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,P14868,SYDC_HUMAN,DARS1,"Aspartate--tRNA ligase, cytoplasmic",6.321136e-02,1.199205,3.046635,478,M478,SYDC_M478


In [5]:
# Canonicalize data - none to do here
peptides;

In [8]:
label_col_data = ["blue"] * 10 + ["white"] * 30 + ["green"] * 381 + ["red"] * 213 + ["gray"] * 120
label_col = pd.Series(label_col_data)
peptides["color"] = label_col

#pd.set_option("display.max_rows", None)
display(peptides)
#pd.reset_option("display.max_rows")

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,Entry Name,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label,color
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,HCDH_HUMAN,HADH,"Hydroxyacyl-coenzyme A dehydrogenase, mitochon...",7.317853e-14,13.135616,-4.116590,178,M178,HCDH_M178,blue
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,TCEA1_HUMAN,TCEA1,Transcription elongation factor A protein 1,3.213832e-04,3.492977,-3.852951,48,M48,TCEA1_M48,blue
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,SEPT7_HUMAN,SEPTIN7,Septin-7,2.521873e-11,10.598277,-2.655733,355,M355,SEPT7_M355,blue
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,MYH9_HUMAN,MYH9,Myosin-9,2.469697e-03,2.607356,-2.118498,1565,M1565,MYH9_M1565,blue
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,1433E_HUMAN,YWHAE,14-3-3 protein epsilon,3.952954e-03,2.403078,-2.045838,160,M160,1433E_M160,blue
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,VIGLN_HUMAN,HDLBP,Vigilin,1.332075e-01,0.875471,2.654099,128,M128,VIGLN_M128,gray
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,DYR_HUMAN,DHFR,Dihydrofolate reductase,5.244905e-02,1.280262,2.951996,126,M126,DYR_M126,gray
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,MYH9_HUMAN,MYH9,Myosin-9,6.250049e-02,1.204117,2.976128,1489,M1489,MYH9_M1489,gray
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,SYDC_HUMAN,DARS1,"Aspartate--tRNA ligase, cytoplasmic",6.321136e-02,1.199205,3.046635,478,M478,SYDC_M478,gray


In [9]:
peptides_completed_sequence = peptides.copy()

# Download Alphafold Data - MsrB2KD

In [10]:
# path for alphafold protein data
alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/pae


In [11]:
# set uniprot IDs to use
uniprotIDs = peptides_completed_sequence["Protein ID"].unique()
uniprotIDs, len(uniprotIDs)

(array(['Q16836', 'P23193', 'Q16181', 'P35579', 'P62258', 'P46109',
        'P55072', 'Q9Y265', 'P25205', 'P61024', 'P41227', 'P18583',
        'Q9UN37', 'O14744', 'Q86UP2', 'O14874', 'P36543', 'Q9Y2W2',
        'P14174', 'Q9Y617', 'Q8WVK2', 'P31948', 'Q9Y3U8', 'Q99729',
        'Q9UKD2', 'Q9Y3I0', 'P27144', 'Q9UHX1', 'P22307', 'Q01518',
        'Q9BWF3', 'Q9Y580', 'O43707', 'P22061', 'P52272', 'Q9HD42',
        'P50454', 'O95831', 'P18859', 'P05067', 'P60709', 'P68032',
        'P35611', 'P55196', 'Q4VCS5', 'P08243', 'P05023', 'P24539',
        'Q9NVI7', 'Q8WWM7', 'Q07812', 'Q9NYF8', 'Q9UHR4', 'P11021',
        'Q9BRK5', 'O43852', 'Q14444', 'Q96CT7', 'Q16543', 'P06493',
        'P61604', 'P10809', 'Q9UQN3', 'Q9H444', 'Q9Y3Y2', 'Q14011',
        'E9PRG8', 'Q07065', 'Q15003', 'P09669', 'P33240', 'Q9H0L4',
        'Q92841', 'Q9NR30', 'Q9BUQ8', 'P00367', 'Q08211', 'Q99615',
        'O75937', 'P55265', 'P33316', 'Q14204', 'P55084', 'P42126',
        'Q6P2E9', 'P29692', 'Q14152', 'O75821', 

In [12]:
# download cif data for proteins
# SLOW THE FIRST TIME
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 464/464 [01:18<00:00,  5.94it/s]

2024-05-19 19:43:54> Valid proteins: 27
2024-05-19 19:43:54> Invalid proteins: 6
2024-05-19 19:43:54> Existing proteins: 431





In [13]:
# download pae data for proteins
# SLOW THE FIRST TIME
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 464/464 [01:23<00:00,  5.57it/s]

2024-05-19 19:45:18> Valid proteins: 27
2024-05-19 19:45:18> Invalid proteins: 6
2024-05-19 19:45:18> Existing proteins: 431





In [14]:
invalid_proteins_pae

['Q14204', 'Q09666', 'Q14789', 'Q9Y520', 'P46013', 'Q9NU22']

In [15]:
peptides_completed_sequence[peptides_completed_sequence["Protein ID"].isin(invalid_proteins_pae)]

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,Entry Name,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label,color
97,RSELEEQQM[649.3660]HLNVGLR,RSELEEQQM[655.3735]HLNVGLR,1.979056,1.859028,1.810054,1.87224,1.952729,2.033539,1.941938,1.988914,...,DYHC1_HUMAN,DYNC1H1,Cytoplasmic dynein 1 heavy chain 1,1.020051e-10,9.991378,2.025697,3199,M3199,DYHC1_M3199,green
108,ISM[649.3660]PDIDLNLKGPK,ISM[655.3735]PDIDLNLKGPK,,,2.317727,1.315954,2.486824,2.297093,,2.372561,...,AHNK_HUMAN,AHNAK,Neuroblast differentiation-associated protein ...,5.110537e-06,5.291533,2.072387,0,M0,AHNK_M0,green
130,AFQIM[649.3660]QEELR,AFQIM[655.3735]QEELR,1.059465,1.217048,1.22789,1.415169,1.178414,1.12477,1.382358,1.389273,...,GOGB1_HUMAN,GOLGB1,Golgin subfamily B member 1,1.497988e-12,11.824492,1.246823,2934,M2934,GOGB1_M2934,green
248,M[649.3660]NSIVYQK,M[655.3735]NSIVYQK,1.4284,,2.302641,,,,,1.752522,...,PRC2C_HUMAN,PRRC2C,Protein PRRC2C,0.001036683,2.984354,1.91504,2702,M2702,PRC2C_M2702,green
253,ISM[649.3660]PDVDLHLK,ISM[655.3735]PDVDLHLK,2.023708,,1.388773,,,,2.04048,1.42799,...,AHNK_HUMAN,AHNAK,Neuroblast differentiation-associated protein ...,0.0002795943,3.553472,1.695889,0,M0,AHNK_M0,green
462,INM[649.3660]LVIELK,INM[655.3735]LVIELK,1.651044,1.464602,,,,1.581897,,,...,DYHC1_HUMAN,DYNC1H1,Cytoplasmic dynein 1 heavy chain 1,6.641485e-05,4.177735,1.438639,1398,M1398,DYHC1_M1398,red
463,M[649.3660]VVLSLPR,M[655.3735]VVLSLPR,2.074851,2.450636,1.930566,2.136952,,2.123057,2.228832,1.589716,...,DYHC1_HUMAN,DYNC1H1,Cytoplasmic dynein 1 heavy chain 1,3.335842e-08,7.476795,1.993457,991,M991,DYHC1_M991,red
504,AM[649.3660]HTPKPAVGEEK,AM[655.3735]HTPKPAVGEEK,,0.840213,,,,,1.688776,2.119348,...,KI67_HUMAN,MKI67,Proliferation marker protein Ki-67,0.002047267,2.688826,1.691246,1782,M1782,KI67_M1782,red
552,GLIPAGTQHSM[649.3660]IATTGK,GLIPAGTQHSM[655.3735]IATTGK,,,,,1.775794,1.826232,1.304407,,...,PRC2C_HUMAN,PRRC2C,Protein PRRC2C,5.141039e-05,4.288949,1.643852,2648,M2648,PRC2C_M2648,red
719,SM[649.3660]GDHNER,SM[655.3735]GDHNER,,,1.642751,,,,,,...,MDN1_HUMAN,MDN1,Midasin,0.08398708,1.075788,1.4503,5110,M5110,MDN1_M5110,gray
