In [1]:
from dataset_utils import *

import os
import warnings

import numpy as np
import pandas as pd

from structuremap.processing import download_alphafold_cif, download_alphafold_pae
import structuremap.utils

warnings.filterwarnings("ignore")
structuremap.utils.set_logger()

### Set Parameters of Analysis

In [2]:
amino_acid = "M"
amino_acid_str = "Methionine"
analysis_threshold = 20 # number of amino acids either side to analyze
modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as strings
heavy_modification = "655.3735"
light_modification = "649.3660"

In [3]:
curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

datasets_path_str = "../datasets"
datasets_path = os.path.abspath(datasets_path_str)

global_data_path_str = "../../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Datasets Directory: " + datasets_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/toy
Datasets Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/datasets
Global Data Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/global_data


In [4]:
alphafold_path_str = "../../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print("AlphaFold Directory: " + alphafold_path)
print("CIF Directory: " + cif_dir)
print("PAE Directory: " + pae_dir)

AlphaFold Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data
CIF Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/cif
PAE Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/pae


## A549

### Load and Process Dataset - A549

In [5]:
# Load initial isoTOP-ABPP dataset
pd.set_option("display.max_rows", 25)
data_loc = os.path.join(curr_dir_path, "A549_hyperreactivity.csv")
peptides = pd.read_csv(data_loc)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,Average H/L,P-value,Site
0,SMEASVDVSAPK,SMEASVDVSAPK,SM[649.3660]EASVDVSAPK,SM[655.3735]EASVDVSAPK,,-4.314368,-4.309369,,,sp|Q8IVF2|AHNK2_HUMAN,Q8IVF2,AHNK2_HUMAN,AHNAK2,Protein AHNAK2,-4.311869,3.689843e-04,
1,VSMPDVELNLKSPK,VSMPDVELNLKSPK,VSM[649.3660]PDVELNLKSPK,VSM[655.3735]PDVELNLKSPK,,-3.310446,-2.678198,-2.695957,-2.488542,sp|Q09666|AHNK_HUMAN,Q09666,AHNK_HUMAN,AHNAK,Neuroblast differentiation-associated protein ...,-2.793286,5.686721e-04,AHNAK_M3417
2,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,O00487,PSDE_HUMAN,PSMD14,26S proteasome non-ATPase regulatory subunit 14,-2.697886,2.888136e-04,PSMD14_M167
3,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,-2.425145,1.903466e-07,HNRNPM_M437
4,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,Q15435,PP1R7_HUMAN,PPP1R7,Protein phosphatase 1 regulatory subunit 7,0.391862,2.353017e-02,PPP1R7_M344
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,P11940,PABP1_HUMAN,PABPC1,Polyadenylate-binding protein 1,8.428814,7.487145e-03,PABPC1_M573
45,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,P63261,ACTG_HUMAN,ACTG1,"Actin, cytoplasmic 2",8.471928,2.227639e-03,ACTG1_M47
46,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,P83731,RL24_HUMAN,RPL24,Large ribosomal subunit protein eL24,9.059513,4.406509e-02,RPL24_M127
47,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,P05783,K1C18_HUMAN,KRT18,"Keratin, type I cytoskeletal 18",9.610308,9.293054e-04,KRT18_M84


In [6]:
# Check dataset for missing (NaN) entries
peptides.isna().sum()

Peptide Sequence           0
Modified Peptide           0
Light Modified Peptide     0
Heavy Modified Peptide     0
exp_1 Log2 Ratio HL       24
exp_2 Log2 Ratio HL       21
exp_3 Log2 Ratio HL       16
exp_4 Log2 Ratio HL       27
exp_5 Log2 Ratio HL       22
Protein                    0
Protein ID                 0
Entry Name                 0
Gene                       0
Protein Description        0
Average H/L                0
P-value                    0
Site                       3
dtype: int64

In [7]:
# Sanity Check: ensure there's only one desired modification in each peptide
modifications_pattern = create_modifications_pattern(amino_acid, modifications)
print(modifications_pattern)
display(peptides["Light Modified Peptide"].str.count(modifications_pattern).value_counts())
display(peptides["Heavy Modified Peptide"].str.count(modifications_pattern).value_counts())

M\[649\.3660\]|M\[655\.3735\]


Light Modified Peptide
1    49
Name: count, dtype: int64

Heavy Modified Peptide
1    49
Name: count, dtype: int64

In [8]:
# Annotate Met site hyperreactivity labels (Hyperreactive: <= 2)

peptides["Hyperreactive"] = np.where(peptides["Average H/L"] <= 2, 1, 0)

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")

In [9]:
# Load and, if necessary, update sequence cache df (mapping from UniProt IDs to full protein sequences)

unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

path = os.path.join(global_data_path, "complete_sequence_cache.csv")
update_sequence_cache(path, unique_uniprotIDs)

sequence_cache_df_updated = pd.read_csv(path).set_index("Unnamed: 0")
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

Unique UniProt IDs: 
['Q8IVF2' 'Q09666' 'O00487' 'P52272' 'Q15435' 'Q15233' 'Q8WUH6' 'Q9BQ04'
 'P35579' 'Q14683' 'Q15149' 'P60228' 'Q9H694' 'O60610' 'Q9H4G0' 'P10809'
 'P46926' 'P41227' 'P61158' 'Q9H444' 'Q86UP2' 'P49915' 'P18621' 'P67870'
 'Q13813' 'P62258' 'Q01518' 'Q86V81' 'Q14152' 'P53999' 'Q13283' 'P26038'
 'P61247' 'P55072' 'Q16181' 'P15311' 'P18669' 'P62805' 'P22626' 'P11940'
 'P63261' 'P83731' 'P05783']
Number of Unique UniProt IDs: 43
   Protein ID                                  Complete Sequence
0      Q8C196  MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1      Q07417  MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...
2      Q91YI0  MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...
3      P50247  MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...
4      P33267  MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...
..        ...                                                ...
4      Q9NR28  MAALKSWLSRSVTSFFRYRQCLCVPVVANFKKRCFSELIRPWHKTV...
5      O60879  MEQPGAAASGAGGGSEE

In [10]:
peptides = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,Average H/L,P-value,Site,Hyperreactive,Complete Sequence
0,SMEASVDVSAPK,SMEASVDVSAPK,SM[649.3660]EASVDVSAPK,SM[655.3735]EASVDVSAPK,,-4.314368,-4.309369,,,sp|Q8IVF2|AHNK2_HUMAN,Q8IVF2,AHNK2_HUMAN,AHNAK2,Protein AHNAK2,-4.311869,3.689843e-04,,1,MCDCFHMVLPTWPGTPGSVSGRQLQPGEPGAETEDDHSVTEGPADE...
1,VSMPDVELNLKSPK,VSMPDVELNLKSPK,VSM[649.3660]PDVELNLKSPK,VSM[655.3735]PDVELNLKSPK,,-3.310446,-2.678198,-2.695957,-2.488542,sp|Q09666|AHNK_HUMAN,Q09666,AHNK_HUMAN,AHNAK,Neuroblast differentiation-associated protein ...,-2.793286,5.686721e-04,AHNAK_M3417,1,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...
2,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,O00487,PSDE_HUMAN,PSMD14,26S proteasome non-ATPase regulatory subunit 14,-2.697886,2.888136e-04,PSMD14_M167,1,MDRLLRLGGGMPGLGQGPPTDAPAVDTAEQVYISSLALLKMLKHGR...
3,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,-2.425145,1.903466e-07,HNRNPM_M437,1,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...
4,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,Q15435,PP1R7_HUMAN,PPP1R7,Protein phosphatase 1 regulatory subunit 7,0.391862,2.353017e-02,PPP1R7_M344,1,MAAERGAGQQQSQEMMEVDRRVESEESGDEEGKKHSSGIVADLSEQ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,P11940,PABP1_HUMAN,PABPC1,Polyadenylate-binding protein 1,8.428814,7.487145e-03,PABPC1_M573,0,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...
45,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,P63261,ACTG_HUMAN,ACTG1,"Actin, cytoplasmic 2",8.471928,2.227639e-03,ACTG1_M47,0,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...
46,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,P83731,RL24_HUMAN,RPL24,Large ribosomal subunit protein eL24,9.059513,4.406509e-02,RPL24_M127,0,MKVELCSFSGYKIYPGHGRRYARTDGKVFQFLNAKCESAFLSKRNP...
47,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,P05783,K1C18_HUMAN,KRT18,"Keratin, type I cytoskeletal 18",9.610308,9.293054e-04,KRT18_M84,0,MSFTTRSTFSTNYRSLGSVQAPSYGARPVSSAASVYAGAGGSGSRI...


In [11]:
# Process dataset to extract peptide and Met site locations
peptides = process_dataset(peptides, amino_acid, amino_acid_str, analysis_threshold, modifications)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,SMEASVDVSAPK,SMEASVDVSAPK,SM[649.3660]EASVDVSAPK,SM[655.3735]EASVDVSAPK,,-4.314368,-4.309369,,,sp|Q8IVF2|AHNK2_HUMAN,...,Q8IVF2_M693,1,MCDCFHMVLPTWPGTPGSVSGRQLQPGEPGAETEDDHSVTEGPADE...,692,12,S,1,693,KFKMPKFKMPLFGASAPGKS,EASVDVSAPKVEADVSLLSM
1,VSMPDVELNLKSPK,VSMPDVELNLKSPK,VSM[649.3660]PDVELNLKSPK,VSM[655.3735]PDVELNLKSPK,,-3.310446,-2.678198,-2.695957,-2.488542,sp|Q09666|AHNK_HUMAN,...,Q09666_M3416,1,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,3414,14,VS,2,3416,KVKGSKFKMPFLSISSPKVS,PDVELNLKSPKVKGDLDIAG
2,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,...,O00487_M166,1,MDRLLRLGGGMPGLGQGPPTDAPAVDTAEQVYISSLALLKMLKHGR...,161,14,LINAN,5,166,PIQSVKGKVVIDAFRLINAN,MVLGHEPRQTTSNLGHLNKP
3,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,...,P52272_M436,1,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,436,7,,0,436,ERMGAGLGHGMDRVGSEIER,GLVMDRMGSVERMGSGIERM
4,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,...,Q15435_M343,1,MAAERGAGQQQSQEMMEVDRRVESEESGDEEGKKHSSGIVADLSEQ...,342,9,V,1,343,ETVYLERNPLQKDPQYRRKV,LALPSVRQIDATFVRF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,...,P11940_M572,0,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...,565,15,LFPLIQA,7,572,APPQEQKQMLGERLFPLIQA,HPTLAGKITGMLLEIDNSEL
45,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,...,P63261_M46,0,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,39,11,HQGVMVG,7,46,PRAVFPSIVGRPRHQGVMVG,GQKDSYVGDEAQSKRGILTL
46,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,...,P83731_M126,0,MKVELCSFSGYKIYPGHGRRYARTDGKVFQFLNAKCESAFLSKRNP...,124,7,TA,2,126,QAIRAAKEAKKAKQASKKTA,AAAKAPTKAAPKQKIVKPVK
47,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,...,P05783_M83,0,MSFTTRSTFSTNYRSLGSVQAPSYGARPVSSAASVYAGAGGSGSRI...,81,9,ET,2,83,ATGIAGGLAGMGGIQNEKET,QSLNDRLASYLDRVRSLETE


In [12]:
# Sanity Check: ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides["Complete Sequence"], peptides["Peptide Location"], peptides["Peptide Length"])]
(temp == peptides["Peptide Sequence"]).value_counts()

Peptide Sequence
True    49
Name: count, dtype: int64

In [13]:
# Sanity Check: ensure Met sites are correct
temp = [A[B] for A, B in zip(peptides["Complete Sequence"], peptides[f"{amino_acid_str} Location"])]
pd.Series(temp).value_counts()

M    49
Name: count, dtype: int64

In [14]:
# NOTE: some Met sites do not have a full 20 amino acids to either side
peptides[(peptides[f"Left {analysis_threshold}"].str.len() != 20) | (peptides[f"Right {analysis_threshold}"].str.len() != 20)]

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
4,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,...,Q15435_M343,1,MAAERGAGQQQSQEMMEVDRRVESEESGDEEGKKHSSGIVADLSEQ...,342,9,V,1,343,ETVYLERNPLQKDPQYRRKV,LALPSVRQIDATFVRF
7,NSLYDMAR,NSLYDMAR,NSLYDM[649.3660]AR,NSLYDM[655.3735]AR,2.380718,2.33005,,,,sp|Q9BQ04|RBM4B_HUMAN,...,Q9BQ04_M341,0,MVKLFIGNLPREATEQEIRSLFEQYGKVLECDIIKNYGFVHIEDKT...,336,8,NSLYD,5,341,YGPESELSQASAATRNSLYD,ARYEREQYVDRARYSAF
19,HNPVFGVMS,HNPVFGVMS,HNPVFGVM[649.3660]S,HNPVFGVM[655.3735]S,4.203245,,4.046394,,,sp|P61158|ARP3_HUMAN,...,P61158_M416,0,MAGRLPACVVDCGTGYTKLGYAGNTEPQFIIPSCIAIKESAKVGDQ...,409,9,HNPVFGV,7,416,KKDYEEIGPSICRHNPVFGV,S
29,ADMQNLVER,n[42.0106]ADMQNLVER,n[42.0106]ADM[649.3660]QNLVER,n[42.0106]ADM[655.3735]QNLVER,,,6.328781,4.68717,4.748218,sp|Q01518|CAP1_HUMAN,...,Q01518_M3,0,MADMQNLVERLERAVGRLEAVSHTSDMHRGYADSPSKAGAAPYVQA...,1,9,AD,2,3,MA,QNLVERLERAVGRLEAVSHT
30,MDMSLDDIIK,MDMSLDDIIK,M[649.3660]DMSLDDIIK,M[655.3735]DMSLDDIIK,5.591513,6.189852,,,,sp|Q86V81|THOC4_HUMAN,...,Q86V81_M4,0,MADKMDMSLDDIIKLNRSQRGGRGGGRGRGRAGSQGGRGGGAQAAA...,4,10,,0,4,MAD,DMSLDDIIKLNRSQRGGRGG
40,AMEAVAAQGK,AMEAVAAQGK,AM[649.3660]EAVAAQGK,AM[655.3735]EAVAAQGK,8.617776,7.403858,,7.891326,8.504623,sp|P18669|PGAM1_HUMAN,...,P18669_M242,0,MAAYKLVLIRHGESAWNLENRFSGWYDADLSPAGHEEAKRGGQALR...,241,10,A,1,242,NLKPIKPMQFLGDEETVRKA,EAVAAQGKAKK
41,TVTAMDVVYALK,TVTAMDVVYALK,TVTAM[649.3660]DVVYALK,TVTAM[655.3735]DVVYALK,,,,8.683691,7.823752,sp|P62805|H4_HUMAN,...,P62805_M84,0,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,80,12,TVTA,4,84,NVIRDAVTYTEHAKRKTVTA,DVVYALKRQGRTLYGFGG


### Download Alphafold Data - A549

In [15]:
# Remove invalid proteins (according to alphafold)
# 5 invalid peptides as a result -> 2 hyperreactive, 3 not

invalid_IDs = ['Q8IVF2', 'Q09666', 'Q15149']
display(peptides[peptides["Protein ID"].isin(invalid_IDs)])
peptides = peptides[~peptides["Protein ID"].isin(invalid_IDs)]
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,SMEASVDVSAPK,SMEASVDVSAPK,SM[649.3660]EASVDVSAPK,SM[655.3735]EASVDVSAPK,,-4.314368,-4.309369,,,sp|Q8IVF2|AHNK2_HUMAN,...,Q8IVF2_M693,1,MCDCFHMVLPTWPGTPGSVSGRQLQPGEPGAETEDDHSVTEGPADE...,692,12,S,1,693,KFKMPKFKMPLFGASAPGKS,EASVDVSAPKVEADVSLLSM
1,VSMPDVELNLKSPK,VSMPDVELNLKSPK,VSM[649.3660]PDVELNLKSPK,VSM[655.3735]PDVELNLKSPK,,-3.310446,-2.678198,-2.695957,-2.488542,sp|Q09666|AHNK_HUMAN,...,Q09666_M3416,1,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,3414,14,VS,2,3416,KVKGSKFKMPFLSISSPKVS,PDVELNLKSPKVKGDLDIAG
10,LVASMEEAR,LVASMEEAR,LVASM[649.3660]EEAR,LVASM[655.3735]EEAR,,3.0758,2.351735,4.269428,,sp|Q15149|PLEC_HUMAN,...,Q15149_M2688,0,MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL...,2684,9,LVAS,4,2688,EEQQRQQQQMEQERQRLVAS,EEARRRQHEAEEGVRRKQEE
21,FKMPEMNIK,FKMPEMNIK,FKMPEM[649.3660]NIK,FKMPEM[655.3735]NIK,,4.471593,3.982242,,,sp|Q09666|AHNK_HUMAN,...,Q09666_M810,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,805,9,FKMPE,5,810,DVSIEEPEGKLKGPKFKMPE,NIKVPKISMPDVDLHLKGPN
33,IHMSGPK,IHMSGPK,IHM[649.3660]SGPK,IHM[655.3735]SGPK,,,6.704194,,6.833744,sp|Q09666|AHNK_HUMAN,...,Q09666_M5029,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,5027,7,IH,2,5029,PEISVGGKGKKSKFKMPKIH,SGPKIKAKKQGFDLNVPGGE


Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
2,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,...,O00487_M166,1,MDRLLRLGGGMPGLGQGPPTDAPAVDTAEQVYISSLALLKMLKHGR...,161,14,LINAN,5,166,PIQSVKGKVVIDAFRLINAN,MVLGHEPRQTTSNLGHLNKP
3,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,...,P52272_M436,1,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,436,7,,0,436,ERMGAGLGHGMDRVGSEIER,GLVMDRMGSVERMGSGIERM
4,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,...,Q15435_M343,1,MAAERGAGQQQSQEMMEVDRRVESEESGDEEGKKHSSGIVADLSEQ...,342,9,V,1,343,ETVYLERNPLQKDPQYRRKV,LALPSVRQIDATFVRF
5,RQQEEMMRR,RQQEEMMRR,RQQEEMM[649.3660]RR,RQQEEMM[655.3735]RR,,,1.089559,,1.211582,sp|Q15233|NONO_HUMAN,...,Q15233_M362,1,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,356,9,RQQEEM,6,362,RQEEERRRREEEMRRQQEEM,RRQQEGFKGTFPDAREQEIR
6,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,2.059981,,1.444680,1.992700,2.121844,sp|Q8WUH6|TM263_HUMAN,...,Q8WUH6_M33,1,MNQTDKNQQEIPSYLNDEPPEGSMKDHPQQQPGMLSRVTGGIFSVT...,25,12,DHPQQQPG,8,33,YLNDEPPEGSMKDHPQQQPG,LSRVTGGIFSVTKGAVGATI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,...,P11940_M572,0,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...,565,15,LFPLIQA,7,572,APPQEQKQMLGERLFPLIQA,HPTLAGKITGMLLEIDNSEL
45,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,...,P63261_M46,0,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,39,11,HQGVMVG,7,46,PRAVFPSIVGRPRHQGVMVG,GQKDSYVGDEAQSKRGILTL
46,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,...,P83731_M126,0,MKVELCSFSGYKIYPGHGRRYARTDGKVFQFLNAKCESAFLSKRNP...,124,7,TA,2,126,QAIRAAKEAKKAKQASKKTA,AAAKAPTKAAPKQKIVKPVK
47,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,...,P05783_M83,0,MSFTTRSTFSTNYRSLGSVQAPSYGARPVSSAASVYAGAGGSGSRI...,81,9,ET,2,83,ATGIAGGLAGMGGIQNEKET,QSLNDRLASYLDRVRSLETE


In [16]:
# Set UniProt IDs to use
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['O00487' 'P52272' 'Q15435' 'Q15233' 'Q8WUH6' 'Q9BQ04' 'P35579' 'Q14683'
 'P60228' 'Q9H694' 'O60610' 'Q9H4G0' 'P10809' 'P46926' 'P41227' 'P61158'
 'Q9H444' 'Q86UP2' 'P49915' 'P18621' 'P67870' 'Q13813' 'P62258' 'Q01518'
 'Q86V81' 'Q14152' 'P53999' 'Q13283' 'P26038' 'P61247' 'P55072' 'Q16181'
 'P15311' 'P18669' 'P62805' 'P22626' 'P11940' 'P63261' 'P83731' 'P05783']
Number of Unique UniProt IDs: 40


In [17]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 40/40 [00:00<00:00, 66894.80it/s]

2025-03-24 19:25:30> Valid proteins: 0
2025-03-24 19:25:30> Invalid proteins: 0
2025-03-24 19:25:30> Existing proteins: 40





In [18]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir,
)

100%|██████████| 40/40 [00:00<00:00, 57633.86it/s]

2025-03-24 19:25:30> Valid proteins: 0
2025-03-24 19:25:30> Invalid proteins: 0
2025-03-24 19:25:30> Existing proteins: 40





### Calculate Accessibilites and Merge into Full Dataset - A549

In [19]:
radii = [2, 3, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 12, 18, 24]
smooth_accessibilities = calculate_accessibilities(cif_dir, pae_dir, unique_uniprotIDs, radii)
smooth_accessibilities["position"] = smooth_accessibilities["position"] - 1 # zero-index the positions to match initial dataframe
smooth_accessibilities

100%|██████████| 1986/1986 [00:03<00:00, 559.32it/s] 
100%|██████████| 40/40 [00:01<00:00, 23.70it/s]
100%|██████████| 40/40 [00:00<00:00, 75.36it/s]
100%|██████████| 40/40 [00:00<00:00, 89.89it/s]
100%|██████████| 40/40 [00:00<00:00, 62.94it/s]
100%|██████████| 40/40 [00:00<00:00, 84.11it/s]
100%|██████████| 40/40 [00:00<00:00, 91.80it/s]
100%|██████████| 40/40 [00:00<00:00, 91.66it/s]
100%|██████████| 40/40 [00:00<00:00, 94.09it/s]
100%|██████████| 40/40 [00:00<00:00, 94.13it/s]
100%|██████████| 40/40 [00:00<00:00, 92.71it/s]
100%|██████████| 40/40 [00:00<00:00, 92.80it/s]
100%|██████████| 40/40 [00:00<00:00, 85.50it/s]
100%|██████████| 40/40 [00:00<00:00, 71.36it/s] 
100%|██████████| 40/40 [00:00<00:00, 58.53it/s]
100%|██████████| 40/40 [00:00<00:00, 248.14it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,O00487,1,M,0,42.66,74.318,75.707,76.187,75.708,-15.972,...,1.181818,1.909091,1.909091,1.909091,1.909091,1.909091,3.636364,6.181818,8.181818,1
1,O00487,1,D,1,35.66,73.558,74.180,75.681,73.817,-14.838,...,1.250000,1.916667,1.916667,1.916667,1.916667,1.916667,3.666667,6.166667,8.250000,1
2,O00487,1,R,2,40.58,70.267,71.774,72.090,72.286,-13.908,...,1.230769,1.923077,1.923077,1.923077,1.923077,1.923077,3.692308,6.230769,8.307692,1
3,O00487,1,L,3,45.02,67.830,68.489,67.564,69.862,-11.412,...,1.214286,1.928571,1.928571,1.928571,1.928571,1.928571,3.642857,6.214286,8.285714,1
4,O00487,1,L,4,44.63,68.422,68.193,69.059,68.614,-8.472,...,1.200000,1.933333,1.933333,1.933333,1.933333,1.933333,3.666667,6.200000,8.333333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
969,Q9H694,40,V,969,37.87,8.571,9.527,10.159,8.796,0.576,...,1.400000,1.933333,1.933333,1.933333,1.933333,1.933333,3.800000,6.466667,9.066667,1
970,Q9H694,40,S,970,36.09,7.259,8.703,9.652,9.070,3.555,...,1.357143,1.928571,1.928571,1.928571,1.928571,1.928571,3.785714,6.428571,9.000000,1
971,Q9H694,40,G,971,39.87,6.657,7.155,,6.684,6.355,...,1.384615,1.923077,1.923077,1.923077,1.923077,1.923077,3.769231,6.384615,8.923077,1
972,Q9H694,40,R,972,36.39,5.515,6.927,7.919,6.991,8.631,...,1.416667,1.916667,1.916667,1.916667,1.916667,1.916667,3.750000,6.333333,8.833333,1


In [20]:
peptides = peptides.merge(
    smooth_accessibilities, 
    how="left", 
    left_on=["Protein ID", f"{amino_acid_str} Location"], 
    right_on=["protein_id", "position"]
)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,...,1.714286,2.095238,2.238095,2.619048,3.047619,3.761905,10.142857,30.904762,58.000000,0
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
2,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,...,2.142857,2.285714,3.571429,5.619048,6.952381,7.333333,19.285714,47.238095,78.476190,0
3,RQQEEMMRR,RQQEEMMRR,RQQEEMM[649.3660]RR,RQQEEMM[655.3735]RR,,,1.089559,,1.211582,sp|Q15233|NONO_HUMAN,...,0.857143,1.380952,1.380952,2.333333,2.952381,3.952381,6.952381,13.238095,18.142857,1
4,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,2.059981,,1.444680,1.992700,2.121844,sp|Q8WUH6|TM263_HUMAN,...,0.238095,1.619048,1.619048,2.000000,2.000000,2.000000,3.476190,6.047619,9.190476,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.809524,5.047619,6.000000,6.809524,15.285714,40.000000,61.428571,0
40,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,...,1.476190,2.238095,2.523810,2.952381,3.571429,4.095238,9.761905,25.285714,46.380952,0
41,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,...,0.714286,1.428571,1.714286,2.333333,3.000000,3.190476,6.619048,11.904762,16.142857,1
42,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,...,0.857143,1.714286,2.047619,2.571429,3.333333,4.142857,7.571429,13.380952,18.809524,1


In [21]:
# Sanity Check: ensure UniProt and AlphaFold sequences are the same

peptides["AA"].value_counts()
#pd.set_option("display.max_columns", None)
#display(peptides[~(peptides["AA"] == amino_acid)])
#pd.reset_option("display.max_columns")
#peptides = peptides[(peptides["AA"] == amino_acid)]

AA
M    44
Name: count, dtype: int64

In [22]:
#peptides.to_csv(os.path.join(curr_dir_path, "A549_processed.csv"))

In [23]:
path = os.path.join(curr_dir_path, "A549_processed.csv")
peptides = pd.read_csv(path).set_index("Unnamed: 0")
peptides.index.name = None
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,...,1.714286,2.095238,2.238095,2.619048,3.047619,3.761905,10.142857,30.904762,58.000000,0
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
2,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,...,2.142857,2.285714,3.571429,5.619048,6.952381,7.333333,19.285714,47.238095,78.476190,0
3,RQQEEMMRR,RQQEEMMRR,RQQEEMM[649.3660]RR,RQQEEMM[655.3735]RR,,,1.089559,,1.211582,sp|Q15233|NONO_HUMAN,...,0.857143,1.380952,1.380952,2.333333,2.952381,3.952381,6.952381,13.238095,18.142857,1
4,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,2.059981,,1.444680,1.992700,2.121844,sp|Q8WUH6|TM263_HUMAN,...,0.238095,1.619048,1.619048,2.000000,2.000000,2.000000,3.476190,6.047619,9.190476,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.809524,5.047619,6.000000,6.809524,15.285714,40.000000,61.428571,0
40,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,...,1.476190,2.238095,2.523810,2.952381,3.571429,4.095238,9.761905,25.285714,46.380952,0
41,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,...,0.714286,1.428571,1.714286,2.333333,3.000000,3.190476,6.619048,11.904762,16.142857,1
42,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,...,0.857143,1.714286,2.047619,2.571429,3.333333,4.142857,7.571429,13.380952,18.809524,1


## HCT116

### Load and Process Dataset - HCT116

In [24]:
# Load initial isoTOP-ABPP dataset
data_loc = os.path.join(curr_dir_path, "HCT116_hyperreactivity.csv")
peptides = pd.read_csv(data_loc)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,Average H/L,P-value,Site
0,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,-2.112572,-2.301838,-2.049472,-2.460635,-2.802424,sp|O94979|SC31A_HUMAN,O94979,SC31A_HUMAN,SEC31A,Protein transport protein Sec31A,-2.345388,0.000065,SEC31A_M823
1,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,-2.362646,,,-2.136884,-1.407531,sp|P35579|MYH9_HUMAN,P35579,MYH9_HUMAN,MYH9,Myosin-9,-1.969020,0.020760,MYH9_M1565
2,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-1.039815,-0.902895,,,-1.159599,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,-1.034103,0.005103,HNRNPM_M437
3,QMMEAATR,QMM[15.9949]EAATR,QM[649.3660]M[15.9949]EAATR,QM[655.3735]M[15.9949]EAATR,,,-0.908366,-0.743273,-0.549589,sp|O43395|PRPF3_HUMAN,O43395,PRPF3_HUMAN,PRPF3,U4/U6 small nuclear ribonucleoprotein Prp3,-0.733743,0.019388,PRPF3_M145
4,AVSAVKNMNLPEIPR,AVSAVKNMNLPEIPR,AVSAVKNM[649.3660]NLPEIPR,AVSAVKNM[655.3735]NLPEIPR,,-0.584995,-0.473381,,-0.601462,sp|Q92572|AP3S1_HUMAN,Q92572,AP3S1_HUMAN,AP3S1,AP-3 complex subunit sigma-1,-0.553279,0.005246,AP3S1_M168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,MDSRGEHRQDR,MDSRGEHRQDR,M[649.3660]DSRGEHRQDR,M[655.3735]DSRGEHRQDR,8.043113,,,8.179892,,sp|P35637|FUS_HUMAN,P35637,FUS_HUMAN,FUS,RNA-binding protein FUS,8.111503,0.005367,FUS_M511
103,MGANSLER,MGANSLER,M[649.3660]GANSLER,M[655.3735]GANSLER,8.274117,8.592321,,8.789749,,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,8.552063,0.000308,
104,MNQGTAR,MNQGTAR,M[649.3660]NQGTAR,M[655.3735]NQGTAR,,9.011247,,,8.254092,sp|P43243|MATR3_HUMAN,P43243,MATR3_HUMAN,MATR3,Matrin-3,8.632669,0.027900,MATR3_M45
105,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,,8.964193,8.405923,,sp|P11940|PABP1_HUMAN,P11940,PABP1_HUMAN,PABPC1,Polyadenylate-binding protein 1,8.685058,0.020454,PABPC1_M584


In [25]:
# Check dataset for missing (NaN) entries
peptides.isna().sum()

Peptide Sequence           0
Modified Peptide           0
Light Modified Peptide     0
Heavy Modified Peptide     0
exp_1 Log2 Ratio HL       41
exp_2 Log2 Ratio HL       30
exp_3 Log2 Ratio HL       43
exp_4 Log2 Ratio HL       33
exp_5 Log2 Ratio HL       35
Protein                    0
Protein ID                 0
Entry Name                 0
Gene                       0
Protein Description        0
Average H/L                0
P-value                    0
Site                       6
dtype: int64

In [26]:
# Sanity Check: ensure there's only one desired modification in each peptide
modifications_pattern = create_modifications_pattern(amino_acid, modifications)
print(modifications_pattern)
display(peptides["Light Modified Peptide"].str.count(modifications_pattern).value_counts())
display(peptides["Heavy Modified Peptide"].str.count(modifications_pattern).value_counts())

M\[649\.3660\]|M\[655\.3735\]


Light Modified Peptide
1    107
Name: count, dtype: int64

Heavy Modified Peptide
1    107
Name: count, dtype: int64

In [27]:
# Annotate Met site hyperreactivity labels (Hyperreactive: <= 2)

peptides["Hyperreactive"] = np.where(peptides["Average H/L"] <= 2, 1, 0)

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")

In [28]:
# Load and, if necessary, update sequence cache df (mapping from UniProt IDs to full protein sequences)

unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

path = os.path.join(global_data_path, "complete_sequence_cache.csv")
update_sequence_cache(path, unique_uniprotIDs)

sequence_cache_df_updated = pd.read_csv(path).set_index("Unnamed: 0")
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

Unique UniProt IDs: 
['O94979' 'P35579' 'P52272' 'O43395' 'Q92572' 'P11940' 'Q16204' 'Q9UKD2'
 'P14314' 'Q15149' 'Q13617' 'P40222' 'Q86UP2' 'Q04323' 'P54886' 'P33240'
 'Q15233' 'Q9UG63' 'Q99961' 'O60610' 'Q13310' 'P25786' 'Q9C0J8' 'Q8N6H7'
 'P08727' 'Q01518' 'P67870' 'Q9Y2W1' 'P62258' 'Q99623' 'P22830' 'P08238'
 'Q9H9T3' 'P62995' 'Q96I24' 'P10809' 'Q15773' 'P15311' 'P09496' 'P05787'
 'Q16543' 'P14174' 'P41227' 'Q9Y244' 'Q13283' 'Q16181' 'Q9Y3U8' 'P18621'
 'P18583' 'P18669' 'Q04637' 'P50454' 'P14866' 'P26038' 'Q15424' 'P62805'
 'P22626' 'Q9UQE7' 'P46777' 'O60664' 'Q96PK6' 'P63261' 'P62841' 'P84098'
 'P31948' 'P38646' 'P83731' 'P16949' 'P05783' 'Q13123' 'Q07666' 'P11142'
 'Q14152' 'P35637' 'P43243' 'P68104']
Number of Unique UniProt IDs: 76
   Protein ID                                  Complete Sequence
0      Q8C196  MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1      Q07417  MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...
2      Q91YI0  MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQG

In [29]:
peptides = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,Average H/L,P-value,Site,Hyperreactive,Complete Sequence
0,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,-2.112572,-2.301838,-2.049472,-2.460635,-2.802424,sp|O94979|SC31A_HUMAN,O94979,SC31A_HUMAN,SEC31A,Protein transport protein Sec31A,-2.345388,0.000065,SEC31A_M823,1,MKLKEVDRTAMQAWSPAQNHPIYLATGTSAQQLDATFSTNASLEIF...
1,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,-2.362646,,,-2.136884,-1.407531,sp|P35579|MYH9_HUMAN,P35579,MYH9_HUMAN,MYH9,Myosin-9,-1.969020,0.020760,MYH9_M1565,1,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...
2,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-1.039815,-0.902895,,,-1.159599,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,-1.034103,0.005103,HNRNPM_M437,1,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...
3,QMMEAATR,QMM[15.9949]EAATR,QM[649.3660]M[15.9949]EAATR,QM[655.3735]M[15.9949]EAATR,,,-0.908366,-0.743273,-0.549589,sp|O43395|PRPF3_HUMAN,O43395,PRPF3_HUMAN,PRPF3,U4/U6 small nuclear ribonucleoprotein Prp3,-0.733743,0.019388,PRPF3_M145,1,MALSKRELDELKPWIEKTVKRVLGFSEPTVVTAALNCVGKGMDKKK...
4,AVSAVKNMNLPEIPR,AVSAVKNMNLPEIPR,AVSAVKNM[649.3660]NLPEIPR,AVSAVKNM[655.3735]NLPEIPR,,-0.584995,-0.473381,,-0.601462,sp|Q92572|AP3S1_HUMAN,Q92572,AP3S1_HUMAN,AP3S1,AP-3 complex subunit sigma-1,-0.553279,0.005246,AP3S1_M168,1,MIKAILIFNNHGKPRLSKFYQPYSEDTQQQIIRETFHLVSKRDENV...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,MDSRGEHRQDR,MDSRGEHRQDR,M[649.3660]DSRGEHRQDR,M[655.3735]DSRGEHRQDR,8.043113,,,8.179892,,sp|P35637|FUS_HUMAN,P35637,FUS_HUMAN,FUS,RNA-binding protein FUS,8.111503,0.005367,FUS_M511,0,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...
103,MGANSLER,MGANSLER,M[649.3660]GANSLER,M[655.3735]GANSLER,8.274117,8.592321,,8.789749,,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,8.552063,0.000308,,0,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...
104,MNQGTAR,MNQGTAR,M[649.3660]NQGTAR,M[655.3735]NQGTAR,,9.011247,,,8.254092,sp|P43243|MATR3_HUMAN,P43243,MATR3_HUMAN,MATR3,Matrin-3,8.632669,0.027900,MATR3_M45,0,MSKSFQQSSLSRDSQGHGRDLSAAGIGLLAAATQSLSMPASLGRMN...
105,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,,8.964193,8.405923,,sp|P11940|PABP1_HUMAN,P11940,PABP1_HUMAN,PABPC1,Polyadenylate-binding protein 1,8.685058,0.020454,PABPC1_M584,0,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...


In [30]:
# Process dataset to extract peptide and Met site locations
peptides = process_dataset(peptides, amino_acid, amino_acid_str, analysis_threshold, modifications)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,-2.112572,-2.301838,-2.049472,-2.460635,-2.802424,sp|O94979|SC31A_HUMAN,...,O94979_M822,1,MKLKEVDRTAMQAWSPAQNHPIYLATGTSAQQLDATFSTNASLEIF...,811,14,GRPGPVAGHHQ,11,822,PYEKQQLPKGRPGPVAGHHQ,PRVQTQQYYPHGENPPPPGF
1,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,-2.362646,,,-2.136884,-1.407531,sp|P35579|MYH9_HUMAN,...,P35579_M1564,1,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,1555,11,LRLEVNLQA,9,1564,EDELQATEDAKLRLEVNLQA,KAQFERDLQGRDEQSEEKKK
2,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-1.039815,-0.902895,,,-1.159599,sp|P52272|HNRPM_HUMAN,...,P52272_M436,1,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,436,7,,0,436,ERMGAGLGHGMDRVGSEIER,GLVMDRMGSVERMGSGIERM
3,QMMEAATR,QMM[15.9949]EAATR,QM[649.3660]M[15.9949]EAATR,QM[655.3735]M[15.9949]EAATR,,,-0.908366,-0.743273,-0.549589,sp|O43395|PRPF3_HUMAN,...,O43395_M144,1,MALSKRELDELKPWIEKTVKRVLGFSEPTVVTAALNCVGKGMDKKK...,143,8,Q,1,144,VIPGPPSESPGMLTKLQIKQ,MEAATRQIEERKKQLSFISP
4,AVSAVKNMNLPEIPR,AVSAVKNMNLPEIPR,AVSAVKNM[649.3660]NLPEIPR,AVSAVKNM[655.3735]NLPEIPR,,-0.584995,-0.473381,,-0.601462,sp|Q92572|AP3S1_HUMAN,...,Q92572_M167,1,MIKAILIFNNHGKPRLSKFYQPYSEDTQQQIIRETFHLVSKRDENV...,160,15,AVSAVKN,7,167,EKSEAGLAGAPARAVSAVKN,NLPEIPRNINIGDISIKVPN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,MDSRGEHRQDR,MDSRGEHRQDR,M[649.3660]DSRGEHRQDR,M[655.3735]DSRGEHRQDR,8.043113,,,8.179892,,sp|P35637|FUS_HUMAN,...,P35637_M510,0,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,510,11,,0,510,RGGFRGGRGGGDRGGFGPGK,DSRGEHRQDRRERPY
103,MGANSLER,MGANSLER,M[649.3660]GANSLER,M[655.3735]GANSLER,8.274117,8.592321,,8.789749,,sp|P52272|HNRPM_HUMAN,...,P52272_M570,0,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,570,8,,0,570,MATGLERMGANNLERMGLER,GANSLERMGLERMGANSLER
104,MNQGTAR,MNQGTAR,M[649.3660]NQGTAR,M[655.3735]NQGTAR,,9.011247,,,8.254092,sp|P43243|MATR3_HUMAN,...,P43243_M44,0,MSKSFQQSSLSRDSQGHGRDLSAAGIGLLAAATQSLSMPASLGRMN...,44,7,,0,44,GIGLLAAATQSLSMPASLGR,NQGTARLASLMNLGMSSSLN
105,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,,8.964193,8.405923,,sp|P11940|PABP1_HUMAN,...,P11940_M583,0,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...,580,24,ITG,3,583,ERLFPLIQAMHPTLAGKITG,LLEIDNSELLHMLESPESLR


In [31]:
# Sanity Check: ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides["Complete Sequence"], peptides["Peptide Location"], peptides["Peptide Length"])]
(temp == peptides["Peptide Sequence"]).value_counts()

Peptide Sequence
True    107
Name: count, dtype: int64

In [32]:
# Sanity Check: ensure Met sites are correct
temp = [A[B] for A, B in zip(peptides["Complete Sequence"], peptides[f"{amino_acid_str} Location"])]
pd.Series(temp).value_counts()

M    107
Name: count, dtype: int64

In [33]:
# NOTE: some Met sites do not have a full 20 amino acids to either side
peptides[(peptides[f"Left {analysis_threshold}"].str.len() != 20) | (peptides[f"Right {analysis_threshold}"].str.len() != 20)]

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
29,ADMQNLVER,n[42.0106]ADMQNLVER,n[42.0106]ADM[649.3660]QNLVER,n[42.0106]ADM[655.3735]QNLVER,4.641467,4.085469,3.666118,4.176906,3.701471,sp|Q01518|CAP1_HUMAN,...,Q01518_M3,0,MADMQNLVERLERAVGRLEAVSHTSDMHRGYADSPSKAGAAPYVQA...,1,9,AD,2,3,MA,QNLVERLERAVGRLEAVSHT
48,PMFIVNTNVPR,n[42.0106]PMFIVNTNVPR,n[42.0106]PM[649.3660]FIVNTNVPR,n[42.0106]PM[655.3735]FIVNTNVPR,5.445224,5.402043,,5.275543,5.171523,sp|P14174|MIF_HUMAN,...,P14174_M2,0,MPMFIVNTNVPRASVPDGFLSELTQQLAQATGKPPQYIAVHVVPDQ...,1,11,P,1,2,M,FIVNTNVPRASVPDGFLSEL
56,REELSNVLAAMRK,REELSNVLAAMRK,REELSNVLAAM[649.3660]RK,REELSNVLAAM[655.3735]RK,5.606605,5.635006,5.730737,5.838071,5.713062,sp|Q9Y3U8|RL36_HUMAN,...,Q9Y3U8_M96,0,MALRYPMAVGLNKGHKVTKNVSKPRHSRRRGRLTKHTKFVRDMIRE...,86,13,REELSNVLAA,10,96,VGTHIRAKRKREELSNVLAA,RKAAAKKD
63,KAMEAVAAQGK,KAMEAVAAQGK,KAM[649.3660]EAVAAQGK,KAM[655.3735]EAVAAQGK,5.749203,,5.662263,6.701941,,sp|P18669|PGAM1_HUMAN,...,P18669_M242,0,MAAYKLVLIRHGESAWNLENRFSGWYDADLSPAGHEEAKRGGQALR...,240,11,KA,2,242,NLKPIKPMQFLGDEETVRKA,EAVAAQGKAKK
66,AMEAVAAQGK,AMEAVAAQGK,AM[649.3660]EAVAAQGK,AM[655.3735]EAVAAQGK,6.360304,6.365764,6.329671,6.317992,6.323987,sp|P18669|PGAM1_HUMAN,...,P18669_M242,0,MAAYKLVLIRHGESAWNLENRFSGWYDADLSPAGHEEAKRGGQALR...,241,10,A,1,242,NLKPIKPMQFLGDEETVRKA,EAVAAQGKAKK
71,TVTAMDVVYALK,TVTAMDVVYALK,TVTAM[649.3660]DVVYALK,TVTAM[655.3735]DVVYALK,6.843325,6.847794,6.696281,6.941575,6.688475,sp|P62805|H4_HUMAN,...,P62805_M84,0,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,80,12,TVTA,4,84,NVIRDAVTYTEHAKRKTVTA,DVVYALKRQGRTLYGFGG
83,AAQMHSGYQR,AAQMHSGYQR,AAQM[649.3660]HSGYQR,AAQM[655.3735]HSGYQR,7.261837,7.279966,7.329382,,7.087245,sp|Q96PK6|RBM14_HUMAN,...,Q96PK6_M660,0,MKIFVGNVDGADTTPEELAALFAPYGTVMSCAVMKQFAFVHMRENA...,657,10,AAQ,3,660,AHSDYARYSGSYNDYLRAAQ,HSGYQRRM
90,LMDVGLIAIR,LMDVGLIAIR,LM[649.3660]DVGLIAIR,LM[655.3735]DVGLIAIR,7.317032,7.700163,7.022413,6.982117,8.229968,sp|P31948|STIP1_HUMAN,...,P31948_M534,0,MEQVNELKEKGNKALSVGNIDDALQCYSEAIKLDPHNHVLYSNRSA...,533,10,L,1,534,PQALSEHLKNPVIAQKIQKL,DVGLIAIR
102,MDSRGEHRQDR,MDSRGEHRQDR,M[649.3660]DSRGEHRQDR,M[655.3735]DSRGEHRQDR,8.043113,,,8.179892,,sp|P35637|FUS_HUMAN,...,P35637_M510,0,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,510,11,,0,510,RGGFRGGRGGGDRGGFGPGK,DSRGEHRQDRRERPY


### Download Alphafold Data - HCT116

In [34]:
# Remove invalid proteins (according to alphafold)
# 2 invalid peptides as a result -> 2 not hyperreactive

invalid_IDs = ['Q15149']
display(peptides[peptides["Protein ID"].isin(invalid_IDs)])
peptides = peptides[~peptides["Protein ID"].isin(invalid_IDs)]
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
9,LVASMEEAR,LVASMEEAR,LVASM[649.3660]EEAR,LVASM[655.3735]EEAR,2.193126,2.455621,2.143137,2.340442,1.864644,sp|Q15149|PLEC_HUMAN,...,Q15149_M2688,0,MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL...,2684,9,LVAS,4,2688,EEQQRQQQQMEQERQRLVAS,EEARRRQHEAEEGVRRKQEE
20,QLEMSAEAER,QLEMSAEAER,QLEM[649.3660]SAEAER,QLEM[655.3735]SAEAER,,3.448625,2.917994,3.188039,3.044096,sp|Q15149|PLEC_HUMAN,...,Q15149_M2514,0,MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL...,2511,10,QLE,3,2514,AEETQGFQRTLEAERQRQLE,SAEAERLKLRVAEMSRAQAR


Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,-2.112572,-2.301838,-2.049472,-2.460635,-2.802424,sp|O94979|SC31A_HUMAN,...,O94979_M822,1,MKLKEVDRTAMQAWSPAQNHPIYLATGTSAQQLDATFSTNASLEIF...,811,14,GRPGPVAGHHQ,11,822,PYEKQQLPKGRPGPVAGHHQ,PRVQTQQYYPHGENPPPPGF
1,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,-2.362646,,,-2.136884,-1.407531,sp|P35579|MYH9_HUMAN,...,P35579_M1564,1,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,1555,11,LRLEVNLQA,9,1564,EDELQATEDAKLRLEVNLQA,KAQFERDLQGRDEQSEEKKK
2,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-1.039815,-0.902895,,,-1.159599,sp|P52272|HNRPM_HUMAN,...,P52272_M436,1,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,436,7,,0,436,ERMGAGLGHGMDRVGSEIER,GLVMDRMGSVERMGSGIERM
3,QMMEAATR,QMM[15.9949]EAATR,QM[649.3660]M[15.9949]EAATR,QM[655.3735]M[15.9949]EAATR,,,-0.908366,-0.743273,-0.549589,sp|O43395|PRPF3_HUMAN,...,O43395_M144,1,MALSKRELDELKPWIEKTVKRVLGFSEPTVVTAALNCVGKGMDKKK...,143,8,Q,1,144,VIPGPPSESPGMLTKLQIKQ,MEAATRQIEERKKQLSFISP
4,AVSAVKNMNLPEIPR,AVSAVKNMNLPEIPR,AVSAVKNM[649.3660]NLPEIPR,AVSAVKNM[655.3735]NLPEIPR,,-0.584995,-0.473381,,-0.601462,sp|Q92572|AP3S1_HUMAN,...,Q92572_M167,1,MIKAILIFNNHGKPRLSKFYQPYSEDTQQQIIRETFHLVSKRDENV...,160,15,AVSAVKN,7,167,EKSEAGLAGAPARAVSAVKN,NLPEIPRNINIGDISIKVPN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,MDSRGEHRQDR,MDSRGEHRQDR,M[649.3660]DSRGEHRQDR,M[655.3735]DSRGEHRQDR,8.043113,,,8.179892,,sp|P35637|FUS_HUMAN,...,P35637_M510,0,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,510,11,,0,510,RGGFRGGRGGGDRGGFGPGK,DSRGEHRQDRRERPY
103,MGANSLER,MGANSLER,M[649.3660]GANSLER,M[655.3735]GANSLER,8.274117,8.592321,,8.789749,,sp|P52272|HNRPM_HUMAN,...,P52272_M570,0,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,570,8,,0,570,MATGLERMGANNLERMGLER,GANSLERMGLERMGANSLER
104,MNQGTAR,MNQGTAR,M[649.3660]NQGTAR,M[655.3735]NQGTAR,,9.011247,,,8.254092,sp|P43243|MATR3_HUMAN,...,P43243_M44,0,MSKSFQQSSLSRDSQGHGRDLSAAGIGLLAAATQSLSMPASLGRMN...,44,7,,0,44,GIGLLAAATQSLSMPASLGR,NQGTARLASLMNLGMSSSLN
105,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,,8.964193,8.405923,,sp|P11940|PABP1_HUMAN,...,P11940_M583,0,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...,580,24,ITG,3,583,ERLFPLIQAMHPTLAGKITG,LLEIDNSELLHMLESPESLR


In [35]:
# Set UniProt IDs to use
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['O94979' 'P35579' 'P52272' 'O43395' 'Q92572' 'P11940' 'Q16204' 'Q9UKD2'
 'P14314' 'Q13617' 'P40222' 'Q86UP2' 'Q04323' 'P54886' 'P33240' 'Q15233'
 'Q9UG63' 'Q99961' 'O60610' 'Q13310' 'P25786' 'Q9C0J8' 'Q8N6H7' 'P08727'
 'Q01518' 'P67870' 'Q9Y2W1' 'P62258' 'Q99623' 'P22830' 'P08238' 'Q9H9T3'
 'P62995' 'Q96I24' 'P10809' 'Q15773' 'P15311' 'P09496' 'P05787' 'Q16543'
 'P14174' 'P41227' 'Q9Y244' 'Q13283' 'Q16181' 'Q9Y3U8' 'P18621' 'P18583'
 'P18669' 'Q04637' 'P50454' 'P14866' 'P26038' 'Q15424' 'P62805' 'P22626'
 'Q9UQE7' 'P46777' 'O60664' 'Q96PK6' 'P63261' 'P62841' 'P84098' 'P31948'
 'P38646' 'P83731' 'P16949' 'P05783' 'Q13123' 'Q07666' 'P11142' 'Q14152'
 'P35637' 'P43243' 'P68104']
Number of Unique UniProt IDs: 75


In [36]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 75/75 [00:00<00:00, 89468.94it/s]

2025-03-24 19:25:42> Valid proteins: 0
2025-03-24 19:25:42> Invalid proteins: 0
2025-03-24 19:25:42> Existing proteins: 75





In [37]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir,
)

100%|██████████| 75/75 [00:00<00:00, 96258.51it/s]

2025-03-24 19:25:42> Valid proteins: 0
2025-03-24 19:25:42> Invalid proteins: 0
2025-03-24 19:25:42> Existing proteins: 75





### Calculate Accessibilites and Merge into Full Dataset - HCT116

In [38]:
radii = [2, 3, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 12, 18, 24]
smooth_accessibilities = calculate_accessibilities(cif_dir, pae_dir, unique_uniprotIDs, radii)
smooth_accessibilities["position"] = smooth_accessibilities["position"] - 1 # zero-index the positions to match initial dataframe
smooth_accessibilities

100%|██████████| 1986/1986 [00:06<00:00, 312.07it/s]
100%|██████████| 75/75 [00:00<00:00, 110.39it/s]
100%|██████████| 75/75 [00:00<00:00, 110.63it/s]
100%|██████████| 75/75 [00:00<00:00, 99.71it/s] 
100%|██████████| 75/75 [00:00<00:00, 90.37it/s] 
100%|██████████| 75/75 [00:00<00:00, 111.36it/s]
100%|██████████| 75/75 [00:00<00:00, 111.72it/s]
100%|██████████| 75/75 [00:00<00:00, 100.23it/s]
100%|██████████| 75/75 [00:00<00:00, 109.33it/s]
100%|██████████| 75/75 [00:00<00:00, 109.21it/s]
100%|██████████| 75/75 [00:00<00:00, 105.52it/s]
100%|██████████| 75/75 [00:00<00:00, 108.19it/s]
100%|██████████| 75/75 [00:00<00:00, 98.84it/s] 
100%|██████████| 75/75 [00:00<00:00, 83.35it/s]
100%|██████████| 75/75 [00:01<00:00, 69.80it/s]
100%|██████████| 75/75 [00:00<00:00, 733.08it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,O43395,1,M,0,56.60,14.046,14.056,12.609,14.984,-2.357,...,1.727273,2.181818,3.454545,3.909091,4.454545,4.636364,9.636364,24.727273,46.090909,0
1,O43395,1,A,1,69.89,12.537,13.954,14.228,14.140,-4.926,...,1.750000,2.166667,3.500000,3.916667,4.583333,4.916667,10.166667,26.500000,48.166667,0
2,O43395,1,L,2,76.19,10.608,11.067,11.081,12.379,-7.441,...,1.769231,2.153846,3.538462,4.000000,4.692308,5.076923,10.307692,27.461538,49.307692,0
3,O43395,1,S,3,83.40,8.765,8.689,7.240,9.296,-9.899,...,1.785714,2.142857,3.571429,4.071429,4.857143,5.214286,10.571429,28.785714,50.714286,0
4,O43395,1,K,4,82.43,7.691,8.715,8.573,8.673,-12.098,...,1.800000,2.133333,3.733333,4.266667,5.066667,5.400000,11.066667,30.533333,52.400000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,Q9Y3U8,75,A,100,93.59,-18.970,-18.508,-19.433,-17.140,-9.412,...,1.666667,2.200000,4.533333,4.733333,6.200000,6.266667,10.333333,24.200000,42.200000,0
101,Q9Y3U8,75,A,101,93.58,-18.406,-19.089,-18.809,-18.691,-12.505,...,1.642857,2.142857,4.500000,4.642857,6.071429,6.142857,9.857143,22.642857,40.571429,0
102,Q9Y3U8,75,K,102,90.40,-17.314,-16.716,-15.196,-17.376,-12.909,...,1.615385,2.153846,4.384615,4.538462,5.923077,6.000000,9.692308,21.230769,38.153846,0
103,Q9Y3U8,75,K,103,75.77,-19.990,-18.736,-18.994,-18.151,-12.488,...,1.583333,2.083333,4.250000,4.416667,5.750000,5.833333,9.416667,19.166667,35.500000,0


In [39]:
peptides = peptides.merge(
    smooth_accessibilities, 
    how="left", 
    left_on=["Protein ID", f"{amino_acid_str} Location"], 
    right_on=["protein_id", "position"]
)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,-2.112572,-2.301838,-2.049472,-2.460635,-2.802424,sp|O94979|SC31A_HUMAN,...,1.333333,2.000000,2.000000,2.000000,2.000000,2.000000,4.047619,7.571429,10.619048,1
1,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,-2.362646,,,-2.136884,-1.407531,sp|P35579|MYH9_HUMAN,...,2.000000,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,1
2,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-1.039815,-0.902895,,,-1.159599,sp|P52272|HNRPM_HUMAN,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
3,QMMEAATR,QMM[15.9949]EAATR,QM[649.3660]M[15.9949]EAATR,QM[655.3735]M[15.9949]EAATR,,,-0.908366,-0.743273,-0.549589,sp|O43395|PRPF3_HUMAN,...,1.761905,2.047619,3.952381,4.761905,6.095238,6.285714,9.523810,15.857143,20.190476,1
4,AVSAVKNMNLPEIPR,AVSAVKNMNLPEIPR,AVSAVKNM[649.3660]NLPEIPR,AVSAVKNM[655.3735]NLPEIPR,,-0.584995,-0.473381,,-0.601462,sp|Q92572|AP3S1_HUMAN,...,1.047619,2.000000,2.000000,2.000000,2.047619,2.047619,4.761905,8.238095,12.809524,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,MDSRGEHRQDR,MDSRGEHRQDR,M[649.3660]DSRGEHRQDR,M[655.3735]DSRGEHRQDR,8.043113,,,8.179892,,sp|P35637|FUS_HUMAN,...,1.285714,2.000000,2.000000,2.000000,2.047619,2.047619,5.142857,8.095238,11.142857,1
101,MGANSLER,MGANSLER,M[649.3660]GANSLER,M[655.3735]GANSLER,8.274117,8.592321,,8.789749,,sp|P52272|HNRPM_HUMAN,...,1.476190,2.000000,2.000000,2.000000,2.000000,2.000000,4.000000,7.000000,10.000000,1
102,MNQGTAR,MNQGTAR,M[649.3660]NQGTAR,M[655.3735]NQGTAR,,9.011247,,,8.254092,sp|P43243|MATR3_HUMAN,...,1.095238,2.000000,2.000000,2.000000,2.000000,2.000000,5.761905,9.619048,12.285714,1
103,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,,8.964193,8.405923,,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.714286,4.285714,5.285714,6.238095,14.285714,38.523810,60.809524,0


In [40]:
# Sanity Check: ensure UniProt and AlphaFold sequences are the same

peptides["AA"].value_counts()
#pd.set_option("display.max_columns", None)
#display(peptides[~(peptides["AA"] == amino_acid)])
#pd.reset_option("display.max_columns")
#peptides = peptides[(peptides["AA"] == amino_acid)]

AA
M    105
Name: count, dtype: int64

In [41]:
#peptides.to_csv(os.path.join(curr_dir_path, "HCT116_processed.csv"))

In [42]:
path = os.path.join(curr_dir_path, "HCT116_processed.csv")
peptides = pd.read_csv(path).set_index("Unnamed: 0")
peptides.index.name = None
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,-2.112572,-2.301838,-2.049472,-2.460635,-2.802424,sp|O94979|SC31A_HUMAN,...,1.333333,2.000000,2.000000,2.000000,2.000000,2.000000,4.047619,7.571429,10.619048,1
1,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,-2.362646,,,-2.136884,-1.407531,sp|P35579|MYH9_HUMAN,...,2.000000,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,1
2,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-1.039815,-0.902895,,,-1.159599,sp|P52272|HNRPM_HUMAN,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
3,QMMEAATR,QMM[15.9949]EAATR,QM[649.3660]M[15.9949]EAATR,QM[655.3735]M[15.9949]EAATR,,,-0.908366,-0.743273,-0.549589,sp|O43395|PRPF3_HUMAN,...,1.761905,2.047619,3.952381,4.761905,6.095238,6.285714,9.523810,15.857143,20.190476,1
4,AVSAVKNMNLPEIPR,AVSAVKNMNLPEIPR,AVSAVKNM[649.3660]NLPEIPR,AVSAVKNM[655.3735]NLPEIPR,,-0.584995,-0.473381,,-0.601462,sp|Q92572|AP3S1_HUMAN,...,1.047619,2.000000,2.000000,2.000000,2.047619,2.047619,4.761905,8.238095,12.809524,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,MDSRGEHRQDR,MDSRGEHRQDR,M[649.3660]DSRGEHRQDR,M[655.3735]DSRGEHRQDR,8.043113,,,8.179892,,sp|P35637|FUS_HUMAN,...,1.285714,2.000000,2.000000,2.000000,2.047619,2.047619,5.142857,8.095238,11.142857,1
101,MGANSLER,MGANSLER,M[649.3660]GANSLER,M[655.3735]GANSLER,8.274117,8.592321,,8.789749,,sp|P52272|HNRPM_HUMAN,...,1.476190,2.000000,2.000000,2.000000,2.000000,2.000000,4.000000,7.000000,10.000000,1
102,MNQGTAR,MNQGTAR,M[649.3660]NQGTAR,M[655.3735]NQGTAR,,9.011247,,,8.254092,sp|P43243|MATR3_HUMAN,...,1.095238,2.000000,2.000000,2.000000,2.000000,2.000000,5.761905,9.619048,12.285714,1
103,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,,8.964193,8.405923,,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.714286,4.285714,5.285714,6.238095,14.285714,38.523810,60.809524,0


## Hela

### Load and Process Dataset - Hela

In [43]:
# Load initial isoTOP-ABPP dataset
data_loc = os.path.join(curr_dir_path, "Hela_hyperreactivity.csv")
peptides = pd.read_csv(data_loc)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Light Intensity,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,Average H/L,P-value,Site
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,,,-5.353853,-5.179115,-5.170143,,sp|P51532|SMCA4_HUMAN,P51532,SMCA4_HUMAN,SMARCA4,Transcription activator BRG1,-5.234370,1.304830e-04,SMARCA4_M1233
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,221884784.0,-4.886254,-4.943996,-5.015291,-4.814711,-4.866699,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,-4.905390,1.449043e-08,HNRNPM_M437
2,VDLVLMHWR,VDLVLMHWR,VDLVLM[649.3660]HWR,VDLVLM[655.3735]HWR,33654592.0,-3.070272,-3.355457,,,,sp|O75165|DJC13_HUMAN,O75165,DJC13_HUMAN,DNAJC13,DnaJ homolog subfamily C member 13,-3.212865,2.823575e-02,DNAJC13_M724
3,FKMPEMHFR,FKM[15.9949]PEMHFR,FKM[15.9949]PEM[649.3660]HFR,FKM[15.9949]PEM[655.3735]HFR,,,,,-2.202729,-2.219461,sp|Q09666|AHNK_HUMAN,Q09666,AHNK_HUMAN,AHNAK,Neuroblast differentiation-associated protein ...,-2.211095,2.408660e-03,AHNAK_M1069
4,KLMQLQHEK,KLMQLQHEK,KLM[649.3660]QLQHEK,KLM[655.3735]QLQHEK,2891831.2,1.070343,1.083406,,,,sp|Q16204|CCDC6_HUMAN,Q16204,CCDC6_HUMAN,CCDC6,Coiled-coil domain-containing protein 6,1.076875,3.861215e-03,CCDC6_M150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,SGSMDPSGAHPSVR,SGSMDPSGAHPSVR,SGSM[649.3660]DPSGAHPSVR,SGSM[655.3735]DPSGAHPSVR,831407.1,9.290080,8.773245,8.012736,9.211272,,sp|Q07666|KHDR1_HUMAN,Q07666,KHDR1_HUMAN,KHDRBS1,"KH domain-containing, RNA-binding, signal tran...",8.821834,8.021238e-05,KHDRBS1_M21
121,ISMPDFDLHLKGPK,ISMPDFDLHLKGPK,ISM[649.3660]PDFDLHLKGPK,ISM[655.3735]PDFDLHLKGPK,,,,,8.750263,9.232076,sp|Q09666|AHNK_HUMAN,Q09666,AHNK_HUMAN,AHNAK,Neuroblast differentiation-associated protein ...,8.991169,1.705332e-02,
122,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,675726.2,9.035667,,,9.444678,,sp|P83731|RL24_HUMAN,P83731,RL24_HUMAN,RPL24,Large ribosomal subunit protein eL24,9.240172,1.408752e-02,RPL24_M127
123,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGM[649.3660]VVTFAPVNVTTEVK,VETGVLKPGM[655.3735]VVTFAPVNVTTEVK,578260.0,10.074934,,9.966146,,9.453065,sp|P68104|EF1A1_HUMAN,P68104,EF1A1_HUMAN,EEF1A1,Elongation factor 1-alpha 1,9.831382,3.801740e-04,EEF1A1_M276


In [44]:
# Check dataset for missing (NaN) entries
peptides.isna().sum()

Peptide Sequence           0
Modified Peptide           0
Light Modified Peptide     0
Heavy Modified Peptide     0
exp_1 Light Intensity     45
exp_1 Log2 Ratio HL       45
exp_2 Log2 Ratio HL       48
exp_3 Log2 Ratio HL       49
exp_4 Log2 Ratio HL       51
exp_5 Log2 Ratio HL       41
Protein                    0
Protein ID                 0
Entry Name                 0
Gene                       0
Protein Description        0
Average H/L                0
P-value                    0
Site                       9
dtype: int64

In [45]:
# Sanity Check: ensure there's only one desired modification in each peptide
modifications_pattern = create_modifications_pattern(amino_acid, modifications)
print(modifications_pattern)
display(peptides["Light Modified Peptide"].str.count(modifications_pattern).value_counts())
display(peptides["Heavy Modified Peptide"].str.count(modifications_pattern).value_counts())

M\[649\.3660\]|M\[655\.3735\]


Light Modified Peptide
1    125
Name: count, dtype: int64

Heavy Modified Peptide
1    125
Name: count, dtype: int64

In [46]:
# Annotate Met site hyperreactivity labels (Hyperreactive: <= 2)

peptides["Hyperreactive"] = np.where(peptides["Average H/L"] <= 2, 1, 0)

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")

In [47]:
# Load and, if necessary, update sequence cache df (mapping from UniProt IDs to full protein sequences)

unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

path = os.path.join(global_data_path, "complete_sequence_cache.csv")
update_sequence_cache(path, unique_uniprotIDs)

sequence_cache_df_updated = pd.read_csv(path).set_index("Unnamed: 0")
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

Unique UniProt IDs: 
['P51532' 'P52272' 'O75165' 'Q09666' 'Q16204' 'P35579' 'O94992' 'Q15149'
 'Q9NWH9' 'Q7LBR1' 'O60610' 'P49257' 'Q15366' 'Q9UKD2' 'P0CG12' 'Q9UNZ5'
 'Q01518' 'Q86UP2' 'Q99459' 'P14314' 'Q16181' 'P25786' 'P43243' 'P54886'
 'Q13813' 'Q15233' 'P67870' 'P11940' 'Q07065' 'P26583' 'P10809' 'P62195'
 'P08238' 'P33240' 'P07108' 'Q04323' 'O60749' 'Q16543' 'P49915' 'P14174'
 'Q13283' 'O75533' 'Q9NYF8' 'Q8N6H7' 'P50990' 'P34932' 'Q96I24' 'Q8NE71'
 'Q9C0J8' 'P14625' 'P15170' 'Q14683' 'P41227' 'Q9Y3U8' 'P07900' 'P06703'
 'P62847' 'O95817' 'P00966' 'Q9Y3Y2' 'Q12931' 'Q9UHX1' 'P38646' 'Q04837'
 'P14866' 'P18669' 'P33991' 'O60664' 'P31948' 'O14737' 'P62805' 'P22626'
 'P62841' 'P07910' 'P61247' 'P68032' 'Q15424' 'P83731' 'Q96PK6' 'P46777'
 'P18583' 'P02545' 'Q08211' 'P16949' 'O75396' 'P26373' 'P37802' 'Q15056'
 'P84098' 'P15311' 'Q14152' 'P18124' 'P11142' 'Q07666' 'P68104']
Number of Unique UniProt IDs: 95
   Protein ID                                  Complete Sequence
0      Q8C196

In [48]:
peptides = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Light Intensity,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,Average H/L,P-value,Site,Hyperreactive,Complete Sequence
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,,,-5.353853,-5.179115,-5.170143,,sp|P51532|SMCA4_HUMAN,P51532,SMCA4_HUMAN,SMARCA4,Transcription activator BRG1,-5.234370,1.304830e-04,SMARCA4_M1233,1,MSTPDPPLGGTPRPGPSPGPGPSPGAMLGPSPGPSPGSAHSMMGPS...
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,221884784.0,-4.886254,-4.943996,-5.015291,-4.814711,-4.866699,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,-4.905390,1.449043e-08,HNRNPM_M437,1,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...
2,VDLVLMHWR,VDLVLMHWR,VDLVLM[649.3660]HWR,VDLVLM[655.3735]HWR,33654592.0,-3.070272,-3.355457,,,,sp|O75165|DJC13_HUMAN,O75165,DJC13_HUMAN,DNAJC13,DnaJ homolog subfamily C member 13,-3.212865,2.823575e-02,DNAJC13_M724,1,MNIIRENKDLACFYTTKHSWRGKYKRVFSVGTHAITTYNPNTLEVT...
3,FKMPEMHFR,FKM[15.9949]PEMHFR,FKM[15.9949]PEM[649.3660]HFR,FKM[15.9949]PEM[655.3735]HFR,,,,,-2.202729,-2.219461,sp|Q09666|AHNK_HUMAN,Q09666,AHNK_HUMAN,AHNAK,Neuroblast differentiation-associated protein ...,-2.211095,2.408660e-03,AHNAK_M1069,1,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...
4,KLMQLQHEK,KLMQLQHEK,KLM[649.3660]QLQHEK,KLM[655.3735]QLQHEK,2891831.2,1.070343,1.083406,,,,sp|Q16204|CCDC6_HUMAN,Q16204,CCDC6_HUMAN,CCDC6,Coiled-coil domain-containing protein 6,1.076875,3.861215e-03,CCDC6_M150,1,MADSASESDTDGAGGNSSSSAAMQSSCSSTSGGGGGGGGGGGGGKS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,SGSMDPSGAHPSVR,SGSMDPSGAHPSVR,SGSM[649.3660]DPSGAHPSVR,SGSM[655.3735]DPSGAHPSVR,831407.1,9.290080,8.773245,8.012736,9.211272,,sp|Q07666|KHDR1_HUMAN,Q07666,KHDR1_HUMAN,KHDRBS1,"KH domain-containing, RNA-binding, signal tran...",8.821834,8.021238e-05,KHDRBS1_M21,0,MQRRDDPAARMSRSSGRSGSMDPSGAHPSVRQTPSRQPPLPHRSRG...
121,ISMPDFDLHLKGPK,ISMPDFDLHLKGPK,ISM[649.3660]PDFDLHLKGPK,ISM[655.3735]PDFDLHLKGPK,,,,,8.750263,9.232076,sp|Q09666|AHNK_HUMAN,Q09666,AHNK_HUMAN,AHNAK,Neuroblast differentiation-associated protein ...,8.991169,1.705332e-02,,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...
122,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,675726.2,9.035667,,,9.444678,,sp|P83731|RL24_HUMAN,P83731,RL24_HUMAN,RPL24,Large ribosomal subunit protein eL24,9.240172,1.408752e-02,RPL24_M127,0,MKVELCSFSGYKIYPGHGRRYARTDGKVFQFLNAKCESAFLSKRNP...
123,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGM[649.3660]VVTFAPVNVTTEVK,VETGVLKPGM[655.3735]VVTFAPVNVTTEVK,578260.0,10.074934,,9.966146,,9.453065,sp|P68104|EF1A1_HUMAN,P68104,EF1A1_HUMAN,EEF1A1,Elongation factor 1-alpha 1,9.831382,3.801740e-04,EEF1A1_M276,0,MGKEKTHINIVVIGHVDSGKSTTTGHLIYKCGGIDKRTIEKFEKEA...


In [49]:
# Process dataset to extract peptide and Met site locations
peptides = process_dataset(peptides, amino_acid, amino_acid_str, analysis_threshold, modifications)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Light Intensity,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,,,-5.353853,-5.179115,-5.170143,,...,P51532_M1232,1,MSTPDPPLGGTPRPGPSPGPGPSPGAMLGPSPGPSPGSAHSMMGPS...,1227,10,VIQAG,5,1232,KILAAAKYKLNVDQKVIQAG,FDQKSSSHERRAFLQAILEH
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,221884784.0,-4.886254,-4.943996,-5.015291,-4.814711,-4.866699,...,P52272_M436,1,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,436,7,,0,436,ERMGAGLGHGMDRVGSEIER,GLVMDRMGSVERMGSGIERM
2,VDLVLMHWR,VDLVLMHWR,VDLVLM[649.3660]HWR,VDLVLM[655.3735]HWR,33654592.0,-3.070272,-3.355457,,,,...,O75165_M723,1,MNIIRENKDLACFYTTKHSWRGKYKRVFSVGTHAITTYNPNTLEVT...,718,9,VDLVL,5,723,AGKAAKEVEKFAKEKVDLVL,HWRDRMGIAQKENINQKPVV
3,FKMPEMHFR,FKM[15.9949]PEMHFR,FKM[15.9949]PEM[649.3660]HFR,FKM[15.9949]PEM[655.3735]HFR,,,,,-2.202729,-2.219461,...,Q09666_M1059,1,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,1054,9,FKMPE,5,1059,DLSLEGPEGKLKGPKFKMPE,HFRAPKMSLPDVDLDLKGPK
4,KLMQLQHEK,KLMQLQHEK,KLM[649.3660]QLQHEK,KLM[655.3735]QLQHEK,2891831.2,1.070343,1.083406,,,,...,Q16204_M149,1,MADSASESDTDGAGGNSSSSAAMQSSCSSTSGGGGGGGGGGGGGKS...,147,9,KL,2,149,LAVNYEKEEEFLTNELSRKL,QLQHEKAELEQHLEQEQEFQ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,SGSMDPSGAHPSVR,SGSMDPSGAHPSVR,SGSM[649.3660]DPSGAHPSVR,SGSM[655.3735]DPSGAHPSVR,831407.1,9.290080,8.773245,8.012736,9.211272,,...,Q07666_M20,0,MQRRDDPAARMSRSSGRSGSMDPSGAHPSVRQTPSRQPPLPHRSRG...,17,14,SGS,3,20,MQRRDDPAARMSRSSGRSGS,DPSGAHPSVRQTPSRQPPLP
121,ISMPDFDLHLKGPK,ISMPDFDLHLKGPK,ISM[649.3660]PDFDLHLKGPK,ISM[655.3735]PDFDLHLKGPK,,,,,8.750263,9.232076,...,Q09666_M2580,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,2578,14,IS,2,2580,KLKGPKLKMPEMNIKAPKIS,PDFDLHLKGPKVKGDVDVSL
122,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,675726.2,9.035667,,,9.444678,,...,P83731_M126,0,MKVELCSFSGYKIYPGHGRRYARTDGKVFQFLNAKCESAFLSKRNP...,124,7,TA,2,126,QAIRAAKEAKKAKQASKKTA,AAAKAPTKAAPKQKIVKPVK
123,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGM[649.3660]VVTFAPVNVTTEVK,VETGVLKPGM[655.3735]VVTFAPVNVTTEVK,578260.0,10.074934,,9.966146,,9.453065,...,P68104_M275,0,MGKEKTHINIVVIGHVDSGKSTTTGHLIYKCGGIDKRTIEKFEKEA...,266,24,VETGVLKPG,9,275,IGGIGTVPVGRVETGVLKPG,VVTFAPVNVTTEVKSVEMHH


In [50]:
# Sanity Check: ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides["Complete Sequence"], peptides["Peptide Location"], peptides["Peptide Length"])]
(temp == peptides["Peptide Sequence"]).value_counts()

Peptide Sequence
True    125
Name: count, dtype: int64

In [51]:
# Sanity Check: ensure Met sites are correct
temp = [A[B] for A, B in zip(peptides["Complete Sequence"], peptides[f"{amino_acid_str} Location"])]
pd.Series(temp).value_counts()

M    125
Name: count, dtype: int64

In [52]:
# NOTE: some Met sites do not have a full 20 amino acids to either side
peptides[(peptides[f"Left {analysis_threshold}"].str.len() != 20) | (peptides[f"Right {analysis_threshold}"].str.len() != 20)]

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Light Intensity,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
8,IVQISGNSMPR,IVQISGNSMPR,IVQISGNSM[649.3660]PR,IVQISGNSM[655.3735]PR,8873681.0,1.854634,,,2.10589,1.905603,...,Q9NWH9_M1014,1,MAAATGAVAASAASGQAEGKKITDLRVIDLKSELKRRNLDITGVKT...,1006,11,IVQISGNS,8,1014,TQHSSNASPINRIVQISGNS,PRGSGSGFKPFKGGPPRRF
16,ADMQNLVER,n[42.0106]ADMQNLVER,n[42.0106]ADM[649.3660]QNLVER,n[42.0106]ADM[655.3735]QNLVER,3235985.5,2.248892,3.686507,2.220728,,2.578891,...,Q01518_M3,0,MADMQNLVERLERAVGRLEAVSHTSDMHRGYADSPSKAGAAPYVQA...,1,9,AD,2,3,MA,QNLVERLERAVGRLEAVSHT
44,PMFIVNTNVPR,n[42.0106]PMFIVNTNVPR,n[42.0106]PM[649.3660]FIVNTNVPR,n[42.0106]PM[655.3735]FIVNTNVPR,2435922.0,4.270483,4.466373,4.464182,4.565046,5.195246,...,P14174_M2,0,MPMFIVNTNVPRASVPDGFLSELTQQLAQATGKPPQYIAVHVVPDQ...,1,11,P,1,2,M,FIVNTNVPRASVPDGFLSEL
62,REELSNVLAAMRK,REELSNVLAAMRK,REELSNVLAAM[649.3660]RK,REELSNVLAAM[655.3735]RK,5732029.5,5.428009,5.202194,5.751889,5.142188,5.237488,...,Q9Y3U8_M96,0,MALRYPMAVGLNKGHKVTKNVSKPRHSRRRGRLTKHTKFVRDMIRE...,86,13,REELSNVLAA,10,96,VGTHIRAKRKREELSNVLAA,RKAAAKKD
66,KFMTNR,KFMTNR,KFM[649.3660]TNR,KFM[655.3735]TNR,10684478.0,5.411728,,5.597605,5.732497,,...,P62847_M12,0,MNDTVTIRTRKFMTNRLLQRKQMVIDVLHPGKATVPKTEIREKLAK...,10,6,KF,2,12,MNDTVTIRTRK,TNRLLQRKQMVIDVLHPGKA
77,AMEAVAAQGK,AMEAVAAQGK,AM[649.3660]EAVAAQGK,AM[655.3735]EAVAAQGK,4608274.0,6.565413,5.900183,6.117652,5.809629,6.190679,...,P18669_M242,0,MAAYKLVLIRHGESAWNLENRFSGWYDADLSPAGHEEAKRGGQALR...,241,10,A,1,242,NLKPIKPMQFLGDEETVRKA,EAVAAQGKAKK
82,TVTAMDVVYALK,TVTAMDVVYALK,TVTAM[649.3660]DVVYALK,TVTAM[655.3735]DVVYALK,9528581.0,6.309925,6.365998,7.13447,6.360899,6.318951,...,P62805_M84,0,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,80,12,TVTA,4,84,NVIRDAVTYTEHAKRKTVTA,DVVYALKRQGRTLYGFGG
98,AAQMHSGYQR,AAQMHSGYQR,AAQM[649.3660]HSGYQR,AAQM[655.3735]HSGYQR,2314374.0,7.164218,,7.539122,7.326026,6.758724,...,Q96PK6_M660,0,MKIFVGNVDGADTTPEELAALFAPYGTVMSCAVMKQFAFVHMRENA...,657,10,AAQ,3,660,AHSDYARYSGSYNDYLRAAQ,HSGYQRRM
107,VLLTMIAR,VLLTMIAR,VLLTM[649.3660]IAR,VLLTM[655.3735]IAR,168637.97,7.88474,,,7.046284,,...,O75396_M5,0,MVLLTMIARVADGLPLAASMQEDEQSGRDLQQYQSQAKQLFRKLNE...,1,8,VLLT,4,5,MVLL,IARVADGLPLAASMQEDEQS
109,NGMVLKPHFHK,NGMVLKPHFHK,NGM[649.3660]VLKPHFHK,NGM[655.3735]VLKPHFHK,509474.5,8.110635,7.099424,,,,...,P26373_M7,0,MAPSRNGMVLKPHFHKDWQRRVATWFNQPARKIRRRKARQAKARRI...,5,11,NG,2,7,MAPSRN,VLKPHFHKDWQRRVATWFNQ


### Download Alphafold Data - Hela

In [53]:
# Remove invalid proteins (according to alphafold)
# 10 invalid peptides as a result -> 2 hyperreactive, 8 not

invalid_IDs = ['Q09666', 'Q15149']
display(peptides[peptides["Protein ID"].isin(invalid_IDs)])
peptides = peptides[~peptides["Protein ID"].isin(invalid_IDs)]
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Light Intensity,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
3,FKMPEMHFR,FKM[15.9949]PEMHFR,FKM[15.9949]PEM[649.3660]HFR,FKM[15.9949]PEM[655.3735]HFR,,,,,-2.202729,-2.219461,...,Q09666_M1059,1,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,1054,9,FKMPE,5,1059,DLSLEGPEGKLKGPKFKMPE,HFRAPKMSLPDVDLDLKGPK
7,MGIVGPEFK,MGIVGPEFK,M[649.3660]GIVGPEFK,M[655.3735]GIVGPEFK,2765434.0,1.282817,1.447835,2.76351,,1.001939,...,Q15149_M4129,1,MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL...,4129,9,,0,4129,TGYVIDPIKGLKLTVEEAVR,GIVGPEFKDKLLSAERAVTG
21,AEMEVLLASK,AEMEVLLASK,AEM[649.3660]EVLLASK,AEM[655.3735]EVLLASK,593212.9,4.49196,2.663923,2.562397,2.526014,2.155676,...,Q15149_M1900,0,MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL...,1898,10,AE,2,1900,AAATQKRQELEAELAKVRAE,EVLLASKARAEEESRSTSEK
55,IPMPDFDLHLKGPK,IPMPDFDLHLKGPK,IPM[649.3660]PDFDLHLKGPK,IPM[655.3735]PDFDLHLKGPK,,,4.915636,5.208143,,,...,Q09666_M2964,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,2962,14,IP,2,2964,KLKGPKFKMPEMNIKAPKIP,PDFDLHLKGPKVKGDVDISL
84,FKMPDVHFK,FKMPDVHFK,FKM[649.3660]PDVHFK,FKM[655.3735]PDVHFK,134584.19,6.969387,,,,6.162996,...,Q09666_M4508,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,4506,9,FK,2,4508,KGPEVDIEGPEGKLKGPKFK,PDVHFKSPQISMSDIDLNLK
89,GDMDISLPK,GDMDISLPK,GDM[649.3660]DISLPK,GDM[655.3735]DISLPK,240535.12,6.42828,7.388692,,,,...,Q09666_M4610,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,4608,9,GD,2,4610,PKISMPEVDLNLKGPKVKGD,DISLPKVEGDLKGPEVDIRD
101,FKMPEMNIKAPK,FKMPEM[15.9949]NIKAPK,FKM[649.3660]PEM[15.9949]NIKAPK,FKM[655.3735]PEM[15.9949]NIKAPK,732931.56,7.710224,6.803463,,7.157932,7.28223,...,Q09666_M935,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,933,12,FK,2,935,EVPDVNIEGPEGKLKGPKFK,PEMNIKAPKISMPDVDLHMK
114,FSMPGFK,FSMPGFK,FSM[649.3660]PGFK,FSM[655.3735]PGFK,3485137.8,8.044017,,8.594781,,,...,Q09666_M886,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,884,7,FS,2,886,EVQGPDWHLKMPKMKMPKFS,PGFKAEGPEVDVNLPKADVD
116,VDIDAPDVDVHGPDWHLKMPK,VDIDAPDVDVHGPDWHLKMPK,VDIDAPDVDVHGPDWHLKM[649.3660]PK,VDIDAPDVDVHGPDWHLKM[655.3735]PK,,,8.236843,8.6281,,,...,Q09666_M748,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,730,21,VDIDAPDVDVHGPDWHLK,18,748,PKVDIDAPDVDVHGPDWHLK,PKMKMPKFSVPGFKAEGPEV
121,ISMPDFDLHLKGPK,ISMPDFDLHLKGPK,ISM[649.3660]PDFDLHLKGPK,ISM[655.3735]PDFDLHLKGPK,,,,,8.750263,9.232076,...,Q09666_M2580,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,2578,14,IS,2,2580,KLKGPKLKMPEMNIKAPKIS,PDFDLHLKGPKVKGDVDVSL


Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Light Intensity,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,,,-5.353853,-5.179115,-5.170143,,...,P51532_M1232,1,MSTPDPPLGGTPRPGPSPGPGPSPGAMLGPSPGPSPGSAHSMMGPS...,1227,10,VIQAG,5,1232,KILAAAKYKLNVDQKVIQAG,FDQKSSSHERRAFLQAILEH
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,221884784.0,-4.886254,-4.943996,-5.015291,-4.814711,-4.866699,...,P52272_M436,1,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,436,7,,0,436,ERMGAGLGHGMDRVGSEIER,GLVMDRMGSVERMGSGIERM
2,VDLVLMHWR,VDLVLMHWR,VDLVLM[649.3660]HWR,VDLVLM[655.3735]HWR,33654592.0,-3.070272,-3.355457,,,,...,O75165_M723,1,MNIIRENKDLACFYTTKHSWRGKYKRVFSVGTHAITTYNPNTLEVT...,718,9,VDLVL,5,723,AGKAAKEVEKFAKEKVDLVL,HWRDRMGIAQKENINQKPVV
4,KLMQLQHEK,KLMQLQHEK,KLM[649.3660]QLQHEK,KLM[655.3735]QLQHEK,2891831.2,1.070343,1.083406,,,,...,Q16204_M149,1,MADSASESDTDGAGGNSSSSAAMQSSCSSTSGGGGGGGGGGGGGKS...,147,9,KL,2,149,LAVNYEKEEEFLTNELSRKL,QLQHEKAELEQHLEQEQEFQ
5,NKHEAMITDLEER,NKHEAMITDLEER,NKHEAM[649.3660]ITDLEER,NKHEAM[655.3735]ITDLEER,11979429.0,1.277815,1.576447,1.391734,,0.748684,...,P35579_M1027,1,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,1022,13,NKHEA,5,1027,LTEEEEKSKSLAKLKNKHEA,ITDLEERLRREEKQRQELEK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,HWPFMVVNDAGRPK,HWPFMVVNDAGRPK,HWPFM[649.3660]VVNDAGRPK,HWPFM[655.3735]VVNDAGRPK,877354.7,7.774908,8.821908,9.482570,,8.929387,...,P11142_M92,0,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...,88,14,HWPF,4,92,LIGRRFDDAVVQSDMKHWPF,VVNDAGRPKVQVEYKGETKS
120,SGSMDPSGAHPSVR,SGSMDPSGAHPSVR,SGSM[649.3660]DPSGAHPSVR,SGSM[655.3735]DPSGAHPSVR,831407.1,9.290080,8.773245,8.012736,9.211272,,...,Q07666_M20,0,MQRRDDPAARMSRSSGRSGSMDPSGAHPSVRQTPSRQPPLPHRSRG...,17,14,SGS,3,20,MQRRDDPAARMSRSSGRSGS,DPSGAHPSVRQTPSRQPPLP
122,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,675726.2,9.035667,,,9.444678,,...,P83731_M126,0,MKVELCSFSGYKIYPGHGRRYARTDGKVFQFLNAKCESAFLSKRNP...,124,7,TA,2,126,QAIRAAKEAKKAKQASKKTA,AAAKAPTKAAPKQKIVKPVK
123,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGM[649.3660]VVTFAPVNVTTEVK,VETGVLKPGM[655.3735]VVTFAPVNVTTEVK,578260.0,10.074934,,9.966146,,9.453065,...,P68104_M275,0,MGKEKTHINIVVIGHVDSGKSTTTGHLIYKCGGIDKRTIEKFEKEA...,266,24,VETGVLKPG,9,275,IGGIGTVPVGRVETGVLKPG,VVTFAPVNVTTEVKSVEMHH


In [54]:
# Set UniProt IDs to use
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P51532' 'P52272' 'O75165' 'Q16204' 'P35579' 'O94992' 'Q9NWH9' 'Q7LBR1'
 'O60610' 'P49257' 'Q15366' 'Q9UKD2' 'P0CG12' 'Q9UNZ5' 'Q01518' 'Q86UP2'
 'Q99459' 'P14314' 'Q16181' 'P25786' 'P43243' 'P54886' 'Q13813' 'Q15233'
 'P67870' 'P11940' 'Q07065' 'P26583' 'P10809' 'P62195' 'P08238' 'P33240'
 'P07108' 'Q04323' 'O60749' 'Q16543' 'P49915' 'P14174' 'Q13283' 'O75533'
 'Q9NYF8' 'Q8N6H7' 'P50990' 'P34932' 'Q96I24' 'Q8NE71' 'Q9C0J8' 'P14625'
 'P15170' 'Q14683' 'P41227' 'Q9Y3U8' 'P07900' 'P06703' 'P62847' 'O95817'
 'P00966' 'Q9Y3Y2' 'Q12931' 'Q9UHX1' 'P38646' 'Q04837' 'P14866' 'P18669'
 'P33991' 'O60664' 'P31948' 'O14737' 'P62805' 'P22626' 'P62841' 'P07910'
 'P61247' 'P68032' 'Q15424' 'P83731' 'Q96PK6' 'P46777' 'P18583' 'P02545'
 'Q08211' 'P16949' 'O75396' 'P26373' 'P37802' 'Q15056' 'P84098' 'P15311'
 'Q14152' 'P18124' 'P11142' 'Q07666' 'P68104']
Number of Unique UniProt IDs: 93


In [55]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 93/93 [00:00<00:00, 98007.61it/s]

2025-03-24 19:26:00> Valid proteins: 0
2025-03-24 19:26:00> Invalid proteins: 0
2025-03-24 19:26:00> Existing proteins: 93





In [56]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir,
)

100%|██████████| 93/93 [00:00<00:00, 103111.36it/s]

2025-03-24 19:26:00> Valid proteins: 0
2025-03-24 19:26:00> Invalid proteins: 0
2025-03-24 19:26:00> Existing proteins: 93





### Calculate Accessibilites and Merge into Full Dataset - Hela

In [57]:
radii = [2, 3, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 12, 18, 24]
smooth_accessibilities = calculate_accessibilities(cif_dir, pae_dir, unique_uniprotIDs, radii)
smooth_accessibilities["position"] = smooth_accessibilities["position"] - 1 # zero-index the positions to match initial dataframe
smooth_accessibilities

100%|██████████| 1986/1986 [00:08<00:00, 238.47it/s]
100%|██████████| 93/93 [00:01<00:00, 92.54it/s] 
100%|██████████| 93/93 [00:00<00:00, 95.16it/s] 
100%|██████████| 93/93 [00:00<00:00, 94.76it/s] 
100%|██████████| 93/93 [00:00<00:00, 95.80it/s] 
100%|██████████| 93/93 [00:00<00:00, 94.79it/s] 
100%|██████████| 93/93 [00:00<00:00, 94.64it/s] 
100%|██████████| 93/93 [00:00<00:00, 93.70it/s] 
100%|██████████| 93/93 [00:01<00:00, 89.57it/s]
100%|██████████| 93/93 [00:01<00:00, 91.67it/s] 
100%|██████████| 93/93 [00:01<00:00, 91.97it/s]
100%|██████████| 93/93 [00:01<00:00, 91.84it/s]
100%|██████████| 93/93 [00:01<00:00, 85.07it/s] 
100%|██████████| 93/93 [00:01<00:00, 71.81it/s]
100%|██████████| 93/93 [00:01<00:00, 59.82it/s]
100%|██████████| 93/93 [00:00<00:00, 748.03it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,O14737,1,M,0,60.98,28.087,27.636,28.803,26.605,-14.869,...,1.272727,1.818182,2.090909,2.818182,4.272727,4.363636,8.181818,12.636364,16.272727,1
1,O14737,1,A,1,65.86,28.148,29.294,30.330,28.837,-11.969,...,1.333333,1.833333,2.166667,2.833333,4.416667,4.500000,8.333333,12.916667,16.666667,1
2,O14737,1,D,2,67.03,25.029,25.896,25.064,27.097,-11.432,...,1.384615,1.846154,2.153846,2.769231,4.384615,4.538462,8.461538,13.153846,17.000000,1
3,O14737,1,E,3,74.70,24.793,24.184,24.089,24.964,-11.799,...,1.428571,1.857143,2.214286,2.857143,4.500000,4.642857,8.642857,13.428571,17.285714,1
4,O14737,1,E,4,73.32,26.617,26.792,28.283,26.125,-9.380,...,1.466667,1.866667,2.200000,2.866667,4.600000,4.800000,8.800000,13.600000,17.533333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,Q9Y3Y2,93,P,243,57.92,36.105,34.827,34.245,35.160,16.663,...,1.066667,1.800000,2.000000,2.666667,3.200000,3.733333,6.466667,10.466667,13.733333,1
244,Q9Y3Y2,93,E,244,55.52,36.428,37.074,37.102,36.299,16.150,...,1.000000,1.785714,2.000000,2.714286,3.214286,3.571429,6.214286,10.000000,13.285714,1
245,Q9Y3Y2,93,T,245,51.57,36.300,37.224,38.704,36.908,17.910,...,0.923077,1.769231,1.846154,2.461538,3.000000,3.230769,5.923077,9.615385,12.846154,1
246,Q9Y3Y2,93,N,246,47.46,35.487,34.657,33.162,35.162,20.364,...,0.833333,1.583333,1.666667,2.250000,2.666667,2.833333,5.583333,9.083333,12.250000,1


In [58]:
peptides = peptides.merge(
    smooth_accessibilities, 
    how="left", 
    left_on=["Protein ID", f"{amino_acid_str} Location"], 
    right_on=["protein_id", "position"]
)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Light Intensity,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,,,-5.353853,-5.179115,-5.170143,,...,1.714286,2.047619,2.666667,3.142857,3.857143,4.238095,8.095238,25.190476,62.666667,0
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,221884784.0,-4.886254,-4.943996,-5.015291,-4.814711,-4.866699,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
2,VDLVLMHWR,VDLVLMHWR,VDLVLM[649.3660]HWR,VDLVLM[655.3735]HWR,33654592.0,-3.070272,-3.355457,,,,...,1.523810,2.000000,2.000000,2.047619,2.476190,3.619048,7.714286,12.857143,17.428571,1
3,KLMQLQHEK,KLMQLQHEK,KLM[649.3660]QLQHEK,KLM[655.3735]QLQHEK,2891831.2,1.070343,1.083406,,,,...,2.000000,2.142857,5.142857,6.285714,8.000000,8.000000,13.952381,20.190476,28.000000,1
4,NKHEAMITDLEER,NKHEAMITDLEER,NKHEAM[649.3660]ITDLEER,NKHEAM[655.3735]ITDLEER,11979429.0,1.277815,1.576447,1.391734,,0.748684,...,1.523810,2.285714,2.904762,3.904762,5.476190,6.571429,12.380952,20.333333,27.714286,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,HWPFMVVNDAGRPK,HWPFMVVNDAGRPK,HWPFM[649.3660]VVNDAGRPK,HWPFM[655.3735]VVNDAGRPK,877354.7,7.774908,8.821908,9.482570,,8.929387,...,2.047619,2.333333,3.904762,5.095238,6.238095,7.571429,18.619048,56.476190,115.714286,0
111,SGSMDPSGAHPSVR,SGSMDPSGAHPSVR,SGSM[649.3660]DPSGAHPSVR,SGSM[655.3735]DPSGAHPSVR,831407.1,9.290080,8.773245,8.012736,9.211272,,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.333333,9.952381,1
112,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,675726.2,9.035667,,,9.444678,,...,0.714286,1.428571,1.714286,2.333333,3.000000,3.190476,6.619048,11.904762,16.142857,1
113,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGM[649.3660]VVTFAPVNVTTEVK,VETGVLKPGM[655.3735]VVTFAPVNVTTEVK,578260.0,10.074934,,9.966146,,9.453065,...,2.285714,2.809524,3.904762,5.571429,7.333333,9.047619,24.619048,72.095238,124.761905,0


In [59]:
# Sanity Check: ensure UniProt and AlphaFold sequences are the same

peptides["AA"].value_counts()
#pd.set_option("display.max_columns", None)
#display(peptides[~(peptides["AA"] == amino_acid)])
#pd.reset_option("display.max_columns")
#peptides = peptides[(peptides["AA"] == amino_acid)]

AA
M    115
Name: count, dtype: int64

In [60]:
#peptides.to_csv(os.path.join(curr_dir_path, "Hela_processed.csv"))

In [61]:
path = os.path.join(curr_dir_path, "Hela_processed.csv")
peptides = pd.read_csv(path).set_index("Unnamed: 0")
peptides.index.name = None
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Light Intensity,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,,,-5.353853,-5.179115,-5.170143,,...,1.714286,2.047619,2.666667,3.142857,3.857143,4.238095,8.095238,25.190476,62.666667,0
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,221884784.0,-4.886254,-4.943996,-5.015291,-4.814711,-4.866699,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
2,VDLVLMHWR,VDLVLMHWR,VDLVLM[649.3660]HWR,VDLVLM[655.3735]HWR,33654592.0,-3.070272,-3.355457,,,,...,1.523810,2.000000,2.000000,2.047619,2.476190,3.619048,7.714286,12.857143,17.428571,1
3,KLMQLQHEK,KLMQLQHEK,KLM[649.3660]QLQHEK,KLM[655.3735]QLQHEK,2891831.2,1.070343,1.083406,,,,...,2.000000,2.142857,5.142857,6.285714,8.000000,8.000000,13.952381,20.190476,28.000000,1
4,NKHEAMITDLEER,NKHEAMITDLEER,NKHEAM[649.3660]ITDLEER,NKHEAM[655.3735]ITDLEER,11979429.0,1.277815,1.576447,1.391734,,0.748684,...,1.523810,2.285714,2.904762,3.904762,5.476190,6.571429,12.380952,20.333333,27.714286,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,HWPFMVVNDAGRPK,HWPFMVVNDAGRPK,HWPFM[649.3660]VVNDAGRPK,HWPFM[655.3735]VVNDAGRPK,877354.7,7.774908,8.821908,9.482570,,8.929387,...,2.047619,2.333333,3.904762,5.095238,6.238095,7.571429,18.619048,56.476190,115.714286,0
111,SGSMDPSGAHPSVR,SGSMDPSGAHPSVR,SGSM[649.3660]DPSGAHPSVR,SGSM[655.3735]DPSGAHPSVR,831407.1,9.290080,8.773245,8.012736,9.211272,,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.333333,9.952381,1
112,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,675726.2,9.035667,,,9.444678,,...,0.714286,1.428571,1.714286,2.333333,3.000000,3.190476,6.619048,11.904762,16.142857,1
113,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGM[649.3660]VVTFAPVNVTTEVK,VETGVLKPGM[655.3735]VVTFAPVNVTTEVK,578260.0,10.074934,,9.966146,,9.453065,...,2.285714,2.809524,3.904762,5.571429,7.333333,9.047619,24.619048,72.095238,124.761905,0


## Jurkat

### Load and Process Dataset - Jurkat

In [62]:
# Load initial isoTOP-ABPP dataset
data_loc = os.path.join(curr_dir_path, "Jurkat_hyperreactivity.csv")
peptides = pd.read_csv(data_loc)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,Average H/L,P-value,Site
0,MNGMLLNDRK,M[15.9949]NGMLLNDRK,M[15.9949]NGM[649.3660]LLNDRK,M[15.9949]NGM[655.3735]LLNDRK,-4.052345,,,,-4.383824,sp|P11940|PABP1_HUMAN,P11940,PABP1_HUMAN,PABPC1,Polyadenylate-binding protein 1,-4.218085,2.500159e-02,PABPC1_M170
1,QRLQEDEMR,QRLQEDEMR,QRLQEDEM[649.3660]R,QRLQEDEM[655.3735]R,,-2.303070,,,-2.156620,sp|Q04323|UBXN1_HUMAN,Q04323,UBXN1_HUMAN,UBXN1,UBX domain-containing protein 1,-2.229845,2.089821e-02,UBXN1_M143
2,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,-0.847024,,-0.934536,-0.727634,sp|O94979|SC31A_HUMAN,O94979,SC31A_HUMAN,SEC31A,Protein transport protein Sec31A,-0.836398,5.100497e-03,SEC31A_M823
3,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,,,-0.666811,-0.821128,-0.462431,sp|Q8WUH6|TM263_HUMAN,Q8WUH6,TM263_HUMAN,TMEM263,Transmembrane protein 263,-0.650123,2.459445e-02,TMEM263_M34
4,NSLYDMAR,NSLYDMAR,NSLYDM[649.3660]AR,NSLYDM[655.3735]AR,0.728052,0.744754,0.880054,,0.838652,sp|Q9BWF3|RBM4_HUMAN,Q9BWF3,RBM4_HUMAN,RBM4,RNA-binding protein 4,0.797878,2.120192e-04,RBM4_M347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,8.583307,,,8.001761,sp|P11940|PABP1_HUMAN,P11940,PABP1_HUMAN,PABPC1,Polyadenylate-binding protein 1,8.292534,2.231360e-02,PABPC1_M584
112,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,8.742243,8.575472,8.085437,8.208840,8.209045,sp|Q14152|EIF3A_HUMAN,Q14152,EIF3A_HUMAN,EIF3A,Eukaryotic translation initiation factor 3 sub...,8.364207,3.007041e-07,
113,HWPFMVVNDAGRPK,HWPFMVVNDAGRPK,HWPFM[649.3660]VVNDAGRPK,HWPFM[655.3735]VVNDAGRPK,9.709745,7.906998,7.789936,8.454630,8.280280,sp|P11142|HSP7C_HUMAN,P11142,HSP7C_HUMAN,HSPA8,Heat shock cognate 71 kDa protein,8.428318,1.615362e-05,HSPA8_M93
114,RKPDTIEVQQMK,RKPDTIEVQQMK,RKPDTIEVQQM[649.3660]K,RKPDTIEVQQM[655.3735]K,9.650006,,,,9.027199,sp|P26038|MOES_HUMAN,P26038,MOES_HUMAN,MSN,Moesin,9.338602,2.122074e-02,MSN_M305


In [63]:
# Check dataset for missing (NaN) entries
peptides.isna().sum()

Peptide Sequence           0
Modified Peptide           0
Light Modified Peptide     0
Heavy Modified Peptide     0
exp_1 Log2 Ratio HL       41
exp_2 Log2 Ratio HL       41
exp_3 Log2 Ratio HL       40
exp_4 Log2 Ratio HL       47
exp_5 Log2 Ratio HL       33
Protein                    0
Protein ID                 0
Entry Name                 0
Gene                       0
Protein Description        0
Average H/L                0
P-value                    0
Site                       2
dtype: int64

In [64]:
# Sanity Check: ensure there's only one desired modification in each peptide
modifications_pattern = create_modifications_pattern(amino_acid, modifications)
print(modifications_pattern)
display(peptides["Light Modified Peptide"].str.count(modifications_pattern).value_counts())
display(peptides["Heavy Modified Peptide"].str.count(modifications_pattern).value_counts())

M\[649\.3660\]|M\[655\.3735\]


Light Modified Peptide
1    116
Name: count, dtype: int64

Heavy Modified Peptide
1    116
Name: count, dtype: int64

In [65]:
# Annotate Met site hyperreactivity labels (Hyperreactive: <= 2)

peptides["Hyperreactive"] = np.where(peptides["Average H/L"] <= 2, 1, 0)

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")

In [66]:
# Load and, if necessary, update sequence cache df (mapping from UniProt IDs to full protein sequences)

unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

path = os.path.join(global_data_path, "complete_sequence_cache.csv")
update_sequence_cache(path, unique_uniprotIDs)

sequence_cache_df_updated = pd.read_csv(path).set_index("Unnamed: 0")
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

Unique UniProt IDs: 
['P11940' 'Q04323' 'O94979' 'Q8WUH6' 'Q9BWF3' 'Q99459' 'P35579' 'Q7LBR1'
 'P07954' 'P50991' 'Q03252' 'P61011' 'P62829' 'P16150' 'Q86UP2' 'P52272'
 'Q9H2K8' 'Q15287' 'Q13310' 'P54886' 'Q13813' 'O60610' 'Q15233' 'P08670'
 'P67870' 'Q9Y244' 'P43243' 'Q9H7E9' 'Q9Y580' 'P49750' 'Q16181' 'P08238'
 'P25786' 'Q9NYF8' 'Q96I24' 'Q15435' 'Q9H444' 'Q01518' 'Q15773' 'P61978'
 'O43776' 'P41227' 'P26038' 'Q9Y3U8' 'Q13283' 'P23246' 'P61758' 'Q9NZM3'
 'P52566' 'O43837' 'P69905' 'P10809' 'P07108' 'P26583' 'P62847' 'Q6ZR08'
 'O60664' 'P07910' 'P31948' 'Q92665' 'P18669' 'P55072' 'P14866' 'O14737'
 'P22626' 'P35637' 'Q9BQ61' 'P53999' 'Q96PK6' 'P14317' 'P38646' 'P62805'
 'P62736' 'P83731' 'P16949' 'P46777' 'P62841' 'Q15056' 'P61247' 'P68363'
 'P37802' 'Q9Y266' 'P84098' 'Q15424' 'Q9Y4L1' 'P07437' 'Q07666' 'Q14152'
 'P11142']
Number of Unique UniProt IDs: 89
   Protein ID                                  Complete Sequence
0      Q8C196  MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1 

In [67]:
peptides = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,Average H/L,P-value,Site,Hyperreactive,Complete Sequence
0,MNGMLLNDRK,M[15.9949]NGMLLNDRK,M[15.9949]NGM[649.3660]LLNDRK,M[15.9949]NGM[655.3735]LLNDRK,-4.052345,,,,-4.383824,sp|P11940|PABP1_HUMAN,P11940,PABP1_HUMAN,PABPC1,Polyadenylate-binding protein 1,-4.218085,2.500159e-02,PABPC1_M170,1,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...
1,QRLQEDEMR,QRLQEDEMR,QRLQEDEM[649.3660]R,QRLQEDEM[655.3735]R,,-2.303070,,,-2.156620,sp|Q04323|UBXN1_HUMAN,Q04323,UBXN1_HUMAN,UBXN1,UBX domain-containing protein 1,-2.229845,2.089821e-02,UBXN1_M143,1,MAELTALESLIEMGFPRGRAEKALALTGNQGIEAAMDWLMEHEDDP...
2,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,-0.847024,,-0.934536,-0.727634,sp|O94979|SC31A_HUMAN,O94979,SC31A_HUMAN,SEC31A,Protein transport protein Sec31A,-0.836398,5.100497e-03,SEC31A_M823,1,MKLKEVDRTAMQAWSPAQNHPIYLATGTSAQQLDATFSTNASLEIF...
3,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,,,-0.666811,-0.821128,-0.462431,sp|Q8WUH6|TM263_HUMAN,Q8WUH6,TM263_HUMAN,TMEM263,Transmembrane protein 263,-0.650123,2.459445e-02,TMEM263_M34,1,MNQTDKNQQEIPSYLNDEPPEGSMKDHPQQQPGMLSRVTGGIFSVT...
4,NSLYDMAR,NSLYDMAR,NSLYDM[649.3660]AR,NSLYDM[655.3735]AR,0.728052,0.744754,0.880054,,0.838652,sp|Q9BWF3|RBM4_HUMAN,Q9BWF3,RBM4_HUMAN,RBM4,RNA-binding protein 4,0.797878,2.120192e-04,RBM4_M347,1,MVKLFIGNLPREATEQEIRSLFEQYGKVLECDIIKNYGFVHIEDKT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,8.583307,,,8.001761,sp|P11940|PABP1_HUMAN,P11940,PABP1_HUMAN,PABPC1,Polyadenylate-binding protein 1,8.292534,2.231360e-02,PABPC1_M584,0,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...
112,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,8.742243,8.575472,8.085437,8.208840,8.209045,sp|Q14152|EIF3A_HUMAN,Q14152,EIF3A_HUMAN,EIF3A,Eukaryotic translation initiation factor 3 sub...,8.364207,3.007041e-07,,0,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...
113,HWPFMVVNDAGRPK,HWPFMVVNDAGRPK,HWPFM[649.3660]VVNDAGRPK,HWPFM[655.3735]VVNDAGRPK,9.709745,7.906998,7.789936,8.454630,8.280280,sp|P11142|HSP7C_HUMAN,P11142,HSP7C_HUMAN,HSPA8,Heat shock cognate 71 kDa protein,8.428318,1.615362e-05,HSPA8_M93,0,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...
114,RKPDTIEVQQMK,RKPDTIEVQQMK,RKPDTIEVQQM[649.3660]K,RKPDTIEVQQM[655.3735]K,9.650006,,,,9.027199,sp|P26038|MOES_HUMAN,P26038,MOES_HUMAN,MSN,Moesin,9.338602,2.122074e-02,MSN_M305,0,MPKTISVRVTTMDAELEFAIQPNTTGKQLFDQVVKTIGLREVWFFG...


In [68]:
# Process dataset to extract peptide and Met site locations
peptides = process_dataset(peptides, amino_acid, amino_acid_str, analysis_threshold, modifications)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,MNGMLLNDRK,M[15.9949]NGMLLNDRK,M[15.9949]NGM[649.3660]LLNDRK,M[15.9949]NGM[655.3735]LLNDRK,-4.052345,,,,-4.383824,sp|P11940|PABP1_HUMAN,...,P11940_M160,1,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...,157,10,MNG,3,160,GFVHFETQEAAERAIEKMNG,LLNDRKVFVGRFKSRKEREA
1,QRLQEDEMR,QRLQEDEMR,QRLQEDEM[649.3660]R,QRLQEDEM[655.3735]R,,-2.303070,,,-2.156620,sp|Q04323|UBXN1_HUMAN,...,Q04323_M142,1,MAELTALESLIEMGFPRGRAEKALALTGNQGIEAAMDWLMEHEDDP...,135,9,QRLQEDE,7,142,QRRRQGQELSAARQRLQEDE,RRAAEERRREKAEELAARQR
2,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,-0.847024,,-0.934536,-0.727634,sp|O94979|SC31A_HUMAN,...,O94979_M822,1,MKLKEVDRTAMQAWSPAQNHPIYLATGTSAQQLDATFSTNASLEIF...,811,14,GRPGPVAGHHQ,11,822,PYEKQQLPKGRPGPVAGHHQ,PRVQTQQYYPHGENPPPPGF
3,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,,,-0.666811,-0.821128,-0.462431,sp|Q8WUH6|TM263_HUMAN,...,Q8WUH6_M33,1,MNQTDKNQQEIPSYLNDEPPEGSMKDHPQQQPGMLSRVTGGIFSVT...,25,12,DHPQQQPG,8,33,YLNDEPPEGSMKDHPQQQPG,LSRVTGGIFSVTKGAVGATI
4,NSLYDMAR,NSLYDMAR,NSLYDM[649.3660]AR,NSLYDM[655.3735]AR,0.728052,0.744754,0.880054,,0.838652,sp|Q9BWF3|RBM4_HUMAN,...,Q9BWF3_M346,1,MVKLFIGNLPREATEQEIRSLFEQYGKVLECDIIKNYGFVHIEDKT...,341,8,NSLYD,5,346,YGHESELSQASAAARNSLYD,ARYEREQYADRARYSAF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,8.583307,,,8.001761,sp|P11940|PABP1_HUMAN,...,P11940_M583,0,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...,580,24,ITG,3,583,ERLFPLIQAMHPTLAGKITG,LLEIDNSELLHMLESPESLR
112,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,8.742243,8.575472,8.085437,8.208840,8.209045,sp|Q14152|EIF3A_HUMAN,...,Q14152_M961,0,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,959,10,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA
113,HWPFMVVNDAGRPK,HWPFMVVNDAGRPK,HWPFM[649.3660]VVNDAGRPK,HWPFM[655.3735]VVNDAGRPK,9.709745,7.906998,7.789936,8.454630,8.280280,sp|P11142|HSP7C_HUMAN,...,P11142_M92,0,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...,88,14,HWPF,4,92,LIGRRFDDAVVQSDMKHWPF,VVNDAGRPKVQVEYKGETKS
114,RKPDTIEVQQMK,RKPDTIEVQQMK,RKPDTIEVQQM[649.3660]K,RKPDTIEVQQM[655.3735]K,9.650006,,,,9.027199,sp|P26038|MOES_HUMAN,...,P26038_M304,0,MPKTISVRVTTMDAELEFAIQPNTTGKQLFDQVVKTIGLREVWFFG...,294,12,RKPDTIEVQQ,10,304,MGNHELYMRRRKPDTIEVQQ,KAQAREEKHQKQMERAMLEN


In [69]:
# Sanity Check: ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides["Complete Sequence"], peptides["Peptide Location"], peptides["Peptide Length"])]
(temp == peptides["Peptide Sequence"]).value_counts()

Peptide Sequence
True    116
Name: count, dtype: int64

In [70]:
# Sanity Check: ensure Met sites are correct
temp = [A[B] for A, B in zip(peptides["Complete Sequence"], peptides[f"{amino_acid_str} Location"])]
pd.Series(temp).value_counts()

M    116
Name: count, dtype: int64

In [71]:
# NOTE: some Met sites do not have a full 20 amino acids to either side
peptides[(peptides[f"Left {analysis_threshold}"].str.len() != 20) | (peptides[f"Right {analysis_threshold}"].str.len() != 20)]

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
4,NSLYDMAR,NSLYDMAR,NSLYDM[649.3660]AR,NSLYDM[655.3735]AR,0.728052,0.744754,0.880054,,0.838652,sp|Q9BWF3|RBM4_HUMAN,...,Q9BWF3_M346,1,MVKLFIGNLPREATEQEIRSLFEQYGKVLECDIIKNYGFVHIEDKT...,341,8,NSLYD,5,346,YGHESELSQASAAARNSLYD,ARYEREQYADRARYSAF
37,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,3.457075,4.645536,4.129323,,,sp|Q15435|PP1R7_HUMAN,...,Q15435_M343,0,MAAERGAGQQQSQEMMEVDRRVESEESGDEEGKKHSSGIVADLSEQ...,342,9,V,1,343,ETVYLERNPLQKDPQYRRKV,LALPSVRQIDATFVRF
39,ADMQNLVER,n[42.0106]ADMQNLVER,n[42.0106]ADM[649.3660]QNLVER,n[42.0106]ADM[655.3735]QNLVER,3.950077,,4.651855,,3.781344,sp|Q01518|CAP1_HUMAN,...,Q01518_M3,0,MADMQNLVERLERAVGRLEAVSHTSDMHRGYADSPSKAGAAPYVQA...,1,9,AD,2,3,MA,QNLVERLERAVGRLEAVSHT
47,REELSNVLAAMRK,REELSNVLAAMRK,REELSNVLAAM[649.3660]RK,REELSNVLAAM[655.3735]RK,4.681375,4.706925,4.613771,4.559984,4.687803,sp|Q9Y3U8|RL36_HUMAN,...,Q9Y3U8_M96,0,MALRYPMAVGLNKGHKVTKNVSKPRHSRRRGRLTKHTKFVRDMIRE...,86,13,REELSNVLAA,10,96,VGTHIRAKRKREELSNVLAA,RKAAAKKD
58,REELSNVLAAMR,REELSNVLAAMR,REELSNVLAAM[649.3660]R,REELSNVLAAM[655.3735]R,,5.870613,,5.774773,4.395039,sp|Q9Y3U8|RL36_HUMAN,...,Q9Y3U8_M96,0,MALRYPMAVGLNKGHKVTKNVSKPRHSRRRGRLTKHTKFVRDMIRE...,86,12,REELSNVLAA,10,96,VGTHIRAKRKREELSNVLAA,RKAAAKKD
62,KFMTNR,KFMTNR,KFM[649.3660]TNR,KFM[655.3735]TNR,6.116281,,5.161891,,5.68533,sp|P62847|RS24_HUMAN,...,P62847_M12,0,MNDTVTIRTRKFMTNRLLQRKQMVIDVLHPGKATVPKTEIREKLAK...,10,6,KF,2,12,MNDTVTIRTRK,TNRLLQRKQMVIDVLHPGKA
73,AMEAVAAQGK,AMEAVAAQGK,AM[649.3660]EAVAAQGK,AM[655.3735]EAVAAQGK,6.407383,5.796616,6.112338,6.081613,6.14116,sp|P18669|PGAM1_HUMAN,...,P18669_M242,0,MAAYKLVLIRHGESAWNLENRFSGWYDADLSPAGHEEAKRGGQALR...,241,10,A,1,242,NLKPIKPMQFLGDEETVRKA,EAVAAQGKAKK
80,MDSRGEHRQDR,MDSRGEHRQDR,M[649.3660]DSRGEHRQDR,M[655.3735]DSRGEHRQDR,6.694228,6.314024,6.335872,6.454918,,sp|P35637|FUS_HUMAN,...,P35637_M510,0,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,510,11,,0,510,RGGFRGGRGGGDRGGFGPGK,DSRGEHRQDRRERPY
86,AAQMHSGYQR,AAQMHSGYQR,AAQM[649.3660]HSGYQR,AAQM[655.3735]HSGYQR,7.082546,,7.163135,6.921033,6.510018,sp|Q96PK6|RBM14_HUMAN,...,Q96PK6_M660,0,MKIFVGNVDGADTTPEELAALFAPYGTVMSCAVMKQFAFVHMRENA...,657,10,AAQ,3,660,AHSDYARYSGSYNDYLRAAQ,HSGYQRRM
89,TVTAMDVVYALK,TVTAMDVVYALK,TVTAM[649.3660]DVVYALK,TVTAM[655.3735]DVVYALK,7.038894,7.010861,6.96988,7.064174,7.080184,sp|P62805|H4_HUMAN,...,P62805_M84,0,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,80,12,TVTA,4,84,NVIRDAVTYTEHAKRKTVTA,DVVYALKRQGRTLYGFGG


### Download Alphafold Data - Jurkat

In [72]:
# Remove invalid proteins (according to alphafold)
# 1 invalid peptide as a result (not hyperreactive)

invalid_IDs = ['Q6ZR08']
display(peptides[peptides["Protein ID"].isin(invalid_IDs)])
peptides = peptides[~peptides["Protein ID"].isin(invalid_IDs)]
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
67,ENELMAK,ENELMAK,ENELM[649.3660]AK,ENELM[655.3735]AK,,,,5.854725,5.908989,sp|Q6ZR08|DYH12_HUMAN,...,Q6ZR08_M605,0,MSDANKAAIAAEKEALNLKLPPIVHLPENIGVDTPTQSKLLKYRRS...,601,7,ENEL,4,605,IFDENDELIENAKHKKENEL,AKREKLILEIEKESRRMEEF


Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,MNGMLLNDRK,M[15.9949]NGMLLNDRK,M[15.9949]NGM[649.3660]LLNDRK,M[15.9949]NGM[655.3735]LLNDRK,-4.052345,,,,-4.383824,sp|P11940|PABP1_HUMAN,...,P11940_M160,1,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...,157,10,MNG,3,160,GFVHFETQEAAERAIEKMNG,LLNDRKVFVGRFKSRKEREA
1,QRLQEDEMR,QRLQEDEMR,QRLQEDEM[649.3660]R,QRLQEDEM[655.3735]R,,-2.303070,,,-2.156620,sp|Q04323|UBXN1_HUMAN,...,Q04323_M142,1,MAELTALESLIEMGFPRGRAEKALALTGNQGIEAAMDWLMEHEDDP...,135,9,QRLQEDE,7,142,QRRRQGQELSAARQRLQEDE,RRAAEERRREKAEELAARQR
2,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,-0.847024,,-0.934536,-0.727634,sp|O94979|SC31A_HUMAN,...,O94979_M822,1,MKLKEVDRTAMQAWSPAQNHPIYLATGTSAQQLDATFSTNASLEIF...,811,14,GRPGPVAGHHQ,11,822,PYEKQQLPKGRPGPVAGHHQ,PRVQTQQYYPHGENPPPPGF
3,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,,,-0.666811,-0.821128,-0.462431,sp|Q8WUH6|TM263_HUMAN,...,Q8WUH6_M33,1,MNQTDKNQQEIPSYLNDEPPEGSMKDHPQQQPGMLSRVTGGIFSVT...,25,12,DHPQQQPG,8,33,YLNDEPPEGSMKDHPQQQPG,LSRVTGGIFSVTKGAVGATI
4,NSLYDMAR,NSLYDMAR,NSLYDM[649.3660]AR,NSLYDM[655.3735]AR,0.728052,0.744754,0.880054,,0.838652,sp|Q9BWF3|RBM4_HUMAN,...,Q9BWF3_M346,1,MVKLFIGNLPREATEQEIRSLFEQYGKVLECDIIKNYGFVHIEDKT...,341,8,NSLYD,5,346,YGHESELSQASAAARNSLYD,ARYEREQYADRARYSAF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,8.583307,,,8.001761,sp|P11940|PABP1_HUMAN,...,P11940_M583,0,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...,580,24,ITG,3,583,ERLFPLIQAMHPTLAGKITG,LLEIDNSELLHMLESPESLR
112,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,8.742243,8.575472,8.085437,8.208840,8.209045,sp|Q14152|EIF3A_HUMAN,...,Q14152_M961,0,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,959,10,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA
113,HWPFMVVNDAGRPK,HWPFMVVNDAGRPK,HWPFM[649.3660]VVNDAGRPK,HWPFM[655.3735]VVNDAGRPK,9.709745,7.906998,7.789936,8.454630,8.280280,sp|P11142|HSP7C_HUMAN,...,P11142_M92,0,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...,88,14,HWPF,4,92,LIGRRFDDAVVQSDMKHWPF,VVNDAGRPKVQVEYKGETKS
114,RKPDTIEVQQMK,RKPDTIEVQQMK,RKPDTIEVQQM[649.3660]K,RKPDTIEVQQM[655.3735]K,9.650006,,,,9.027199,sp|P26038|MOES_HUMAN,...,P26038_M304,0,MPKTISVRVTTMDAELEFAIQPNTTGKQLFDQVVKTIGLREVWFFG...,294,12,RKPDTIEVQQ,10,304,MGNHELYMRRRKPDTIEVQQ,KAQAREEKHQKQMERAMLEN


In [73]:
# Set UniProt IDs to use
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P11940' 'Q04323' 'O94979' 'Q8WUH6' 'Q9BWF3' 'Q99459' 'P35579' 'Q7LBR1'
 'P07954' 'P50991' 'Q03252' 'P61011' 'P62829' 'P16150' 'Q86UP2' 'P52272'
 'Q9H2K8' 'Q15287' 'Q13310' 'P54886' 'Q13813' 'O60610' 'Q15233' 'P08670'
 'P67870' 'Q9Y244' 'P43243' 'Q9H7E9' 'Q9Y580' 'P49750' 'Q16181' 'P08238'
 'P25786' 'Q9NYF8' 'Q96I24' 'Q15435' 'Q9H444' 'Q01518' 'Q15773' 'P61978'
 'O43776' 'P41227' 'P26038' 'Q9Y3U8' 'Q13283' 'P23246' 'P61758' 'Q9NZM3'
 'P52566' 'O43837' 'P69905' 'P10809' 'P07108' 'P26583' 'P62847' 'O60664'
 'P07910' 'P31948' 'Q92665' 'P18669' 'P55072' 'P14866' 'O14737' 'P22626'
 'P35637' 'Q9BQ61' 'P53999' 'Q96PK6' 'P14317' 'P38646' 'P62805' 'P62736'
 'P83731' 'P16949' 'P46777' 'P62841' 'Q15056' 'P61247' 'P68363' 'P37802'
 'Q9Y266' 'P84098' 'Q15424' 'Q9Y4L1' 'P07437' 'Q07666' 'Q14152' 'P11142']
Number of Unique UniProt IDs: 88


In [74]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 88/88 [00:00<00:00, 100929.38it/s]

2025-03-24 19:26:24> Valid proteins: 0
2025-03-24 19:26:24> Invalid proteins: 0
2025-03-24 19:26:24> Existing proteins: 88





In [75]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir,
)

100%|██████████| 88/88 [00:00<00:00, 88534.12it/s]

2025-03-24 19:26:24> Valid proteins: 0
2025-03-24 19:26:24> Invalid proteins: 0
2025-03-24 19:26:24> Existing proteins: 88





### Calculate Accessibilites and Merge into Full Dataset - Jurkat

In [76]:
radii = [2, 3, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 12, 18, 24]
smooth_accessibilities = calculate_accessibilities(cif_dir, pae_dir, unique_uniprotIDs, radii)
smooth_accessibilities["position"] = smooth_accessibilities["position"] - 1 # zero-index the positions to match initial dataframe
smooth_accessibilities

100%|██████████| 1986/1986 [00:06<00:00, 285.35it/s]
100%|██████████| 88/88 [00:00<00:00, 116.65it/s]
100%|██████████| 88/88 [00:00<00:00, 121.65it/s]
100%|██████████| 88/88 [00:00<00:00, 121.13it/s]
100%|██████████| 88/88 [00:00<00:00, 120.79it/s]
100%|██████████| 88/88 [00:00<00:00, 118.57it/s]
100%|██████████| 88/88 [00:00<00:00, 118.67it/s]
100%|██████████| 88/88 [00:00<00:00, 106.47it/s]
100%|██████████| 88/88 [00:00<00:00, 116.52it/s]
100%|██████████| 88/88 [00:00<00:00, 106.09it/s]
100%|██████████| 88/88 [00:00<00:00, 113.02it/s]
100%|██████████| 88/88 [00:00<00:00, 111.21it/s]
100%|██████████| 88/88 [00:00<00:00, 102.39it/s]
100%|██████████| 88/88 [00:01<00:00, 87.71it/s] 
100%|██████████| 88/88 [00:01<00:00, 72.67it/s]
100%|██████████| 88/88 [00:00<00:00, 783.85it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,O14737,1,M,0,60.98,28.087,27.636,28.803,26.605,-14.869,...,1.272727,1.818182,2.090909,2.818182,4.272727,4.363636,8.181818,12.636364,16.272727,1
1,O14737,1,A,1,65.86,28.148,29.294,30.330,28.837,-11.969,...,1.333333,1.833333,2.166667,2.833333,4.416667,4.500000,8.333333,12.916667,16.666667,1
2,O14737,1,D,2,67.03,25.029,25.896,25.064,27.097,-11.432,...,1.384615,1.846154,2.153846,2.769231,4.384615,4.538462,8.461538,13.153846,17.000000,1
3,O14737,1,E,3,74.70,24.793,24.184,24.089,24.964,-11.799,...,1.428571,1.857143,2.214286,2.857143,4.500000,4.642857,8.642857,13.428571,17.285714,1
4,O14737,1,E,4,73.32,26.617,26.792,28.283,26.125,-9.380,...,1.466667,1.866667,2.200000,2.866667,4.600000,4.800000,8.800000,13.600000,17.533333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Q9Y580,88,R,261,46.06,6.211,6.670,7.796,5.627,-12.540,...,0.933333,1.866667,1.866667,1.933333,1.933333,1.933333,3.200000,5.933333,8.866667,1
262,Q9Y580,88,S,262,51.68,8.008,6.961,5.681,6.695,-12.519,...,0.928571,1.857143,1.857143,1.928571,1.928571,1.928571,3.142857,5.785714,8.785714,1
263,Q9Y580,88,S,263,58.91,10.671,10.073,11.166,9.192,-14.554,...,0.923077,1.846154,1.846154,1.923077,1.923077,1.923077,3.076923,5.769231,8.692308,1
264,Q9Y580,88,R,264,56.10,11.019,10.694,9.753,10.045,-15.974,...,0.916667,1.833333,1.833333,1.916667,1.916667,1.916667,3.083333,5.750000,8.583333,1


In [77]:
peptides = peptides.merge(
    smooth_accessibilities, 
    how="left", 
    left_on=["Protein ID", f"{amino_acid_str} Location"], 
    right_on=["protein_id", "position"]
)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,MNGMLLNDRK,M[15.9949]NGMLLNDRK,M[15.9949]NGM[649.3660]LLNDRK,M[15.9949]NGM[655.3735]LLNDRK,-4.052345,,,,-4.383824,sp|P11940|PABP1_HUMAN,...,2.095238,2.190476,3.333333,4.190476,5.809524,6.952381,16.095238,39.714286,65.952381,0
1,QRLQEDEMR,QRLQEDEMR,QRLQEDEM[649.3660]R,QRLQEDEM[655.3735]R,,-2.303070,,,-2.156620,sp|Q04323|UBXN1_HUMAN,...,2.000000,2.333333,4.952381,6.333333,8.000000,8.000000,13.000000,20.238095,27.619048,1
2,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,-0.847024,,-0.934536,-0.727634,sp|O94979|SC31A_HUMAN,...,1.333333,2.000000,2.000000,2.000000,2.000000,2.000000,4.047619,7.571429,10.619048,1
3,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,,,-0.666811,-0.821128,-0.462431,sp|Q8WUH6|TM263_HUMAN,...,0.238095,1.619048,1.619048,2.000000,2.000000,2.000000,3.476190,6.047619,9.190476,1
4,NSLYDMAR,NSLYDMAR,NSLYDM[649.3660]AR,NSLYDM[655.3735]AR,0.728052,0.744754,0.880054,,0.838652,sp|Q9BWF3|RBM4_HUMAN,...,1.095238,1.857143,1.857143,2.000000,2.000000,2.000000,3.857143,6.523810,9.666667,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,8.583307,,,8.001761,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.714286,4.285714,5.285714,6.238095,14.285714,38.523810,60.809524,0
111,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,8.742243,8.575472,8.085437,8.208840,8.209045,sp|Q14152|EIF3A_HUMAN,...,1.619048,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.857143,10.000000,1
112,HWPFMVVNDAGRPK,HWPFMVVNDAGRPK,HWPFM[649.3660]VVNDAGRPK,HWPFM[655.3735]VVNDAGRPK,9.709745,7.906998,7.789936,8.454630,8.280280,sp|P11142|HSP7C_HUMAN,...,2.047619,2.333333,3.904762,5.095238,6.238095,7.571429,18.619048,56.476190,115.714286,0
113,RKPDTIEVQQMK,RKPDTIEVQQMK,RKPDTIEVQQM[649.3660]K,RKPDTIEVQQM[655.3735]K,9.650006,,,,9.027199,sp|P26038|MOES_HUMAN,...,2.000000,2.476190,4.619048,5.476190,6.809524,7.238095,12.619048,32.523810,64.285714,0


In [78]:
# Sanity Check: ensure UniProt and AlphaFold sequences are the same

peptides["AA"].value_counts()
#pd.set_option("display.max_columns", None)
#display(peptides[~(peptides["AA"] == amino_acid)])
#pd.reset_option("display.max_columns")
#peptides = peptides[(peptides["AA"] == amino_acid)]

AA
M    115
Name: count, dtype: int64

In [79]:
#peptides.to_csv(os.path.join(curr_dir_path, "Jurkat_processed.csv"))

In [80]:
path = os.path.join(curr_dir_path, "Jurkat_processed.csv")
peptides = pd.read_csv(path).set_index("Unnamed: 0")
peptides.index.name = None
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,MNGMLLNDRK,M[15.9949]NGMLLNDRK,M[15.9949]NGM[649.3660]LLNDRK,M[15.9949]NGM[655.3735]LLNDRK,-4.052345,,,,-4.383824,sp|P11940|PABP1_HUMAN,...,2.095238,2.190476,3.333333,4.190476,5.809524,6.952381,16.095238,39.714286,65.952381,0
1,QRLQEDEMR,QRLQEDEMR,QRLQEDEM[649.3660]R,QRLQEDEM[655.3735]R,,-2.303070,,,-2.156620,sp|Q04323|UBXN1_HUMAN,...,2.000000,2.333333,4.952381,6.333333,8.000000,8.000000,13.000000,20.238095,27.619048,1
2,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,-0.847024,,-0.934536,-0.727634,sp|O94979|SC31A_HUMAN,...,1.333333,2.000000,2.000000,2.000000,2.000000,2.000000,4.047619,7.571429,10.619048,1
3,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,,,-0.666811,-0.821128,-0.462431,sp|Q8WUH6|TM263_HUMAN,...,0.238095,1.619048,1.619048,2.000000,2.000000,2.000000,3.476190,6.047619,9.190476,1
4,NSLYDMAR,NSLYDMAR,NSLYDM[649.3660]AR,NSLYDM[655.3735]AR,0.728052,0.744754,0.880054,,0.838652,sp|Q9BWF3|RBM4_HUMAN,...,1.095238,1.857143,1.857143,2.000000,2.000000,2.000000,3.857143,6.523810,9.666667,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,8.583307,,,8.001761,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.714286,4.285714,5.285714,6.238095,14.285714,38.523810,60.809524,0
111,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,8.742243,8.575472,8.085437,8.208840,8.209045,sp|Q14152|EIF3A_HUMAN,...,1.619048,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.857143,10.000000,1
112,HWPFMVVNDAGRPK,HWPFMVVNDAGRPK,HWPFM[649.3660]VVNDAGRPK,HWPFM[655.3735]VVNDAGRPK,9.709745,7.906998,7.789936,8.454630,8.280280,sp|P11142|HSP7C_HUMAN,...,2.047619,2.333333,3.904762,5.095238,6.238095,7.571429,18.619048,56.476190,115.714286,0
113,RKPDTIEVQQMK,RKPDTIEVQQMK,RKPDTIEVQQM[649.3660]K,RKPDTIEVQQM[655.3735]K,9.650006,,,,9.027199,sp|P26038|MOES_HUMAN,...,2.000000,2.476190,4.619048,5.476190,6.809524,7.238095,12.619048,32.523810,64.285714,0


## K562

### Load and Process Dataset - K562

In [81]:
# Load initial isoTOP-ABPP dataset
data_loc = os.path.join(curr_dir_path, "K562_hyperreactivity.csv")
peptides = pd.read_csv(data_loc)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,Average H/L,P-value,Site
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,-4.945349,-4.818352,,-4.786624,,sp|P51532|SMCA4_HUMAN,P51532,SMCA4_HUMAN,SMARCA4,Transcription activator BRG1,-4.850108,0.000100,SMARCA4_M1233
1,LNFDMTASPK,LNFDMTASPK,LNFDM[649.3660]TASPK,LNFDM[655.3735]TASPK,-3.428915,,,,-3.174253,sp|Q9ULU4|ZMYD8_HUMAN,Q9ULU4,ZMYD8_HUMAN,ZMYND8,MYND-type zinc finger-containing chromatin rea...,-3.301584,0.024540,ZMYND8_M403
2,MALVADEQQR,MALVADEQQR,M[649.3660]ALVADEQQR,M[655.3735]ALVADEQQR,-2.404106,-2.242554,,,,sp|Q99856|ARI3A_HUMAN,Q99856,ARI3A_HUMAN,ARID3A,AT-rich interactive domain-containing protein 3A,-2.323330,0.022125,ARID3A_M463
3,SAFNIMSAER,SAFNIMSAER,SAFNIM[649.3660]SAER,SAFNIM[655.3735]SAER,,-1.907930,,-1.772328,-2.423630,sp|Q9H4L5|OSBL3_HUMAN,Q9H4L5,OSBL3_HUMAN,OSBPL3,Oxysterol-binding protein-related protein 3,-2.034630,0.009375,OSBPL3_M354
4,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,-1.377457,-1.878490,,-1.988167,sp|P35579|MYH9_HUMAN,P35579,MYH9_HUMAN,MYH9,Myosin-9,-1.748038,0.011367,MYH9_M1565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,AITGASLADIMAK,AITGASLADIMAK,AITGASLADIM[649.3660]AK,AITGASLADIM[655.3735]AK,10.017999,,9.173403,9.037141,8.539698,sp|P83731|RL24_HUMAN,P83731,RL24_HUMAN,RPL24,Large ribosomal subunit protein eL24,9.192060,0.000082,RPL24_M91
95,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,,9.341180,9.059947,,sp|P05783|K1C18_HUMAN,P05783,K1C18_HUMAN,KRT18,"Keratin, type I cytoskeletal 18",9.200564,0.009729,KRT18_M84
96,KEAPPMEKPEVVK,KEAPPMEKPEVVK,KEAPPM[649.3660]EKPEVVK,KEAPPM[655.3735]EKPEVVK,9.790663,,9.226812,,,sp|P62841|RS15_HUMAN,P62841,RS15_HUMAN,RPS15,Small ribosomal subunit protein uS19,9.508738,0.018870,RPS15_M70
97,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,9.708863,,9.621413,9.558118,,sp|Q14152|EIF3A_HUMAN,Q14152,EIF3A_HUMAN,EIF3A,Eukaryotic translation initiation factor 3 sub...,9.629465,0.000021,


In [82]:
# Check dataset for missing (NaN) entries
peptides.isna().sum()

Peptide Sequence           0
Modified Peptide           0
Light Modified Peptide     0
Heavy Modified Peptide     0
exp_1 Log2 Ratio HL       43
exp_2 Log2 Ratio HL       40
exp_3 Log2 Ratio HL       42
exp_4 Log2 Ratio HL       38
exp_5 Log2 Ratio HL       38
Protein                    0
Protein ID                 0
Entry Name                 0
Gene                       0
Protein Description        0
Average H/L                0
P-value                    0
Site                       3
dtype: int64

In [83]:
# Sanity Check: ensure there's only one desired modification in each peptide
modifications_pattern = create_modifications_pattern(amino_acid, modifications)
print(modifications_pattern)
display(peptides["Light Modified Peptide"].str.count(modifications_pattern).value_counts())
display(peptides["Heavy Modified Peptide"].str.count(modifications_pattern).value_counts())

M\[649\.3660\]|M\[655\.3735\]


Light Modified Peptide
1    99
Name: count, dtype: int64

Heavy Modified Peptide
1    99
Name: count, dtype: int64

In [84]:
# Annotate Met site hyperreactivity labels (Hyperreactive: <= 2)

peptides["Hyperreactive"] = np.where(peptides["Average H/L"] <= 2, 1, 0)

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")

In [85]:
# Load and, if necessary, update sequence cache df (mapping from UniProt IDs to full protein sequences)

unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

path = os.path.join(global_data_path, "complete_sequence_cache.csv")
update_sequence_cache(path, unique_uniprotIDs)

sequence_cache_df_updated = pd.read_csv(path).set_index("Unnamed: 0")
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

Unique UniProt IDs: 
['P51532' 'Q9ULU4' 'Q99856' 'Q9H4L5' 'P35579' 'Q9BQ04' 'Q13617' 'P50454'
 'Q13263' 'P61011' 'P46109' 'P43243' 'P49257' 'Q9Y4L1' 'P07954' 'P35612'
 'P16150' 'Q9UKD2' 'Q03252' 'P18754' 'Q13310' 'P52272' 'P08727' 'Q09666'
 'Q14687' 'P61158' 'Q9BR76' 'Q16181' 'Q14683' 'Q13813' 'P39023' 'Q9Y2W1'
 'Q9H9T3' 'Q9H3S7' 'P25786' 'P55196' 'P07900' 'Q86UP2' 'Q15287' 'P67870'
 'Q01518' 'P08670' 'O60610' 'Q9Y520' 'P07814' 'O43776' 'Q9Y244' 'P07437'
 'P34932' 'Q96I24' 'Q9H444' 'O15372' 'P41227' 'Q8NE71' 'P26038' 'P14866'
 'P09496' 'P10809' 'P07108' 'Q16543' 'Q15424' 'P49750' 'P62847' 'P08243'
 'Q15233' 'Q9Y383' 'Q13283' 'P67809' 'P11940' 'P22061' 'P62805' 'P15259'
 'Q96PK6' 'P22626' 'P61978' 'O60664' 'P60709' 'P35637' 'P07910' 'O14737'
 'P16949' 'P83731' 'P05783' 'P62841' 'Q14152' 'P37802']
Number of Unique UniProt IDs: 86
   Protein ID                                  Complete Sequence
0      Q8C196  MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1      Q07417  MAAALLARARGPLRR

In [86]:
peptides = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,Average H/L,P-value,Site,Hyperreactive,Complete Sequence
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,-4.945349,-4.818352,,-4.786624,,sp|P51532|SMCA4_HUMAN,P51532,SMCA4_HUMAN,SMARCA4,Transcription activator BRG1,-4.850108,0.000100,SMARCA4_M1233,1,MSTPDPPLGGTPRPGPSPGPGPSPGAMLGPSPGPSPGSAHSMMGPS...
1,LNFDMTASPK,LNFDMTASPK,LNFDM[649.3660]TASPK,LNFDM[655.3735]TASPK,-3.428915,,,,-3.174253,sp|Q9ULU4|ZMYD8_HUMAN,Q9ULU4,ZMYD8_HUMAN,ZMYND8,MYND-type zinc finger-containing chromatin rea...,-3.301584,0.024540,ZMYND8_M403,1,MDISTRSKDPGSAERTAQKRKFPSPPHSSNGHSPQDTSTSPIKKKK...
2,MALVADEQQR,MALVADEQQR,M[649.3660]ALVADEQQR,M[655.3735]ALVADEQQR,-2.404106,-2.242554,,,,sp|Q99856|ARI3A_HUMAN,Q99856,ARI3A_HUMAN,ARID3A,AT-rich interactive domain-containing protein 3A,-2.323330,0.022125,ARID3A_M463,1,MKLQAVMETLLQRQQRARQELEARQQLPPDPPAAPPGRARAAPDED...
3,SAFNIMSAER,SAFNIMSAER,SAFNIM[649.3660]SAER,SAFNIM[655.3735]SAER,,-1.907930,,-1.772328,-2.423630,sp|Q9H4L5|OSBL3_HUMAN,Q9H4L5,OSBL3_HUMAN,OSBPL3,Oxysterol-binding protein-related protein 3,-2.034630,0.009375,OSBPL3_M354,1,MMSDEKNLGVSQKLVSPSRSTSSCSSKQGSRQDSWEVVEGLRGEMN...
4,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,-1.377457,-1.878490,,-1.988167,sp|P35579|MYH9_HUMAN,P35579,MYH9_HUMAN,MYH9,Myosin-9,-1.748038,0.011367,MYH9_M1565,1,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,AITGASLADIMAK,AITGASLADIMAK,AITGASLADIM[649.3660]AK,AITGASLADIM[655.3735]AK,10.017999,,9.173403,9.037141,8.539698,sp|P83731|RL24_HUMAN,P83731,RL24_HUMAN,RPL24,Large ribosomal subunit protein eL24,9.192060,0.000082,RPL24_M91,0,MKVELCSFSGYKIYPGHGRRYARTDGKVFQFLNAKCESAFLSKRNP...
95,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,,9.341180,9.059947,,sp|P05783|K1C18_HUMAN,P05783,K1C18_HUMAN,KRT18,"Keratin, type I cytoskeletal 18",9.200564,0.009729,KRT18_M84,0,MSFTTRSTFSTNYRSLGSVQAPSYGARPVSSAASVYAGAGGSGSRI...
96,KEAPPMEKPEVVK,KEAPPMEKPEVVK,KEAPPM[649.3660]EKPEVVK,KEAPPM[655.3735]EKPEVVK,9.790663,,9.226812,,,sp|P62841|RS15_HUMAN,P62841,RS15_HUMAN,RPS15,Small ribosomal subunit protein uS19,9.508738,0.018870,RPS15_M70,0,MAEVEQKKKRTFRKFTYRGVDLDQLLDMSYEQLMQLYSARQRRRLN...
97,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,9.708863,,9.621413,9.558118,,sp|Q14152|EIF3A_HUMAN,Q14152,EIF3A_HUMAN,EIF3A,Eukaryotic translation initiation factor 3 sub...,9.629465,0.000021,,0,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...


In [87]:
# Process dataset to extract peptide and Met site locations
peptides = process_dataset(peptides, amino_acid, amino_acid_str, analysis_threshold, modifications)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,-4.945349,-4.818352,,-4.786624,,sp|P51532|SMCA4_HUMAN,...,P51532_M1232,1,MSTPDPPLGGTPRPGPSPGPGPSPGAMLGPSPGPSPGSAHSMMGPS...,1227,10,VIQAG,5,1232,KILAAAKYKLNVDQKVIQAG,FDQKSSSHERRAFLQAILEH
1,LNFDMTASPK,LNFDMTASPK,LNFDM[649.3660]TASPK,LNFDM[655.3735]TASPK,-3.428915,,,,-3.174253,sp|Q9ULU4|ZMYD8_HUMAN,...,Q9ULU4_M402,1,MDISTRSKDPGSAERTAQKRKFPSPPHSSNGHSPQDTSTSPIKKKK...,398,10,LNFD,4,402,NPSAGTAKIDKQEKVKLNFD,TASPKILMSKPVLSGGTGRR
2,MALVADEQQR,MALVADEQQR,M[649.3660]ALVADEQQR,M[655.3735]ALVADEQQR,-2.404106,-2.242554,,,,sp|Q99856|ARI3A_HUMAN,...,Q99856_M462,1,MKLQAVMETLLQRQQRARQELEARQQLPPDPPAAPPGRARAAPDED...,462,10,,0,462,AAALEQLREKLESAEPPEKK,ALVADEQQRLMQRALQQNFL
3,SAFNIMSAER,SAFNIMSAER,SAFNIM[649.3660]SAER,SAFNIM[655.3735]SAER,,-1.907930,,-1.772328,-2.423630,sp|Q9H4L5|OSBL3_HUMAN,...,Q9H4L5_M353,1,MMSDEKNLGVSQKLVSPSRSTSSCSSKQGSRQDSWEVVEGLRGEMN...,348,10,SAFNI,5,353,EDLCHIAHKVYFTLRSAFNI,SAEREKLKQLMEQDASSSPS
4,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,-1.377457,-1.878490,,-1.988167,sp|P35579|MYH9_HUMAN,...,P35579_M1564,1,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,1555,11,LRLEVNLQA,9,1564,EDELQATEDAKLRLEVNLQA,KAQFERDLQGRDEQSEEKKK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,AITGASLADIMAK,AITGASLADIMAK,AITGASLADIM[649.3660]AK,AITGASLADIM[655.3735]AK,10.017999,,9.173403,9.037141,8.539698,sp|P83731|RL24_HUMAN,...,P83731_M90,0,MKVELCSFSGYKIYPGHGRRYARTDGKVFQFLNAKCESAFLSKRNP...,80,13,AITGASLADI,10,90,RTRRAVKFQRAITGASLADI,AKRNQKPEVRKAQREQAIRA
95,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,,9.341180,9.059947,,sp|P05783|K1C18_HUMAN,...,P05783_M83,0,MSFTTRSTFSTNYRSLGSVQAPSYGARPVSSAASVYAGAGGSGSRI...,81,9,ET,2,83,ATGIAGGLAGMGGIQNEKET,QSLNDRLASYLDRVRSLETE
96,KEAPPMEKPEVVK,KEAPPMEKPEVVK,KEAPPM[649.3660]EKPEVVK,KEAPPM[655.3735]EKPEVVK,9.790663,,9.226812,,,sp|P62841|RS15_HUMAN,...,P62841_M69,0,MAEVEQKKKRTFRKFTYRGVDLDQLLDMSYEQLMQLYSARQRRRLN...,64,13,KEAPP,5,69,RRKQHSLLKRLRKAKKEAPP,EKPEVVKTHLRDMIILPEMV
97,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,9.708863,,9.621413,9.558118,,sp|Q14152|EIF3A_HUMAN,...,Q14152_M961,0,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,959,10,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA


In [88]:
# Sanity Check: ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides["Complete Sequence"], peptides["Peptide Location"], peptides["Peptide Length"])]
(temp == peptides["Peptide Sequence"]).value_counts()

Peptide Sequence
True    99
Name: count, dtype: int64

In [89]:
# Sanity Check: ensure Met sites are correct
temp = [A[B] for A, B in zip(peptides["Complete Sequence"], peptides[f"{amino_acid_str} Location"])]
pd.Series(temp).value_counts()

M    99
Name: count, dtype: int64

In [90]:
# NOTE: some Met sites do not have a full 20 amino acids to either side
peptides[(peptides[f"Left {analysis_threshold}"].str.len() != 20) | (peptides[f"Right {analysis_threshold}"].str.len() != 20)]

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
5,NSLYDMAR,NSLYDMAR,NSLYDM[649.3660]AR,NSLYDM[655.3735]AR,0.784757,,0.637029,0.613101,,sp|Q9BQ04|RBM4B_HUMAN,...,Q9BQ04_M341,1,MVKLFIGNLPREATEQEIRSLFEQYGKVLECDIIKNYGFVHIEDKT...,336,8,NSLYD,5,341,YGPESELSQASAATRNSLYD,ARYEREQYVDRARYSAF
10,SAWYMGPVSR,SAWYMGPVSR,SAWYM[649.3660]GPVSR,SAWYM[655.3735]GPVSR,,1.804169,,,1.897866,sp|P46109|CRKL_HUMAN,...,P46109_M15,1,MSSARFDSSDRSAWYMGPVSRQEAQTRLQGQRHGMFLVRDSSTCPG...,11,10,SAWY,4,15,MSSARFDSSDRSAW,GPVSRQEAQTRLQGQRHGMF
25,HNPVFGVMS,HNPVFGVMS,HNPVFGVM[649.3660]S,HNPVFGVM[655.3735]S,,,3.536351,3.443475,3.197735,sp|P61158|ARP3_HUMAN,...,P61158_M416,0,MAGRLPACVVDCGTGYTKLGYAGNTEPQFIIPSCIAIKESAKVGDQ...,409,9,HNPVFGV,7,416,KKDYEEIGPSICRHNPVFGV,S
40,ADMQNLVER,n[42.0106]ADMQNLVER,n[42.0106]ADM[649.3660]QNLVER,n[42.0106]ADM[655.3735]QNLVER,4.958334,4.800158,4.422003,3.682361,4.615951,sp|Q01518|CAP1_HUMAN,...,Q01518_M3,0,MADMQNLVERLERAVGRLEAVSHTSDMHRGYADSPSKAGAAPYVQA...,1,9,AD,2,3,MA,QNLVERLERAVGRLEAVSHT
68,KFMTNR,KFMTNR,KFM[649.3660]TNR,KFM[655.3735]TNR,,6.328857,,6.68827,6.449744,sp|P62847|RS24_HUMAN,...,P62847_M12,0,MNDTVTIRTRKFMTNRLLQRKQMVIDVLHPGKATVPKTEIREKLAK...,10,6,KF,2,12,MNDTVTIRTRK,TNRLLQRKQMVIDVLHPGKA
72,AMLDQLMGTSR,AMLDQLMGTSR,AM[649.3660]LDQLMGTSR,AM[655.3735]LDQLMGTSR,6.903599,6.535189,6.828151,,,sp|Q9Y383|LC7L2_HUMAN,...,Q9Y383_M9,0,MSAQAQMRAMLDQLMGTSRDGDTTRQRIKFSDDRVCKSHLLNCCPH...,8,11,A,1,9,MSAQAQMR,LDQLMGTSRDGDTTRQRIKF
80,MKPLMGVIYVPLTDKEK,MKPLMGVIYVPLTDKEK,MKPLM[649.3660]GVIYVPLTDKEK,MKPLM[655.3735]GVIYVPLTDKEK,,7.801893,,7.731557,,sp|P22061|PIMT_HUMAN,...,P22061_M208,0,MAWKSGGASHSELIHNLRKNGIIKTDKVFEVMLATDRSHYAKCNPY...,204,17,MKPL,4,208,NQMLEQYDKLQDGSIKMKPL,GVIYVPLTDKEKQWSRWK
81,TVTAMDVVYALK,TVTAMDVVYALK,TVTAM[649.3660]DVVYALK,TVTAM[655.3735]DVVYALK,8.062908,7.949803,7.61544,7.827899,7.868236,sp|P62805|H4_HUMAN,...,P62805_M84,0,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,80,12,TVTA,4,84,NVIRDAVTYTEHAKRKTVTA,DVVYALKRQGRTLYGFGG
82,AMEAVAAQGK,AMEAVAAQGK,AM[649.3660]EAVAAQGK,AM[655.3735]EAVAAQGK,8.848605,8.042631,7.447687,7.326116,,sp|P15259|PGAM2_HUMAN,...,P15259_M242,0,MATHRLVMVRHGESTWNQENRFCGWFDAELSEKGTEEAKRGAKAIK...,241,10,A,1,242,ELKPTKPMQFLGDEETVRKA,EAVAAQGKAK
83,AAQMHSGYQR,AAQMHSGYQR,AAQM[649.3660]HSGYQR,AAQM[655.3735]HSGYQR,9.012547,,7.823853,7.594282,7.69533,sp|Q96PK6|RBM14_HUMAN,...,Q96PK6_M660,0,MKIFVGNVDGADTTPEELAALFAPYGTVMSCAVMKQFAFVHMRENA...,657,10,AAQ,3,660,AHSDYARYSGSYNDYLRAAQ,HSGYQRRM


### Download Alphafold Data - K562

In [91]:
# Remove invalid proteins (according to alphafold)
# 4 invalid peptides as a result -> 4 non hyperreactive

invalid_IDs = ['Q09666', 'Q9Y520']
display(peptides[peptides["Protein ID"].isin(invalid_IDs)])
peptides = peptides[~peptides["Protein ID"].isin(invalid_IDs)]
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
23,GDLDASVPSMK,GDLDASVPSMK,GDLDASVPSM[649.3660]K,GDLDASVPSM[655.3735]K,2.598458,3.98502,,,2.928357,sp|Q09666|AHNK_HUMAN,...,Q09666_M5321,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,5312,11,GDLDASVPS,9,5321,LDLNLKGPSLKGDLDASVPS,KVHAPGLNLSGVGGKMQVGG
44,MNSIVYQK,MNSIVYQK,M[649.3660]NSIVYQK,M[655.3735]NSIVYQK,4.927943,,,4.856778,,sp|Q9Y520|PRC2C_HUMAN,...,Q9Y520_M2701,0,MSEKSGQSTKAKDGKKYATLSLFNTYKGKSLETQKTTARHGLQSLG...,2701,8,,0,2701,TPTSSPFRATSTSPNSQSSK,NSIVYQKQFQSAPATVRMTQ
51,ISMPDVDLHVK,ISMPDVDLHVK,ISM[649.3660]PDVDLHVK,ISM[655.3735]PDVDLHVK,5.827808,,4.820624,,5.337697,sp|Q09666|AHNK_HUMAN,...,Q09666_M698,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,696,11,IS,2,698,KLKGPDVKLPDMSVKTPKIS,PDVDLHVKGTKVKGEYDVTV
61,FKMPEMNIKAPK,FKMPEM[15.9949]NIKAPK,FKM[649.3660]PEM[15.9949]NIKAPK,FKM[655.3735]PEM[15.9949]NIKAPK,,6.574625,5.61737,,,sp|Q09666|AHNK_HUMAN,...,Q09666_M935,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,933,12,FK,2,935,EVPDVNIEGPEGKLKGPKFK,PEMNIKAPKISMPDVDLHMK


Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,-4.945349,-4.818352,,-4.786624,,sp|P51532|SMCA4_HUMAN,...,P51532_M1232,1,MSTPDPPLGGTPRPGPSPGPGPSPGAMLGPSPGPSPGSAHSMMGPS...,1227,10,VIQAG,5,1232,KILAAAKYKLNVDQKVIQAG,FDQKSSSHERRAFLQAILEH
1,LNFDMTASPK,LNFDMTASPK,LNFDM[649.3660]TASPK,LNFDM[655.3735]TASPK,-3.428915,,,,-3.174253,sp|Q9ULU4|ZMYD8_HUMAN,...,Q9ULU4_M402,1,MDISTRSKDPGSAERTAQKRKFPSPPHSSNGHSPQDTSTSPIKKKK...,398,10,LNFD,4,402,NPSAGTAKIDKQEKVKLNFD,TASPKILMSKPVLSGGTGRR
2,MALVADEQQR,MALVADEQQR,M[649.3660]ALVADEQQR,M[655.3735]ALVADEQQR,-2.404106,-2.242554,,,,sp|Q99856|ARI3A_HUMAN,...,Q99856_M462,1,MKLQAVMETLLQRQQRARQELEARQQLPPDPPAAPPGRARAAPDED...,462,10,,0,462,AAALEQLREKLESAEPPEKK,ALVADEQQRLMQRALQQNFL
3,SAFNIMSAER,SAFNIMSAER,SAFNIM[649.3660]SAER,SAFNIM[655.3735]SAER,,-1.907930,,-1.772328,-2.423630,sp|Q9H4L5|OSBL3_HUMAN,...,Q9H4L5_M353,1,MMSDEKNLGVSQKLVSPSRSTSSCSSKQGSRQDSWEVVEGLRGEMN...,348,10,SAFNI,5,353,EDLCHIAHKVYFTLRSAFNI,SAEREKLKQLMEQDASSSPS
4,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,-1.377457,-1.878490,,-1.988167,sp|P35579|MYH9_HUMAN,...,P35579_M1564,1,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,1555,11,LRLEVNLQA,9,1564,EDELQATEDAKLRLEVNLQA,KAQFERDLQGRDEQSEEKKK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,AITGASLADIMAK,AITGASLADIMAK,AITGASLADIM[649.3660]AK,AITGASLADIM[655.3735]AK,10.017999,,9.173403,9.037141,8.539698,sp|P83731|RL24_HUMAN,...,P83731_M90,0,MKVELCSFSGYKIYPGHGRRYARTDGKVFQFLNAKCESAFLSKRNP...,80,13,AITGASLADI,10,90,RTRRAVKFQRAITGASLADI,AKRNQKPEVRKAQREQAIRA
95,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,,9.341180,9.059947,,sp|P05783|K1C18_HUMAN,...,P05783_M83,0,MSFTTRSTFSTNYRSLGSVQAPSYGARPVSSAASVYAGAGGSGSRI...,81,9,ET,2,83,ATGIAGGLAGMGGIQNEKET,QSLNDRLASYLDRVRSLETE
96,KEAPPMEKPEVVK,KEAPPMEKPEVVK,KEAPPM[649.3660]EKPEVVK,KEAPPM[655.3735]EKPEVVK,9.790663,,9.226812,,,sp|P62841|RS15_HUMAN,...,P62841_M69,0,MAEVEQKKKRTFRKFTYRGVDLDQLLDMSYEQLMQLYSARQRRRLN...,64,13,KEAPP,5,69,RRKQHSLLKRLRKAKKEAPP,EKPEVVKTHLRDMIILPEMV
97,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,9.708863,,9.621413,9.558118,,sp|Q14152|EIF3A_HUMAN,...,Q14152_M961,0,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,959,10,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA


In [92]:
# Set UniProt IDs to use
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P51532' 'Q9ULU4' 'Q99856' 'Q9H4L5' 'P35579' 'Q9BQ04' 'Q13617' 'P50454'
 'Q13263' 'P61011' 'P46109' 'P43243' 'P49257' 'Q9Y4L1' 'P07954' 'P35612'
 'P16150' 'Q9UKD2' 'Q03252' 'P18754' 'Q13310' 'P52272' 'P08727' 'Q14687'
 'P61158' 'Q9BR76' 'Q16181' 'Q14683' 'Q13813' 'P39023' 'Q9Y2W1' 'Q9H9T3'
 'Q9H3S7' 'P25786' 'P55196' 'P07900' 'Q86UP2' 'Q15287' 'P67870' 'Q01518'
 'P08670' 'O60610' 'P07814' 'O43776' 'Q9Y244' 'P07437' 'P34932' 'Q96I24'
 'Q9H444' 'O15372' 'P41227' 'Q8NE71' 'P26038' 'P14866' 'P09496' 'P10809'
 'P07108' 'Q16543' 'Q15424' 'P49750' 'P62847' 'P08243' 'Q15233' 'Q9Y383'
 'Q13283' 'P67809' 'P11940' 'P22061' 'P62805' 'P15259' 'Q96PK6' 'P22626'
 'P61978' 'O60664' 'P60709' 'P35637' 'P07910' 'O14737' 'P16949' 'P83731'
 'P05783' 'P62841' 'Q14152' 'P37802']
Number of Unique UniProt IDs: 84


In [93]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 84/84 [00:00<00:00, 67897.77it/s]

2025-03-24 19:26:43> Valid proteins: 0
2025-03-24 19:26:43> Invalid proteins: 0
2025-03-24 19:26:43> Existing proteins: 84





In [94]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir,
)

100%|██████████| 84/84 [00:00<00:00, 45631.59it/s]

2025-03-24 19:26:43> Valid proteins: 0
2025-03-24 19:26:43> Invalid proteins: 0
2025-03-24 19:26:43> Existing proteins: 84





### Calculate Accessibilites and Merge into Full Dataset - Jurkat

In [95]:
radii = [2, 3, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 12, 18, 24]
smooth_accessibilities = calculate_accessibilities(cif_dir, pae_dir, unique_uniprotIDs, radii)
smooth_accessibilities["position"] = smooth_accessibilities["position"] - 1 # zero-index the positions to match initial dataframe
smooth_accessibilities

100%|██████████| 1986/1986 [00:07<00:00, 252.30it/s]
100%|██████████| 84/84 [00:00<00:00, 88.96it/s] 
100%|██████████| 84/84 [00:00<00:00, 90.25it/s] 
100%|██████████| 84/84 [00:00<00:00, 84.86it/s] 
100%|██████████| 84/84 [00:00<00:00, 90.96it/s] 
100%|██████████| 84/84 [00:00<00:00, 89.41it/s] 
100%|██████████| 84/84 [00:00<00:00, 90.19it/s] 
100%|██████████| 84/84 [00:00<00:00, 89.71it/s] 
100%|██████████| 84/84 [00:00<00:00, 88.79it/s] 
100%|██████████| 84/84 [00:00<00:00, 88.56it/s] 
100%|██████████| 84/84 [00:00<00:00, 87.22it/s] 
100%|██████████| 84/84 [00:00<00:00, 86.59it/s] 
100%|██████████| 84/84 [00:01<00:00, 80.12it/s] 
100%|██████████| 84/84 [00:01<00:00, 67.58it/s] 
100%|██████████| 84/84 [00:01<00:00, 56.14it/s]
100%|██████████| 84/84 [00:00<00:00, 749.73it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,O14737,1,M,0,60.98,28.087,27.636,28.803,26.605,-14.869,...,1.272727,1.818182,2.090909,2.818182,4.272727,4.363636,8.181818,12.636364,16.272727,1
1,O14737,1,A,1,65.86,28.148,29.294,30.330,28.837,-11.969,...,1.333333,1.833333,2.166667,2.833333,4.416667,4.500000,8.333333,12.916667,16.666667,1
2,O14737,1,D,2,67.03,25.029,25.896,25.064,27.097,-11.432,...,1.384615,1.846154,2.153846,2.769231,4.384615,4.538462,8.461538,13.153846,17.000000,1
3,O14737,1,E,3,74.70,24.793,24.184,24.089,24.964,-11.799,...,1.428571,1.857143,2.214286,2.857143,4.500000,4.642857,8.642857,13.428571,17.285714,1
4,O14737,1,E,4,73.32,26.617,26.792,28.283,26.125,-9.380,...,1.466667,1.866667,2.200000,2.866667,4.600000,4.800000,8.800000,13.600000,17.533333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,Q9Y4L1,84,K,994,44.06,-11.015,-11.329,-10.901,-12.759,-47.848,...,1.666667,1.933333,1.933333,1.933333,1.933333,1.933333,3.800000,6.933333,9.066667,1
995,Q9Y4L1,84,N,995,39.42,-8.409,-9.263,-8.384,-9.873,-47.140,...,1.642857,1.928571,1.928571,1.928571,1.928571,1.928571,3.785714,6.857143,9.000000,1
996,Q9Y4L1,84,D,996,46.24,-6.819,-6.712,-5.648,-8.052,-44.927,...,1.615385,1.923077,1.923077,1.923077,1.923077,1.923077,3.769231,6.846154,8.923077,1
997,Q9Y4L1,84,E,997,39.71,-4.832,-6.041,-6.027,-5.998,-42.645,...,1.666667,1.916667,1.916667,1.916667,1.916667,1.916667,3.750000,6.833333,8.833333,1


In [96]:
peptides = peptides.merge(
    smooth_accessibilities, 
    how="left", 
    left_on=["Protein ID", f"{amino_acid_str} Location"], 
    right_on=["protein_id", "position"]
)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,-4.945349,-4.818352,,-4.786624,,sp|P51532|SMCA4_HUMAN,...,1.714286,2.047619,2.666667,3.142857,3.857143,4.238095,8.095238,25.190476,62.666667,0
1,LNFDMTASPK,LNFDMTASPK,LNFDM[649.3660]TASPK,LNFDM[655.3735]TASPK,-3.428915,,,,-3.174253,sp|Q9ULU4|ZMYD8_HUMAN,...,1.428571,2.000000,2.000000,2.000000,2.000000,2.000000,4.142857,7.904762,15.380952,1
2,MALVADEQQR,MALVADEQQR,M[649.3660]ALVADEQQR,M[655.3735]ALVADEQQR,-2.404106,-2.242554,,,,sp|Q99856|ARI3A_HUMAN,...,1.285714,2.047619,2.476190,2.761905,3.000000,3.428571,7.809524,12.761905,17.571429,1
3,SAFNIMSAER,SAFNIMSAER,SAFNIM[649.3660]SAER,SAFNIM[655.3735]SAER,,-1.907930,,-1.772328,-2.423630,sp|Q9H4L5|OSBL3_HUMAN,...,2.000000,2.190476,4.857143,6.238095,8.000000,8.095238,15.095238,30.380952,44.619048,0
4,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,-1.377457,-1.878490,,-1.988167,sp|P35579|MYH9_HUMAN,...,2.000000,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,AITGASLADIMAK,AITGASLADIMAK,AITGASLADIM[649.3660]AK,AITGASLADIM[655.3735]AK,10.017999,,9.173403,9.037141,8.539698,sp|P83731|RL24_HUMAN,...,1.904762,2.000000,2.809524,3.380952,4.000000,4.428571,8.904762,17.000000,23.714286,1
91,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,,9.341180,9.059947,,sp|P05783|K1C18_HUMAN,...,0.857143,1.714286,2.047619,2.571429,3.333333,4.142857,7.571429,13.380952,18.809524,1
92,KEAPPMEKPEVVK,KEAPPMEKPEVVK,KEAPPM[649.3660]EKPEVVK,KEAPPM[655.3735]EKPEVVK,9.790663,,9.226812,,,sp|P62841|RS15_HUMAN,...,1.952381,2.047619,3.000000,3.428571,4.285714,4.619048,13.285714,33.333333,55.523810,0
93,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,9.708863,,9.621413,9.558118,,sp|Q14152|EIF3A_HUMAN,...,1.619048,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.857143,10.000000,1


In [97]:
# Sanity Check: ensure UniProt and AlphaFold sequences are the same

peptides["AA"].value_counts()
#pd.set_option("display.max_columns", None)
#display(peptides[~(peptides["AA"] == amino_acid)])
#pd.reset_option("display.max_columns")
#peptides = peptides[(peptides["AA"] == amino_acid)]

AA
M    95
Name: count, dtype: int64

In [98]:
#peptides.to_csv(os.path.join(curr_dir_path, "K562_processed.csv"))

In [99]:
path = os.path.join(curr_dir_path, "K562_processed.csv")
peptides = pd.read_csv(path).set_index("Unnamed: 0")
peptides.index.name = None
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,-4.945349,-4.818352,,-4.786624,,sp|P51532|SMCA4_HUMAN,...,1.714286,2.047619,2.666667,3.142857,3.857143,4.238095,8.095238,25.190476,62.666667,0
1,LNFDMTASPK,LNFDMTASPK,LNFDM[649.3660]TASPK,LNFDM[655.3735]TASPK,-3.428915,,,,-3.174253,sp|Q9ULU4|ZMYD8_HUMAN,...,1.428571,2.000000,2.000000,2.000000,2.000000,2.000000,4.142857,7.904762,15.380952,1
2,MALVADEQQR,MALVADEQQR,M[649.3660]ALVADEQQR,M[655.3735]ALVADEQQR,-2.404106,-2.242554,,,,sp|Q99856|ARI3A_HUMAN,...,1.285714,2.047619,2.476190,2.761905,3.000000,3.428571,7.809524,12.761905,17.571429,1
3,SAFNIMSAER,SAFNIMSAER,SAFNIM[649.3660]SAER,SAFNIM[655.3735]SAER,,-1.907930,,-1.772328,-2.423630,sp|Q9H4L5|OSBL3_HUMAN,...,2.000000,2.190476,4.857143,6.238095,8.000000,8.095238,15.095238,30.380952,44.619048,0
4,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,-1.377457,-1.878490,,-1.988167,sp|P35579|MYH9_HUMAN,...,2.000000,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,AITGASLADIMAK,AITGASLADIMAK,AITGASLADIM[649.3660]AK,AITGASLADIM[655.3735]AK,10.017999,,9.173403,9.037141,8.539698,sp|P83731|RL24_HUMAN,...,1.904762,2.000000,2.809524,3.380952,4.000000,4.428571,8.904762,17.000000,23.714286,1
91,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,,9.341180,9.059947,,sp|P05783|K1C18_HUMAN,...,0.857143,1.714286,2.047619,2.571429,3.333333,4.142857,7.571429,13.380952,18.809524,1
92,KEAPPMEKPEVVK,KEAPPMEKPEVVK,KEAPPM[649.3660]EKPEVVK,KEAPPM[655.3735]EKPEVVK,9.790663,,9.226812,,,sp|P62841|RS15_HUMAN,...,1.952381,2.047619,3.000000,3.428571,4.285714,4.619048,13.285714,33.333333,55.523810,0
93,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,9.708863,,9.621413,9.558118,,sp|Q14152|EIF3A_HUMAN,...,1.619048,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.857143,10.000000,1
