In [1]:
from utils.preprocessing import *

import os
import warnings

import numpy as np
import pandas as pd

from structuremap.processing import download_alphafold_cif, download_alphafold_pae
import structuremap.utils

warnings.filterwarnings("ignore")
structuremap.utils.set_logger()

## Cysteine - Aldehyde Reactivity

### Set Parameters of Analysis

In [2]:
amino_acid = "C"
amino_acid_str = "Cys"
amino_acid_str_long = "Cysteine"
analysis_threshold = 20 # number of amino acids either side of target sites to analyze
modifications = ["561.3387", "567.3462"] # which modifications we're looking for, as strings
light_modification = modifications[0]
heavy_modification = modifications[1]

In [3]:
curr_dir_path_str = "."
curr_dir_path = os.path.abspath(curr_dir_path_str)

raw_datasets_path_str = "../data/raw"
raw_datasets_path = os.path.abspath(raw_datasets_path_str)

processed_datasets_path_str = "../data/processed"
processed_datasets_path = os.path.abspath(processed_datasets_path_str)

cached_data_path_str = "../cache"
cached_data_path = os.path.abspath(cached_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Raw Datasets Directory: " + raw_datasets_path)
print("Processed Datasets Directory: " + processed_datasets_path)
print("Cached Data Directory: " + cached_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/abpp_ald_ml/notebooks
Raw Datasets Directory: /Users/ritwiksrinivas/Desktop/Projects/abpp_ald_ml/data/raw
Processed Datasets Directory: /Users/ritwiksrinivas/Desktop/Projects/abpp_ald_ml/data/processed
Cached Data Directory: /Users/ritwiksrinivas/Desktop/Projects/abpp_ald_ml/cache


In [4]:
alphafold_path_str = "../alphafold"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print("AlphaFold Directory: " + alphafold_path)
print("CIF Directory: " + cif_dir)
print("PAE Directory: " + pae_dir)

AlphaFold Directory: /Users/ritwiksrinivas/Desktop/Projects/abpp_ald_ml/alphafold
CIF Directory: /Users/ritwiksrinivas/Desktop/Projects/abpp_ald_ml/alphafold/cif
PAE Directory: /Users/ritwiksrinivas/Desktop/Projects/abpp_ald_ml/alphafold/pae


In [5]:
# Create protein sequence cache (if it doesn't already exist)
path = os.path.join(cached_data_path, "prot_seqs.csv")
create_sequence_cache(path)

Protein sequence cache already exists!


### Load and Process Combined Dataset (all aldehydes)

In [6]:
# Load initial isoTOP-ABPP dataset
pd.set_option("display.max_rows", 25)
data_loc = os.path.join(raw_datasets_path, "cys ald abpp data sites not added 11062025.xlsx")
peptides = pd.read_excel(data_loc, sheet_name='no sites')
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,Protein ID,Entry Name,Gene,Protein Description,p-value,DTB Log2LH,Ald,-log10P
0,SC[561.3387]ATFSSSHR,SC[567.3462]ATFSSSHR,,,,-5.311599,-5.493770,,Q3UPL0,SC31A_MOUSE,Sec31a,Protein transport protein Sec31A,1.073198e-02,5.402684,MGO,1.969320
1,VFANPEDC[561.3387]AGFGK,VFANPEDC[567.3462]AGFGK,-3.163860,-3.369927,-2.964787,-3.072636,-3.284364,-3.326394,Q91VS7,MGST1_MOUSE,Mgst1,Microsomal glutathione S-transferase 1,6.311667e-08,3.196995,MGO,7.199856
2,VSSNGC[561.3387]LANSAC[57.0215]QLAHDHTDQVIK,VSSNGC[567.3462]LANSAC[57.0215]QLAHDHTDQVIK,,-1.188386,-3.195898,-3.452135,,-3.437022,A2A974,CP4CB_MOUSE,Cyp4a12b,Cytochrome P450 4A12B,1.413678e-02,2.818360,MGO,1.849650
3,RSIQFVDWC[561.3387]PTGFK,RSIQFVDWC[567.3462]PTGFK,-1.300077,,-2.814100,-4.818681,-2.075210,,P68368,TBA4A_MOUSE,Tuba4a,Tubulin alpha-4A chain,3.562101e-02,2.752017,MGO,1.448294
4,QVC[57.0215]QLPGLFC[561.3387]YAQHIASIDGR,QVC[57.0215]QLPGLFC[567.3462]YAQHIASIDGR,-0.960994,-2.932991,-3.424777,-2.642612,-3.824914,-1.646469,Q791V5,MTCH2_MOUSE,Mtch2,Mitochondrial carrier homolog 2,2.134749e-03,2.572126,MGO,2.670653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6037,C[561.3387]PTSGR,C[567.3462]PTSGR,,-0.470235,,0.449227,,,Q9Z2W0,DNPEP_MOUSE,Dnpep,Aspartyl aminopeptidase,9.854569e-01,0.010504,FA,0.006362
6038,GVFC[561.3387]AGADLK,GVFC[567.3462]AGADLK,0.305741,,,0.716710,-0.352555,-0.689582,Q3TLP5,ECHD2_MOUSE,Echdc2,Enoyl-CoA hydratase domain-containing protein ...,9.885932e-01,0.004921,FA,0.004982
6039,LLAC[561.3387]IASRPGQC[57.0215]GR,LLAC[567.3462]IASRPGQC[57.0215]GR,,0.337643,-0.344721,,,,P62242,RS8_MOUSE,Rps8,Small ribosomal subunit protein eS8,9.933967e-01,0.003539,FA,0.002877
6040,YVEPIEDVPC[561.3387]GNIVGLVGVDQFLVK,YVEPIEDVPC[567.3462]GNIVGLVGVDQFLVK,,,,,0.183278,-0.184626,P58252,EF2_MOUSE,Eef2,Elongation factor 2,9.976672e-01,0.000674,FA,0.001014


In [7]:
# Sanity Check: NaN entries
peptides.isna().sum()

Light Modified Peptide       0
Heavy Modified Peptide       0
1_1 Log2 Ratio HL         1063
1_2 Log2 Ratio HL         2843
2_1 Log2 Ratio HL         1697
2_2 Log2 Ratio HL         2435
3_1 Log2 Ratio HL         2314
3_2 Log2 Ratio HL         2056
Protein ID                   0
Entry Name                   0
Gene                        16
Protein Description          0
p-value                      0
DTB Log2LH                   0
Ald                          0
-log10P                      0
dtype: int64

In [8]:
# Sanity Check: ensure there's only one desired modification in each peptide
modifications_pattern = create_modifications_pattern(amino_acid, modifications)
print(modifications_pattern)
display(peptides["Light Modified Peptide"].str.count(modifications_pattern).value_counts())
display(peptides["Heavy Modified Peptide"].str.count(modifications_pattern).value_counts())

C\[561\.3387\]|C\[567\.3462\]


Light Modified Peptide
1    6042
Name: count, dtype: int64

Heavy Modified Peptide
1    6042
Name: count, dtype: int64

In [9]:
# Sanity Check: ensure light/heavy modified peptide columns represent the same underlying peptide
(peptides["Light Modified Peptide"].apply(filter_amino_acid_sequence) == peptides["Heavy Modified Peptide"].apply(filter_amino_acid_sequence)).value_counts()

True    6042
Name: count, dtype: int64

In [10]:
# Insert clean peptide sequence column
peptides.insert(loc=2, column="Peptide Sequence", value=peptides["Light Modified Peptide"].apply(filter_amino_acid_sequence))
peptides;

In [11]:
# Create and inspect reactivity labels (Log2LH ratio >= 1.5)
peptides["Reactive"] = np.where(peptides["DTB Log2LH"] >= 1.5, 1, 0)
display(peptides["Reactive"].value_counts())

Reactive
0    5835
1     207
Name: count, dtype: int64

In [12]:
# Load and, if necessary, update sequence cache df (mapping from UniProt IDs to full protein sequences)

unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

path = os.path.join(cached_data_path, "prot_seqs.csv")
update_sequence_cache(path, unique_uniprotIDs)

sequence_cache_df_updated = pd.read_csv(path).set_index("Unnamed: 0")
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

Unique UniProt IDs: 
['Q3UPL0' 'Q91VS7' 'A2A974' ... 'Q60854' 'Q9JLV1' 'Q9Z1K6']
Number of Unique UniProt IDs: 1276
     Protein ID                                  Complete Sequence
0        A0JNU3  MARAMGPERRLLAIYTGGTIGMRSEGGVLVPGRGLAAVLKTLHMFH...
1        A2A432  MSRSTRSKERRENDTDSEDNSSETSNQERRRCRQGPPRPPYPPLLP...
2        A2A8Z1  MASIVEGPLSKWTNVMKGWQYRWFVLDYNAGLLSYYTSKDKMMRGS...
3        A2A974  MSASALSSIRFPGSISEYLQVASVLSLLLLLFKTAQLYLHRQWLLS...
4        A2ADY9  MLLTVYCVRRDLSEVTFSLQVDADFELHNFRALCELESGIPAAESQ...
...         ...                                                ...
1271     Q9Z2U0  MSYDRAITVFSPDGHLFQVEYAQEAVKKGSTAVGVRGKDIVVLGVE...
1272     Q9Z2V4  MPPQLHNGLDFSAKVIQGSLDSLPQAVRKFVEGNAQLCQPEYIHIC...
1273     Q9Z2W0  MAMNGRARKEAIQATARELLKFVNRSPSPFHVVAECRSRLLQAGFR...
1274     Q9Z2X1  MMLGPEGGEGYVVKLRGLPWSCSIEDVQNFLSDCTIHDGVAGVHFI...
1275     Q9Z2Z6  MADEPKPISPFKNLLAGGFGGMCLVFVGHPLDTVKVRLQTQPPSLS...

[1276 rows x 2 columns]
All proteins have a known sequence!


In [13]:
peptides = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,Peptide Sequence,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,Protein ID,Entry Name,Gene,Protein Description,p-value,DTB Log2LH,Ald,-log10P,Reactive,Complete Sequence
0,SC[561.3387]ATFSSSHR,SC[567.3462]ATFSSSHR,SCATFSSSHR,,,,-5.311599,-5.493770,,Q3UPL0,SC31A_MOUSE,Sec31a,Protein transport protein Sec31A,1.073198e-02,5.402684,MGO,1.969320,1,MKLKEIDRTAMQAWSPAQNHPIYLATGTSAQQLDATFSTNASLEIF...
1,VFANPEDC[561.3387]AGFGK,VFANPEDC[567.3462]AGFGK,VFANPEDCAGFGK,-3.163860,-3.369927,-2.964787,-3.072636,-3.284364,-3.326394,Q91VS7,MGST1_MOUSE,Mgst1,Microsomal glutathione S-transferase 1,6.311667e-08,3.196995,MGO,7.199856,1,MADLRQLMDNEVLMAFTSYATIILTKMMFMSSATAFQRITNKVFAN...
2,VSSNGC[561.3387]LANSAC[57.0215]QLAHDHTDQVIK,VSSNGC[567.3462]LANSAC[57.0215]QLAHDHTDQVIK,VSSNGCLANSACQLAHDHTDQVIK,,-1.188386,-3.195898,-3.452135,,-3.437022,A2A974,CP4CB_MOUSE,Cyp4a12b,Cytochrome P450 4A12B,1.413678e-02,2.818360,MGO,1.849650,1,MSASALSSIRFPGSISEYLQVASVLSLLLLLFKTAQLYLHRQWLLS...
3,RSIQFVDWC[561.3387]PTGFK,RSIQFVDWC[567.3462]PTGFK,RSIQFVDWCPTGFK,-1.300077,,-2.814100,-4.818681,-2.075210,,P68368,TBA4A_MOUSE,Tuba4a,Tubulin alpha-4A chain,3.562101e-02,2.752017,MGO,1.448294,1,MRECISVHVGQAGVQMGNACWELYCLEHGIQPDGQMPSDKTIGGGD...
4,QVC[57.0215]QLPGLFC[561.3387]YAQHIASIDGR,QVC[57.0215]QLPGLFC[567.3462]YAQHIASIDGR,QVCQLPGLFCYAQHIASIDGR,-0.960994,-2.932991,-3.424777,-2.642612,-3.824914,-1.646469,Q791V5,MTCH2_MOUSE,Mtch2,Mitochondrial carrier homolog 2,2.134749e-03,2.572126,MGO,2.670653,1,MADAASQVLLGSGLTILSQPLMYVKVLIQVGYEPLPPTIGRNIFGR...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6037,C[561.3387]PTSGR,C[567.3462]PTSGR,CPTSGR,,-0.470235,,0.449227,,,Q9Z2W0,DNPEP_MOUSE,Dnpep,Aspartyl aminopeptidase,9.854569e-01,0.010504,FA,0.006362,0,MAMNGRARKEAIQATARELLKFVNRSPSPFHVVAECRSRLLQAGFR...
6038,GVFC[561.3387]AGADLK,GVFC[567.3462]AGADLK,GVFCAGADLK,0.305741,,,0.716710,-0.352555,-0.689582,Q3TLP5,ECHD2_MOUSE,Echdc2,Enoyl-CoA hydratase domain-containing protein ...,9.885932e-01,0.004921,FA,0.004982,0,MLRVLPRALRLPCSWRFSGARDCASHATTRTPEIQVQALTGPNQGI...
6039,LLAC[561.3387]IASRPGQC[57.0215]GR,LLAC[567.3462]IASRPGQC[57.0215]GR,LLACIASRPGQCGR,,0.337643,-0.344721,,,,P62242,RS8_MOUSE,Rps8,Small ribosomal subunit protein eS8,9.933967e-01,0.003539,FA,0.002877,0,MGISRDNWHKRRKTGGKRKPYHKKRKYELGRPAANTKIGPRRIHTV...
6040,YVEPIEDVPC[561.3387]GNIVGLVGVDQFLVK,YVEPIEDVPC[567.3462]GNIVGLVGVDQFLVK,YVEPIEDVPCGNIVGLVGVDQFLVK,,,,,0.183278,-0.184626,P58252,EF2_MOUSE,Eef2,Elongation factor 2,9.976672e-01,0.000674,FA,0.001014,0,MVNFTVDQIRAIMDKKANIRNMSVIAHVDHGKSTLTDSLVCKAGII...


In [14]:
# Extract peptide, target site locations and surrounding amino acid sequences
peptides = extract_sites(peptides, amino_acid, amino_acid_str, analysis_threshold, modifications)
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,Peptide Sequence,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,Protein ID,...,Reactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Cys Location,Site,Left 20,Right 20
0,SC[561.3387]ATFSSSHR,SC[567.3462]ATFSSSHR,SCATFSSSHR,,,,-5.311599,-5.493770,,Q3UPL0,...,1,MKLKEIDRTAMQAWSPAQNHPIYLATGTSAQQLDATFSTNASLEIF...,58,10,S,1,59,Q3UPL0_M60,NASLEIFELDLSDPSLDMKS,ATFSSSHRYHKLIWGPHKMD
1,VFANPEDC[561.3387]AGFGK,VFANPEDC[567.3462]AGFGK,VFANPEDCAGFGK,-3.163860,-3.369927,-2.964787,-3.072636,-3.284364,-3.326394,Q91VS7,...,1,MADLRQLMDNEVLMAFTSYATIILTKMMFMSSATAFQRITNKVFAN...,42,13,VFANPED,7,49,Q91VS7_M50,MSSATAFQRITNKVFANPED,AGFGKGENAKKFVRTDEKVE
2,VSSNGC[561.3387]LANSAC[57.0215]QLAHDHTDQVIK,VSSNGC[567.3462]LANSAC[57.0215]QLAHDHTDQVIK,VSSNGCLANSACQLAHDHTDQVIK,,-1.188386,-3.195898,-3.452135,,-3.437022,A2A974,...,1,MSASALSSIRFPGSISEYLQVASVLSLLLLLFKTAQLYLHRQWLLS...,242,24,VSSNG,5,247,A2A974_M248,LRVRNIFHQNDIIYRVSSNG,LANSACQLAHDHTDQVIKSR
3,RSIQFVDWC[561.3387]PTGFK,RSIQFVDWC[567.3462]PTGFK,RSIQFVDWCPTGFK,-1.300077,,-2.814100,-4.818681,-2.075210,,P68368,...,1,MRECISVHVGQAGVQMGNACWELYCLEHGIQPDGQMPSDKTIGGGD...,338,14,RSIQFVDW,8,346,P68368_M347,DVNAAIAAIKTKRSIQFVDW,PTGFKVGINYQPPTVVPGGD
4,QVC[57.0215]QLPGLFC[561.3387]YAQHIASIDGR,QVC[57.0215]QLPGLFC[567.3462]YAQHIASIDGR,QVCQLPGLFCYAQHIASIDGR,-0.960994,-2.932991,-3.424777,-2.642612,-3.824914,-1.646469,Q791V5,...,1,MADAASQVLLGSGLTILSQPLMYVKVLIQVGYEPLPPTIGRNIFGR...,46,21,QVCQLPGLF,9,55,Q791V5_M56,PPTIGRNIFGRQVCQLPGLF,YAQHIASIDGRRGLFTGLTP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6037,C[561.3387]PTSGR,C[567.3462]PTSGR,CPTSGR,,-0.470235,,0.449227,,,Q9Z2W0,...,0,MAMNGRARKEAIQATARELLKFVNRSPSPFHVVAECRSRLLQAGFR...,141,6,,0,141,Q9Z2W0_M142,GIWSTWFDRDLTLAGRVIIK,PTSGRLEQRLVHIERPILRI
6038,GVFC[561.3387]AGADLK,GVFC[567.3462]AGADLK,GVFCAGADLK,0.305741,,,0.716710,-0.352555,-0.689582,Q3TLP5,...,0,MLRVLPRALRLPCSWRFSGARDCASHATTRTPEIQVQALTGPNQGI...,91,10,GVF,3,94,Q3TLP5_M95,LREDQQVRVLLFRSAVKGVF,AGADLKEREQMSDVEVGTFV
6039,LLAC[561.3387]IASRPGQC[57.0215]GR,LLAC[567.3462]IASRPGQC[57.0215]GR,LLACIASRPGQCGR,,0.337643,-0.344721,,,,P62242,...,0,MGISRDNWHKRRKTGGKRKPYHKKRKYELGRPAANTKIGPRRIHTV...,170,14,LLA,3,173,P62242_M174,KNAKISSLLEEQFQQGKLLA,IASRPGQCGRADGYVLEGKE
6040,YVEPIEDVPC[561.3387]GNIVGLVGVDQFLVK,YVEPIEDVPC[567.3462]GNIVGLVGVDQFLVK,YVEPIEDVPCGNIVGLVGVDQFLVK,,,,,0.183278,-0.184626,P58252,...,0,MVNFTVDQIRAIMDKKANIRNMSVIAHVDHGKSTLTDSLVCKAGII...,456,25,YVEPIEDVP,9,465,P58252_M466,PIQRTILMMGRYVEPIEDVP,GNIVGLVGVDQFLVKTGTIT


In [15]:
# Sanity Check: ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides["Complete Sequence"], peptides["Peptide Location"], peptides["Peptide Length"])]
(temp == peptides["Peptide Sequence"]).value_counts()

Peptide Sequence
True     6024
False      18
Name: count, dtype: int64

In [16]:
# These peptides aren't found in the UniProt reference sequence for their proteins -> drop 'em
# 18 dropped peptides as a result
# potential TODO: alignment to find peptide sequence

display(peptides[peptides["Peptide Sequence"] != temp])

peptides = peptides[peptides["Peptide Sequence"] == temp]
peptides;

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,Peptide Sequence,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,Protein ID,...,Reactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Cys Location,Site,Left 20,Right 20
244,EIVHLQAGQC[561.3387]GNQIGAK,EIVHLQAGQC[567.3462]GNQIGAK,EIVHLQAGQCGNQIGAK,-1.753626,-0.867699,-0.850444,-0.901603,-0.901639,-0.70354,Q7TMM9,...,0,MREIVHIQAGQCGNQIGAKFWEVISDEHGIDPTGSYHGDSDLQLER...,-1,17,EIVHLQAGQ,9,8,Q7TMM9_M9,MREIVHI,GQCGNQIGAKFWEVISDEHG
648,SIHDALC[561.3387]VIR,SIHDALC[567.3462]VIR,SIHDALCVIR,-0.547237,-0.704635,-0.506405,-0.37158,-0.808413,-0.743547,P80316,...,0,MASVGTLAFDEYGRPFLIIKDQDRKSRLMGLEALKSHIMAAKAVAN...,-1,10,SIHDAL,6,5,P80316_M6,MASV,LAFDEYGRPFLIIKDQDRKS
1362,NC[561.3387]IGM[15.9949]R,NC[567.3462]IGM[15.9949]R,NCIGMR,-0.24303,,0.209377,0.33149,0.20753,0.208625,Q64459,...,0,MDLVSALSLETWVLLAISLVLLYRYGTRKHELFKKQGIPGPKPLPF...,-1,6,N,1,0,Q64459_M1,MDLVSALSLETWVLLAISLVLLYRYGTRKHELFKKQGIPGPKPLPF...,DLVSALSLETWVLLAISLVL
1653,AIIPC[561.3387]IK,AIIPC[567.3462]IK,AIIPCIK,-0.063253,-0.217632,0.168579,-0.105441,0.346744,,P60843,...,0,MSASQDSRSRDNGPDGMEPEGVIESNWNEIVDSFDDMNLSESLLRG...,-1,7,AIIP,4,3,P60843_M4,MS,QDSRSRDNGPDGMEPEGVIE
1877,EIVHLQAGQC[561.3387]GNQIGAK,EIVHLQAGQC[567.3462]GNQIGAK,EIVHLQAGQCGNQIGAK,-0.227553,0.038037,-0.054995,-0.289106,-0.34535,-0.516564,Q7TMM9,...,0,MREIVHIQAGQCGNQIGAKFWEVISDEHGIDPTGSYHGDSDLQLER...,-1,17,EIVHLQAGQ,9,8,Q7TMM9_M9,MREIVHI,GQCGNQIGAKFWEVISDEHG
1998,NC[561.3387]IGM[15.9949]R,NC[567.3462]IGM[15.9949]R,NCIGMR,0.327569,0.383667,0.023402,,0.299112,,Q64459,...,0,MDLVSALSLETWVLLAISLVLLYRYGTRKHELFKKQGIPGPKPLPF...,-1,6,N,1,0,Q64459_M1,MDLVSALSLETWVLLAISLVLLYRYGTRKHELFKKQGIPGPKPLPF...,DLVSALSLETWVLLAISLVL
2465,C[561.3387]IEDIK,C[567.3462]IEDIK,CIEDIK,0.257384,0.032816,0.092468,,0.057171,,Q9CQQ7,...,0,MLSRVVLSAAATAAPCLKNAAALGPGVLQATRAFHTGQPRLAPLPP...,-1,6,,0,-1,Q9CQQ7_M0,MLSRVVLSAAATAAPCLKNAAALGPGVLQATRAFHTGQPRLAPLPP...,MLSRVVLSAAATAAPCLKNA
2774,NC[561.3387]IGMR,NC[567.3462]IGMR,NCIGMR,0.642677,0.298321,-0.010151,0.450504,-0.19856,,Q64459,...,0,MDLVSALSLETWVLLAISLVLLYRYGTRKHELFKKQGIPGPKPLPF...,-1,6,N,1,0,Q64459_M1,MDLVSALSLETWVLLAISLVLLYRYGTRKHELFKKQGIPGPKPLPF...,DLVSALSLETWVLLAISLVL
2819,LC[561.3387]EEHGIIR,LC[567.3462]EEHGIIR,LCEEHGIIR,-0.278228,,,-0.136271,,,P11589,...,0,MKMLLLLCLGLTLVCVHAEEASSTGRNFNVEKINGEWHTIILASDK...,-1,9,L,1,0,P11589_M1,MKMLLLLCLGLTLVCVHAEEASSTGRNFNVEKINGEWHTIILASDK...,KMLLLLCLGLTLVCVHAEEA
2890,AIIPC[561.3387]IK,AIIPC[567.3462]IK,AIIPCIK,0.366422,,0.681469,,,-0.020389,P60843,...,0,MSASQDSRSRDNGPDGMEPEGVIESNWNEIVDSFDDMNLSESLLRG...,-1,7,AIIP,4,3,P60843_M4,MS,QDSRSRDNGPDGMEPEGVIE


In [17]:
# Sanity Check: ensure target sites are correct
temp = [A[B] for A, B in zip(peptides["Complete Sequence"], peptides[f"{amino_acid_str} Location"])]
pd.Series(temp).value_counts()

C    6024
Name: count, dtype: int64

In [18]:
# NOTE: some target sites do not have a full 20 amino acids to either side
peptides[(peptides[f"Left {analysis_threshold}"].str.len() != 20) | (peptides[f"Right {analysis_threshold}"].str.len() != 20)]

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,Peptide Sequence,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,Protein ID,...,Reactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Cys Location,Site,Left 20,Right 20
7,LAGIC[561.3387]TIQLK,LAGIC[567.3462]TIQLK,LAGICTIQLK,-2.286976,-2.362502,-2.557802,-2.672497,-2.318515,-2.479661,Q9JIZ0,...,1,MVPYHIRQYQDSDHKRVVDVFTKGMEEYIPSTFRHMLMLPRTLLLL...,206,10,LAGI,4,210,Q9JIZ0_M211,MGFKKAGQYFMSIFWRLAGI,TIQLKYSFPSA
8,AFPANAAC[561.3387]FLGFEIAMK,AFPANAAC[567.3462]FLGFEIAMK,AFPANAACFLGFEIAMK,-1.861605,-3.151087,-2.273753,-2.208168,-2.082882,-2.883389,Q9Z2Z6,...,1,MADEPKPISPFKNLLAGGFGGMCLVFVGHPLDTVKVRLQTQPPSLS...,275,17,AFPANAA,7,282,Q9Z2Z6_M283,TSLYKGFNAVMIRAFPANAA,FLGFEIAMKFLNWIAPNL
43,TYC[561.3387]YDLR,TYC[567.3462]YDLR,TYCYDLR,,,,-1.541895,,-1.640851,Q791V5,...,1,MADAASQVLLGSGLTILSQPLMYVKVLIQVGYEPLPPTIGRNIFGR...,293,7,TY,2,295,Q791V5_M296,GNMSRGNSLFFRKVPCGKTY,YDLRMLI
44,C[561.3387]LPTPK,C[567.3462]LPTPK,CLPTPK,-1.153381,-1.056601,-1.374569,-1.252662,-1.004046,-3.548544,P62717,...,1,MKASGTLREYKVVGRCLPTPKCHTPPLYRMRIFAPNHVVAKSRFWY...,15,6,,0,15,P62717_M16,MKASGTLREYKVVG,LPTPKCHTPPLYRMRIFAPN
50,NC[561.3387]LLTVM[15.9949]DR,NC[567.3462]LLTVM[15.9949]DR,NCLLTVMDR,,,-1.472386,-1.170983,-1.391642,-1.817400,Q62264,...,0,MQVLTKRYPKNCLLTVMDRYSAVVRNMEQVVMIPSLLRDVQLSGPG...,10,9,N,1,11,Q62264_M12,MQVLTKRYPK,LLTVMDRYSAVVRNMEQVVM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5998,IVVHMAHALKPGEFGLASIC[561.3387]NGGGGASALLIEK,IVVHMAHALKPGEFGLASIC[567.3462]NGGGGASALLIEK,IVVHMAHALKPGEFGLASICNGGGGASALLIEK,0.336608,0.189877,-0.135796,-0.058946,-0.110562,-0.123623,Q8QZT1,...,0,MAALVALHGVVRRPLLRGLLQEVRCLERSYASKPTLNEVVIVSAIR...,390,33,IVVHMAHALKPGEFGLASI,19,409,Q8QZT1_M410,RIVVHMAHALKPGEFGLASI,NGGGGASALLIEKL
6011,IDFDLHDLIPSC[561.3387]ER,IDFDLHDLIPSC[567.3462]ER,IDFDLHDLIPSCER,0.202646,-0.096907,,-0.069074,,,Q99P30,...,0,MSRPCGLPEPVRNNLIDDAKARLRKSDVGTRYSHLSSNKFSVLVPL...,211,14,IDFDLHDLIPS,11,222,Q99P30_M223,ILEQSPAFKIDFDLHDLIPS,ERTFLWRYSLSKL
6027,SLC[57.0215]IPFNPLC[57.0215]ELQPGAM[15.9949]C[...,SLC[57.0215]IPFNPLC[57.0215]ELQPGAM[15.9949]C[...,SLCIPFNPLCELQPGAMCVCGK,-0.037515,,,,-0.098462,0.149182,Q8CGC7,...,0,MAALCLTVNAGNPPLEALLAVEHVKGDVSISVEEGKENLLRVSETV...,1477,22,SLCIPFNPLCELQPGAM,17,1494,Q8CGC7_M1495,GAKSLCIPFNPLCELQPGAM,VCGKNPAKFYTLFGRSY
6030,LVC[561.3387]ISDYEQHVR,LVC[567.3462]ISDYEQHVR,LVCISDYEQHVR,0.159474,0.164138,-0.208701,-0.097486,,,Q9WU19,...,0,MLPRLVCISDYEQHVRSVLQKSVYDYYRSGANDQETLADNIQAFSR...,4,12,LV,2,6,Q9WU19_M7,MLPRL,ISDYEQHVRSVLQKSVYDYY


### Download Alphafold Data

In [19]:
# Remove invalid proteins (structures not in AlphaFold)
# 13 invalid proteins -> 53 invalid peptides -> 3 hyperreactive, 50 not

invalid_IDs = ['A2AN08', 'Q9JHU4', 'Q5SSH7', 'Q8BX70', 'E9Q414', 'Q7TMY8', 'O70325', 'P11352', 'P97412', 'Q9DBC0', 'Q8R0W0', 'Q9QXZ0', 'Q9ESE1']
display(peptides[peptides["Protein ID"].isin(invalid_IDs)])
display(peptides[peptides["Protein ID"].isin(invalid_IDs)]["Reactive"].value_counts())
peptides = peptides[~peptides["Protein ID"].isin(invalid_IDs)]
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,Peptide Sequence,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,Protein ID,...,Reactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Cys Location,Site,Left 20,Right 20
186,AVQC[561.3387]LNTSSK,AVQC[567.3462]LNTSSK,AVQCLNTSSK,-1.304264,,,-1.106132,-0.954427,-1.029554,A2AN08,...,0,MATSGGEEAAAAAPAPGAPATGQDTTPGWEVAVRPLLSASYSAFEM...,2548,10,AVQ,3,2551,A2AN08_M2552,SSRSAYHSHKDQALLSKAVQ,LNTSSKEGKDLDPEVFQRLV
381,WQAHC[561.3387]LTLHIYR,WQAHC[567.3462]LTLHIYR,WQAHCLTLHIYR,,,-1.007960,-0.919363,-0.852110,-0.614876,A2AN08,...,0,MATSGGEEAAAAAPAPGAPATGQDTTPGWEVAVRPLLSASYSAFEM...,3422,12,WQAH,4,3426,A2AN08_M3427,QFLRCFLLESNSSSVRWQAH,LTLHIYRNSNKAQQELLLDL
387,LHSLQPHAC[561.3387]FR,LHSLQPHAC[567.3462]FR,LHSLQPHACFR,,,-0.822956,-1.303309,-0.630032,-0.616297,Q9JHU4,...,0,MSEPGGGEDGSAGLEVSAVQNVADVAVLQKHLRKLVPLLLEDGGDA...,4110,11,LHSLQPHA,8,4118,Q9JHU4_M4119,LAPGWLMQLEKKLHSLQPHA,FRLFLTMEINPKVPVNLLRA
435,REEC[57.0215]IC[561.3387]AQTLLLK,REEC[57.0215]IC[567.3462]AQTLLLK,REECICAQTLLLK,,,-0.683450,-0.883881,,-0.821112,Q5SSH7,...,0,MGNAPSNSSEDEAAAAGGEGWSPHQDWAADSGTTPGPGPAAAVLPS...,781,13,REECI,5,786,Q5SSH7_M787,LQIFWKFYSKLKQNRREECI,AQTLLLKLLQSCFSVLQGDP
498,C[561.3387]FHFPDSK,C[567.3462]FHFPDSK,CFHFPDSK,,,-0.735295,,-0.832932,-0.673263,Q8BX70,...,0,MVLESVVADLLNRFLGDYVENLNKSQLKLGIWGGNVALDNLQIKEN...,1478,8,,0,1478,Q8BX70_M1479,AKVKAHDMTAAAYLRNISMR,FHFPDSKGEPLRIVNTSDVS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4906,LHSLQPHAC[561.3387]FR,LHSLQPHAC[567.3462]FR,LHSLQPHACFR,-1.090330,,,-0.548194,-0.788785,,Q9JHU4,...,0,MSEPGGGEDGSAGLEVSAVQNVADVAVLQKHLRKLVPLLLEDGGDA...,4110,11,LHSLQPHA,8,4118,Q9JHU4_M4119,LAPGWLMQLEKKLHSLQPHA,FRLFLTMEINPKVPVNLLRA
4908,ALSVLGC[561.3387]GHTSSTK,ALSVLGC[567.3462]GHTSSTK,ALSVLGCGHTSSTK,-0.981457,,,-0.728749,-0.713671,,A2AN08,...,0,MATSGGEEAAAAAPAPGAPATGQDTTPGWEVAVRPLLSASYSAFEM...,3854,14,ALSVLG,6,3860,A2AN08_M3861,RTSVQPTFTASQYRALSVLG,GHTSSTKCYGCASAVTEHCI
5011,YIIWSPVC[561.3387]R,YIIWSPVC[567.3462]R,YIIWSPVCR,-0.569033,-0.556640,-0.176238,-0.385590,-0.843086,-1.206960,P11352,...,0,MCAARLSAAAQSTVYAFSARPLTGGEPVSLGSLRGKVLLIENVASL...,146,9,YIIWSPV,7,153,P11352_M154,PSDDPTALMTDPKYIIWSPV,RNDIAWNFEKFLVGPDGVPV
5130,ILAFPC[561.3387]NQFGR,ILAFPC[567.3462]NQFGR,ILAFPCNQFGR,-0.510034,,-0.098633,,-0.486131,-0.598755,O70325,...,0,MSWGRLSRLLKPALLCGALAAPGLAGTMCASRDDWRCARSMHEFSA...,96,11,ILAFP,5,101,O70325_M102,QLVDLHARYAECGLRILAFP,NQFGRQEPGSNQEIKEFAAG


Reactive
0    50
1     3
Name: count, dtype: int64

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,Peptide Sequence,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,Protein ID,...,Reactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Cys Location,Site,Left 20,Right 20
0,SC[561.3387]ATFSSSHR,SC[567.3462]ATFSSSHR,SCATFSSSHR,,,,-5.311599,-5.493770,,Q3UPL0,...,1,MKLKEIDRTAMQAWSPAQNHPIYLATGTSAQQLDATFSTNASLEIF...,58,10,S,1,59,Q3UPL0_M60,NASLEIFELDLSDPSLDMKS,ATFSSSHRYHKLIWGPHKMD
1,VFANPEDC[561.3387]AGFGK,VFANPEDC[567.3462]AGFGK,VFANPEDCAGFGK,-3.163860,-3.369927,-2.964787,-3.072636,-3.284364,-3.326394,Q91VS7,...,1,MADLRQLMDNEVLMAFTSYATIILTKMMFMSSATAFQRITNKVFAN...,42,13,VFANPED,7,49,Q91VS7_M50,MSSATAFQRITNKVFANPED,AGFGKGENAKKFVRTDEKVE
2,VSSNGC[561.3387]LANSAC[57.0215]QLAHDHTDQVIK,VSSNGC[567.3462]LANSAC[57.0215]QLAHDHTDQVIK,VSSNGCLANSACQLAHDHTDQVIK,,-1.188386,-3.195898,-3.452135,,-3.437022,A2A974,...,1,MSASALSSIRFPGSISEYLQVASVLSLLLLLFKTAQLYLHRQWLLS...,242,24,VSSNG,5,247,A2A974_M248,LRVRNIFHQNDIIYRVSSNG,LANSACQLAHDHTDQVIKSR
3,RSIQFVDWC[561.3387]PTGFK,RSIQFVDWC[567.3462]PTGFK,RSIQFVDWCPTGFK,-1.300077,,-2.814100,-4.818681,-2.075210,,P68368,...,1,MRECISVHVGQAGVQMGNACWELYCLEHGIQPDGQMPSDKTIGGGD...,338,14,RSIQFVDW,8,346,P68368_M347,DVNAAIAAIKTKRSIQFVDW,PTGFKVGINYQPPTVVPGGD
4,QVC[57.0215]QLPGLFC[561.3387]YAQHIASIDGR,QVC[57.0215]QLPGLFC[567.3462]YAQHIASIDGR,QVCQLPGLFCYAQHIASIDGR,-0.960994,-2.932991,-3.424777,-2.642612,-3.824914,-1.646469,Q791V5,...,1,MADAASQVLLGSGLTILSQPLMYVKVLIQVGYEPLPPTIGRNIFGR...,46,21,QVCQLPGLF,9,55,Q791V5_M56,PPTIGRNIFGRQVCQLPGLF,YAQHIASIDGRRGLFTGLTP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6037,C[561.3387]PTSGR,C[567.3462]PTSGR,CPTSGR,,-0.470235,,0.449227,,,Q9Z2W0,...,0,MAMNGRARKEAIQATARELLKFVNRSPSPFHVVAECRSRLLQAGFR...,141,6,,0,141,Q9Z2W0_M142,GIWSTWFDRDLTLAGRVIIK,PTSGRLEQRLVHIERPILRI
6038,GVFC[561.3387]AGADLK,GVFC[567.3462]AGADLK,GVFCAGADLK,0.305741,,,0.716710,-0.352555,-0.689582,Q3TLP5,...,0,MLRVLPRALRLPCSWRFSGARDCASHATTRTPEIQVQALTGPNQGI...,91,10,GVF,3,94,Q3TLP5_M95,LREDQQVRVLLFRSAVKGVF,AGADLKEREQMSDVEVGTFV
6039,LLAC[561.3387]IASRPGQC[57.0215]GR,LLAC[567.3462]IASRPGQC[57.0215]GR,LLACIASRPGQCGR,,0.337643,-0.344721,,,,P62242,...,0,MGISRDNWHKRRKTGGKRKPYHKKRKYELGRPAANTKIGPRRIHTV...,170,14,LLA,3,173,P62242_M174,KNAKISSLLEEQFQQGKLLA,IASRPGQCGRADGYVLEGKE
6040,YVEPIEDVPC[561.3387]GNIVGLVGVDQFLVK,YVEPIEDVPC[567.3462]GNIVGLVGVDQFLVK,YVEPIEDVPCGNIVGLVGVDQFLVK,,,,,0.183278,-0.184626,P58252,...,0,MVNFTVDQIRAIMDKKANIRNMSVIAHVDHGKSTLTDSLVCKAGII...,456,25,YVEPIEDVP,9,465,P58252_M466,PIQRTILMMGRYVEPIEDVP,GNIVGLVGVDQFLVKTGTIT


In [20]:
# Set UniProt IDs to use
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q3UPL0' 'Q91VS7' 'A2A974' ... 'Q60854' 'Q9JLV1' 'Q9Z1K6']
Number of Unique UniProt IDs: 1260


In [21]:
# Download cif data for AlphaFold protein structures
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 1260/1260 [00:00<00:00, 131509.06it/s]

2025-11-12 18:31:30> Valid proteins: 0
2025-11-12 18:31:30> Invalid proteins: 0
2025-11-12 18:31:30> Existing proteins: 1260





In [22]:
# Download pae data for AlphaFold protein structures
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir,
)

100%|██████████| 1260/1260 [00:00<00:00, 165658.05it/s]

2025-11-12 18:31:30> Valid proteins: 0
2025-11-12 18:31:30> Invalid proteins: 0
2025-11-12 18:31:30> Existing proteins: 1260





### Calculate Accessibilites and Merge into Full Dataset

In [23]:
radii = [2, 3, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 12, 18, 24]
smooth_accessibilities = calculate_accessibilities(cif_dir, pae_dir, unique_uniprotIDs, radii)
smooth_accessibilities["position"] = smooth_accessibilities["position"] - 1 # zero-index the positions to match initial dataframe
smooth_accessibilities

100%|██████████| 1264/1264 [02:04<00:00, 10.14it/s]
100%|██████████| 1260/1260 [00:16<00:00, 77.35it/s] 
100%|██████████| 1260/1260 [00:14<00:00, 86.88it/s] 
100%|██████████| 1260/1260 [00:14<00:00, 88.45it/s] 
100%|██████████| 1260/1260 [00:14<00:00, 85.46it/s] 
100%|██████████| 1260/1260 [00:14<00:00, 87.34it/s] 
100%|██████████| 1260/1260 [00:14<00:00, 84.07it/s] 
100%|██████████| 1260/1260 [00:15<00:00, 83.17it/s] 
100%|██████████| 1260/1260 [00:15<00:00, 81.60it/s] 
100%|██████████| 1260/1260 [00:15<00:00, 79.70it/s] 
100%|██████████| 1260/1260 [00:15<00:00, 80.02it/s] 
100%|██████████| 1260/1260 [00:15<00:00, 80.50it/s] 
100%|██████████| 1260/1260 [00:19<00:00, 65.89it/s] 
100%|██████████| 1260/1260 [00:24<00:00, 50.48it/s]
100%|██████████| 1260/1260 [00:33<00:00, 37.29it/s]
100%|██████████| 1260/1260 [00:02<00:00, 460.46it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,A0JNU3,1,M,0,51.97,17.551,16.184,15.131,16.088,2.290,...,1.818182,2.181818,2.636364,3.090909,3.545455,4.181818,11.363636,31.727273,63.181818,0
1,A0JNU3,1,A,1,58.44,19.196,19.493,20.290,18.237,1.976,...,1.833333,2.250000,2.750000,3.250000,4.000000,4.750000,13.166667,36.333333,72.500000,0
2,A0JNU3,1,R,2,70.88,20.471,20.196,21.370,20.014,1.619,...,1.846154,2.384615,2.846154,3.615385,4.538462,5.307692,14.692308,41.461538,81.076923,0
3,A0JNU3,1,A,3,70.19,21.652,20.145,19.339,19.911,-0.151,...,1.857143,2.500000,2.928571,3.714286,4.928571,5.642857,15.928571,46.071429,88.142857,0
4,A0JNU3,1,M,4,70.94,23.797,23.599,24.092,22.193,-2.290,...,1.866667,2.600000,3.000000,3.933333,5.200000,6.000000,16.933333,49.666667,94.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,Q9Z2Z6,1260,I,296,91.38,9.338,7.822,7.062,7.447,28.701,...,1.866667,2.000000,3.466667,4.600000,5.533333,5.800000,11.533333,24.600000,40.466667,0
297,Q9Z2Z6,1260,A,297,86.56,12.229,11.499,11.664,10.073,26.782,...,1.857143,1.928571,3.285714,4.285714,5.285714,5.571429,10.571429,23.357143,38.142857,0
298,Q9Z2Z6,1260,P,298,80.50,14.308,12.836,12.648,12.273,26.270,...,1.846154,1.923077,3.153846,4.153846,5.076923,5.384615,10.153846,22.230769,36.153846,0
299,Q9Z2Z6,1260,N,299,79.62,16.913,16.530,17.224,15.086,25.999,...,1.833333,1.916667,3.083333,4.000000,4.833333,5.166667,9.916667,21.416667,34.500000,0


In [24]:
peptides = peptides.merge(
    smooth_accessibilities, 
    how="left", 
    left_on=["Protein ID", f"{amino_acid_str} Location"], 
    right_on=["protein_id", "position"]
)
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,Peptide Sequence,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,Protein ID,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,SC[561.3387]ATFSSSHR,SC[567.3462]ATFSSSHR,SCATFSSSHR,,,,-5.311599,-5.493770,,Q3UPL0,...,2.000000,2.047619,2.238095,2.666667,3.333333,4.000000,12.428571,38.285714,82.761905,0
1,VFANPEDC[561.3387]AGFGK,VFANPEDC[567.3462]AGFGK,VFANPEDCAGFGK,-3.163860,-3.369927,-2.964787,-3.072636,-3.284364,-3.326394,Q91VS7,...,0.714286,1.809524,1.809524,1.952381,2.000000,2.238095,4.619048,9.285714,19.238095,1
2,VSSNGC[561.3387]LANSAC[57.0215]QLAHDHTDQVIK,VSSNGC[567.3462]LANSAC[57.0215]QLAHDHTDQVIK,VSSNGCLANSACQLAHDHTDQVIK,,-1.188386,-3.195898,-3.452135,,-3.437022,A2A974,...,2.000000,2.476190,4.857143,6.190476,7.857143,8.476190,19.666667,46.666667,82.809524,0
3,RSIQFVDWC[561.3387]PTGFK,RSIQFVDWC[567.3462]PTGFK,RSIQFVDWCPTGFK,-1.300077,,-2.814100,-4.818681,-2.075210,,P68368,...,1.952381,2.380952,2.523810,2.809524,3.714286,4.523810,12.428571,39.619048,81.761905,0
4,QVC[57.0215]QLPGLFC[561.3387]YAQHIASIDGR,QVC[57.0215]QLPGLFC[567.3462]YAQHIASIDGR,QVCQLPGLFCYAQHIASIDGR,-0.960994,-2.932991,-3.424777,-2.642612,-3.824914,-1.646469,Q791V5,...,2.047619,2.238095,3.666667,4.523810,5.666667,6.380952,14.238095,37.285714,67.761905,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5966,C[561.3387]PTSGR,C[567.3462]PTSGR,CPTSGR,,-0.470235,,0.449227,,,Q9Z2W0,...,2.238095,2.619048,3.619048,5.571429,7.000000,8.428571,20.000000,52.714286,100.523810,0
5967,GVFC[561.3387]AGADLK,GVFC[567.3462]AGADLK,GVFCAGADLK,0.305741,,,0.716710,-0.352555,-0.689582,Q3TLP5,...,2.238095,2.809524,4.142857,5.571429,7.047619,8.333333,24.857143,70.428571,123.857143,0
5968,LLAC[561.3387]IASRPGQC[57.0215]GR,LLAC[567.3462]IASRPGQC[57.0215]GR,LLACIASRPGQCGR,,0.337643,-0.344721,,,,P62242,...,2.142857,2.666667,4.666667,6.285714,7.761905,9.190476,22.857143,60.809524,102.238095,0
5969,YVEPIEDVPC[561.3387]GNIVGLVGVDQFLVK,YVEPIEDVPC[567.3462]GNIVGLVGVDQFLVK,YVEPIEDVPCGNIVGLVGVDQFLVK,,,,,0.183278,-0.184626,P58252,...,2.190476,2.571429,2.904762,4.142857,5.285714,7.095238,21.000000,69.095238,135.285714,0


In [25]:
# Sanity Check: ensure UniProt and AlphaFold sequences are the same
peptides["AA"].value_counts()
#pd.set_option("display.max_columns", None)
#display(peptides[~(peptides["AA"] == amino_acid)])
#pd.reset_option("display.max_columns")
#peptides = peptides[(peptides["AA"] == amino_acid)]

AA
C    5971
Name: count, dtype: int64

In [26]:
peptides.to_csv(os.path.join(processed_datasets_path, "cys_ald_complete.csv"))

In [27]:
path = os.path.join(processed_datasets_path, "cys_ald_complete.csv")
peptides = pd.read_csv(path).set_index("Unnamed: 0")
peptides.index.name = None
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,Peptide Sequence,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,Protein ID,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,SC[561.3387]ATFSSSHR,SC[567.3462]ATFSSSHR,SCATFSSSHR,,,,-5.311599,-5.493770,,Q3UPL0,...,2.000000,2.047619,2.238095,2.666667,3.333333,4.000000,12.428571,38.285714,82.761905,0
1,VFANPEDC[561.3387]AGFGK,VFANPEDC[567.3462]AGFGK,VFANPEDCAGFGK,-3.163860,-3.369927,-2.964787,-3.072636,-3.284364,-3.326394,Q91VS7,...,0.714286,1.809524,1.809524,1.952381,2.000000,2.238095,4.619048,9.285714,19.238095,1
2,VSSNGC[561.3387]LANSAC[57.0215]QLAHDHTDQVIK,VSSNGC[567.3462]LANSAC[57.0215]QLAHDHTDQVIK,VSSNGCLANSACQLAHDHTDQVIK,,-1.188386,-3.195898,-3.452135,,-3.437022,A2A974,...,2.000000,2.476190,4.857143,6.190476,7.857143,8.476190,19.666667,46.666667,82.809524,0
3,RSIQFVDWC[561.3387]PTGFK,RSIQFVDWC[567.3462]PTGFK,RSIQFVDWCPTGFK,-1.300077,,-2.814100,-4.818681,-2.075210,,P68368,...,1.952381,2.380952,2.523810,2.809524,3.714286,4.523810,12.428571,39.619048,81.761905,0
4,QVC[57.0215]QLPGLFC[561.3387]YAQHIASIDGR,QVC[57.0215]QLPGLFC[567.3462]YAQHIASIDGR,QVCQLPGLFCYAQHIASIDGR,-0.960994,-2.932991,-3.424777,-2.642612,-3.824914,-1.646469,Q791V5,...,2.047619,2.238095,3.666667,4.523810,5.666667,6.380952,14.238095,37.285714,67.761905,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5966,C[561.3387]PTSGR,C[567.3462]PTSGR,CPTSGR,,-0.470235,,0.449227,,,Q9Z2W0,...,2.238095,2.619048,3.619048,5.571429,7.000000,8.428571,20.000000,52.714286,100.523810,0
5967,GVFC[561.3387]AGADLK,GVFC[567.3462]AGADLK,GVFCAGADLK,0.305741,,,0.716710,-0.352555,-0.689582,Q3TLP5,...,2.238095,2.809524,4.142857,5.571429,7.047619,8.333333,24.857143,70.428571,123.857143,0
5968,LLAC[561.3387]IASRPGQC[57.0215]GR,LLAC[567.3462]IASRPGQC[57.0215]GR,LLACIASRPGQCGR,,0.337643,-0.344721,,,,P62242,...,2.142857,2.666667,4.666667,6.285714,7.761905,9.190476,22.857143,60.809524,102.238095,0
5969,YVEPIEDVPC[561.3387]GNIVGLVGVDQFLVK,YVEPIEDVPC[567.3462]GNIVGLVGVDQFLVK,YVEPIEDVPCGNIVGLVGVDQFLVK,,,,,0.183278,-0.184626,P58252,...,2.190476,2.571429,2.904762,4.142857,5.285714,7.095238,21.000000,69.095238,135.285714,0


TODO: split dataset into 3 aldehyde components

for combined dataset, average 