In [1]:
from dataset_utils import *

import os
import warnings

import numpy as np
import pandas as pd

import structuremap.utils
from structuremap.processing import download_alphafold_cif, download_alphafold_pae

warnings.filterwarnings("ignore")
structuremap.utils.set_logger()

### Set Parameters of Analysis

In [2]:
amino_acid = "M"
amino_acid_str = "Methionine"
analysis_threshold = 20 # number of amino acids either side to analyze
modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as strings
heavy_modification = "655.3735"
light_modification = "649.3660"

In [3]:
curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

datasets_path_str = "../datasets"
datasets_path = os.path.abspath(datasets_path_str)

global_data_path_str = "../../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Datasets Directory: " + datasets_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/toy
Datasets Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/datasets
Global Data Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/global_data


In [4]:
alphafold_path_str = "../../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print("AlphaFold Directory: " + alphafold_path)
print("CIF Directory: " + cif_dir)
print("PAE Directory: " + pae_dir)

AlphaFold Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data
CIF Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/cif
PAE Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/pae


## A549

### Load and Process Dataset - A549

In [5]:
# Load initial isoTOP-ABPP dataset
pd.set_option("display.max_rows", 25)
data_loc = os.path.join(curr_dir_path, "A549_hyperreactivity.csv")
peptides = pd.read_csv(data_loc)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,Average H/L,P-value,Site
0,SMEASVDVSAPK,SMEASVDVSAPK,SM[649.3660]EASVDVSAPK,SM[655.3735]EASVDVSAPK,,-4.314368,-4.309369,,,sp|Q8IVF2|AHNK2_HUMAN,Q8IVF2,AHNK2_HUMAN,AHNAK2,Protein AHNAK2,-4.311869,3.689843e-04,
1,VSMPDVELNLKSPK,VSMPDVELNLKSPK,VSM[649.3660]PDVELNLKSPK,VSM[655.3735]PDVELNLKSPK,,-3.310446,-2.678198,-2.695957,-2.488542,sp|Q09666|AHNK_HUMAN,Q09666,AHNK_HUMAN,AHNAK,Neuroblast differentiation-associated protein ...,-2.793286,5.686721e-04,AHNAK_M3417
2,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,O00487,PSDE_HUMAN,PSMD14,26S proteasome non-ATPase regulatory subunit 14,-2.697886,2.888136e-04,PSMD14_M167
3,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,-2.425145,1.903466e-07,HNRNPM_M437
4,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,Q15435,PP1R7_HUMAN,PPP1R7,Protein phosphatase 1 regulatory subunit 7,0.391862,2.353017e-02,PPP1R7_M344
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,P11940,PABP1_HUMAN,PABPC1,Polyadenylate-binding protein 1,8.428814,7.487145e-03,PABPC1_M573
45,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,P63261,ACTG_HUMAN,ACTG1,"Actin, cytoplasmic 2",8.471928,2.227639e-03,ACTG1_M47
46,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,P83731,RL24_HUMAN,RPL24,Large ribosomal subunit protein eL24,9.059513,4.406509e-02,RPL24_M127
47,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,P05783,K1C18_HUMAN,KRT18,"Keratin, type I cytoskeletal 18",9.610308,9.293054e-04,KRT18_M84


In [6]:
# Check dataset for missing (NaN) entries
peptides.isna().sum()

Peptide Sequence           0
Modified Peptide           0
Light Modified Peptide     0
Heavy Modified Peptide     0
exp_1 Log2 Ratio HL       24
exp_2 Log2 Ratio HL       21
exp_3 Log2 Ratio HL       16
exp_4 Log2 Ratio HL       27
exp_5 Log2 Ratio HL       22
Protein                    0
Protein ID                 0
Entry Name                 0
Gene                       0
Protein Description        0
Average H/L                0
P-value                    0
Site                       3
dtype: int64

In [7]:
# Sanity Check: ensure there's only one desired modification in each peptide
modifications_pattern = create_modifications_pattern(amino_acid, modifications)
print(modifications_pattern)
display(peptides["Light Modified Peptide"].str.count(modifications_pattern).value_counts())
display(peptides["Heavy Modified Peptide"].str.count(modifications_pattern).value_counts())

M\[649\.3660\]|M\[655\.3735\]


Light Modified Peptide
1    49
Name: count, dtype: int64

Heavy Modified Peptide
1    49
Name: count, dtype: int64

In [8]:
# Annotate Met site hyperreactivity labels (Hyperreactive: <= 2)

peptides["Hyperreactive"] = np.where(peptides["Average H/L"] <= 2, 1, 0)

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")

In [9]:
# Load and, if necessary, update sequence cache df (mapping from UniProt IDs to full protein sequences)

unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

path = os.path.join(global_data_path, "complete_sequence_cache.csv")
update_sequence_cache(path, unique_uniprotIDs)

sequence_cache_df_updated = pd.read_csv(path).set_index("Unnamed: 0")
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

Unique UniProt IDs: 
['Q8IVF2' 'Q09666' 'O00487' 'P52272' 'Q15435' 'Q15233' 'Q8WUH6' 'Q9BQ04'
 'P35579' 'Q14683' 'Q15149' 'P60228' 'Q9H694' 'O60610' 'Q9H4G0' 'P10809'
 'P46926' 'P41227' 'P61158' 'Q9H444' 'Q86UP2' 'P49915' 'P18621' 'P67870'
 'Q13813' 'P62258' 'Q01518' 'Q86V81' 'Q14152' 'P53999' 'Q13283' 'P26038'
 'P61247' 'P55072' 'Q16181' 'P15311' 'P18669' 'P62805' 'P22626' 'P11940'
 'P63261' 'P83731' 'P05783']
Number of Unique UniProt IDs: 43
   Protein ID                                  Complete Sequence
0      Q8C196  MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1      Q07417  MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...
2      Q91YI0  MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...
3      P50247  MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...
4      P33267  MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...
..        ...                                                ...
1      P22392  MANLERTFIAIKPDGVQRGLVGEIIKRFEQKGFRLVAMKFLRASEE...
2      O00116  MAEAAAAAGGTGLGAGA

In [10]:
peptides = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,Average H/L,P-value,Site,Hyperreactive,Complete Sequence
0,SMEASVDVSAPK,SMEASVDVSAPK,SM[649.3660]EASVDVSAPK,SM[655.3735]EASVDVSAPK,,-4.314368,-4.309369,,,sp|Q8IVF2|AHNK2_HUMAN,Q8IVF2,AHNK2_HUMAN,AHNAK2,Protein AHNAK2,-4.311869,3.689843e-04,,1,MCDCFHMVLPTWPGTPGSVSGRQLQPGEPGAETEDDHSVTEGPADE...
1,VSMPDVELNLKSPK,VSMPDVELNLKSPK,VSM[649.3660]PDVELNLKSPK,VSM[655.3735]PDVELNLKSPK,,-3.310446,-2.678198,-2.695957,-2.488542,sp|Q09666|AHNK_HUMAN,Q09666,AHNK_HUMAN,AHNAK,Neuroblast differentiation-associated protein ...,-2.793286,5.686721e-04,AHNAK_M3417,1,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...
2,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,O00487,PSDE_HUMAN,PSMD14,26S proteasome non-ATPase regulatory subunit 14,-2.697886,2.888136e-04,PSMD14_M167,1,MDRLLRLGGGMPGLGQGPPTDAPAVDTAEQVYISSLALLKMLKHGR...
3,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,-2.425145,1.903466e-07,HNRNPM_M437,1,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...
4,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,Q15435,PP1R7_HUMAN,PPP1R7,Protein phosphatase 1 regulatory subunit 7,0.391862,2.353017e-02,PPP1R7_M344,1,MAAERGAGQQQSQEMMEVDRRVESEESGDEEGKKHSSGIVADLSEQ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,P11940,PABP1_HUMAN,PABPC1,Polyadenylate-binding protein 1,8.428814,7.487145e-03,PABPC1_M573,0,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...
45,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,P63261,ACTG_HUMAN,ACTG1,"Actin, cytoplasmic 2",8.471928,2.227639e-03,ACTG1_M47,0,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...
46,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,P83731,RL24_HUMAN,RPL24,Large ribosomal subunit protein eL24,9.059513,4.406509e-02,RPL24_M127,0,MKVELCSFSGYKIYPGHGRRYARTDGKVFQFLNAKCESAFLSKRNP...
47,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,P05783,K1C18_HUMAN,KRT18,"Keratin, type I cytoskeletal 18",9.610308,9.293054e-04,KRT18_M84,0,MSFTTRSTFSTNYRSLGSVQAPSYGARPVSSAASVYAGAGGSGSRI...


In [11]:
# Process dataset to extract peptide and Met site locations
peptides = process_dataset(peptides, amino_acid, amino_acid_str, analysis_threshold, modifications)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,SMEASVDVSAPK,SMEASVDVSAPK,SM[649.3660]EASVDVSAPK,SM[655.3735]EASVDVSAPK,,-4.314368,-4.309369,,,sp|Q8IVF2|AHNK2_HUMAN,...,Q8IVF2_M693,1,MCDCFHMVLPTWPGTPGSVSGRQLQPGEPGAETEDDHSVTEGPADE...,692,12,S,1,693,KFKMPKFKMPLFGASAPGKS,EASVDVSAPKVEADVSLLSM
1,VSMPDVELNLKSPK,VSMPDVELNLKSPK,VSM[649.3660]PDVELNLKSPK,VSM[655.3735]PDVELNLKSPK,,-3.310446,-2.678198,-2.695957,-2.488542,sp|Q09666|AHNK_HUMAN,...,Q09666_M3416,1,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,3414,14,VS,2,3416,KVKGSKFKMPFLSISSPKVS,PDVELNLKSPKVKGDLDIAG
2,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,...,O00487_M166,1,MDRLLRLGGGMPGLGQGPPTDAPAVDTAEQVYISSLALLKMLKHGR...,161,14,LINAN,5,166,PIQSVKGKVVIDAFRLINAN,MVLGHEPRQTTSNLGHLNKP
3,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,...,P52272_M436,1,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,436,7,,0,436,ERMGAGLGHGMDRVGSEIER,GLVMDRMGSVERMGSGIERM
4,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,...,Q15435_M343,1,MAAERGAGQQQSQEMMEVDRRVESEESGDEEGKKHSSGIVADLSEQ...,342,9,V,1,343,ETVYLERNPLQKDPQYRRKV,LALPSVRQIDATFVRF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,...,P11940_M572,0,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...,565,15,LFPLIQA,7,572,APPQEQKQMLGERLFPLIQA,HPTLAGKITGMLLEIDNSEL
45,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,...,P63261_M46,0,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,39,11,HQGVMVG,7,46,PRAVFPSIVGRPRHQGVMVG,GQKDSYVGDEAQSKRGILTL
46,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,...,P83731_M126,0,MKVELCSFSGYKIYPGHGRRYARTDGKVFQFLNAKCESAFLSKRNP...,124,7,TA,2,126,QAIRAAKEAKKAKQASKKTA,AAAKAPTKAAPKQKIVKPVK
47,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,...,P05783_M83,0,MSFTTRSTFSTNYRSLGSVQAPSYGARPVSSAASVYAGAGGSGSRI...,81,9,ET,2,83,ATGIAGGLAGMGGIQNEKET,QSLNDRLASYLDRVRSLETE


In [12]:
# Sanity Check: ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides["Complete Sequence"], peptides["Peptide Location"], peptides["Peptide Length"])]
(temp == peptides["Peptide Sequence"]).value_counts()

Peptide Sequence
True    49
Name: count, dtype: int64

In [13]:
# Sanity Check: ensure Met sites are correct
temp = [A[B] for A, B in zip(peptides["Complete Sequence"], peptides[f"{amino_acid_str} Location"])]
pd.Series(temp).value_counts()

M    49
Name: count, dtype: int64

In [14]:
# NOTE: some Met sites do not have a full 20 amino acids to either side
peptides[(peptides[f"Left {analysis_threshold}"].str.len() != 20) | (peptides[f"Right {analysis_threshold}"].str.len() != 20)]

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
4,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,...,Q15435_M343,1,MAAERGAGQQQSQEMMEVDRRVESEESGDEEGKKHSSGIVADLSEQ...,342,9,V,1,343,ETVYLERNPLQKDPQYRRKV,LALPSVRQIDATFVRF
7,NSLYDMAR,NSLYDMAR,NSLYDM[649.3660]AR,NSLYDM[655.3735]AR,2.380718,2.33005,,,,sp|Q9BQ04|RBM4B_HUMAN,...,Q9BQ04_M341,0,MVKLFIGNLPREATEQEIRSLFEQYGKVLECDIIKNYGFVHIEDKT...,336,8,NSLYD,5,341,YGPESELSQASAATRNSLYD,ARYEREQYVDRARYSAF
19,HNPVFGVMS,HNPVFGVMS,HNPVFGVM[649.3660]S,HNPVFGVM[655.3735]S,4.203245,,4.046394,,,sp|P61158|ARP3_HUMAN,...,P61158_M416,0,MAGRLPACVVDCGTGYTKLGYAGNTEPQFIIPSCIAIKESAKVGDQ...,409,9,HNPVFGV,7,416,KKDYEEIGPSICRHNPVFGV,S
29,ADMQNLVER,n[42.0106]ADMQNLVER,n[42.0106]ADM[649.3660]QNLVER,n[42.0106]ADM[655.3735]QNLVER,,,6.328781,4.68717,4.748218,sp|Q01518|CAP1_HUMAN,...,Q01518_M3,0,MADMQNLVERLERAVGRLEAVSHTSDMHRGYADSPSKAGAAPYVQA...,1,9,AD,2,3,MA,QNLVERLERAVGRLEAVSHT
30,MDMSLDDIIK,MDMSLDDIIK,M[649.3660]DMSLDDIIK,M[655.3735]DMSLDDIIK,5.591513,6.189852,,,,sp|Q86V81|THOC4_HUMAN,...,Q86V81_M4,0,MADKMDMSLDDIIKLNRSQRGGRGGGRGRGRAGSQGGRGGGAQAAA...,4,10,,0,4,MAD,DMSLDDIIKLNRSQRGGRGG
40,AMEAVAAQGK,AMEAVAAQGK,AM[649.3660]EAVAAQGK,AM[655.3735]EAVAAQGK,8.617776,7.403858,,7.891326,8.504623,sp|P18669|PGAM1_HUMAN,...,P18669_M242,0,MAAYKLVLIRHGESAWNLENRFSGWYDADLSPAGHEEAKRGGQALR...,241,10,A,1,242,NLKPIKPMQFLGDEETVRKA,EAVAAQGKAKK
41,TVTAMDVVYALK,TVTAMDVVYALK,TVTAM[649.3660]DVVYALK,TVTAM[655.3735]DVVYALK,,,,8.683691,7.823752,sp|P62805|H4_HUMAN,...,P62805_M84,0,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,80,12,TVTA,4,84,NVIRDAVTYTEHAKRKTVTA,DVVYALKRQGRTLYGFGG


### Download Alphafold Data - A549

In [15]:
# Remove invalid proteins (according to alphafold)
# 5 invalid peptides as a result -> 2 hyperreactive, 3 not

invalid_IDs = ['Q8IVF2', 'Q09666', 'Q15149']
display(peptides[peptides["Protein ID"].isin(invalid_IDs)])
peptides = peptides[~peptides["Protein ID"].isin(invalid_IDs)]
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,SMEASVDVSAPK,SMEASVDVSAPK,SM[649.3660]EASVDVSAPK,SM[655.3735]EASVDVSAPK,,-4.314368,-4.309369,,,sp|Q8IVF2|AHNK2_HUMAN,...,Q8IVF2_M693,1,MCDCFHMVLPTWPGTPGSVSGRQLQPGEPGAETEDDHSVTEGPADE...,692,12,S,1,693,KFKMPKFKMPLFGASAPGKS,EASVDVSAPKVEADVSLLSM
1,VSMPDVELNLKSPK,VSMPDVELNLKSPK,VSM[649.3660]PDVELNLKSPK,VSM[655.3735]PDVELNLKSPK,,-3.310446,-2.678198,-2.695957,-2.488542,sp|Q09666|AHNK_HUMAN,...,Q09666_M3416,1,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,3414,14,VS,2,3416,KVKGSKFKMPFLSISSPKVS,PDVELNLKSPKVKGDLDIAG
10,LVASMEEAR,LVASMEEAR,LVASM[649.3660]EEAR,LVASM[655.3735]EEAR,,3.0758,2.351735,4.269428,,sp|Q15149|PLEC_HUMAN,...,Q15149_M2688,0,MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL...,2684,9,LVAS,4,2688,EEQQRQQQQMEQERQRLVAS,EEARRRQHEAEEGVRRKQEE
21,FKMPEMNIK,FKMPEMNIK,FKMPEM[649.3660]NIK,FKMPEM[655.3735]NIK,,4.471593,3.982242,,,sp|Q09666|AHNK_HUMAN,...,Q09666_M810,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,805,9,FKMPE,5,810,DVSIEEPEGKLKGPKFKMPE,NIKVPKISMPDVDLHLKGPN
33,IHMSGPK,IHMSGPK,IHM[649.3660]SGPK,IHM[655.3735]SGPK,,,6.704194,,6.833744,sp|Q09666|AHNK_HUMAN,...,Q09666_M5029,0,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,5027,7,IH,2,5029,PEISVGGKGKKSKFKMPKIH,SGPKIKAKKQGFDLNVPGGE


Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,Site,Hyperreactive,Complete Sequence,Peptide Location,Peptide Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
2,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,...,O00487_M166,1,MDRLLRLGGGMPGLGQGPPTDAPAVDTAEQVYISSLALLKMLKHGR...,161,14,LINAN,5,166,PIQSVKGKVVIDAFRLINAN,MVLGHEPRQTTSNLGHLNKP
3,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,...,P52272_M436,1,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,436,7,,0,436,ERMGAGLGHGMDRVGSEIER,GLVMDRMGSVERMGSGIERM
4,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,...,Q15435_M343,1,MAAERGAGQQQSQEMMEVDRRVESEESGDEEGKKHSSGIVADLSEQ...,342,9,V,1,343,ETVYLERNPLQKDPQYRRKV,LALPSVRQIDATFVRF
5,RQQEEMMRR,RQQEEMMRR,RQQEEMM[649.3660]RR,RQQEEMM[655.3735]RR,,,1.089559,,1.211582,sp|Q15233|NONO_HUMAN,...,Q15233_M362,1,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,356,9,RQQEEM,6,362,RQEEERRRREEEMRRQQEEM,RRQQEGFKGTFPDAREQEIR
6,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,2.059981,,1.444680,1.992700,2.121844,sp|Q8WUH6|TM263_HUMAN,...,Q8WUH6_M33,1,MNQTDKNQQEIPSYLNDEPPEGSMKDHPQQQPGMLSRVTGGIFSVT...,25,12,DHPQQQPG,8,33,YLNDEPPEGSMKDHPQQQPG,LSRVTGGIFSVTKGAVGATI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,...,P11940_M572,0,MNPSAPSYPMASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDM...,565,15,LFPLIQA,7,572,APPQEQKQMLGERLFPLIQA,HPTLAGKITGMLLEIDNSEL
45,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,...,P63261_M46,0,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,39,11,HQGVMVG,7,46,PRAVFPSIVGRPRHQGVMVG,GQKDSYVGDEAQSKRGILTL
46,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,...,P83731_M126,0,MKVELCSFSGYKIYPGHGRRYARTDGKVFQFLNAKCESAFLSKRNP...,124,7,TA,2,126,QAIRAAKEAKKAKQASKKTA,AAAKAPTKAAPKQKIVKPVK
47,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,...,P05783_M83,0,MSFTTRSTFSTNYRSLGSVQAPSYGARPVSSAASVYAGAGGSGSRI...,81,9,ET,2,83,ATGIAGGLAGMGGIQNEKET,QSLNDRLASYLDRVRSLETE


In [16]:
# Set UniProt IDs to use
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['O00487' 'P52272' 'Q15435' 'Q15233' 'Q8WUH6' 'Q9BQ04' 'P35579' 'Q14683'
 'P60228' 'Q9H694' 'O60610' 'Q9H4G0' 'P10809' 'P46926' 'P41227' 'P61158'
 'Q9H444' 'Q86UP2' 'P49915' 'P18621' 'P67870' 'Q13813' 'P62258' 'Q01518'
 'Q86V81' 'Q14152' 'P53999' 'Q13283' 'P26038' 'P61247' 'P55072' 'Q16181'
 'P15311' 'P18669' 'P62805' 'P22626' 'P11940' 'P63261' 'P83731' 'P05783']
Number of Unique UniProt IDs: 40


In [17]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 40/40 [00:00<00:00, 28306.42it/s]

2025-03-24 14:40:35> Valid proteins: 0
2025-03-24 14:40:35> Invalid proteins: 0
2025-03-24 14:40:35> Existing proteins: 40





In [18]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 40/40 [00:00<00:00, 53261.00it/s]

2025-03-24 14:40:35> Valid proteins: 0
2025-03-24 14:40:35> Invalid proteins: 0
2025-03-24 14:40:35> Existing proteins: 40





### Calculate Accessibilites and Merge into Full Dataset - A549

In [19]:
radii = [2, 3, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 12, 18, 24]
smooth_accessibilities = calculate_accessibilities(cif_dir, pae_dir, unique_uniprotIDs, radii)
smooth_accessibilities["position"] = smooth_accessibilities["position"] - 1 # zero-index the positions to match initial dataframe
smooth_accessibilities

100%|██████████| 1984/1984 [00:03<00:00, 556.33it/s] 
100%|██████████| 40/40 [00:01<00:00, 24.17it/s]
100%|██████████| 40/40 [00:00<00:00, 97.81it/s] 
100%|██████████| 40/40 [00:00<00:00, 100.00it/s]
100%|██████████| 40/40 [00:00<00:00, 68.30it/s]
100%|██████████| 40/40 [00:00<00:00, 96.36it/s] 
100%|██████████| 40/40 [00:00<00:00, 92.88it/s]
100%|██████████| 40/40 [00:00<00:00, 93.86it/s]
100%|██████████| 40/40 [00:00<00:00, 96.11it/s] 
100%|██████████| 40/40 [00:00<00:00, 90.02it/s]
100%|██████████| 40/40 [00:00<00:00, 88.26it/s]
100%|██████████| 40/40 [00:00<00:00, 92.67it/s]
100%|██████████| 40/40 [00:00<00:00, 82.40it/s]
100%|██████████| 40/40 [00:00<00:00, 70.49it/s]
100%|██████████| 40/40 [00:00<00:00, 58.70it/s]
100%|██████████| 40/40 [00:00<00:00, 252.89it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,O00487,1,M,0,42.66,74.318,75.707,76.187,75.708,-15.972,...,1.181818,1.909091,1.909091,1.909091,1.909091,1.909091,3.636364,6.181818,8.181818,1
1,O00487,1,D,1,35.66,73.558,74.180,75.681,73.817,-14.838,...,1.250000,1.916667,1.916667,1.916667,1.916667,1.916667,3.666667,6.166667,8.250000,1
2,O00487,1,R,2,40.58,70.267,71.774,72.090,72.286,-13.908,...,1.230769,1.923077,1.923077,1.923077,1.923077,1.923077,3.692308,6.230769,8.307692,1
3,O00487,1,L,3,45.02,67.830,68.489,67.564,69.862,-11.412,...,1.214286,1.928571,1.928571,1.928571,1.928571,1.928571,3.642857,6.214286,8.285714,1
4,O00487,1,L,4,44.63,68.422,68.193,69.059,68.614,-8.472,...,1.200000,1.933333,1.933333,1.933333,1.933333,1.933333,3.666667,6.200000,8.333333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
969,Q9H694,40,V,969,37.87,8.571,9.527,10.159,8.796,0.576,...,1.400000,1.933333,1.933333,1.933333,1.933333,1.933333,3.800000,6.466667,9.066667,1
970,Q9H694,40,S,970,36.09,7.259,8.703,9.652,9.070,3.555,...,1.357143,1.928571,1.928571,1.928571,1.928571,1.928571,3.785714,6.428571,9.000000,1
971,Q9H694,40,G,971,39.87,6.657,7.155,,6.684,6.355,...,1.384615,1.923077,1.923077,1.923077,1.923077,1.923077,3.769231,6.384615,8.923077,1
972,Q9H694,40,R,972,36.39,5.515,6.927,7.919,6.991,8.631,...,1.416667,1.916667,1.916667,1.916667,1.916667,1.916667,3.750000,6.333333,8.833333,1


In [20]:
peptides = peptides.merge(
    smooth_accessibilities, 
    how="left", 
    left_on=["Protein ID", f"{amino_acid_str} Location"], 
    right_on=["protein_id", "position"]
)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,...,1.714286,2.095238,2.238095,2.619048,3.047619,3.761905,10.142857,30.904762,58.000000,0
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
2,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,...,2.142857,2.285714,3.571429,5.619048,6.952381,7.333333,19.285714,47.238095,78.476190,0
3,RQQEEMMRR,RQQEEMMRR,RQQEEMM[649.3660]RR,RQQEEMM[655.3735]RR,,,1.089559,,1.211582,sp|Q15233|NONO_HUMAN,...,0.857143,1.380952,1.380952,2.333333,2.952381,3.952381,6.952381,13.238095,18.142857,1
4,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,2.059981,,1.444680,1.992700,2.121844,sp|Q8WUH6|TM263_HUMAN,...,0.238095,1.619048,1.619048,2.000000,2.000000,2.000000,3.476190,6.047619,9.190476,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.809524,5.047619,6.000000,6.809524,15.285714,40.000000,61.428571,0
40,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,...,1.476190,2.238095,2.523810,2.952381,3.571429,4.095238,9.761905,25.285714,46.380952,0
41,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,...,0.714286,1.428571,1.714286,2.333333,3.000000,3.190476,6.619048,11.904762,16.142857,1
42,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,...,0.857143,1.714286,2.047619,2.571429,3.333333,4.142857,7.571429,13.380952,18.809524,1


In [21]:
# Sanity Check: ensure UniProt and AlphaFold sequences are the same

peptides["AA"].value_counts()
#pd.set_option("display.max_columns", None)
#display(peptides[~(peptides["AA"] == amino_acid)])
#pd.reset_option("display.max_columns")
#peptides = peptides[(peptides["AA"] == amino_acid)]

AA
M    44
Name: count, dtype: int64

In [22]:
# TODO: select some necessary subset of these columns to store

In [23]:
#peptides.to_csv(os.path.join(curr_dir_path, "A549_processed.csv"))

In [24]:
path = os.path.join(curr_dir_path, "A549_processed.csv")
peptides = pd.read_csv(path).set_index("Unnamed: 0")
peptides.index.name = None
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,...,1.714286,2.095238,2.238095,2.619048,3.047619,3.761905,10.142857,30.904762,58.000000,0
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
2,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,...,2.142857,2.285714,3.571429,5.619048,6.952381,7.333333,19.285714,47.238095,78.476190,0
3,RQQEEMMRR,RQQEEMMRR,RQQEEMM[649.3660]RR,RQQEEMM[655.3735]RR,,,1.089559,,1.211582,sp|Q15233|NONO_HUMAN,...,0.857143,1.380952,1.380952,2.333333,2.952381,3.952381,6.952381,13.238095,18.142857,1
4,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,2.059981,,1.444680,1.992700,2.121844,sp|Q8WUH6|TM263_HUMAN,...,0.238095,1.619048,1.619048,2.000000,2.000000,2.000000,3.476190,6.047619,9.190476,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.809524,5.047619,6.000000,6.809524,15.285714,40.000000,61.428571,0
40,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,...,1.476190,2.238095,2.523810,2.952381,3.571429,4.095238,9.761905,25.285714,46.380952,0
41,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,...,0.714286,1.428571,1.714286,2.333333,3.000000,3.190476,6.619048,11.904762,16.142857,1
42,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,...,0.857143,1.714286,2.047619,2.571429,3.333333,4.142857,7.571429,13.380952,18.809524,1
