In [1]:
from training_utils import *

import os
import warnings

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

### Set Parameters of Analysis

In [2]:
amino_acid = "M"
amino_acid_str = "Methionine"
analysis_threshold = 20 # number of amino acids either side to analyze
modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as strings
heavy_modification = "655.3735"
light_modification = "649.3660"

In [3]:
curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

datasets_path_str = "../datasets"
datasets_path = os.path.abspath(datasets_path_str)

global_data_path_str = "../../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Datasets Directory: " + datasets_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/toy
Datasets Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/datasets
Global Data Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/global_data


In [4]:
alphafold_path_str = "../../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print("AlphaFold Directory: " + alphafold_path)
print("CIF Directory: " + cif_dir)
print("PAE Directory: " + pae_dir)

AlphaFold Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data
CIF Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/cif
PAE Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/pae


## A549

In [5]:
# Load initial processed dataset (includes adjacent sequences, AlphaFold accessibilities, etc.)
pd.set_option("display.max_rows", 25)
path = os.path.join(curr_dir_path, "A549_processed.csv")
peptides = pd.read_csv(path).set_index("Unnamed: 0")
peptides.index.name = None
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,...,1.714286,2.095238,2.238095,2.619048,3.047619,3.761905,10.142857,30.904762,58.000000,0
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
2,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,...,2.142857,2.285714,3.571429,5.619048,6.952381,7.333333,19.285714,47.238095,78.476190,0
3,RQQEEMMRR,RQQEEMMRR,RQQEEMM[649.3660]RR,RQQEEMM[655.3735]RR,,,1.089559,,1.211582,sp|Q15233|NONO_HUMAN,...,0.857143,1.380952,1.380952,2.333333,2.952381,3.952381,6.952381,13.238095,18.142857,1
4,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,2.059981,,1.444680,1.992700,2.121844,sp|Q8WUH6|TM263_HUMAN,...,0.238095,1.619048,1.619048,2.000000,2.000000,2.000000,3.476190,6.047619,9.190476,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.809524,5.047619,6.000000,6.809524,15.285714,40.000000,61.428571,0
40,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,...,1.476190,2.238095,2.523810,2.952381,3.571429,4.095238,9.761905,25.285714,46.380952,0
41,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,...,0.714286,1.428571,1.714286,2.333333,3.000000,3.190476,6.619048,11.904762,16.142857,1
42,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,...,0.857143,1.714286,2.047619,2.571429,3.333333,4.142857,7.571429,13.380952,18.809524,1


In [6]:
# Filter out duplicated Met sites
peptides = peptides.drop_duplicates(subset=["Site", "Hyperreactive"], keep="first")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,...,1.714286,2.095238,2.238095,2.619048,3.047619,3.761905,10.142857,30.904762,58.000000,0
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
2,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,...,2.142857,2.285714,3.571429,5.619048,6.952381,7.333333,19.285714,47.238095,78.476190,0
3,RQQEEMMRR,RQQEEMMRR,RQQEEMM[649.3660]RR,RQQEEMM[655.3735]RR,,,1.089559,,1.211582,sp|Q15233|NONO_HUMAN,...,0.857143,1.380952,1.380952,2.333333,2.952381,3.952381,6.952381,13.238095,18.142857,1
4,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,2.059981,,1.444680,1.992700,2.121844,sp|Q8WUH6|TM263_HUMAN,...,0.238095,1.619048,1.619048,2.000000,2.000000,2.000000,3.476190,6.047619,9.190476,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,QEMQEVQSSR,QEMQEVQSSR,QEM[649.3660]QEVQSSR,QEM[655.3735]QEVQSSR,8.652682,8.353068,8.195150,8.408691,8.504804,sp|P22626|ROA2_HUMAN,...,1.476190,2.047619,2.333333,2.952381,3.904762,4.476190,10.761905,25.285714,45.571429,0
39,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.809524,5.047619,6.000000,6.809524,15.285714,40.000000,61.428571,0
40,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,...,1.476190,2.238095,2.523810,2.952381,3.571429,4.095238,9.761905,25.285714,46.380952,0
41,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,...,0.714286,1.428571,1.714286,2.333333,3.000000,3.190476,6.619048,11.904762,16.142857,1


In [7]:
# Sanity Check: ensure all Met sites are unique
peptides["Site"].is_unique

True

In [8]:
# TODO: BLAST search to retrieve homologous peptides

In [9]:
# Convert processed dataset to FASTA format
fasta_path = os.path.join(curr_dir_path, "A549_training.fasta")
dataset_to_fasta(peptides, fasta_path)

Wrote 26 new peptides to /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/toy/A549_training.fasta.


In [10]:
# Run CKSAAP on FASTA file
fasta_path = os.path.join(curr_dir_path, "A549_training.fasta")
A549_CKSAAP = CKSAAP(fasta_path, k=5)
A549_CKSAAP

File imported successfully.


Unnamed: 0,Site,Hyperreactivity Label,CKSAAP_AA.gap0,CKSAAP_AC.gap0,CKSAAP_AD.gap0,CKSAAP_AE.gap0,CKSAAP_AF.gap0,CKSAAP_AG.gap0,CKSAAP_AH.gap0,CKSAAP_AI.gap0,...,CKSAAP_YM.gap5,CKSAAP_YN.gap5,CKSAAP_YP.gap5,CKSAAP_YQ.gap5,CKSAAP_YR.gap5,CKSAAP_YS.gap5,CKSAAP_YT.gap5,CKSAAP_YV.gap5,CKSAAP_YW.gap5,CKSAAP_YY.gap5
0,O00487_M166,1,0.0,0.0,0.0,0.000,0.025,0.000,0.0,0.000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
1,P52272_M436,1,0.0,0.0,0.0,0.000,0.000,0.025,0.0,0.000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
2,Q15435_M343,1,0.0,0.0,0.0,0.000,0.000,0.000,0.0,0.000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
3,Q15233_M362,1,0.0,0.0,0.0,0.000,0.000,0.000,0.0,0.000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
4,Q8WUH6_M33,1,0.0,0.0,0.0,0.000,0.000,0.000,0.0,0.000,...,0.0,0.0,0.028571,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,P22626_M192,0,0.0,0.0,0.0,0.025,0.000,0.000,0.0,0.000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
22,P11940_M572,0,0.0,0.0,0.0,0.000,0.000,0.025,0.0,0.000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
23,P63261_M46,0,0.0,0.0,0.0,0.000,0.000,0.000,0.0,0.000,...,0.0,0.0,0.000000,0.028571,0.0,0.000000,0.0,0.0,0.0,0.0
24,P83731_M126,0,0.1,0.0,0.0,0.000,0.000,0.000,0.0,0.025,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0


## HCT116

In [11]:
# Load initial processed dataset (includes adjacent sequences, AlphaFold accessibilities, etc.)
path = os.path.join(curr_dir_path, "HCT116_processed.csv")
peptides = pd.read_csv(path).set_index("Unnamed: 0")
peptides.index.name = None
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,-2.112572,-2.301838,-2.049472,-2.460635,-2.802424,sp|O94979|SC31A_HUMAN,...,1.333333,2.000000,2.000000,2.000000,2.000000,2.000000,4.047619,7.571429,10.619048,1
1,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,-2.362646,,,-2.136884,-1.407531,sp|P35579|MYH9_HUMAN,...,2.000000,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,1
2,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-1.039815,-0.902895,,,-1.159599,sp|P52272|HNRPM_HUMAN,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
3,QMMEAATR,QMM[15.9949]EAATR,QM[649.3660]M[15.9949]EAATR,QM[655.3735]M[15.9949]EAATR,,,-0.908366,-0.743273,-0.549589,sp|O43395|PRPF3_HUMAN,...,1.761905,2.047619,3.952381,4.761905,6.095238,6.285714,9.523810,15.857143,20.190476,1
4,AVSAVKNMNLPEIPR,AVSAVKNMNLPEIPR,AVSAVKNM[649.3660]NLPEIPR,AVSAVKNM[655.3735]NLPEIPR,,-0.584995,-0.473381,,-0.601462,sp|Q92572|AP3S1_HUMAN,...,1.047619,2.000000,2.000000,2.000000,2.047619,2.047619,4.761905,8.238095,12.809524,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,MDSRGEHRQDR,MDSRGEHRQDR,M[649.3660]DSRGEHRQDR,M[655.3735]DSRGEHRQDR,8.043113,,,8.179892,,sp|P35637|FUS_HUMAN,...,1.285714,2.000000,2.000000,2.000000,2.047619,2.047619,5.142857,8.095238,11.142857,1
101,MGANSLER,MGANSLER,M[649.3660]GANSLER,M[655.3735]GANSLER,8.274117,8.592321,,8.789749,,sp|P52272|HNRPM_HUMAN,...,1.476190,2.000000,2.000000,2.000000,2.000000,2.000000,4.000000,7.000000,10.000000,1
102,MNQGTAR,MNQGTAR,M[649.3660]NQGTAR,M[655.3735]NQGTAR,,9.011247,,,8.254092,sp|P43243|MATR3_HUMAN,...,1.095238,2.000000,2.000000,2.000000,2.000000,2.000000,5.761905,9.619048,12.285714,1
103,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,,8.964193,8.405923,,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.714286,4.285714,5.285714,6.238095,14.285714,38.523810,60.809524,0


In [12]:
# Filter out duplicated Met sites
peptides = peptides.drop_duplicates(subset=["Site", "Hyperreactive"], keep="first")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,-2.112572,-2.301838,-2.049472,-2.460635,-2.802424,sp|O94979|SC31A_HUMAN,...,1.333333,2.000000,2.000000,2.000000,2.000000,2.000000,4.047619,7.571429,10.619048,1
1,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,-2.362646,,,-2.136884,-1.407531,sp|P35579|MYH9_HUMAN,...,2.000000,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,1
2,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-1.039815,-0.902895,,,-1.159599,sp|P52272|HNRPM_HUMAN,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
3,QMMEAATR,QMM[15.9949]EAATR,QM[649.3660]M[15.9949]EAATR,QM[655.3735]M[15.9949]EAATR,,,-0.908366,-0.743273,-0.549589,sp|O43395|PRPF3_HUMAN,...,1.761905,2.047619,3.952381,4.761905,6.095238,6.285714,9.523810,15.857143,20.190476,1
4,AVSAVKNMNLPEIPR,AVSAVKNMNLPEIPR,AVSAVKNM[649.3660]NLPEIPR,AVSAVKNM[655.3735]NLPEIPR,,-0.584995,-0.473381,,-0.601462,sp|Q92572|AP3S1_HUMAN,...,1.047619,2.000000,2.000000,2.000000,2.047619,2.047619,4.761905,8.238095,12.809524,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,MDSRGEHRQDR,MDSRGEHRQDR,M[649.3660]DSRGEHRQDR,M[655.3735]DSRGEHRQDR,8.043113,,,8.179892,,sp|P35637|FUS_HUMAN,...,1.285714,2.000000,2.000000,2.000000,2.047619,2.047619,5.142857,8.095238,11.142857,1
101,MGANSLER,MGANSLER,M[649.3660]GANSLER,M[655.3735]GANSLER,8.274117,8.592321,,8.789749,,sp|P52272|HNRPM_HUMAN,...,1.476190,2.000000,2.000000,2.000000,2.000000,2.000000,4.000000,7.000000,10.000000,1
102,MNQGTAR,MNQGTAR,M[649.3660]NQGTAR,M[655.3735]NQGTAR,,9.011247,,,8.254092,sp|P43243|MATR3_HUMAN,...,1.095238,2.000000,2.000000,2.000000,2.000000,2.000000,5.761905,9.619048,12.285714,1
103,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,,8.964193,8.405923,,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.714286,4.285714,5.285714,6.238095,14.285714,38.523810,60.809524,0


In [13]:
# Sanity Check: ensure all Met sites are unique
peptides["Site"].is_unique

True

In [14]:
# TODO: BLAST search to retrieve homologous peptides

In [15]:
# Convert processed dataset to FASTA format
fasta_path = os.path.join(curr_dir_path, "HCT116_training.fasta")
dataset_to_fasta(peptides, fasta_path)

Wrote 64 new peptides to /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/toy/HCT116_training.fasta.


In [16]:
# Run CKSAAP on FASTA file
fasta_path = os.path.join(curr_dir_path, "HCT116_training.fasta")
HCT116_CKSAAP = CKSAAP(fasta_path, k=5)
HCT116_CKSAAP

File imported successfully.


Unnamed: 0,Site,Hyperreactivity Label,CKSAAP_AA.gap0,CKSAAP_AC.gap0,CKSAAP_AD.gap0,CKSAAP_AE.gap0,CKSAAP_AF.gap0,CKSAAP_AG.gap0,CKSAAP_AH.gap0,CKSAAP_AI.gap0,...,CKSAAP_YM.gap5,CKSAAP_YN.gap5,CKSAAP_YP.gap5,CKSAAP_YQ.gap5,CKSAAP_YR.gap5,CKSAAP_YS.gap5,CKSAAP_YT.gap5,CKSAAP_YV.gap5,CKSAAP_YW.gap5,CKSAAP_YY.gap5
0,O94979_M822,1,0.000,0.0,0.0,0.0,0.0,0.025,0.0,0.0,...,0.0,0.028571,0.057143,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,P35579_M1564,1,0.000,0.0,0.0,0.0,0.0,0.000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,P52272_M436,1,0.000,0.0,0.0,0.0,0.0,0.025,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,O43395_M144,1,0.025,0.0,0.0,0.0,0.0,0.000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Q92572_M167,1,0.000,0.0,0.0,0.0,0.0,0.050,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,P35637_M510,0,0.000,0.0,0.0,0.0,0.0,0.000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60,P52272_M570,0,0.000,0.0,0.0,0.0,0.0,0.000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61,P43243_M44,0,0.050,0.0,0.0,0.0,0.0,0.000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62,P11940_M583,0,0.000,0.0,0.0,0.0,0.0,0.025,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Hela

In [17]:
# Load initial processed dataset (includes adjacent sequences, AlphaFold accessibilities, etc.)
path = os.path.join(curr_dir_path, "Hela_processed.csv")
peptides = pd.read_csv(path).set_index("Unnamed: 0")
peptides.index.name = None
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Light Intensity,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,,,-5.353853,-5.179115,-5.170143,,...,1.714286,2.047619,2.666667,3.142857,3.857143,4.238095,8.095238,25.190476,62.666667,0
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,221884784.0,-4.886254,-4.943996,-5.015291,-4.814711,-4.866699,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
2,VDLVLMHWR,VDLVLMHWR,VDLVLM[649.3660]HWR,VDLVLM[655.3735]HWR,33654592.0,-3.070272,-3.355457,,,,...,1.523810,2.000000,2.000000,2.047619,2.476190,3.619048,7.714286,12.857143,17.428571,1
3,KLMQLQHEK,KLMQLQHEK,KLM[649.3660]QLQHEK,KLM[655.3735]QLQHEK,2891831.2,1.070343,1.083406,,,,...,2.000000,2.142857,5.142857,6.285714,8.000000,8.000000,13.952381,20.190476,28.000000,1
4,NKHEAMITDLEER,NKHEAMITDLEER,NKHEAM[649.3660]ITDLEER,NKHEAM[655.3735]ITDLEER,11979429.0,1.277815,1.576447,1.391734,,0.748684,...,1.523810,2.285714,2.904762,3.904762,5.476190,6.571429,12.380952,20.333333,27.714286,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,HWPFMVVNDAGRPK,HWPFMVVNDAGRPK,HWPFM[649.3660]VVNDAGRPK,HWPFM[655.3735]VVNDAGRPK,877354.7,7.774908,8.821908,9.482570,,8.929387,...,2.047619,2.333333,3.904762,5.095238,6.238095,7.571429,18.619048,56.476190,115.714286,0
111,SGSMDPSGAHPSVR,SGSMDPSGAHPSVR,SGSM[649.3660]DPSGAHPSVR,SGSM[655.3735]DPSGAHPSVR,831407.1,9.290080,8.773245,8.012736,9.211272,,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.333333,9.952381,1
112,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,675726.2,9.035667,,,9.444678,,...,0.714286,1.428571,1.714286,2.333333,3.000000,3.190476,6.619048,11.904762,16.142857,1
113,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGM[649.3660]VVTFAPVNVTTEVK,VETGVLKPGM[655.3735]VVTFAPVNVTTEVK,578260.0,10.074934,,9.966146,,9.453065,...,2.285714,2.809524,3.904762,5.571429,7.333333,9.047619,24.619048,72.095238,124.761905,0


In [18]:
# Filter out duplicated Met sites
peptides = peptides.drop_duplicates(subset=["Site", "Hyperreactive"], keep="first")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Light Intensity,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,,,-5.353853,-5.179115,-5.170143,,...,1.714286,2.047619,2.666667,3.142857,3.857143,4.238095,8.095238,25.190476,62.666667,0
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,221884784.0,-4.886254,-4.943996,-5.015291,-4.814711,-4.866699,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
2,VDLVLMHWR,VDLVLMHWR,VDLVLM[649.3660]HWR,VDLVLM[655.3735]HWR,33654592.0,-3.070272,-3.355457,,,,...,1.523810,2.000000,2.000000,2.047619,2.476190,3.619048,7.714286,12.857143,17.428571,1
3,KLMQLQHEK,KLMQLQHEK,KLM[649.3660]QLQHEK,KLM[655.3735]QLQHEK,2891831.2,1.070343,1.083406,,,,...,2.000000,2.142857,5.142857,6.285714,8.000000,8.000000,13.952381,20.190476,28.000000,1
4,NKHEAMITDLEER,NKHEAMITDLEER,NKHEAM[649.3660]ITDLEER,NKHEAM[655.3735]ITDLEER,11979429.0,1.277815,1.576447,1.391734,,0.748684,...,1.523810,2.285714,2.904762,3.904762,5.476190,6.571429,12.380952,20.333333,27.714286,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,HWPFMVVNDAGRPK,HWPFMVVNDAGRPK,HWPFM[649.3660]VVNDAGRPK,HWPFM[655.3735]VVNDAGRPK,877354.7,7.774908,8.821908,9.482570,,8.929387,...,2.047619,2.333333,3.904762,5.095238,6.238095,7.571429,18.619048,56.476190,115.714286,0
111,SGSMDPSGAHPSVR,SGSMDPSGAHPSVR,SGSM[649.3660]DPSGAHPSVR,SGSM[655.3735]DPSGAHPSVR,831407.1,9.290080,8.773245,8.012736,9.211272,,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.333333,9.952381,1
112,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,675726.2,9.035667,,,9.444678,,...,0.714286,1.428571,1.714286,2.333333,3.000000,3.190476,6.619048,11.904762,16.142857,1
113,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGMVVTFAPVNVTTEVK,VETGVLKPGM[649.3660]VVTFAPVNVTTEVK,VETGVLKPGM[655.3735]VVTFAPVNVTTEVK,578260.0,10.074934,,9.966146,,9.453065,...,2.285714,2.809524,3.904762,5.571429,7.333333,9.047619,24.619048,72.095238,124.761905,0


In [19]:
# Sanity Check: ensure all Met sites are unique
peptides["Site"].is_unique

True

In [20]:
# TODO: BLAST search to retrieve homologous peptides

In [21]:
# Convert processed dataset to FASTA format
fasta_path = os.path.join(curr_dir_path, "Hela_training.fasta")
dataset_to_fasta(peptides, fasta_path)

Wrote 66 new peptides to /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/toy/Hela_training.fasta.


In [22]:
# Run CKSAAP on FASTA file
fasta_path = os.path.join(curr_dir_path, "Hela_training.fasta")
Hela_CKSAAP = CKSAAP(fasta_path, k=5)
Hela_CKSAAP

File imported successfully.


Unnamed: 0,Site,Hyperreactivity Label,CKSAAP_AA.gap0,CKSAAP_AC.gap0,CKSAAP_AD.gap0,CKSAAP_AE.gap0,CKSAAP_AF.gap0,CKSAAP_AG.gap0,CKSAAP_AH.gap0,CKSAAP_AI.gap0,...,CKSAAP_YM.gap5,CKSAAP_YN.gap5,CKSAAP_YP.gap5,CKSAAP_YQ.gap5,CKSAAP_YR.gap5,CKSAAP_YS.gap5,CKSAAP_YT.gap5,CKSAAP_YV.gap5,CKSAAP_YW.gap5,CKSAAP_YY.gap5
0,P51532_M1232,1,0.050,0.0,0.0,0.000,0.025,0.025,0.000,0.025,...,0.0,0.0,0.0,0.028571,0.0,0.000000,0.0,0.0,0.0,0.0
1,P52272_M436,1,0.000,0.0,0.0,0.000,0.000,0.025,0.000,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
2,O75165_M723,1,0.025,0.0,0.0,0.000,0.000,0.025,0.000,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
3,Q16204_M149,1,0.000,0.0,0.0,0.025,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
4,P35579_M1027,1,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,P11142_M92,0,0.000,0.0,0.0,0.000,0.000,0.025,0.000,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.028571,0.0,0.0,0.0,0.0
62,Q07666_M20,0,0.025,0.0,0.0,0.000,0.000,0.000,0.025,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
63,P83731_M126,0,0.100,0.0,0.0,0.000,0.000,0.000,0.000,0.025,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
64,P68104_M275,0,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0


## Jurkat

In [23]:
# Load initial processed dataset (includes adjacent sequences, AlphaFold accessibilities, etc.)
path = os.path.join(curr_dir_path, "Jurkat_processed.csv")
peptides = pd.read_csv(path).set_index("Unnamed: 0")
peptides.index.name = None
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,MNGMLLNDRK,M[15.9949]NGMLLNDRK,M[15.9949]NGM[649.3660]LLNDRK,M[15.9949]NGM[655.3735]LLNDRK,-4.052345,,,,-4.383824,sp|P11940|PABP1_HUMAN,...,2.095238,2.190476,3.333333,4.190476,5.809524,6.952381,16.095238,39.714286,65.952381,0
1,QRLQEDEMR,QRLQEDEMR,QRLQEDEM[649.3660]R,QRLQEDEM[655.3735]R,,-2.303070,,,-2.156620,sp|Q04323|UBXN1_HUMAN,...,2.000000,2.333333,4.952381,6.333333,8.000000,8.000000,13.000000,20.238095,27.619048,1
2,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,-0.847024,,-0.934536,-0.727634,sp|O94979|SC31A_HUMAN,...,1.333333,2.000000,2.000000,2.000000,2.000000,2.000000,4.047619,7.571429,10.619048,1
3,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,,,-0.666811,-0.821128,-0.462431,sp|Q8WUH6|TM263_HUMAN,...,0.238095,1.619048,1.619048,2.000000,2.000000,2.000000,3.476190,6.047619,9.190476,1
4,NSLYDMAR,NSLYDMAR,NSLYDM[649.3660]AR,NSLYDM[655.3735]AR,0.728052,0.744754,0.880054,,0.838652,sp|Q9BWF3|RBM4_HUMAN,...,1.095238,1.857143,1.857143,2.000000,2.000000,2.000000,3.857143,6.523810,9.666667,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,8.583307,,,8.001761,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.714286,4.285714,5.285714,6.238095,14.285714,38.523810,60.809524,0
111,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,8.742243,8.575472,8.085437,8.208840,8.209045,sp|Q14152|EIF3A_HUMAN,...,1.619048,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.857143,10.000000,1
112,HWPFMVVNDAGRPK,HWPFMVVNDAGRPK,HWPFM[649.3660]VVNDAGRPK,HWPFM[655.3735]VVNDAGRPK,9.709745,7.906998,7.789936,8.454630,8.280280,sp|P11142|HSP7C_HUMAN,...,2.047619,2.333333,3.904762,5.095238,6.238095,7.571429,18.619048,56.476190,115.714286,0
113,RKPDTIEVQQMK,RKPDTIEVQQMK,RKPDTIEVQQM[649.3660]K,RKPDTIEVQQM[655.3735]K,9.650006,,,,9.027199,sp|P26038|MOES_HUMAN,...,2.000000,2.476190,4.619048,5.476190,6.809524,7.238095,12.619048,32.523810,64.285714,0


In [24]:
# Filter out duplicated Met sites
peptides = peptides.drop_duplicates(subset=["Site", "Hyperreactive"], keep="first")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,MNGMLLNDRK,M[15.9949]NGMLLNDRK,M[15.9949]NGM[649.3660]LLNDRK,M[15.9949]NGM[655.3735]LLNDRK,-4.052345,,,,-4.383824,sp|P11940|PABP1_HUMAN,...,2.095238,2.190476,3.333333,4.190476,5.809524,6.952381,16.095238,39.714286,65.952381,0
1,QRLQEDEMR,QRLQEDEMR,QRLQEDEM[649.3660]R,QRLQEDEM[655.3735]R,,-2.303070,,,-2.156620,sp|Q04323|UBXN1_HUMAN,...,2.000000,2.333333,4.952381,6.333333,8.000000,8.000000,13.000000,20.238095,27.619048,1
2,GRPGPVAGHHQMPR,GRPGPVAGHHQMPR,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,-0.847024,,-0.934536,-0.727634,sp|O94979|SC31A_HUMAN,...,1.333333,2.000000,2.000000,2.000000,2.000000,2.000000,4.047619,7.571429,10.619048,1
3,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,,,-0.666811,-0.821128,-0.462431,sp|Q8WUH6|TM263_HUMAN,...,0.238095,1.619048,1.619048,2.000000,2.000000,2.000000,3.476190,6.047619,9.190476,1
4,NSLYDMAR,NSLYDMAR,NSLYDM[649.3660]AR,NSLYDM[655.3735]AR,0.728052,0.744754,0.880054,,0.838652,sp|Q9BWF3|RBM4_HUMAN,...,1.095238,1.857143,1.857143,2.000000,2.000000,2.000000,3.857143,6.523810,9.666667,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,ITGMLLEIDNSELLHMLESPESLR,ITGMLLEIDNSELLHMLESPESLR,ITGM[649.3660]LLEIDNSELLHMLESPESLR,ITGM[655.3735]LLEIDNSELLHMLESPESLR,,8.583307,,,8.001761,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.714286,4.285714,5.285714,6.238095,14.285714,38.523810,60.809524,0
111,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,8.742243,8.575472,8.085437,8.208840,8.209045,sp|Q14152|EIF3A_HUMAN,...,1.619048,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.857143,10.000000,1
112,HWPFMVVNDAGRPK,HWPFMVVNDAGRPK,HWPFM[649.3660]VVNDAGRPK,HWPFM[655.3735]VVNDAGRPK,9.709745,7.906998,7.789936,8.454630,8.280280,sp|P11142|HSP7C_HUMAN,...,2.047619,2.333333,3.904762,5.095238,6.238095,7.571429,18.619048,56.476190,115.714286,0
113,RKPDTIEVQQMK,RKPDTIEVQQMK,RKPDTIEVQQM[649.3660]K,RKPDTIEVQQM[655.3735]K,9.650006,,,,9.027199,sp|P26038|MOES_HUMAN,...,2.000000,2.476190,4.619048,5.476190,6.809524,7.238095,12.619048,32.523810,64.285714,0


In [25]:
# Sanity Check: ensure all Met sites are unique
peptides["Site"].is_unique

True

In [26]:
# TODO: BLAST search to retrieve homologous peptides

In [27]:
# Convert processed dataset to FASTA format
fasta_path = os.path.join(curr_dir_path, "Jurkat_training.fasta")
dataset_to_fasta(peptides, fasta_path)

Wrote 65 new peptides to /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/toy/Jurkat_training.fasta.


In [28]:
# Run CKSAAP on FASTA file
fasta_path = os.path.join(curr_dir_path, "Jurkat_training.fasta")
Jurkat_CKSAAP = CKSAAP(fasta_path, k=5)
Jurkat_CKSAAP

File imported successfully.


Unnamed: 0,Site,Hyperreactivity Label,CKSAAP_AA.gap0,CKSAAP_AC.gap0,CKSAAP_AD.gap0,CKSAAP_AE.gap0,CKSAAP_AF.gap0,CKSAAP_AG.gap0,CKSAAP_AH.gap0,CKSAAP_AI.gap0,...,CKSAAP_YM.gap5,CKSAAP_YN.gap5,CKSAAP_YP.gap5,CKSAAP_YQ.gap5,CKSAAP_YR.gap5,CKSAAP_YS.gap5,CKSAAP_YT.gap5,CKSAAP_YV.gap5,CKSAAP_YW.gap5,CKSAAP_YY.gap5
0,P11940_M160,1,0.025000,0.0,0.000000,0.025,0.000000,0.000,0.0,0.025,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000
1,Q04323_M142,1,0.075000,0.0,0.000000,0.050,0.000000,0.000,0.0,0.000,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000
2,O94979_M822,1,0.000000,0.0,0.000000,0.000,0.000000,0.025,0.0,0.000,...,0.0,0.028571,0.057143,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000
3,Q8WUH6_M33,1,0.000000,0.0,0.000000,0.000,0.000000,0.000,0.0,0.000,...,0.0,0.000000,0.028571,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000
4,Q9BWF3_M346,1,0.054054,0.0,0.027027,0.000,0.027027,0.000,0.0,0.000,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.03125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,P11940_M583,0,0.000000,0.0,0.000000,0.000,0.000000,0.025,0.0,0.000,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000
61,Q14152_M961,0,0.000000,0.0,0.000000,0.000,0.000000,0.000,0.0,0.000,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000
62,P11142_M92,0,0.000000,0.0,0.000000,0.000,0.000000,0.025,0.0,0.000,...,0.0,0.000000,0.000000,0.000000,0.0,0.028571,0.0,0.0,0.0,0.00000
63,P26038_M304,0,0.000000,0.0,0.000000,0.000,0.000000,0.000,0.0,0.000,...,0.0,0.000000,0.028571,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000


## K562

In [29]:
# Load initial processed dataset (includes adjacent sequences, AlphaFold accessibilities, etc.)
path = os.path.join(curr_dir_path, "K562_processed.csv")
peptides = pd.read_csv(path).set_index("Unnamed: 0")
peptides.index.name = None
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,-4.945349,-4.818352,,-4.786624,,sp|P51532|SMCA4_HUMAN,...,1.714286,2.047619,2.666667,3.142857,3.857143,4.238095,8.095238,25.190476,62.666667,0
1,LNFDMTASPK,LNFDMTASPK,LNFDM[649.3660]TASPK,LNFDM[655.3735]TASPK,-3.428915,,,,-3.174253,sp|Q9ULU4|ZMYD8_HUMAN,...,1.428571,2.000000,2.000000,2.000000,2.000000,2.000000,4.142857,7.904762,15.380952,1
2,MALVADEQQR,MALVADEQQR,M[649.3660]ALVADEQQR,M[655.3735]ALVADEQQR,-2.404106,-2.242554,,,,sp|Q99856|ARI3A_HUMAN,...,1.285714,2.047619,2.476190,2.761905,3.000000,3.428571,7.809524,12.761905,17.571429,1
3,SAFNIMSAER,SAFNIMSAER,SAFNIM[649.3660]SAER,SAFNIM[655.3735]SAER,,-1.907930,,-1.772328,-2.423630,sp|Q9H4L5|OSBL3_HUMAN,...,2.000000,2.190476,4.857143,6.238095,8.000000,8.095238,15.095238,30.380952,44.619048,0
4,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,-1.377457,-1.878490,,-1.988167,sp|P35579|MYH9_HUMAN,...,2.000000,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,AITGASLADIMAK,AITGASLADIMAK,AITGASLADIM[649.3660]AK,AITGASLADIM[655.3735]AK,10.017999,,9.173403,9.037141,8.539698,sp|P83731|RL24_HUMAN,...,1.904762,2.000000,2.809524,3.380952,4.000000,4.428571,8.904762,17.000000,23.714286,1
91,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,,9.341180,9.059947,,sp|P05783|K1C18_HUMAN,...,0.857143,1.714286,2.047619,2.571429,3.333333,4.142857,7.571429,13.380952,18.809524,1
92,KEAPPMEKPEVVK,KEAPPMEKPEVVK,KEAPPM[649.3660]EKPEVVK,KEAPPM[655.3735]EKPEVVK,9.790663,,9.226812,,,sp|P62841|RS15_HUMAN,...,1.952381,2.047619,3.000000,3.428571,4.285714,4.619048,13.285714,33.333333,55.523810,0
93,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,9.708863,,9.621413,9.558118,,sp|Q14152|EIF3A_HUMAN,...,1.619048,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.857143,10.000000,1


In [30]:
# Filter out duplicated Met sites
peptides = peptides.drop_duplicates(subset=["Site", "Hyperreactive"], keep="first")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,VIQAGMFDQK,VIQAGMFDQK,VIQAGM[649.3660]FDQK,VIQAGM[655.3735]FDQK,-4.945349,-4.818352,,-4.786624,,sp|P51532|SMCA4_HUMAN,...,1.714286,2.047619,2.666667,3.142857,3.857143,4.238095,8.095238,25.190476,62.666667,0
1,LNFDMTASPK,LNFDMTASPK,LNFDM[649.3660]TASPK,LNFDM[655.3735]TASPK,-3.428915,,,,-3.174253,sp|Q9ULU4|ZMYD8_HUMAN,...,1.428571,2.000000,2.000000,2.000000,2.000000,2.000000,4.142857,7.904762,15.380952,1
2,MALVADEQQR,MALVADEQQR,M[649.3660]ALVADEQQR,M[655.3735]ALVADEQQR,-2.404106,-2.242554,,,,sp|Q99856|ARI3A_HUMAN,...,1.285714,2.047619,2.476190,2.761905,3.000000,3.428571,7.809524,12.761905,17.571429,1
3,SAFNIMSAER,SAFNIMSAER,SAFNIM[649.3660]SAER,SAFNIM[655.3735]SAER,,-1.907930,,-1.772328,-2.423630,sp|Q9H4L5|OSBL3_HUMAN,...,2.000000,2.190476,4.857143,6.238095,8.000000,8.095238,15.095238,30.380952,44.619048,0
4,LRLEVNLQAMK,LRLEVNLQAMK,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,-1.377457,-1.878490,,-1.988167,sp|P35579|MYH9_HUMAN,...,2.000000,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,AITGASLADIMAK,AITGASLADIMAK,AITGASLADIM[649.3660]AK,AITGASLADIM[655.3735]AK,10.017999,,9.173403,9.037141,8.539698,sp|P83731|RL24_HUMAN,...,1.904762,2.000000,2.809524,3.380952,4.000000,4.428571,8.904762,17.000000,23.714286,1
91,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,,9.341180,9.059947,,sp|P05783|K1C18_HUMAN,...,0.857143,1.714286,2.047619,2.571429,3.333333,4.142857,7.571429,13.380952,18.809524,1
92,KEAPPMEKPEVVK,KEAPPMEKPEVVK,KEAPPM[649.3660]EKPEVVK,KEAPPM[655.3735]EKPEVVK,9.790663,,9.226812,,,sp|P62841|RS15_HUMAN,...,1.952381,2.047619,3.000000,3.428571,4.285714,4.619048,13.285714,33.333333,55.523810,0
93,RGMDDDRGPR,RGMDDDRGPR,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,9.708863,,9.621413,9.558118,,sp|Q14152|EIF3A_HUMAN,...,1.619048,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.857143,10.000000,1


In [31]:
# Sanity Check: ensure all Met sites are unique
peptides["Site"].is_unique

True

In [32]:
# TODO: BLAST search to retrieve homologous peptides

In [33]:
# Convert processed dataset to FASTA format
fasta_path = os.path.join(curr_dir_path, "K562_training.fasta")
dataset_to_fasta(peptides, fasta_path)

Wrote 58 new peptides to /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/toy/K562_training.fasta.


In [34]:
# Run CKSAAP on FASTA file
fasta_path = os.path.join(curr_dir_path, "K562_training.fasta")
K562_CKSAAP = CKSAAP(fasta_path, k=5)
K562_CKSAAP

File imported successfully.


Unnamed: 0,Site,Hyperreactivity Label,CKSAAP_AA.gap0,CKSAAP_AC.gap0,CKSAAP_AD.gap0,CKSAAP_AE.gap0,CKSAAP_AF.gap0,CKSAAP_AG.gap0,CKSAAP_AH.gap0,CKSAAP_AI.gap0,...,CKSAAP_YM.gap5,CKSAAP_YN.gap5,CKSAAP_YP.gap5,CKSAAP_YQ.gap5,CKSAAP_YR.gap5,CKSAAP_YS.gap5,CKSAAP_YT.gap5,CKSAAP_YV.gap5,CKSAAP_YW.gap5,CKSAAP_YY.gap5
0,P51532_M1232,1,0.05,0.0,0.000,0.000,0.025,0.025,0.000,0.025,...,0.0,0.0,0.0,0.028571,0.0,0.000000,0.0,0.0,0.0,0.0
1,Q9ULU4_M402,1,0.00,0.0,0.000,0.000,0.000,0.025,0.000,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
2,Q99856_M462,1,0.05,0.0,0.025,0.025,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
3,Q9H4L5_M353,1,0.00,0.0,0.000,0.025,0.025,0.000,0.025,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
4,P35579_M1564,1,0.00,0.0,0.000,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,P83731_M90,0,0.00,0.0,0.025,0.000,0.000,0.000,0.000,0.050,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
54,P05783_M83,0,0.00,0.0,0.000,0.000,0.000,0.050,0.000,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.028571,0.0,0.0,0.0,0.0
55,P62841_M69,0,0.00,0.0,0.000,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
56,Q14152_M961,0,0.00,0.0,0.000,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0


## Combine Datasets

In [35]:
all_feature_sets = [A549_CKSAAP, HCT116_CKSAAP, Hela_CKSAAP, Jurkat_CKSAAP, K562_CKSAAP]
training_df = pd.concat(all_feature_sets)
training_df

Unnamed: 0,Site,Hyperreactivity Label,CKSAAP_AA.gap0,CKSAAP_AC.gap0,CKSAAP_AD.gap0,CKSAAP_AE.gap0,CKSAAP_AF.gap0,CKSAAP_AG.gap0,CKSAAP_AH.gap0,CKSAAP_AI.gap0,...,CKSAAP_YM.gap5,CKSAAP_YN.gap5,CKSAAP_YP.gap5,CKSAAP_YQ.gap5,CKSAAP_YR.gap5,CKSAAP_YS.gap5,CKSAAP_YT.gap5,CKSAAP_YV.gap5,CKSAAP_YW.gap5,CKSAAP_YY.gap5
0,O00487_M166,1,0.0,0.0,0.000,0.0,0.025,0.000,0.0,0.00,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,P52272_M436,1,0.0,0.0,0.000,0.0,0.000,0.025,0.0,0.00,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,Q15435_M343,1,0.0,0.0,0.000,0.0,0.000,0.000,0.0,0.00,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,Q15233_M362,1,0.0,0.0,0.000,0.0,0.000,0.000,0.0,0.00,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,Q8WUH6_M33,1,0.0,0.0,0.000,0.0,0.000,0.000,0.0,0.00,...,0.0,0.0,0.028571,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,P83731_M90,0,0.0,0.0,0.025,0.0,0.000,0.000,0.0,0.05,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
54,P05783_M83,0,0.0,0.0,0.000,0.0,0.000,0.050,0.0,0.00,...,0.0,0.0,0.000000,0.0,0.0,0.028571,0.0,0.0,0.0,0.0
55,P62841_M69,0,0.0,0.0,0.000,0.0,0.000,0.000,0.0,0.00,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
56,Q14152_M961,0,0.0,0.0,0.000,0.0,0.000,0.000,0.0,0.00,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [36]:
# Filter out duplicated Met sites (across datasets)
training_df = training_df.drop_duplicates(subset=["Site", "Hyperreactivity Label"], keep="first")
training_df

Unnamed: 0,Site,Hyperreactivity Label,CKSAAP_AA.gap0,CKSAAP_AC.gap0,CKSAAP_AD.gap0,CKSAAP_AE.gap0,CKSAAP_AF.gap0,CKSAAP_AG.gap0,CKSAAP_AH.gap0,CKSAAP_AI.gap0,...,CKSAAP_YM.gap5,CKSAAP_YN.gap5,CKSAAP_YP.gap5,CKSAAP_YQ.gap5,CKSAAP_YR.gap5,CKSAAP_YS.gap5,CKSAAP_YT.gap5,CKSAAP_YV.gap5,CKSAAP_YW.gap5,CKSAAP_YY.gap5
0,O00487_M166,1,0.000000,0.0,0.0,0.0,0.025,0.000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
1,P52272_M436,1,0.000000,0.0,0.0,0.0,0.000,0.025,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
2,Q15435_M343,1,0.000000,0.0,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
3,Q15233_M362,1,0.000000,0.0,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
4,Q8WUH6_M33,1,0.000000,0.0,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.028571,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,P67809_M197,0,0.000000,0.0,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.028571,0.028571,0.000000,0.0,0.0,0.0,0.0,0.057143
39,P46109_M171,0,0.000000,0.0,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.028571,0.0,0.0,0.0,0.0,0.000000
40,P22061_M208,0,0.000000,0.0,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
42,P15259_M242,0,0.033333,0.0,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000


In [37]:
# Sanity Check: ensure all Met sites are unique
training_df["Site"].is_unique

True

In [38]:
training_df["Hyperreactivity Label"].value_counts()

Hyperreactivity Label
0    131
1     34
Name: count, dtype: int64