In [1]:
from training_utils import *

import os
import warnings

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

### Set Parameters of Analysis

In [2]:
amino_acid = "M"
amino_acid_str = "Methionine"
analysis_threshold = 20 # number of amino acids either side to analyze
modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as strings
heavy_modification = "655.3735"
light_modification = "649.3660"

In [3]:
curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

datasets_path_str = "../datasets"
datasets_path = os.path.abspath(datasets_path_str)

global_data_path_str = "../../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Datasets Directory: " + datasets_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/toy
Datasets Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/datasets
Global Data Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/global_data


In [4]:
alphafold_path_str = "../../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print("AlphaFold Directory: " + alphafold_path)
print("CIF Directory: " + cif_dir)
print("PAE Directory: " + pae_dir)

AlphaFold Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data
CIF Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/cif
PAE Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/pae


## A549

In [5]:
# Load initial processed dataset (includes adjacent sequences, AlphaFold accessibilities, etc.)
pd.set_option("display.max_rows", 25)
path = os.path.join(curr_dir_path, "A549_processed.csv")
A549_peptides = pd.read_csv(path).set_index("Unnamed: 0")
A549_peptides.index.name = None
A549_peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,exp_1 Log2 Ratio HL,exp_2 Log2 Ratio HL,exp_3 Log2 Ratio HL,exp_4 Log2 Ratio HL,exp_5 Log2 Ratio HL,Protein,...,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,LINANMMVLGHEPR,LINANMMVLGHEPR,LINANM[649.3660]MVLGHEPR,LINANM[655.3735]MVLGHEPR,-2.660741,-3.097204,-2.506075,-2.527523,,sp|O00487|PSDE_HUMAN,...,1.714286,2.095238,2.238095,2.619048,3.047619,3.761905,10.142857,30.904762,58.000000,0
1,MGLVMDR,MGLVMDR,M[649.3660]GLVMDR,M[655.3735]GLVMDR,-2.415405,-2.484789,-2.460022,-2.303696,-2.461816,sp|P52272|HNRPM_HUMAN,...,1.238095,2.000000,2.000000,2.000000,2.000000,2.000000,4.190476,7.333333,10.380952,1
2,VMLALPSVR,VMLALPSVR,VM[649.3660]LALPSVR,VM[655.3735]LALPSVR,0.508617,,0.545374,0.373343,0.140114,sp|Q15435|PP1R7_HUMAN,...,2.142857,2.285714,3.571429,5.619048,6.952381,7.333333,19.285714,47.238095,78.476190,0
3,RQQEEMMRR,RQQEEMMRR,RQQEEMM[649.3660]RR,RQQEEMM[655.3735]RR,,,1.089559,,1.211582,sp|Q15233|NONO_HUMAN,...,0.857143,1.380952,1.380952,2.333333,2.952381,3.952381,6.952381,13.238095,18.142857,1
4,DHPQQQPGMLSR,DHPQQQPGMLSR,DHPQQQPGM[649.3660]LSR,DHPQQQPGM[655.3735]LSR,2.059981,,1.444680,1.992700,2.121844,sp|Q8WUH6|TM263_HUMAN,...,0.238095,1.619048,1.619048,2.000000,2.000000,2.000000,3.476190,6.047619,9.190476,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,LFPLIQAMHPTLAGK,LFPLIQAMHPTLAGK,LFPLIQAM[649.3660]HPTLAGK,LFPLIQAM[655.3735]HPTLAGK,,,,8.329680,8.527948,sp|P11940|PABP1_HUMAN,...,2.000000,2.142857,3.809524,5.047619,6.000000,6.809524,15.285714,40.000000,61.428571,0
40,HQGVMVGMGQK,HQGVMVGMGQK,HQGVMVGM[649.3660]GQK,HQGVMVGM[655.3735]GQK,9.078376,7.715460,8.621949,,,sp|P63261|ACTG_HUMAN,...,1.476190,2.238095,2.523810,2.952381,3.571429,4.095238,9.761905,25.285714,46.380952,0
41,TAMAAAK,TAMAAAK,TAM[649.3660]AAAK,TAM[655.3735]AAAK,,8.431435,,,9.687591,sp|P83731|RL24_HUMAN,...,0.714286,1.428571,1.714286,2.333333,3.000000,3.190476,6.619048,11.904762,16.142857,1
42,ETMQSLNDR,ETMQSLNDR,ETM[649.3660]QSLNDR,ETM[655.3735]QSLNDR,,9.407117,9.235583,,10.188223,sp|P05783|K1C18_HUMAN,...,0.857143,1.714286,2.047619,2.571429,3.333333,4.142857,7.571429,13.380952,18.809524,1


In [6]:
# TODO: BLAST search to retrieve homologous peptides

In [7]:
# Convert processed dataset to FASTA format
fasta_path = os.path.join(curr_dir_path, "A549_training.fasta")
dataset_to_fasta(A549_peptides, fasta_path)

Wrote 27 new peptides to /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel/toy/A549_training.fasta.


In [8]:
# Run CKSAAP on FASTA file
fasta_path = os.path.join(curr_dir_path, "A549_training.fasta")
A549_CKSAAP = CKSAAP(fasta_path, k=5)
A549_CKSAAP

File imported successfully.


Unnamed: 0,Site,Hyperreactivity Label,CKSAAP_AA.gap0,CKSAAP_AC.gap0,CKSAAP_AD.gap0,CKSAAP_AE.gap0,CKSAAP_AF.gap0,CKSAAP_AG.gap0,CKSAAP_AH.gap0,CKSAAP_AI.gap0,...,CKSAAP_YM.gap5,CKSAAP_YN.gap5,CKSAAP_YP.gap5,CKSAAP_YQ.gap5,CKSAAP_YR.gap5,CKSAAP_YS.gap5,CKSAAP_YT.gap5,CKSAAP_YV.gap5,CKSAAP_YW.gap5,CKSAAP_YY.gap5
0,O00487_M166,1,0.0,0.0,0.0,0.0,0.025,0.000,0.0,0.000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
1,P52272_M436,1,0.0,0.0,0.0,0.0,0.000,0.025,0.0,0.000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
2,Q15435_M343,1,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
3,Q15233_M362,1,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
4,Q8WUH6_M33,1,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.000,...,0.0,0.0,0.028571,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22,P11940_M572,0,0.0,0.0,0.0,0.0,0.000,0.025,0.0,0.000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
23,P63261_M46,0,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.000,...,0.0,0.0,0.000000,0.028571,0.0,0.000000,0.0,0.0,0.0,0.0
24,P83731_M126,0,0.1,0.0,0.0,0.0,0.000,0.000,0.0,0.025,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
25,P05783_M83,0,0.0,0.0,0.0,0.0,0.000,0.050,0.0,0.000,...,0.0,0.0,0.000000,0.000000,0.0,0.028571,0.0,0.0,0.0,0.0
