## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import requests as r
from Bio import SeqIO
from io import StringIO
import warnings

warnings.filterwarnings('ignore')

# Import structuremap functions
import structuremap.utils
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score

structuremap.utils.set_logger()

In [2]:
# Set parameters of analysis
analysis_threshold = 20 # number of amino acids either side to analyze

hyperreactivity_threshold = 3 # how many instances of a log H/L ratio < 1 across replicates to be considered hyperreactive

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "655.3735" 
light_modification = "649.3660"

## Load Dataset - HyperreactivityModel Dataset #1

In [3]:
# Set correct pathing
curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/HyperreactivityModel
Global Data Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/global_data


In [4]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "combined_modified_peptide_label_quant.tsv")
peptides = pd.read_csv(data_loc, delimiter="\t")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,K562_3 Heavy Match Type,K562_4 Heavy Match Type,K562_5 Heavy Match Type,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,MS/MS,MS/MS,MS/MS,sp|Q14498|RBM39_HUMAN,Q14498,RBM39_HUMAN,RBM39,RNA-binding protein 39,,
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,MS/MS,unmatched,unmatched,sp|Q03252|LMNB2_HUMAN,Q03252,LMNB2_HUMAN,LMNB2,Lamin-B2,,
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,unmatched,unmatched,unmatched,sp|P06753|TPM3_HUMAN,P06753,TPM3_HUMAN,TPM3,Tropomyosin alpha-3 chain,"TPM1, TPM2, TPM4","sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp..."
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,unmatched,unmatched,unmatched,sp|Q9UHV9|PFD2_HUMAN,Q9UHV9,PFD2_HUMAN,PFDN2,Prefoldin subunit 2,,
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,unmatched,unmatched,unmatched,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,KIF5B,Kinesin-1 heavy chain,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,MS/MS,MS/MS,unmatched,sp|P62328|TYB4_HUMAN,P62328,TYB4_HUMAN,TMSB4X,Thymosin beta-4,,
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,MS/MS,MS/MS,MS/MS,sp|P63218|GBG5_HUMAN,P63218,GBG5_HUMAN,GNG5,Guanine nucleotide-binding protein G(I)/G(S)/G...,,
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,unmatched,unmatched,unmatched,sp|Q8IYB7|DI3L2_HUMAN,Q8IYB7,DI3L2_HUMAN,DIS3L2,DIS3-like exonuclease 2,,
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,unmatched,unmatched,unmatched,sp|P30622|CLIP1_HUMAN,P30622,CLIP1_HUMAN,CLIP1,CAP-Gly domain-containing linker protein 1,,


In [5]:
# Label hyperreactivity
ratio_df = peptides.filter(like='Log2 Ratio HL', axis=1)
ratio_df = ratio_df.fillna(999.999)
ratio_df

Unnamed: 0,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,A549_4 Log2 Ratio HL,A549_5 Log2 Ratio HL,HCT116_1 Log2 Ratio HL,HCT116_2 Log2 Ratio HL,HCT116_3 Log2 Ratio HL,HCT116_4 Log2 Ratio HL,HCT116_5 Log2 Ratio HL,...,Jurkat_1 Log2 Ratio HL,Jurkat_2 Log2 Ratio HL,Jurkat_3 Log2 Ratio HL,Jurkat_4 Log2 Ratio HL,Jurkat_5 Log2 Ratio HL,K562_1 Log2 Ratio HL,K562_2 Log2 Ratio HL,K562_3 Log2 Ratio HL,K562_4 Log2 Ratio HL,K562_5 Log2 Ratio HL
0,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
1,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
2,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
3,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
4,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
2341,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
2342,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
2343,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999


In [6]:
num_hr_instances = ratio_df[ ratio_df < 1.0 ].count(axis=1) # hr means "hyperreactive")
print(num_hr_instances.value_counts())

peptides["hr_label"] = num_hr_instances >= 3
peptides["hr_label"] = peptides["hr_label"].astype(int)
print(peptides["hr_label"].value_counts())

peptides;

0     2256
1       43
2       15
4       10
5        7
3        7
6        2
12       1
8        1
14       1
13       1
7        1
Name: count, dtype: int64
hr_label
0    2314
1      31
Name: count, dtype: int64


In [7]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q14498' 'Q03252' 'P06753' 'Q9UHV9' 'P33176' 'O14497' 'Q9Y230' 'Q9H444'
 'Q9BRD0' 'Q5PRF9' 'Q16891' 'P46063' 'Q96PK6' 'P46459' 'Q96I24' 'P62258'
 'P09525' 'P07910' 'Q9Y2W2' 'P35222' 'P61970' 'P62333' 'P08243' 'P05787'
 'Q86XZ4' 'Q9P0K7' 'Q9Y2X7' 'P24928' 'Q13523' 'Q7L7X3' 'Q15149' 'Q9Y5Z4'
 'Q9UPN3' 'P51572' 'Q9NR30' 'P55265' 'Q9Y613' 'Q13769' 'Q15366' 'P26373'
 'Q5VZK9' 'P39023' 'P46013' 'Q14789' 'Q9NWH9' 'P14618' 'Q6UB99' 'Q8WUM0'
 'Q9UHI6' 'Q15050' 'Q13310' 'Q99848' 'Q8TB05' 'Q02543' 'P00558' 'Q96SI9'
 'Q13895' 'P07954' 'P15121' 'Q96T51' 'P54252' 'Q9Y618' 'Q8N6H7' 'Q9NP61'
 'P83731' 'P53396' 'Q92945' 'O60218' 'O43768' 'P55084' 'P49368' 'Q8WXF1'
 'P11940' 'P26447' 'P35579' 'Q15233' 'Q9H3P2' 'Q92614' 'P40222' 'Q9Y5B6'
 'Q15785' 'Q99615' 'P10809' 'O43776' 'O43633' 'Q9BYN8' 'O43242' 'Q8TDX7'
 'Q9Y3U8' 'P22626' 'Q9HD42' 'P00491' 'Q13509' 'Q567U6' 'P27105' 'Q9BY77'
 'Q9BV36' 'Q5VTR2' 'P18669' 'Q9H0L4' 'Q9Y3C1' 'Q14671' 'Q14320' 'Q9Y383'
 'P26038' 'Q8WUF5' 'O95347' 'Q

In [8]:
# Helper function to get full amino acid sequence for a protein
def get_complete_sequence(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [9]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated

In [10]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [11]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,K562_5 Heavy Match Type,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,hr_label,Complete Sequence
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,MS/MS,sp|Q14498|RBM39_HUMAN,Q14498,RBM39_HUMAN,RBM39,RNA-binding protein 39,,,0,MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRSHE...
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,unmatched,sp|Q03252|LMNB2_HUMAN,Q03252,LMNB2_HUMAN,LMNB2,Lamin-B2,,,0,MSPPSPGRRREQRRPRAAATMATPLPGRAGGPATPLSPTRLSRLQE...
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,unmatched,sp|P06753|TPM3_HUMAN,P06753,TPM3_HUMAN,TPM3,Tropomyosin alpha-3 chain,"TPM1, TPM2, TPM4","sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp...",0,MMEAIKKKMQMLKLDKENALDRAEQAEAEQKQAEERSKQLEDELAA...
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,unmatched,sp|Q9UHV9|PFD2_HUMAN,Q9UHV9,PFD2_HUMAN,PFDN2,Prefoldin subunit 2,,,0,MAENSGRAGKSSGSGAGKGAVSAEQVIAGFNRLRQEQRGLASKAAE...
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,unmatched,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,KIF5B,Kinesin-1 heavy chain,,,0,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,unmatched,sp|P62328|TYB4_HUMAN,P62328,TYB4_HUMAN,TMSB4X,Thymosin beta-4,,,0,MSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,MS/MS,sp|P63218|GBG5_HUMAN,P63218,GBG5_HUMAN,GNG5,Guanine nucleotide-binding protein G(I)/G(S)/G...,,,0,MSGSSSVAAMKKVVQQLRLEAGLNRVKVSQAAADLKQFCLQNAQHD...
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,unmatched,sp|Q8IYB7|DI3L2_HUMAN,Q8IYB7,DI3L2_HUMAN,DIS3L2,DIS3-like exonuclease 2,,,0,MSHPDYRMNLRPLGTPRGVSAVAGPHDIGASPGDKKSKNRSTRGKK...
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,unmatched,sp|P30622|CLIP1_HUMAN,P30622,CLIP1_HUMAN,CLIP1,CAP-Gly domain-containing linker protein 1,,,0,MSMLKPSGLKAPTKILKPGSTALKTPTAVVAPVEKTISSEKASSTP...


In [12]:
# Create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    whole, mantissa = modifications[0].split(".")
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        whole, mantissa = modifications[i].split(".")
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [13]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [14]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [15]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True     2342
False       3
Name: count, dtype: int64

In [16]:
# Rows corresponding to sequences that weren't found
peptides_cs[(temp != peptides_cs["Peptide Sequence"])];

In [17]:
# Other sequences within the same proteins are found -> issue isn't with the complete sequence -> drop rows 313, 648, and 847 (peptides that weren't found in their protein)
peptides_cs[peptides_cs["Protein ID"].isin(["P60660", "Q9Y2K9", "P08727"])];

In [18]:
# Remove rows corresponding to sequences that weren't found
peptides_cs = peptides_cs.drop([313, 648, 847])
peptides_cs

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,hr_label,Complete Sequence,Sequence Location,Sequence Length
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,Q14498,RBM39_HUMAN,RBM39,RNA-binding protein 39,,,0,MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRSHE...,234,10
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,Q03252,LMNB2_HUMAN,LMNB2,Lamin-B2,,,0,MSPPSPGRRREQRRPRAAATMATPLPGRAGGPATPLSPTRLSRLQE...,16,12
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,P06753,TPM3_HUMAN,TPM3,Tropomyosin alpha-3 chain,"TPM1, TPM2, TPM4","sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp...",0,MMEAIKKKMQMLKLDKENALDRAEQAEAEQKQAEERSKQLEDELAA...,119,10
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,Q9UHV9,PFD2_HUMAN,PFDN2,Prefoldin subunit 2,,,0,MAENSGRAGKSSGSGAGKGAVSAEQVIAGFNRLRQEQRGLASKAAE...,43,19
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,P33176,KINH_HUMAN,KIF5B,Kinesin-1 heavy chain,,,0,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,550,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,P62328,TYB4_HUMAN,TMSB4X,Thymosin beta-4,,,0,MSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES,1,11
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,P63218,GBG5_HUMAN,GNG5,Guanine nucleotide-binding protein G(I)/G(S)/G...,,,0,MSGSSSVAAMKKVVQQLRLEAGLNRVKVSQAAADLKQFCLQNAQHD...,1,11
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,Q8IYB7,DI3L2_HUMAN,DIS3L2,DIS3-like exonuclease 2,,,0,MSHPDYRMNLRPLGTPRGVSAVAGPHDIGASPGDKKSKNRSTRGKK...,1,16
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,P30622,CLIP1_HUMAN,CLIP1,CAP-Gly domain-containing linker protein 1,,,0,MSMLKPSGLKAPTKILKPGSTALKTPTAVVAPVEKTISSEKASSTP...,1,13


In [19]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    2342
Name: count, dtype: int64

In [20]:
# Inspect missing "Heavy Modified Peptide" sequences
peptides_cs[peptides_cs["Heavy Modified Peptide"].isna()]

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,hr_label,Complete Sequence,Sequence Location,Sequence Length
956,KMREYGVER,KMREYGVER,9,4,1,KM[649.3660]REYGVER,,,,,...,Q9BQ95,ECSIT_HUMAN,ECSIT,Evolutionarily conserved signaling intermediat...,,,0,MSWVQATLLARGLCRAWGGTCGAALTGTSISQVPRRLPRGLHCSAA...,115,9
1339,MIFDVESMKK,MIFDVESM[15.9949]KK,10,4,1,M[649.3660]IFDVESM[15.9949]KK,,,,,...,P09874,PARP1_HUMAN,PARP1,Poly [ADP-ribose] polymerase 1,,,0,MAESSDKLYRVEYAKSGRASCKKCSESIPKDSLRMAIMVQSPMFDG...,674,10


In [21]:
# Manually impute "Heavy Modified Peptide" sequences
peptides_cs.loc[956, "Heavy Modified Peptide"] = "KM[" + heavy_modification + "]REYGVER"
peptides_cs.loc[1339, "Heavy Modified Peptide"] = "M[" + heavy_modification + "]IFDVESM[15.9949]KK"

In [22]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Heavy Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [23]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [24]:
# Sanity check - ensure methionine locations are correct
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
temp.count("M") == len(temp)

True

In [25]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,Mapped Proteins,hr_label,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,,0,MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRSHE...,234,10,AAA,3,237,VLGVPIIVQASQAEKNRAAA,ANNLQKGSAGPMRLYVGSLH
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,,0,MSPPSPGRRREQRRPRAAATMATPLPGRAGGPATPLSPTRLSRLQE...,16,12,AAAT,4,20,MSPPSPGRRREQRRPRAAAT,ATPLPGRAGGPATPLSPTRL
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,"sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp...",0,MMEAIKKKMQMLKLDKENALDRAEQAEAEQKQAEERSKQLEDELAA...,119,10,AADESERG,8,127,ATALQKLEEAEKAADESERG,KVIENRALKDEEKMELQEIQ
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,,0,MAENSGRAGKSSGSGAGKGAVSAEQVIAGFNRLRQEQRGLASKAAE...,43,19,AAELE,5,48,GFNRLRQEQRGLASKAAELE,ELNEHSLVIDTLKEVDETRK
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,,0,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,550,10,AAE,3,553,DAELQKLKEMTNHQKKRAAE,MASLLKDLAEIGIAVGNNDV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,,0,MSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES,1,11,SDKPD,5,6,MSDKP,AEIEKFDKSKLKKTETQEKN
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,,0,MSGSSSVAAMKKVVQQLRLEAGLNRVKVSQAAADLKQFCLQNAQHD...,1,11,SGSSSVAA,8,9,MSGSSSVA,KKVVQQLRLEAGLNRVKVSQ
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,,0,MSHPDYRMNLRPLGTPRGVSAVAGPHDIGASPGDKKSKNRSTRGKK...,1,16,SHPDYR,6,7,MSHPDY,NLRPLGTPRGVSAVAGPHDI
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,,0,MSMLKPSGLKAPTKILKPGSTALKTPTAVVAPVEKTISSEKASSTP...,1,13,S,1,2,M,LKPSGLKAPTKILKPGSTAL


In [26]:
##TODO: remove
#peptides_cs.to_csv(os.path.join(curr_dir_path, "HyperreactivityModel_complete_sequence.csv"))

In [27]:
#TODO: remove
path = os.path.join(curr_dir_path, "HyperreactivityModel_complete_sequence.csv")
peptides_cs = pd.read_csv(path)
peptides_cs.set_index("Unnamed: 0", inplace=True)
peptides_cs.index.name = None
peptides_cs.fillna({f"Left {analysis_threshold}": '', f"Right {analysis_threshold}": ''}, inplace=True)
peptides_cs

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,Mapped Proteins,hr_label,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,,0,MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRSHE...,234,10,AAA,3,237,VLGVPIIVQASQAEKNRAAA,ANNLQKGSAGPMRLYVGSLH
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,,0,MSPPSPGRRREQRRPRAAATMATPLPGRAGGPATPLSPTRLSRLQE...,16,12,AAAT,4,20,MSPPSPGRRREQRRPRAAAT,ATPLPGRAGGPATPLSPTRL
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,"sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp...",0,MMEAIKKKMQMLKLDKENALDRAEQAEAEQKQAEERSKQLEDELAA...,119,10,AADESERG,8,127,ATALQKLEEAEKAADESERG,KVIENRALKDEEKMELQEIQ
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,,0,MAENSGRAGKSSGSGAGKGAVSAEQVIAGFNRLRQEQRGLASKAAE...,43,19,AAELE,5,48,GFNRLRQEQRGLASKAAELE,ELNEHSLVIDTLKEVDETRK
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,,0,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,550,10,AAE,3,553,DAELQKLKEMTNHQKKRAAE,MASLLKDLAEIGIAVGNNDV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,,0,MSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES,1,11,SDKPD,5,6,MSDKP,AEIEKFDKSKLKKTETQEKN
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,,0,MSGSSSVAAMKKVVQQLRLEAGLNRVKVSQAAADLKQFCLQNAQHD...,1,11,SGSSSVAA,8,9,MSGSSSVA,KKVVQQLRLEAGLNRVKVSQ
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,,0,MSHPDYRMNLRPLGTPRGVSAVAGPHDIGASPGDKKSKNRSTRGKK...,1,16,SHPDYR,6,7,MSHPDY,NLRPLGTPRGVSAVAGPHDI
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,,0,MSMLKPSGLKAPTKILKPGSTALKTPTAVVAPVEKTISSEKASSTP...,1,13,S,1,2,M,LKPSGLKAPTKILKPGSTAL


In [28]:
# TODO: decide on whether/not to use these peptides (they come from large proteins)
# Remove invalid proteins (according to alphafold)
# 13 invalid proteins -> 199 invalid peptides

#invalid_IDs = ['Q15149', 'Q9UPN3', 'P46013', 'Q14789', 'Q09666', 'P15924', 'Q8IVF2', 'Q14315', 'Q9Y520', 'Q14204', 'Q7Z6Z7', 'Q8NFC6', 'Q9NU22']
#display(peptides_cs[peptides_cs["Protein ID"].isin(invalid_IDs)])
#peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
#peptides_cs

# Download Alphafold Data - HyperreactivityModel Dataset #1

In [29]:
# Path for alphafold protein data

alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/pae


In [30]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q14498' 'Q03252' 'P06753' 'Q9UHV9' 'P33176' 'O14497' 'Q9Y230' 'Q9H444'
 'Q9BRD0' 'Q5PRF9' 'Q16891' 'P46063' 'Q96PK6' 'P46459' 'Q96I24' 'P62258'
 'P09525' 'P07910' 'Q9Y2W2' 'P35222' 'P61970' 'P62333' 'P08243' 'P05787'
 'Q86XZ4' 'Q9P0K7' 'Q9Y2X7' 'P24928' 'Q13523' 'Q7L7X3' 'Q15149' 'Q9Y5Z4'
 'Q9UPN3' 'P51572' 'Q9NR30' 'P55265' 'Q9Y613' 'Q13769' 'Q15366' 'P26373'
 'Q5VZK9' 'P39023' 'P46013' 'Q14789' 'Q9NWH9' 'P14618' 'Q6UB99' 'Q8WUM0'
 'Q9UHI6' 'Q15050' 'Q13310' 'Q99848' 'Q8TB05' 'Q02543' 'P00558' 'Q96SI9'
 'Q13895' 'P07954' 'P15121' 'Q96T51' 'P54252' 'Q9Y618' 'Q8N6H7' 'Q9NP61'
 'P83731' 'P53396' 'Q92945' 'O60218' 'O43768' 'P55084' 'P49368' 'Q8WXF1'
 'P11940' 'P26447' 'P35579' 'Q15233' 'Q9H3P2' 'Q92614' 'P40222' 'Q9Y5B6'
 'Q15785' 'Q99615' 'P10809' 'O43776' 'O43633' 'Q9BYN8' 'O43242' 'Q8TDX7'
 'Q9Y3U8' 'P22626' 'Q9HD42' 'P00491' 'Q13509' 'Q567U6' 'P27105' 'Q9BY77'
 'Q9BV36' 'Q5VTR2' 'P18669' 'Q9H0L4' 'Q9Y3C1' 'Q14671' 'Q14320' 'Q9Y383'
 'P26038' 'Q8WUF5' 'O95347' 'Q

In [31]:
## Download cif data for proteins
## SLOW THE FIRST TIME - caches the relevant cif data
#valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
#    proteins=unique_uniprotIDs,
#    out_folder=cif_dir
#)

In [32]:
## Download pae data for proteins
## SLOW THE FIRST TIME - caches the relevant pae data
#valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
#    proteins=unique_uniprotIDs,
#    out_folder=pae_dir, 
#)

In [33]:
#invalid_proteins_cif

In [34]:
#TODO: Add AlphaFold structure info to table

# Convert Left/Right 20 Sequences to FASTA

In [35]:
path = os.path.join(curr_dir_path, "hr_training_1.fasta")

with open(path, 'wt') as f:
    for index, row in peptides_cs.iterrows():

        # TODO: question - should we zero-index or one-index the methionine location in the fasta file?
        k = row["Protein ID"] + "-" + str(row["Methionine Location"]) + "-" + str(row["hr_label"])
        v = str(row["Left 20"]) + "M" + str(row["Right 20"])

        f.write(f'>{k}\n{v}\n')