## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
import re
import plotly.express as px
from tqdm import tqdm
import tempfile
import csv
import requests as r
from Bio import SeqIO
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns
import scipy


# Import structuremap functions
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score, annotate_proteins_with_idr_pattern, get_extended_flexible_pattern, get_proximity_pvals, perform_enrichment_analysis, perform_enrichment_analysis_per_protein, evaluate_ptm_colocalization, extract_motifs_in_proteome
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization

## Set Parameters of Analysis

In [2]:
analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "655.3735" 
light_modification = "649.3660"

## Load Dataset - All Peptides

In [3]:
# path for csv output data
datasets_path_str = "../datasets/"
datasets_path = os.path.abspath(datasets_path_str)
print(datasets_path)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/datasets


In [4]:
data_loc = os.path.join(datasets_path, "combined_modified_peptide_label_quant.tsv")
peptides = pd.read_csv(data_loc, delimiter="\t")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,K562_3 Heavy Match Type,K562_4 Heavy Match Type,K562_5 Heavy Match Type,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,MS/MS,MS/MS,MS/MS,sp|Q14498|RBM39_HUMAN,Q14498,RBM39_HUMAN,RBM39,RNA-binding protein 39,,
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,MS/MS,unmatched,unmatched,sp|Q03252|LMNB2_HUMAN,Q03252,LMNB2_HUMAN,LMNB2,Lamin-B2,,
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,unmatched,unmatched,unmatched,sp|P06753|TPM3_HUMAN,P06753,TPM3_HUMAN,TPM3,Tropomyosin alpha-3 chain,"TPM1, TPM2, TPM4","sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp..."
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,unmatched,unmatched,unmatched,sp|Q9UHV9|PFD2_HUMAN,Q9UHV9,PFD2_HUMAN,PFDN2,Prefoldin subunit 2,,
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,unmatched,unmatched,unmatched,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,KIF5B,Kinesin-1 heavy chain,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,MS/MS,MS/MS,unmatched,sp|P62328|TYB4_HUMAN,P62328,TYB4_HUMAN,TMSB4X,Thymosin beta-4,,
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,MS/MS,MS/MS,MS/MS,sp|P63218|GBG5_HUMAN,P63218,GBG5_HUMAN,GNG5,Guanine nucleotide-binding protein G(I)/G(S)/G...,,
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,unmatched,unmatched,unmatched,sp|Q8IYB7|DI3L2_HUMAN,Q8IYB7,DI3L2_HUMAN,DIS3L2,DIS3-like exonuclease 2,,
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,unmatched,unmatched,unmatched,sp|P30622|CLIP1_HUMAN,P30622,CLIP1_HUMAN,CLIP1,CAP-Gly domain-containing linker protein 1,,


In [5]:
ratio_df = peptides.filter(like='Log2 Ratio HL', axis=1)
ratio_df = ratio_df.fillna(999.999)
ratio_df

Unnamed: 0,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,A549_4 Log2 Ratio HL,A549_5 Log2 Ratio HL,HCT116_1 Log2 Ratio HL,HCT116_2 Log2 Ratio HL,HCT116_3 Log2 Ratio HL,HCT116_4 Log2 Ratio HL,HCT116_5 Log2 Ratio HL,...,Jurkat_1 Log2 Ratio HL,Jurkat_2 Log2 Ratio HL,Jurkat_3 Log2 Ratio HL,Jurkat_4 Log2 Ratio HL,Jurkat_5 Log2 Ratio HL,K562_1 Log2 Ratio HL,K562_2 Log2 Ratio HL,K562_3 Log2 Ratio HL,K562_4 Log2 Ratio HL,K562_5 Log2 Ratio HL
0,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
1,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
2,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
3,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
4,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
2341,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
2342,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999
2343,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,...,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999,999.999


In [6]:
num_hyperreactive_instances = ratio_df[ ratio_df < 1.0 ].count(axis=1)
num_hyperreactive_instances.value_counts()

0     2256
1       43
2       15
4       10
5        7
3        7
6        2
12       1
8        1
14       1
13       1
7        1
Name: count, dtype: int64

In [7]:
peptides["prelim_hyperreactive"] = num_hyperreactive_instances >= 3
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,K562_4 Heavy Match Type,K562_5 Heavy Match Type,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,prelim_hyperreactive
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,MS/MS,MS/MS,sp|Q14498|RBM39_HUMAN,Q14498,RBM39_HUMAN,RBM39,RNA-binding protein 39,,,False
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,unmatched,unmatched,sp|Q03252|LMNB2_HUMAN,Q03252,LMNB2_HUMAN,LMNB2,Lamin-B2,,,False
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,unmatched,unmatched,sp|P06753|TPM3_HUMAN,P06753,TPM3_HUMAN,TPM3,Tropomyosin alpha-3 chain,"TPM1, TPM2, TPM4","sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp...",False
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,unmatched,unmatched,sp|Q9UHV9|PFD2_HUMAN,Q9UHV9,PFD2_HUMAN,PFDN2,Prefoldin subunit 2,,,False
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,unmatched,unmatched,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,KIF5B,Kinesin-1 heavy chain,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,MS/MS,unmatched,sp|P62328|TYB4_HUMAN,P62328,TYB4_HUMAN,TMSB4X,Thymosin beta-4,,,False
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,MS/MS,MS/MS,sp|P63218|GBG5_HUMAN,P63218,GBG5_HUMAN,GNG5,Guanine nucleotide-binding protein G(I)/G(S)/G...,,,False
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,unmatched,unmatched,sp|Q8IYB7|DI3L2_HUMAN,Q8IYB7,DI3L2_HUMAN,DIS3L2,DIS3-like exonuclease 2,,,False
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,unmatched,unmatched,sp|P30622|CLIP1_HUMAN,P30622,CLIP1_HUMAN,CLIP1,CAP-Gly domain-containing linker protein 1,,,False


In [8]:
peptides["prelim_hyperreactive"].value_counts()

prelim_hyperreactive
False    2314
True       31
Name: count, dtype: int64

In [9]:
# helper function to get full amino acid sequence for a protein
def get_full_protein_seq(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [10]:
# test - get a single amino acid sequence - TODO: FIX THIS FOR THIS PARTICULAR NOTEBOOK
#first_protein_ID = peptides["Protein ID"].iloc[0]
#test_sequence = get_full_protein_seq(first_protein_ID)
#print(test_sequence[575:587])
#print(peptides["Peptide Sequence"].iloc[0])

In [11]:
unique_uniprot_IDs = peptides["Protein ID"].unique()
unique_uniprot_IDs, len(unique_uniprot_IDs), len(peptides["Protein ID"])

(array(['Q14498', 'Q03252', 'P06753', 'Q9UHV9', 'P33176', 'O14497',
        'Q9Y230', 'Q9H444', 'Q9BRD0', 'Q5PRF9', 'Q16891', 'P46063',
        'Q96PK6', 'P46459', 'Q96I24', 'P62258', 'P09525', 'P07910',
        'Q9Y2W2', 'P35222', 'P61970', 'P62333', 'P08243', 'P05787',
        'Q86XZ4', 'Q9P0K7', 'Q9Y2X7', 'P24928', 'Q13523', 'Q7L7X3',
        'Q15149', 'Q9Y5Z4', 'Q9UPN3', 'P51572', 'Q9NR30', 'P55265',
        'Q9Y613', 'Q13769', 'Q15366', 'P26373', 'Q5VZK9', 'P39023',
        'P46013', 'Q14789', 'Q9NWH9', 'P14618', 'Q6UB99', 'Q8WUM0',
        'Q9UHI6', 'Q15050', 'Q13310', 'Q99848', 'Q8TB05', 'Q02543',
        'P00558', 'Q96SI9', 'Q13895', 'P07954', 'P15121', 'Q96T51',
        'P54252', 'Q9Y618', 'Q8N6H7', 'Q9NP61', 'P83731', 'P53396',
        'Q92945', 'O60218', 'O43768', 'P55084', 'P49368', 'Q8WXF1',
        'P11940', 'P26447', 'P35579', 'Q15233', 'Q9H3P2', 'Q92614',
        'P40222', 'Q9Y5B6', 'Q15785', 'Q99615', 'P10809', 'O43776',
        'O43633', 'Q9BYN8', 'O43242', 'Q8TDX7', 

In [12]:
#unique_IDs_to_sequence_df = pd.DataFrame({"Protein ID": unique_uniprot_IDs})
#unique_IDs_to_sequence_df

In [13]:
# get whole amino acid sequences for methionine peptides
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#tqdm.pandas()
#unique_IDs_to_sequence_df["Complete Sequence"] = unique_IDs_to_sequence_df["Protein ID"].progress_apply(get_full_protein_seq)
#unique_IDs_to_sequence_df

In [14]:
#peptides_completed_sequence = peptides.merge(unique_IDs_to_sequence_df, how="left", on="Protein ID")
#peptides_completed_sequence

In [15]:
#peptides_completed_sequence.to_csv(os.path.join(datasets_path, "hyperreactivity_model_completed_sequence.csv"))

In [16]:
path = os.path.join(datasets_path, "hyperreactivity_model_completed_sequence.csv")
peptides_completed_sequence = pd.read_csv(path)
peptides_completed_sequence.set_index("Unnamed: 0", inplace=True)
peptides_completed_sequence.index.name = None
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,K562_5 Heavy Match Type,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,prelim_hyperreactive,Complete Sequence
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,MS/MS,sp|Q14498|RBM39_HUMAN,Q14498,RBM39_HUMAN,RBM39,RNA-binding protein 39,,,False,MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRSHE...
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,unmatched,sp|Q03252|LMNB2_HUMAN,Q03252,LMNB2_HUMAN,LMNB2,Lamin-B2,,,False,MSPPSPGRRREQRRPRAAATMATPLPGRAGGPATPLSPTRLSRLQE...
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,unmatched,sp|P06753|TPM3_HUMAN,P06753,TPM3_HUMAN,TPM3,Tropomyosin alpha-3 chain,"TPM1, TPM2, TPM4","sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp...",False,MMEAIKKKMQMLKLDKENALDRAEQAEAEQKQAEERSKQLEDELAA...
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,unmatched,sp|Q9UHV9|PFD2_HUMAN,Q9UHV9,PFD2_HUMAN,PFDN2,Prefoldin subunit 2,,,False,MAENSGRAGKSSGSGAGKGAVSAEQVIAGFNRLRQEQRGLASKAAE...
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,unmatched,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,KIF5B,Kinesin-1 heavy chain,,,False,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,unmatched,sp|P62328|TYB4_HUMAN,P62328,TYB4_HUMAN,TMSB4X,Thymosin beta-4,,,False,MSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,MS/MS,sp|P63218|GBG5_HUMAN,P63218,GBG5_HUMAN,GNG5,Guanine nucleotide-binding protein G(I)/G(S)/G...,,,False,MSGSSSVAAMKKVVQQLRLEAGLNRVKVSQAAADLKQFCLQNAQHD...
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,unmatched,sp|Q8IYB7|DI3L2_HUMAN,Q8IYB7,DI3L2_HUMAN,DIS3L2,DIS3-like exonuclease 2,,,False,MSHPDYRMNLRPLGTPRGVSAVAGPHDIGASPGDKKSKNRSTRGKK...
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,unmatched,sp|P30622|CLIP1_HUMAN,P30622,CLIP1_HUMAN,CLIP1,CAP-Gly domain-containing linker protein 1,,,False,MSMLKPSGLKAPTKILKPGSTALKTPTAVVAPVEKTISSEKASSTP...


In [17]:
# create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    split_mod = modifications[0].split(".")
    whole = split_mod[0]
    mantissa = split_mod[1]
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        split_mod = modifications[i].split(".")
        whole = split_mod[0]
        mantissa = split_mod[1]
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [18]:
peptides_completed_sequence["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Peptide Sequence"])])
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,prelim_hyperreactive,Complete Sequence,Sequence Location
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,sp|Q14498|RBM39_HUMAN,Q14498,RBM39_HUMAN,RBM39,RNA-binding protein 39,,,False,MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRSHE...,234
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,sp|Q03252|LMNB2_HUMAN,Q03252,LMNB2_HUMAN,LMNB2,Lamin-B2,,,False,MSPPSPGRRREQRRPRAAATMATPLPGRAGGPATPLSPTRLSRLQE...,16
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,sp|P06753|TPM3_HUMAN,P06753,TPM3_HUMAN,TPM3,Tropomyosin alpha-3 chain,"TPM1, TPM2, TPM4","sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp...",False,MMEAIKKKMQMLKLDKENALDRAEQAEAEQKQAEERSKQLEDELAA...,119
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,sp|Q9UHV9|PFD2_HUMAN,Q9UHV9,PFD2_HUMAN,PFDN2,Prefoldin subunit 2,,,False,MAENSGRAGKSSGSGAGKGAVSAEQVIAGFNRLRQEQRGLASKAAE...,43
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,KIF5B,Kinesin-1 heavy chain,,,False,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,sp|P62328|TYB4_HUMAN,P62328,TYB4_HUMAN,TMSB4X,Thymosin beta-4,,,False,MSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES,1
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,sp|P63218|GBG5_HUMAN,P63218,GBG5_HUMAN,GNG5,Guanine nucleotide-binding protein G(I)/G(S)/G...,,,False,MSGSSSVAAMKKVVQQLRLEAGLNRVKVSQAAADLKQFCLQNAQHD...,1
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,sp|Q8IYB7|DI3L2_HUMAN,Q8IYB7,DI3L2_HUMAN,DIS3L2,DIS3-like exonuclease 2,,,False,MSHPDYRMNLRPLGTPRGVSAVAGPHDIGASPGDKKSKNRSTRGKK...,1
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,sp|P30622|CLIP1_HUMAN,P30622,CLIP1_HUMAN,CLIP1,CAP-Gly domain-containing linker protein 1,,,False,MSMLKPSGLKAPTKILKPGSTALKTPTAVVAPVEKTISSEKASSTP...,1


In [19]:
peptides_completed_sequence[peptides_completed_sequence["Sequence Location"] == -1]

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,prelim_hyperreactive,Complete Sequence,Sequence Location
313,EGNGTVMGAELR,EGNGTVMGAELR,12,3,1,,EGNGTVM[655.3735]GAELR,,,,...,sp|P60660|MYL6_HUMAN,P60660,MYL6_HUMAN,MYL6,Myosin light polypeptide 6,"MYL1, MYL3","sp|P05976|MYL1_HUMAN, sp|P08590|MYL3_HUMAN",False,MCDFTEDQTAEFKEAFQLFDRTGDGKILYSQCGDVMRALGQNPTNA...,-1
648,HAHEIMLK,HAHEIMLK,8,4,1,,HAHEIM[655.3735]LK,,,,...,sp|Q9Y2K9|STB5L_HUMAN,Q9Y2K9,STB5L_HUMAN,STXBP5L,Syntaxin-binding protein 5-like,STXBP5,sp|Q5T5C0|STXB5_HUMAN,False,MKKFNFRKVLDGLTASSPGSGSSSGSNSGGGAGSGSVHPAGTAGVL...,-1
847,ITMQNLNDR,ITMQNLNDR,9,3,1,,ITM[655.3735]QNLNDR,,,,...,sp|P08727|K1C19_HUMAN,P08727,K1C19_HUMAN,KRT19,"Keratin, type I cytoskeletal 19","KRT13, KRT15","contam_sp|O77727|K1C15_SHEEP, sp|P13646|K1C13_...",False,MTSYSYRQSSATSSFGGLGGGSVRFGPGVAFRAPSIHGGSGGRGVS...,-1


In [20]:
# other sequences within the same proteins are found -> so, drop rows 313, 648, and 847 (peptides that weren't found in their protein)

peptides_completed_sequence[peptides_completed_sequence["Protein ID"].isin(["P60660", "Q9Y2K9", "P08727"])]

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,prelim_hyperreactive,Complete Sequence,Sequence Location
313,EGNGTVMGAELR,EGNGTVMGAELR,12,3,1,,EGNGTVM[655.3735]GAELR,,,,...,sp|P60660|MYL6_HUMAN,P60660,MYL6_HUMAN,MYL6,Myosin light polypeptide 6,"MYL1, MYL3","sp|P05976|MYL1_HUMAN, sp|P08590|MYL3_HUMAN",False,MCDFTEDQTAEFKEAFQLFDRTGDGKILYSQCGDVMRALGQNPTNA...,-1
648,HAHEIMLK,HAHEIMLK,8,4,1,,HAHEIM[655.3735]LK,,,,...,sp|Q9Y2K9|STB5L_HUMAN,Q9Y2K9,STB5L_HUMAN,STXBP5L,Syntaxin-binding protein 5-like,STXBP5,sp|Q5T5C0|STXB5_HUMAN,False,MKKFNFRKVLDGLTASSPGSGSSSGSNSGGGAGSGSVHPAGTAGVL...,-1
847,ITMQNLNDR,ITMQNLNDR,9,3,1,,ITM[655.3735]QNLNDR,,,,...,sp|P08727|K1C19_HUMAN,P08727,K1C19_HUMAN,KRT19,"Keratin, type I cytoskeletal 19","KRT13, KRT15","contam_sp|O77727|K1C15_SHEEP, sp|P13646|K1C13_...",False,MTSYSYRQSSATSSFGGLGGGSVRFGPGVAFRAPSIHGGSGGRGVS...,-1
1966,SQYEVMAEQNRK,SQYEVMAEQNRK,12,4,1,SQYEVM[649.3660]AEQNRK,SQYEVM[655.3735]AEQNRK,,,,...,sp|P08727|K1C19_HUMAN,P08727,K1C19_HUMAN,KRT19,"Keratin, type I cytoskeletal 19",,,False,MTSYSYRQSSATSSFGGLGGGSVRFGPGVAFRAPSIHGGSGGRGVS...,253
2023,TDLEMQIEGLK,TDLEMQIEGLK,11,3,1,,TDLEM[655.3735]QIEGLK,,,,...,sp|P08727|K1C19_HUMAN,P08727,K1C19_HUMAN,KRT19,"Keratin, type I cytoskeletal 19",KRT16,sp|P08779|K1C16_HUMAN,False,MTSYSYRQSSATSSFGGLGGGSVRFGPGVAFRAPSIHGGSGGRGVS...,197
2202,VLDFEHFLPMLQTVAK,VLDFEHFLPMLQTVAK,16,4,1,,VLDFEHFLPM[655.3735]LQTVAK,,,,...,sp|P60660|MYL6_HUMAN,P60660,MYL6_HUMAN,MYL6,Myosin light polypeptide 6,,,False,MCDFTEDQTAEFKEAFQLFDRTGDGKILYSQCGDVMRALGQNPTNA...,63


In [21]:
peptides_completed_sequence = peptides_completed_sequence.drop([313, 648, 847])
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,prelim_hyperreactive,Complete Sequence,Sequence Location
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,sp|Q14498|RBM39_HUMAN,Q14498,RBM39_HUMAN,RBM39,RNA-binding protein 39,,,False,MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRSHE...,234
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,sp|Q03252|LMNB2_HUMAN,Q03252,LMNB2_HUMAN,LMNB2,Lamin-B2,,,False,MSPPSPGRRREQRRPRAAATMATPLPGRAGGPATPLSPTRLSRLQE...,16
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,sp|P06753|TPM3_HUMAN,P06753,TPM3_HUMAN,TPM3,Tropomyosin alpha-3 chain,"TPM1, TPM2, TPM4","sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp...",False,MMEAIKKKMQMLKLDKENALDRAEQAEAEQKQAEERSKQLEDELAA...,119
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,sp|Q9UHV9|PFD2_HUMAN,Q9UHV9,PFD2_HUMAN,PFDN2,Prefoldin subunit 2,,,False,MAENSGRAGKSSGSGAGKGAVSAEQVIAGFNRLRQEQRGLASKAAE...,43
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,KIF5B,Kinesin-1 heavy chain,,,False,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,sp|P62328|TYB4_HUMAN,P62328,TYB4_HUMAN,TMSB4X,Thymosin beta-4,,,False,MSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES,1
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,sp|P63218|GBG5_HUMAN,P63218,GBG5_HUMAN,GNG5,Guanine nucleotide-binding protein G(I)/G(S)/G...,,,False,MSGSSSVAAMKKVVQQLRLEAGLNRVKVSQAAADLKQFCLQNAQHD...,1
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,sp|Q8IYB7|DI3L2_HUMAN,Q8IYB7,DI3L2_HUMAN,DIS3L2,DIS3-like exonuclease 2,,,False,MSHPDYRMNLRPLGTPRGVSAVAGPHDIGASPGDKKSKNRSTRGKK...,1
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,sp|P30622|CLIP1_HUMAN,P30622,CLIP1_HUMAN,CLIP1,CAP-Gly domain-containing linker protein 1,,,False,MSMLKPSGLKAPTKILKPGSTALKTPTAVVAPVEKTISSEKASSTP...,1


In [22]:
temp = [A[B:B+C] for A, B, C in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Sequence Location"], peptides_completed_sequence["Peptide Length"])]
(temp == peptides_completed_sequence["Peptide Sequence"]).value_counts()

Peptide Sequence
True    2342
Name: count, dtype: int64

In [23]:
# create regex pattern to identify desired modifications
left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [24]:
peptides_completed_sequence[peptides_completed_sequence["Heavy Modified Peptide"].isna()]

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,prelim_hyperreactive,Complete Sequence,Sequence Location
956,KMREYGVER,KMREYGVER,9,4,1,KM[649.3660]REYGVER,,,,,...,sp|Q9BQ95|ECSIT_HUMAN,Q9BQ95,ECSIT_HUMAN,ECSIT,Evolutionarily conserved signaling intermediat...,,,False,MSWVQATLLARGLCRAWGGTCGAALTGTSISQVPRRLPRGLHCSAA...,115
1339,MIFDVESMKK,MIFDVESM[15.9949]KK,10,4,1,M[649.3660]IFDVESM[15.9949]KK,,,,,...,sp|P09874|PARP1_HUMAN,P09874,PARP1_HUMAN,PARP1,Poly [ADP-ribose] polymerase 1,,,False,MAESSDKLYRVEYAKSGRASCKKCSESIPKDSLRMAIMVQSPMFDG...,674


In [25]:
peptides_completed_sequence.loc[956, "Heavy Modified Peptide"] = "KM[" + heavy_modification + "]REYGVER"
peptides_completed_sequence.loc[1339, "Heavy Modified Peptide"] = "M[" + heavy_modification + "]IFDVESM[15.9949]KK"

In [26]:
# extract left prefix of modified methionine (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_completed_sequence["Left Prefix"] = peptides_completed_sequence["Heavy Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_completed_sequence["Left Prefix"] = peptides_completed_sequence["Left Prefix"].fillna("")
peptides_completed_sequence["Left Prefix"] = peptides_completed_sequence["Left Prefix"].map(filtering)
peptides_completed_sequence["Left Prefix Length"] = peptides_completed_sequence["Left Prefix"].str.len()

peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,prelim_hyperreactive,Complete Sequence,Sequence Location,Left Prefix,Left Prefix Length
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,RBM39_HUMAN,RBM39,RNA-binding protein 39,,,False,MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRSHE...,234,AAA,3
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,LMNB2_HUMAN,LMNB2,Lamin-B2,,,False,MSPPSPGRRREQRRPRAAATMATPLPGRAGGPATPLSPTRLSRLQE...,16,AAAT,4
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,TPM3_HUMAN,TPM3,Tropomyosin alpha-3 chain,"TPM1, TPM2, TPM4","sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp...",False,MMEAIKKKMQMLKLDKENALDRAEQAEAEQKQAEERSKQLEDELAA...,119,AADESERG,8
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,PFD2_HUMAN,PFDN2,Prefoldin subunit 2,,,False,MAENSGRAGKSSGSGAGKGAVSAEQVIAGFNRLRQEQRGLASKAAE...,43,AAELE,5
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,KINH_HUMAN,KIF5B,Kinesin-1 heavy chain,,,False,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,550,AAE,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,TYB4_HUMAN,TMSB4X,Thymosin beta-4,,,False,MSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES,1,SDKPD,5
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,GBG5_HUMAN,GNG5,Guanine nucleotide-binding protein G(I)/G(S)/G...,,,False,MSGSSSVAAMKKVVQQLRLEAGLNRVKVSQAAADLKQFCLQNAQHD...,1,SGSSSVAA,8
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,DI3L2_HUMAN,DIS3L2,DIS3-like exonuclease 2,,,False,MSHPDYRMNLRPLGTPRGVSAVAGPHDIGASPGDKKSKNRSTRGKK...,1,SHPDYR,6
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,CLIP1_HUMAN,CLIP1,CAP-Gly domain-containing linker protein 1,,,False,MSMLKPSGLKAPTKILKPGSTALKTPTAVVAPVEKTISSEKASSTP...,1,S,1


In [27]:
peptides_completed_sequence["Methionine Location"] = peptides_completed_sequence["Sequence Location"] + peptides_completed_sequence["Left Prefix Length"]
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,Gene,Protein Description,Mapped Genes,Mapped Proteins,prelim_hyperreactive,Complete Sequence,Sequence Location,Left Prefix,Left Prefix Length,Methionine Location
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,RBM39,RNA-binding protein 39,,,False,MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRSHE...,234,AAA,3,237
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,LMNB2,Lamin-B2,,,False,MSPPSPGRRREQRRPRAAATMATPLPGRAGGPATPLSPTRLSRLQE...,16,AAAT,4,20
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,TPM3,Tropomyosin alpha-3 chain,"TPM1, TPM2, TPM4","sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp...",False,MMEAIKKKMQMLKLDKENALDRAEQAEAEQKQAEERSKQLEDELAA...,119,AADESERG,8,127
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,PFDN2,Prefoldin subunit 2,,,False,MAENSGRAGKSSGSGAGKGAVSAEQVIAGFNRLRQEQRGLASKAAE...,43,AAELE,5,48
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,KIF5B,Kinesin-1 heavy chain,,,False,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,550,AAE,3,553
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,TMSB4X,Thymosin beta-4,,,False,MSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES,1,SDKPD,5,6
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,GNG5,Guanine nucleotide-binding protein G(I)/G(S)/G...,,,False,MSGSSSVAAMKKVVQQLRLEAGLNRVKVSQAAADLKQFCLQNAQHD...,1,SGSSSVAA,8,9
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,DIS3L2,DIS3-like exonuclease 2,,,False,MSHPDYRMNLRPLGTPRGVSAVAGPHDIGASPGDKKSKNRSTRGKK...,1,SHPDYR,6,7
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,CLIP1,CAP-Gly domain-containing linker protein 1,,,False,MSMLKPSGLKAPTKILKPGSTALKTPTAVVAPVEKTISSEKASSTP...,1,S,1,2


In [28]:
# Compute left/right analysis sequences based on threshold
peptides_completed_sequence[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B] for A, B in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Methionine Location"])]
peptides_completed_sequence[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Methionine Location"])]
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,Mapped Genes,Mapped Proteins,prelim_hyperreactive,Complete Sequence,Sequence Location,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,,,False,MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRSHE...,234,AAA,3,237,VLGVPIIVQASQAEKNRAAA,ANNLQKGSAGPMRLYVGSLH
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,,,False,MSPPSPGRRREQRRPRAAATMATPLPGRAGGPATPLSPTRLSRLQE...,16,AAAT,4,20,MSPPSPGRRREQRRPRAAAT,ATPLPGRAGGPATPLSPTRL
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,"TPM1, TPM2, TPM4","sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp...",False,MMEAIKKKMQMLKLDKENALDRAEQAEAEQKQAEERSKQLEDELAA...,119,AADESERG,8,127,ATALQKLEEAEKAADESERG,KVIENRALKDEEKMELQEIQ
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,,,False,MAENSGRAGKSSGSGAGKGAVSAEQVIAGFNRLRQEQRGLASKAAE...,43,AAELE,5,48,GFNRLRQEQRGLASKAAELE,ELNEHSLVIDTLKEVDETRK
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,,,False,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,550,AAE,3,553,DAELQKLKEMTNHQKKRAAE,MASLLKDLAEIGIAVGNNDV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,,,False,MSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES,1,SDKPD,5,6,MSDKPD,AEIEKFDKSKLKKTETQEKN
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,,,False,MSGSSSVAAMKKVVQQLRLEAGLNRVKVSQAAADLKQFCLQNAQHD...,1,SGSSSVAA,8,9,MSGSSSVAA,KKVVQQLRLEAGLNRVKVSQ
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,,,False,MSHPDYRMNLRPLGTPRGVSAVAGPHDIGASPGDKKSKNRSTRGKK...,1,SHPDYR,6,7,MSHPDYR,NLRPLGTPRGVSAVAGPHDI
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,,,False,MSMLKPSGLKAPTKILKPGSTALKTPTAVVAPVEKTISSEKASSTP...,1,S,1,2,MS,LKPSGLKAPTKILKPGSTAL


In [35]:
peptides_completed_sequence[f"Left {analysis_threshold}"] = peptides_completed_sequence[f"Left {analysis_threshold}"].fillna("")
peptides_completed_sequence[f"Right {analysis_threshold}"] = peptides_completed_sequence[f"Right {analysis_threshold}"].fillna("")

In [36]:
temp = pd.Series([A[B] for A, B in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Methionine Location"])])
temp.value_counts()

M    2342
Name: count, dtype: int64

In [37]:
#peptides_completed_sequence.to_csv(os.path.join(datasets_path, "hyperreactivity_model_completed_sequence_with_thresholds.csv"))

In [38]:
path = os.path.join(datasets_path, "hyperreactivity_model_completed_sequence_with_thresholds.csv")
peptides_completed_sequence = pd.read_csv(path)
peptides_completed_sequence.set_index("Unnamed: 0", inplace=True)
peptides_completed_sequence.index.name = None
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Peptide Length,Charges,Label Count,Light Modified Peptide,Heavy Modified Peptide,A549_1 Log2 Ratio HL,A549_2 Log2 Ratio HL,A549_3 Log2 Ratio HL,...,Mapped Genes,Mapped Proteins,prelim_hyperreactive,Complete Sequence,Sequence Location,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,AAAMANNLQK,AAAMANNLQK,10,3,1,,AAAM[655.3735]ANNLQK,,,,...,,,False,MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRSHE...,234,AAA,3,237,VLGVPIIVQASQAEKNRAAA,ANNLQKGSAGPMRLYVGSLH
1,AAATMATPLPGR,AAATMATPLPGR,12,3,1,,AAATM[655.3735]ATPLPGR,,,,...,,,False,MSPPSPGRRREQRRPRAAATMATPLPGRAGGPATPLSPTRLSRLQE...,16,AAAT,4,20,MSPPSPGRRREQRRPRAAAT,ATPLPGRAGGPATPLSPTRL
2,AADESERGMK,AADESERGMK,10,3,1,,AADESERGM[655.3735]K,,,,...,"TPM1, TPM2, TPM4","sp|P07951|TPM2_HUMAN, sp|P09493|TPM1_HUMAN, sp...",False,MMEAIKKKMQMLKLDKENALDRAEQAEAEQKQAEERSKQLEDELAA...,119,AADESERG,8,127,ATALQKLEEAEKAADESERG,KVIENRALKDEEKMELQEIQ
3,AAELEMELNEHSLVIDTLK,AAELEMELNEHSLVIDTLK,19,4,1,,AAELEM[655.3735]ELNEHSLVIDTLK,,,,...,,,False,MAENSGRAGKSSGSGAGKGAVSAEQVIAGFNRLRQEQRGLASKAAE...,43,AAELE,5,48,GFNRLRQEQRGLASKAAELE,ELNEHSLVIDTLKEVDETRK
4,AAEMMASLLK,AAEMMASLLK,10,3,1,,AAEM[655.3735]MASLLK,,,,...,,,False,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,550,AAE,3,553,DAELQKLKEMTNHQKKRAAE,MASLLKDLAEIGIAVGNNDV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,SDKPDMAEIEK,n[42.0106]SDKPDMAEIEK,11,3,1,,n[42.0106]SDKPDM[655.3735]AEIEK,,,,...,,,False,MSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES,1,SDKPD,5,6,MSDKPD,AEIEKFDKSKLKKTETQEKN
2341,SGSSSVAAMKK,n[42.0106]SGSSSVAAMKK,11,3,1,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,...,,,False,MSGSSSVAAMKKVVQQLRLEAGLNRVKVSQAAADLKQFCLQNAQHD...,1,SGSSSVAA,8,9,MSGSSSVAA,KKVVQQLRLEAGLNRVKVSQ
2342,SHPDYRMNLRPLGTPR,n[42.0106]SHPDYRMNLRPLGTPR,16,4,1,,n[42.0106]SHPDYRM[655.3735]NLRPLGTPR,,,,...,,,False,MSHPDYRMNLRPLGTPRGVSAVAGPHDIGASPGDKKSKNRSTRGKK...,1,SHPDYR,6,7,MSHPDYR,NLRPLGTPRGVSAVAGPHDI
2343,SMLKPSGLKAPTK,n[42.0106]SMLKPSGLKAPTK,13,4,1,,n[42.0106]SM[655.3735]LKPSGLKAPTK,,,,...,,,False,MSMLKPSGLKAPTKILKPGSTALKTPTAVVAPVEKTISSEKASSTP...,1,S,1,2,MS,LKPSGLKAPTKILKPGSTAL


# Convert Left/Right 20 Sequences to FASTA

In [40]:
path = os.path.join(datasets_path, "hyperreactivity_training_1.fasta")

with open(path, 'wt') as f:
    for index, row in peptides_completed_sequence.iterrows():

        k = row["Protein ID"] + "-" + str(row["Methionine Location"])
        v = str(row["Left 20"]) + "M" + str(row["Right 20"])

        f.write(f'>{k}\n{v}\n')

# Download Alphafold Data - All Peptides

In [None]:
# path for alphafold protein data
alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

In [None]:
# set uniprot IDs to use
uniprotIDs = peptides_completed_sequence["Protein ID"].unique()
uniprotIDs, len(uniprotIDs)

In [None]:
# download cif data for proteins
# SLOW THE FIRST TIME
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprotIDs,
    out_folder=cif_dir
)

In [None]:
# download pae data for proteins
# SLOW THE FIRST TIME
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprotIDs,
    out_folder=pae_dir, 
)

In [None]:
invalid_proteins_pae

In [None]:
peptides_completed_sequence[~peptides_completed_sequence["Protein ID"].isin(invalid_proteins_pae)]["Protein ID"].value_counts()