## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import requests as r
from Bio import SeqIO
from io import StringIO
import warnings

warnings.filterwarnings('ignore')

# Import structuremap functions
import structuremap.utils
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score

structuremap.utils.set_logger()

## Set Parameters of Analysis - ChURRO_1

In [2]:
# Set parameters of analysis

analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["577.3085", "583.3160"] # which modifications we are looking for, as regex strings
heavy_modification = "577.3085"
light_modification = "583.3160"

## Load Dataset - ChURRO_1

In [3]:
# Set correct pathing

curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/ChURRO_revisions
Global Data Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/global_data


In [4]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "ChURRO_1_isoDTB.csv")
peptides = pd.read_csv(data_loc)
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,pvalue,avg ratio,neglogpval,Site,Label
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,sp|P11142|HSP7C_HUMAN,P11142,HSP7C_HUMAN,HSPA8,Heat shock cognate 71 kDa protein,5.529095e-04,4.131862,3.257346,M549,HSP7C_M549
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,sp|Q07065|CKAP4_HUMAN,Q07065,CKAP4_HUMAN,CKAP4,Cytoskeleton-associated protein 4,1.337810e-07,3.424534,6.873606,M423,CKAP4_M423
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,sp|Q9Y490|TLN1_HUMAN,Q9Y490,TLN1_HUMAN,TLN1,Talin-1,1.377828e-03,2.946435,2.860805,M1759,TLN1_M1759
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,sp|P99999|CYC_HUMAN,P99999,CYC_HUMAN,CYCS,Cytochrome c,2.391139e-03,2.876139,2.621395,M66,CYC_M66
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,sp|P00374|DYR_HUMAN,P00374,DYR_HUMAN,DHFR,Dihydrofolate reductase,1.270312e-02,2.831927,1.896090,M112,DYR_M112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,sp|Q9H444|CHM4B_HUMAN,Q9H444,CHM4B_HUMAN,CHMP4B,Charged multivesicular body protein 4b,9.760989e-01,-0.002733,0.010506,M35,CHM4B_M35
1047,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,sp|Q15233|NONO_HUMAN,Q15233,NONO_HUMAN,NONO,Non-POU domain-containing octamer-binding protein,9.809736e-01,0.002278,0.008343,M441,NONO_M441
1048,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,sp|Q86UP2|KTN1_HUMAN,Q86UP2,KTN1_HUMAN,KTN1,Kinectin,9.825290e-01,-0.008675,0.007655,M406,KTN1_M406
1049,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,sp|P51148|RAB5C_HUMAN,P51148,RAB5C_HUMAN,RAB5C,Ras-related protein Rab-5C,9.893231e-01,-0.012026,0.004662,M89,RAB5C_M89


In [5]:
# Canonicalize data - none to do here
peptides;

In [6]:
# Manual labeling of peptides
label_col_data = ["red"] * 513 + ["blue"] * 81 + ["grey"] * 457
label_col = pd.Series(label_col_data)
peptides["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [7]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P11142' 'Q07065' 'Q9Y490' 'P99999' 'P00374' 'O43837' 'P04406' 'P63261'
 'P07437' 'Q15233' 'P13693' 'P62820' 'P39023' 'P46777' 'Q13263' 'P27694'
 'P22626' 'O75208' 'P07954' 'Q92841' 'Q9UMX0' 'P06576' 'P12004' 'Q8WWM7'
 'Q14152' 'P06733' 'O43242' 'Q00577' 'P13639' 'P10809' 'P22307' 'P24752'
 'P27816' 'O15042' 'P62258' 'Q9BR76' 'P34932' 'Q969T9' 'Q13526' 'Q96A49'
 'Q9NX55' 'P0DMV8' 'P26038' 'P08238' 'Q9Y237' 'Q9BQE3' 'Q9BYN8' 'Q9GZM5'
 'Q9Y4L1' 'P26583' 'P53999' 'P27635' 'Q13428' 'O75396' 'P12270' 'Q96CT7'
 'P28066' 'Q8N8S7' 'O14974' 'P68133' 'P27797' 'P09874' 'O14654' 'Q14204'
 'P60842' 'Q86Y82' 'Q9Y2L1' 'Q9NP61' 'P06493' 'Q96MW1' 'P31943' 'P31948'
 'Q12907' 'P08708' 'Q5JSH3' 'P33316' 'P68371' 'Q7Z5L9' 'Q8WYA6' 'Q14684'
 'P06748' 'P52272' 'P15121' 'Q9BZI7' 'Q13435' 'P78344' 'O43719' 'Q02790'
 'Q68EM7' 'P35580' 'Q14247' 'P61978' 'P49750' 'P35241' 'Q9Y266' 'P07814'
 'Q08211' 'P49792' 'P51858' 'Q15366' 'O60814' 'P13804' 'P11940' 'P35579'
 'P61088' 'Q9NQC3' 'Q14980' 'O

In [8]:
# Helper function to get full amino acid sequence for a protein
def get_complete_sequence(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [9]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [10]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [11]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,Entry Name,Gene,Protein Description,pvalue,avg ratio,neglogpval,Site,Label,Color,Complete Sequence
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,...,HSP7C_HUMAN,HSPA8,Heat shock cognate 71 kDa protein,5.529095e-04,4.131862,3.257346,M549,HSP7C_M549,red,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,...,CKAP4_HUMAN,CKAP4,Cytoskeleton-associated protein 4,1.337810e-07,3.424534,6.873606,M423,CKAP4_M423,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,...,TLN1_HUMAN,TLN1,Talin-1,1.377828e-03,2.946435,2.860805,M1759,TLN1_M1759,red,MVALSLKISIGNVVKTMQFEPSTMVYDACRIIRERIPEAPAGPPSD...
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,...,CYC_HUMAN,CYCS,Cytochrome c,2.391139e-03,2.876139,2.621395,M66,CYC_M66,red,MGDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,...,DYR_HUMAN,DHFR,Dihydrofolate reductase,1.270312e-02,2.831927,1.896090,M112,DYR_M112,red,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,...,CHM4B_HUMAN,CHMP4B,Charged multivesicular body protein 4b,9.760989e-01,-0.002733,0.010506,M35,CHM4B_M35,grey,MSVFGKLFGAGGGKAGKGGPTPQEAIQRLRDTEEMLSKKQEFLEKK...
1047,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,...,NONO_HUMAN,NONO,Non-POU domain-containing octamer-binding protein,9.809736e-01,0.002278,0.008343,M441,NONO_M441,grey,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...
1048,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,...,KTN1_HUMAN,KTN1,Kinectin,9.825290e-01,-0.008675,0.007655,M406,KTN1_M406,grey,MEFYESAYFIVLIPSIVITVIFLFFWLFMKETLYDEVLAKQKREQK...
1049,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,...,RAB5C_HUMAN,RAB5C,Ras-related protein Rab-5C,9.893231e-01,-0.012026,0.004662,M89,RAB5C_M89,grey,MAGRGGAARPNGPAAGNKICQFKLVLLGESAVGKSSLVLRFVKGQF...


In [12]:
# Create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    whole, mantissa = modifications[0].split(".")
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        whole, mantissa = modifications[i].split(".")
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[577\.3085\]|M\[583\.3160\]


In [13]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [14]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [15]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [16]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    1051
Name: count, dtype: int64

In [17]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[577\.3085\]|M\[583\.3160\])


In [18]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [19]:
# Sanity check - ensure methionine locations are correct
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
temp.count("M") == len(temp)

True

In [20]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,...,red,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...,NSLESYAFNMK,539,11,NSLESYAFN,9,548,DEKQRDKVSSKNSLESYAFN,KATVEDEKLQGKINDEDKQK
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,...,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...,LQHVEDGVLSMQVASAR,412,17,LQHVEDGVLS,10,422,QQKSQGLDSRLQHVEDGVLS,QVASARQTESLESLLSKSQE
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,...,red,MVALSLKISIGNVVKTMQFEPSTMVYDACRIIRERIPEAPAGPPSD...,TLSHPQQMALLDQTK,1751,15,TLSHPQQ,7,1758,EPLTLAAVGAASKTLSHPQQ,ALLDQTKTLAESALQLLYTA
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,...,red,MGDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...,GIIWGEDTLMEYLENPK,56,17,GIIWGEDTL,9,65,GYSYTAANKNKGIIWGEDTL,EYLENPKKYIPGTKMIFVGI
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,...,red,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,VDMVWIVGGSSVYK,109,14,VD,2,111,RSLDDALKLTEQPELANKVD,VWIVGGSSVYKEAMNHPGHL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,...,grey,MSVFGKLFGAGGGKAGKGGPTPQEAIQRLRDTEEMLSKKQEFLEKK...,LRDTEEMLSK,28,10,LRDTEE,6,34,AGKGGPTPQEAIQRLRDTEE,LSKKQEFLEKKIEQELTAAK
1047,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,...,grey,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,FGQAATMEGIGAIGGTPPAFNR,434,22,FGQAAT,6,440,PDGTLGLTPPTTERFGQAAT,EGIGAIGGTPPAFNRAAPGA
1048,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,...,grey,MEFYESAYFIVLIPSIVITVIFLFFWLFMKETLYDEVLAKQKREQK...,IHVSYQETQQMQMK,393,14,IHVSYQETQQMQ,12,405,EHNVFQNKIHVSYQETQQMQ,KFQQVREQMEAEIAHLKQEN
1049,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,...,grey,MAGRGGAARPNGPAAGNKICQFKLVLLGESAVGKSSLVLRFVKGQF...,YHSLAPMYYR,82,10,YHSLAP,6,88,TVKFEIWDTAGQERYHSLAP,YYRGAQAAIVVYDITNTDTF


In [21]:
# Remove invalid proteins (according to alphafold)
# 18 invalid peptides as a result -> 9 red, 0 blue, 9 grey

invalid_IDs = ['Q14204', 'P49792', 'P46013', 'Q9NU22', 'Q9Y520', 'Q9UQ35', 'P78527', 'Q7Z6Z7']
display(peptides_cs[peptides_cs["Protein ID"].isin(invalid_IDs)])
peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,...,red,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...,NSLESYAFNMK,539,11,NSLESYAFN,9,548,DEKQRDKVSSKNSLESYAFN,KATVEDEKLQGKINDEDKQK
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,...,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...,LQHVEDGVLSMQVASAR,412,17,LQHVEDGVLS,10,422,QQKSQGLDSRLQHVEDGVLS,QVASARQTESLESLLSKSQE
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,...,red,MVALSLKISIGNVVKTMQFEPSTMVYDACRIIRERIPEAPAGPPSD...,TLSHPQQMALLDQTK,1751,15,TLSHPQQ,7,1758,EPLTLAAVGAASKTLSHPQQ,ALLDQTKTLAESALQLLYTA
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,...,red,MGDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...,GIIWGEDTLMEYLENPK,56,17,GIIWGEDTL,9,65,GYSYTAANKNKGIIWGEDTL,EYLENPKKYIPGTKMIFVGI
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,...,red,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,VDMVWIVGGSSVYK,109,14,VD,2,111,RSLDDALKLTEQPELANKVD,VWIVGGSSVYKEAMNHPGHL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,...,grey,MSVFGKLFGAGGGKAGKGGPTPQEAIQRLRDTEEMLSKKQEFLEKK...,LRDTEEMLSK,28,10,LRDTEE,6,34,AGKGGPTPQEAIQRLRDTEE,LSKKQEFLEKKIEQELTAAK
1047,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,...,grey,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,FGQAATMEGIGAIGGTPPAFNR,434,22,FGQAAT,6,440,PDGTLGLTPPTTERFGQAAT,EGIGAIGGTPPAFNRAAPGA
1048,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,...,grey,MEFYESAYFIVLIPSIVITVIFLFFWLFMKETLYDEVLAKQKREQK...,IHVSYQETQQMQMK,393,14,IHVSYQETQQMQ,12,405,EHNVFQNKIHVSYQETQQMQ,KFQQVREQMEAEIAHLKQEN
1049,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,...,grey,MAGRGGAARPNGPAAGNKICQFKLVLLGESAVGKSSLVLRFVKGQF...,YHSLAPMYYR,82,10,YHSLAP,6,88,TVKFEIWDTAGQERYHSLAP,YYRGAQAAIVVYDITNTDTF


## Download Alphafold Data - ChURRO_1

In [22]:
# Path for alphafold protein data

alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/pae


In [23]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P11142' 'Q07065' 'Q9Y490' 'P99999' 'P00374' 'O43837' 'P04406' 'P63261'
 'P07437' 'Q15233' 'P13693' 'P62820' 'P39023' 'P46777' 'Q13263' 'P27694'
 'P22626' 'O75208' 'P07954' 'Q92841' 'Q9UMX0' 'P06576' 'P12004' 'Q8WWM7'
 'Q14152' 'P06733' 'O43242' 'Q00577' 'P13639' 'P10809' 'P22307' 'P24752'
 'P27816' 'O15042' 'P62258' 'Q9BR76' 'P34932' 'Q969T9' 'Q13526' 'Q96A49'
 'Q9NX55' 'P0DMV8' 'P26038' 'P08238' 'Q9Y237' 'Q9BQE3' 'Q9BYN8' 'Q9GZM5'
 'Q9Y4L1' 'P26583' 'P53999' 'P27635' 'Q13428' 'O75396' 'P12270' 'Q96CT7'
 'P28066' 'Q8N8S7' 'O14974' 'P68133' 'P27797' 'P09874' 'O14654' 'P60842'
 'Q86Y82' 'Q9Y2L1' 'Q9NP61' 'P06493' 'Q96MW1' 'P31943' 'P31948' 'Q12907'
 'P08708' 'Q5JSH3' 'P33316' 'P68371' 'Q7Z5L9' 'Q8WYA6' 'Q14684' 'P06748'
 'P52272' 'P15121' 'Q9BZI7' 'Q13435' 'P78344' 'O43719' 'Q02790' 'Q68EM7'
 'P35580' 'Q14247' 'P61978' 'P49750' 'P35241' 'Q9Y266' 'P07814' 'Q08211'
 'P51858' 'Q15366' 'O60814' 'P13804' 'P11940' 'P35579' 'P61088' 'Q9NQC3'
 'Q14980' 'O43660' 'P14625' 'O

In [24]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 557/557 [00:00<00:00, 177889.84it/s]

2025-02-24 11:39:16> Valid proteins: 0
2025-02-24 11:39:16> Invalid proteins: 0
2025-02-24 11:39:16> Existing proteins: 557





In [25]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 557/557 [00:00<00:00, 170652.11it/s]

2025-02-24 11:39:17> Valid proteins: 0
2025-02-24 11:39:17> Invalid proteins: 0
2025-02-24 11:39:17> Existing proteins: 557





## Construct Alphafold Dataframe (Calculate Accessibilities) - ChURRO_1

In [26]:
# Format alphafold data into dataframe
alphafold_annotation_ChURRO_1 = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=unique_uniprotIDs)
alphafold_annotation_ChURRO_1

100%|██████████| 1849/1849 [00:53<00:00, 34.51it/s] 


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A0A2R8Y4L2,1,M,1,84.87,16.672,16.441,15.737,15.649,-4.743,...,-14.713,-13.538,-15.178,unstructured,unstructured,0,0,0,0,1
1,A0A2R8Y4L2,1,R,2,89.95,18.394,18.171,19.389,17.831,-7.753,...,-17.035,-17.849,-15.961,STRN,STRN,0,0,1,0,0
2,A0A2R8Y4L2,1,D,3,89.61,20.042,18.508,17.905,18.174,-10.302,...,-16.919,-17.992,-17.255,unstructured,unstructured,0,0,0,0,1
3,A0A2R8Y4L2,1,P,4,93.86,22.804,22.046,22.252,20.600,-11.701,...,-15.526,-14.118,-15.741,TURN_TY1_P,TURN,0,0,0,1,0
4,A0A2R8Y4L2,1,N,5,94.20,22.703,22.805,22.164,22.171,-13.138,...,-18.112,-17.913,-17.180,TURN_TY1_P,TURN,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352757,Q9Y6X4,557,K,666,53.36,-44.109,-44.548,-43.702,-44.447,-47.884,...,-10.531,-10.393,-11.884,unstructured,unstructured,0,0,0,0,1
352758,Q9Y6X4,557,A,667,54.16,-44.765,-45.645,-46.279,-44.950,-48.051,...,-7.492,-7.817,-8.622,unstructured,unstructured,0,0,0,0,1
352759,Q9Y6X4,557,K,668,46.58,-45.037,-44.530,-44.726,-45.243,-47.928,...,-3.931,-3.624,-5.178,unstructured,unstructured,0,0,0,0,1
352760,Q9Y6X4,557,L,669,56.53,-42.662,-44.147,-44.931,-44.103,-49.445,...,-0.766,-0.790,-2.103,unstructured,unstructured,0,0,0,0,1


In [27]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 557/557 [00:08<00:00, 67.53it/s] 


In [28]:
alphafold_accessibility_ChURRO_1 = alphafold_annotation_ChURRO_1.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [29]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 557/557 [00:06<00:00, 81.98it/s] 


In [30]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [31]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 557/557 [00:06<00:00, 90.10it/s] 


In [32]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [33]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 557/557 [00:06<00:00, 86.92it/s] 


In [34]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [35]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 557/557 [00:06<00:00, 87.19it/s] 


In [36]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [37]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 557/557 [00:06<00:00, 84.43it/s] 


In [38]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [39]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 557/557 [00:06<00:00, 83.09it/s] 


In [40]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [41]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 557/557 [00:07<00:00, 78.22it/s] 


In [42]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [43]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 557/557 [00:06<00:00, 80.69it/s] 


In [44]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [45]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 557/557 [00:07<00:00, 79.49it/s] 


In [46]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [47]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 557/557 [00:06<00:00, 84.35it/s] 


In [48]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [49]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 557/557 [00:07<00:00, 77.30it/s] 


In [50]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [51]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 557/557 [00:08<00:00, 64.32it/s] 


In [52]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [53]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 557/557 [00:10<00:00, 53.92it/s]


In [54]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [55]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 557/557 [00:07<00:00, 71.48it/s] 


In [56]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,A0A2R8Y4L2,1,M,1,84.87,16.672,16.441,15.737,15.649,-4.743,...,1,1,1,1,1,1,6,12,19,0
1,A0A2R8Y4L2,1,R,2,89.95,18.394,18.171,19.389,17.831,-7.753,...,2,2,3,3,6,6,12,13,23,1
2,A0A2R8Y4L2,1,D,3,89.61,20.042,18.508,17.905,18.174,-10.302,...,2,3,3,4,4,5,10,13,19,2
3,A0A2R8Y4L2,1,P,4,93.86,22.804,22.046,22.252,20.600,-11.701,...,2,2,2,3,4,4,9,12,13,0
4,A0A2R8Y4L2,1,N,5,94.20,22.703,22.805,22.164,22.171,-13.138,...,2,2,2,4,4,4,6,9,13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352757,Q9Y6X4,557,K,666,53.36,-44.109,-44.548,-43.702,-44.447,-47.884,...,1,2,2,2,2,2,4,7,9,0
352758,Q9Y6X4,557,A,667,54.16,-44.765,-45.645,-46.279,-44.950,-48.051,...,2,2,2,2,2,2,4,7,8,0
352759,Q9Y6X4,557,K,668,46.58,-45.037,-44.530,-44.726,-45.243,-47.928,...,1,2,2,2,2,2,4,5,7,0
352760,Q9Y6X4,557,L,669,56.53,-42.662,-44.147,-44.931,-44.103,-49.445,...,1,2,2,2,2,2,3,4,6,0


In [57]:
alphafold_accessibility_ChURRO_1_smooth = get_smooth_score(
    alphafold_accessibility_ChURRO_1, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_ChURRO_1_smooth;

100%|██████████| 557/557 [00:01<00:00, 426.29it/s]


In [58]:
alphafold_accessibility_ChURRO_1_smooth['IDR'] = np.where(
    alphafold_accessibility_ChURRO_1_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_ChURRO_1_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,A0A2R8Y4L2,1,M,1,84.87,16.672,16.441,15.737,15.649,-4.743,...,2.090909,2.545455,3.272727,4.181818,4.545455,9.000000,13.909091,21.636364,0.818182,1
1,A0A2R8Y4L2,1,R,2,89.95,18.394,18.171,19.389,17.831,-7.753,...,2.083333,2.500000,3.250000,4.083333,4.416667,9.083333,15.166667,24.583333,0.750000,1
2,A0A2R8Y4L2,1,D,3,89.61,20.042,18.508,17.905,18.174,-10.302,...,2.076923,2.461538,3.153846,3.923077,4.307692,9.153846,16.384615,27.230769,0.692308,1
3,A0A2R8Y4L2,1,P,4,93.86,22.804,22.046,22.252,20.600,-11.701,...,2.071429,2.428571,3.071429,3.785714,4.214286,9.285714,17.928571,30.000000,0.714286,1
4,A0A2R8Y4L2,1,N,5,94.20,22.703,22.805,22.164,22.171,-13.138,...,2.066667,2.400000,3.000000,3.733333,4.200000,9.400000,19.266667,32.066667,0.800000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,Q9Y6X4,557,K,666,53.36,-44.109,-44.548,-43.702,-44.447,-47.884,...,1.933333,1.933333,1.933333,1.933333,1.933333,3.733333,5.733333,8.533333,0.000000,1
666,Q9Y6X4,557,A,667,54.16,-44.765,-45.645,-46.279,-44.950,-48.051,...,1.928571,1.928571,1.928571,1.928571,1.928571,3.714286,5.714286,8.500000,0.000000,1
667,Q9Y6X4,557,K,668,46.58,-45.037,-44.530,-44.726,-45.243,-47.928,...,1.923077,1.923077,1.923077,1.923077,1.923077,3.692308,5.692308,8.384615,0.000000,1
668,Q9Y6X4,557,L,669,56.53,-42.662,-44.147,-44.931,-44.103,-49.445,...,1.916667,1.916667,1.916667,1.916667,1.916667,3.666667,5.666667,8.416667,0.000000,1


## Merge Dataframes into Full Dataset (Includes Alphafold) - ChURRO_1

In [59]:
alphafold_accessibility_ChURRO_1_smooth["position"] = alphafold_accessibility_ChURRO_1_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_wa = peptides_cs.merge(
    alphafold_accessibility_ChURRO_1_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_wa # wa means "with alphafold"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,...,2.095238,4.380952,5.285714,6.714286,6.952381,15.666667,40.666667,61.238095,4.380952,0.0
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0.0
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,...,2.380952,3.714286,5.238095,6.619048,7.047619,18.047619,57.952381,95.047619,5.380952,0.0
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,...,2.380952,3.857143,6.000000,7.285714,8.190476,20.333333,51.047619,87.857143,4.619048,0.0
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,...,2.476190,3.333333,4.857143,6.666667,8.095238,21.428571,59.619048,108.380952,5.523810,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1028,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,...,2.428571,5.095238,6.047619,8.380952,8.476190,18.142857,36.428571,52.190476,4.619048,0.0
1029,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,...,2.000000,2.000000,2.000000,2.000000,2.000000,4.000000,6.285714,10.000000,0.095238,1.0
1030,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,...,2.571429,5.285714,6.095238,7.952381,8.000000,12.047619,19.904762,27.476190,2.142857,1.0
1031,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,...,2.523810,3.428571,4.619048,5.952381,6.904762,18.904762,55.714286,94.666667,4.857143,0.0


In [60]:
# NOTE: five peptide sequences differ between UniProt & AlphaFold

pd.set_option("display.max_columns", None)
display(peptides_wa[~(peptides_wa["AA"] == "M")])
peptides_wa = peptides_wa[(peptides_wa["AA"] == "M")]
pd.reset_option("display.max_columns")

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,pvalue,avg ratio,neglogpval,Site,Label,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,y_coord_ca,y_coord_cb,y_coord_n,z_coord_c,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_2_180_pae,nAA_3_180_pae,nAA_4_180_pae,nAA_4.5_180_pae,nAA_5_180_pae,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae,nAA_2_180_pae_smooth10,nAA_3_180_pae_smooth10,nAA_4_180_pae_smooth10,nAA_4.5_180_pae_smooth10,nAA_5_180_pae_smooth10,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
50,EIQSSNLETAM[577.3085]SVIGDR,EIQSSNLETAM[583.3160]SVIGDR,1.089386,,1.279832,,1.261323,1.268879,,,sp|Q9NX55|HYPK_HUMAN,Q9NX55,HYPK_HUMAN,HYPK,Huntingtin-interacting protein K,0.000111,1.224855,3.954171,M56,HYPK_M56,red,MATEGDVELELETETSGPERPPEKPRKHDSGAADLERVTDYAEEKE...,EIQSSNLETAMSVIGDR,45,17,EIQSSNLETA,10,55,ERVTDYAEEKEIQSSNLETA,SVIGDRRSREQKAKQEREKE,Q9NX55,502.0,Q,55.0,69.9,-14.11,-14.821,-14.402,-16.255,-11.201,-12.063,-13.538,-11.93,34.016,35.062,34.949,34.86,unstructured,unstructured,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,7.0,12.0,0.0,0.0,0.0,0.0,0.0,0.47619,0.47619,1.571429,1.571429,2.0,2.047619,2.095238,4.761905,8.428571,12.190476,0.571429,1.0
275,EGGSAAALSSSSSSSAAAAAASSSSSSGPGSAM[577.3085]ETG...,EGGSAAALSSSSSSSAAAAAASSSSSSGPGSAM[583.3160]ETG...,0.375276,0.628574,0.366605,0.496898,0.594104,0.535261,0.196503,0.497402,sp|Q8NFD5|ARI1B_HUMAN,Q8NFD5,ARI1B_HUMAN,ARID1B,AT-rich interactive domain-containing protein 1B,3.6e-05,0.461328,4.438909,M142,ARI1B_M142,red,MAARAAAAAAAAAARARARAGSGERRAPPGPRPAPGARDLEAGARG...,EGGSAAALSSSSSSSAAAAAASSSSSSGPGSAMETGLLPNHK,109,42,EGGSAAALSSSSSSSAAAAAASSSSSSGPGSA,32,141,SSSAAAAAASSSSSSGPGSA,ETGLLPNHKLKTVGEAPAAP,Q8NFD5,418.0,G,141.0,30.16,-26.329,-27.241,,-28.344,76.067,77.156,,76.553,0.29,0.884,,1.665,unstructured,unstructured,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,7.0,10.0,0.0,0.0,0.0,0.0,0.619048,1.380952,1.47619,2.0,2.0,2.0,2.0,2.0,4.0,6.428571,9.904762,0.285714,1.0
661,AQEAAAAVM[577.3085]QAAANSAQSR,AQEAAAAVM[583.3160]QAAANSAQSR,,0.442455,0.061465,,0.231275,-0.056427,0.580257,,sp|Q8NFD5|ARI1B_HUMAN,Q8NFD5,ARI1B_HUMAN,ARID1B,AT-rich interactive domain-containing protein 1B,0.098649,0.251805,1.005906,M1020,ARI1B_M1020,grey,MAARAAAAAAAAAARARARAGSGERRAPPGPRPAPGARDLEAGARG...,AQEAAAAVMQAAANSAQSR,1011,19,AQEAAAAV,8,1019,GMGPPMPTVNRKAQEAAAAV,QAAANSAQSRQGSFPGMNQS,Q8NFD5,418.0,K,1019.0,31.07,13.037,11.897,11.484,12.313,-20.213,-19.269,-19.647,-17.839,-82.437,-82.871,-84.311,-82.847,unstructured,unstructured,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,7.0,10.0,0.0,0.0,0.0,0.0,0.428571,1.333333,1.47619,2.0,2.0,2.0,2.0,2.0,4.0,6.428571,9.809524,0.142857,1.0
773,GGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPM[5...,GGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPM[5...,,0.607134,,,,,,0.335,sp|A0A2R8Y4L2|RA1L3_HUMAN,A0A2R8Y4L2,RA1L3_HUMAN,HNRNPA1L3,Heterogeneous nuclear ribonucleoprotein A1-like 3,0.179014,0.471067,0.747112,M276,RA1L3_M276,grey,MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVM...,GGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPMK,232,45,GGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGP,43,275,GGGSYNDFGNYNNQSSNFGP,KGGNFEGRSSGPHGGGGQYF,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
828,EHM[577.3085]GNVVEALIALTN,EHM[583.3160]GNVVEALIALTN,0.253647,-0.532562,0.354471,0.270041,,0.254807,0.216633,0.227243,sp|Q9NX55|HYPK_HUMAN,Q9NX55,HYPK_HUMAN,HYPK,Huntingtin-interacting protein K,0.241736,0.149183,0.616659,M109,HYPK_M109,grey,MATEGDVELELETETSGPERPPEKPRKHDSGAADLERVTDYAEEKE...,EHMGNVVEALIALTN,106,15,EH,2,108,LIMTEMEISRAAAERSLREH,GNVVEALIALTN,Q9NX55,502.0,A,108.0,92.77,3.291,2.081,2.371,0.883,1.961,2.414,3.753,2.57,-7.576,-8.404,-9.096,-7.572,HELX_RH_AL_P,HELX,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,5.0,6.0,9.0,12.0,24.0,39.0,44.0,12.0,0.0,0.0,0.0,0.0,2.0,2.0,2.095238,3.52381,4.714286,5.904762,6.619048,15.0,33.238095,43.809524,3.47619,0.0


In [61]:
# NOTE: one peptide has two modifications

pd.set_option("display.max_columns", None)
display(peptides_wa[~(peptides_wa["Site"].str.strip("M").astype(int) == peptides_wa["Methionine Location"] + 1)])
peptides_wa = peptides_wa[(peptides_wa["Site"].str.strip("M").astype(int) == peptides_wa["Methionine Location"] + 1)]
pd.reset_option("display.max_columns")

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,pvalue,avg ratio,neglogpval,Site,Label,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,y_coord_ca,y_coord_cb,y_coord_n,z_coord_c,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_2_180_pae,nAA_3_180_pae,nAA_4_180_pae,nAA_4.5_180_pae,nAA_5_180_pae,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae,nAA_2_180_pae_smooth10,nAA_3_180_pae_smooth10,nAA_4_180_pae_smooth10,nAA_4.5_180_pae_smooth10,nAA_5_180_pae_smooth10,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
22,AAFNSGKVDIVAINDPFIDLNYM[577.3085]VYM[577.3085]...,AAFNSGKVDIVAINDPFIDLNYM[583.3160]VYM[583.3160]...,1.544582,,,1.967496,,,1.4354,,sp|P04406|G3P_HUMAN,P04406,G3P_HUMAN,GAPDH,Glyceraldehyde-3-phosphate dehydrogenase,0.009542,1.649159,2.020358,M43,G3P_M43,red,MGKVKVGVNGFGRIGRLVTRAAFNSGKVDIVAINDPFIDLNYMVYM...,AAFNSGKVDIVAINDPFIDLNYMVYMFQYDSTHGK,20,35,AAFNSGKVDIVAINDPFIDLNYMVY,25,45,GKVDIVAINDPFIDLNYMVY,FQYDSTHGKFHGTVKAENGK,P04406,67.0,M,45.0,98.74,0.353,0.691,0.121,0.183,6.75,5.768,4.396,6.194,18.406,19.531,19.149,20.842,HELX_RH_AL_P,HELX,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,3.0,6.0,7.0,8.0,21.0,69.0,115.0,9.0,0.0,0.0,0.0,0.0,2.047619,2.047619,2.428571,3.52381,5.285714,6.190476,7.428571,19.571429,53.809524,104.952381,5.190476,0.0


In [64]:
#peptides_wa.to_csv(os.path.join(curr_dir_path, "ChURRO_1_with_alphafold.csv"))

In [65]:
path = os.path.join(curr_dir_path, "ChURRO_1_with_alphafold.csv")
peptides_wa = pd.read_csv(path)
peptides_wa.set_index("Unnamed: 0", inplace=True)
peptides_wa.index.name = None
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,...,2.095238,4.380952,5.285714,6.714286,6.952381,15.666667,40.666667,61.238095,4.380952,0.0
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0.0
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,...,2.380952,3.714286,5.238095,6.619048,7.047619,18.047619,57.952381,95.047619,5.380952,0.0
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,...,2.380952,3.857143,6.000000,7.285714,8.190476,20.333333,51.047619,87.857143,4.619048,0.0
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,...,2.476190,3.333333,4.857143,6.666667,8.095238,21.428571,59.619048,108.380952,5.523810,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1028,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,...,2.428571,5.095238,6.047619,8.380952,8.476190,18.142857,36.428571,52.190476,4.619048,0.0
1029,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,...,2.000000,2.000000,2.000000,2.000000,2.000000,4.000000,6.285714,10.000000,0.095238,1.0
1030,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,...,2.571429,5.285714,6.095238,7.952381,8.000000,12.047619,19.904762,27.476190,2.142857,1.0
1031,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,...,2.523810,3.428571,4.619048,5.952381,6.904762,18.904762,55.714286,94.666667,4.857143,0.0


## Set Parameters of Analysis - ChURRO_2

In [66]:
# Set parameters of analysis

analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "649.3660"
light_modification = "655.3735"

## Load Dataset - ChURRO_2

In [67]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "ChURRO_2_isoDTB.csv")
peptides = pd.read_csv(data_loc)
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,3_3 Log2 Ratio HL,Protein,Protein ID,Entry Name,Protein Description,pvalue,avg ratio,neglogpvalue,Site,Label
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,5.847717,sp|Q07065|CKAP4_HUMAN,Q07065,CKAP4_HUMAN,Cytoskeleton-associated protein 4,1.634911e-05,6.486397,4.786506,M423,CKAP4_M423
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,5.759984,sp|P25786|PSA1_HUMAN,P25786,PSA1_HUMAN,Proteasome subunit alpha type-1,6.741449e-05,6.336632,4.171247,M26,PSA1_M26
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,3.758318,sp|P08238|HS90B_HUMAN,P08238,HS90B_HUMAN,Heat shock protein HSP 90-beta,6.241545e-10,3.806415,9.204708,M466,HS90B_M466
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,2.647510,sp|P15121|ALDR_HUMAN,P15121,ALDR_HUMAN,Aldo-keto reductase family 1 member B1,5.401038e-06,3.038070,5.267523,M169,ALDR_M169
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,2.647917,sp|P34932|HSP74_HUMAN,P34932,HSP74_HUMAN,Heat shock 70 kDa protein 4,3.372263e-03,2.373026,2.472079,M604,HSP74_M604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,TDFFIGGEEGM[649.3660]AEK,TDFFIGGEEGM[655.3735]AEK,0.008080,-0.318884,0.097640,-0.137602,0.183818,-0.011744,0.116157,0.026499,0.007357,sp|Q9Y266|NUDC_HUMAN,Q9Y266,NUDC_HUMAN,Nuclear migration protein nudC,9.505145e-01,-0.003187,0.022041,M50,NUDC_M50
620,EDIERM[649.3660]VQEAEK,EDIERM[655.3735]VQEAEK,-0.909764,,,,,,,,0.822103,sp|P11142|HSP7C_HUMAN,P11142,HSP7C_HUMAN,Heat shock cognate 71 kDa protein,9.678040e-01,-0.043831,0.014213,M518,HSP7C_M518
621,SM[649.3660]PWNVDTLSK,SM[655.3735]PWNVDTLSK,0.276155,-0.230254,0.361472,0.135673,-0.067806,-0.089783,0.210097,-0.544967,-0.083871,sp|Q16543|CDC37_HUMAN,Q16543,CDC37_HUMAN,Hsp90 co-chaperone Cdc37,9.696497e-01,-0.003698,0.013385,M112,CDC37_M112
622,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,-0.239224,sp|Q9Y6D9|MD1L1_HUMAN,Q9Y6D9,MD1L1_HUMAN,Mitotic spindle assembly checkpoint protein MAD1,9.773242e-01,-0.008231,0.009961,M245,MD1L1_M245


In [68]:
# Canonicalize data - none to do here
peptides;

In [71]:
# Manual labeling of peptides
label_col_data = ["red"] * 268 + ["blue"] * 73 + ["grey"] * 283
label_col = pd.Series(label_col_data)
peptides["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [72]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q07065' 'P25786' 'P08238' 'P15121' 'P34932' 'P04406' 'Q96CT7' 'Q15717'
 'Q14204' 'P47897' 'P12694' 'Q15366' 'P14625' 'P15374' 'Q9H078' 'P31948'
 'P11940' 'Q16836' 'Q12907' 'P26038' 'P13639' 'P07814' 'P67870' 'P15170'
 'P10809' 'Q16181' 'O95347' 'P07437' 'P68371' 'Q86UP2' 'P68363' 'Q13283'
 'P0DP23' 'P55196' 'Q07866' 'Q86Y82' 'P11142' 'P68104' 'P78371' 'Q15233'
 'P22061' 'P80303' 'P52272' 'P50402' 'P35579' 'P83731' 'Q9UHD8' 'Q9NR31'
 'P63261' 'P62829' 'Q9P013' 'P61978' 'O14950' 'P46777' 'O43776' 'Q9P2E9'
 'Q08211' 'P15311' 'P54819' 'P00367' 'Q6PKG0' 'P09012' 'Q9Y266' 'Q9Y520'
 'Q02818' 'P27816' 'Q9UQE7' 'O43143' 'Q4VCS5' 'P18669' 'Q9UMX0' 'Q12904'
 'Q9Y383' 'P09874' 'P35241' 'Q13404' 'P23588' 'Q9H910' 'Q9H1E3' 'P46778'
 'P38646' 'O60763' 'O14562' 'P13693' 'Q9H3P7' 'Q9UHX1' 'Q16891' 'Q9Y3U8'
 'Q13263' 'P78344' 'P31942' 'O00193' 'Q13586' 'O95983' 'P62847' 'P26368'
 'P23246' 'P07910' 'P40222' 'Q02790' 'Q86U42' 'Q14126' 'Q14247' 'P29692'
 'P49750' 'Q00341' 'O43852' 'Q

In [74]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [75]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [76]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,Protein ID,Entry Name,Protein Description,pvalue,avg ratio,neglogpvalue,Site,Label,Color,Complete Sequence
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,...,Q07065,CKAP4_HUMAN,Cytoskeleton-associated protein 4,1.634911e-05,6.486397,4.786506,M423,CKAP4_M423,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,...,P25786,PSA1_HUMAN,Proteasome subunit alpha type-1,6.741449e-05,6.336632,4.171247,M26,PSA1_M26,red,MFRNQYDNDVTVWSPQGRIHQIEYAMEAVKQGSATVGLKSKTHAVL...
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,...,P08238,HS90B_HUMAN,Heat shock protein HSP 90-beta,6.241545e-10,3.806415,9.204708,M466,HS90B_M466,red,MPEEVHHGEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISN...
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,...,P15121,ALDR_HUMAN,Aldo-keto reductase family 1 member B1,5.401038e-06,3.038070,5.267523,M169,ALDR_M169,red,MASRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCA...
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,...,P34932,HSP74_HUMAN,Heat shock 70 kDa protein 4,3.372263e-03,2.373026,2.472079,M604,HSP74_M604,red,MSVVGIDLGFQSCYVAVARAGGIETIANEYSDRCTPACISFGPKNR...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,TDFFIGGEEGM[649.3660]AEK,TDFFIGGEEGM[655.3735]AEK,0.008080,-0.318884,0.097640,-0.137602,0.183818,-0.011744,0.116157,0.026499,...,Q9Y266,NUDC_HUMAN,Nuclear migration protein nudC,9.505145e-01,-0.003187,0.022041,M50,NUDC_M50,grey,MGGEQEEERFDGMLLAMAQQHEGGVQELVNTFFSFLRRKTDFFIGG...
620,EDIERM[649.3660]VQEAEK,EDIERM[655.3735]VQEAEK,-0.909764,,,,,,,,...,P11142,HSP7C_HUMAN,Heat shock cognate 71 kDa protein,9.678040e-01,-0.043831,0.014213,M518,HSP7C_M518,grey,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...
621,SM[649.3660]PWNVDTLSK,SM[655.3735]PWNVDTLSK,0.276155,-0.230254,0.361472,0.135673,-0.067806,-0.089783,0.210097,-0.544967,...,Q16543,CDC37_HUMAN,Hsp90 co-chaperone Cdc37,9.696497e-01,-0.003698,0.013385,M112,CDC37_M112,grey,MVDYSVWDHIEVSDDEDETHPNIDTASLFRWRHQARVERMEQFQKE...
622,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,...,Q9Y6D9,MD1L1_HUMAN,Mitotic spindle assembly checkpoint protein MAD1,9.773242e-01,-0.008231,0.009961,M245,MD1L1_M245,grey,MEDLGENTMVLSTLRSLNNFISQRVEGGSGLDISTSAPGSLQMQYQ...


In [80]:
# Create regex pattern to identify desired modifications
modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [81]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [82]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [83]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [84]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    624
Name: count, dtype: int64

In [85]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [86]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [87]:
# Sanity check - ensure methionine locations are correct
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
temp.count("M") == len(temp)

True

In [88]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,...,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...,LQHVEDGVLSMQVASAR,412,17,LQHVEDGVLS,10,422,QQKSQGLDSRLQHVEDGVLS,QVASARQTESLESLLSKSQE
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,...,red,MFRNQYDNDVTVWSPQGRIHQIEYAMEAVKQGSATVGLKSKTHAVL...,IHQIEYAMEAVK,18,12,IHQIEYA,7,25,YDNDVTVWSPQGRIHQIEYA,EAVKQGSATVGLKSKTHAVL
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,...,red,MPEEVHHGEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISN...,YHTSQSGDEMTSLSEYVSR,456,19,YHTSQSGDE,9,465,TNRRRLSELLRYHTSQSGDE,TSLSEYVSRMKETQKSIYYI
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,...,red,MASRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCA...,AIGISNFNHLQVEMILNKPGLK,155,22,AIGISNFNHLQVE,13,168,VDEGLVKAIGISNFNHLQVE,ILNKPGLKYKPAVNQIECHP
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,...,red,MSVVGIDLGFQSCYVAVARAGGIETIANEYSDRCTPACISFGPKNR...,TSTVDLPIENQLLWQIDREMLNLYIENEGKMIMQDK,573,36,TSTVDLPIENQLLWQIDREMLNLYIENEGK,30,603,QLLWQIDREMLNLYIENEGK,IMQDKLEKERNDAKNAVEEY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,TDFFIGGEEGM[649.3660]AEK,TDFFIGGEEGM[655.3735]AEK,0.008080,-0.318884,0.097640,-0.137602,0.183818,-0.011744,0.116157,0.026499,...,grey,MGGEQEEERFDGMLLAMAQQHEGGVQELVNTFFSFLRRKTDFFIGG...,TDFFIGGEEGMAEK,39,14,TDFFIGGEEG,10,49,NTFFSFLRRKTDFFIGGEEG,AEKLITQTFSHHNQLAQKTR
620,EDIERM[649.3660]VQEAEK,EDIERM[655.3735]VQEAEK,-0.909764,,,,,,,,...,grey,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...,EDIERMVQEAEK,512,12,EDIER,5,517,ENKITITNDKGRLSKEDIER,VQEAEKYKAEDEKQRDKVSS
621,SM[649.3660]PWNVDTLSK,SM[655.3735]PWNVDTLSK,0.276155,-0.230254,0.361472,0.135673,-0.067806,-0.089783,0.210097,-0.544967,...,grey,MVDYSVWDHIEVSDDEDETHPNIDTASLFRWRHQARVERMEQFQKE...,SMPWNVDTLSK,110,11,S,1,111,RKEERSWEQKLEEMRKKEKS,PWNVDTLSKDGFSKSMVNTK
622,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,...,grey,MEDLGENTMVLSTLRSLNNFISQRVEGGSGLDISTSAPGSLQMQYQ...,LSLQEQDAAIVKNMK,231,15,LSLQEQDAAIVKN,13,244,IKDLEQKLSLQEQDAAIVKN,KSELVRLPRLERELKQLREE


In [94]:
# Remove invalid proteins (according to alphafold)
# 9 invalid peptides as a result -> 4 red, 1 blue, 4 grey

invalid_IDs = ['Q14204', 'Q9Y520', 'P46013']
display(peptides_cs[peptides_cs["Protein ID"].isin(invalid_IDs)])
peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
8,KVM[649.3660]SQEIQEQLHK,KVM[655.3735]SQEIQEQLHK,,1.764489,,1.940866,2.128917,,,2.139554,...,red,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,KVMSQEIQEQLHK,3253,13,KV,2,3255,ANDKLKKMVKDQQEAEKKKV,SQEIQEQLHKQQEVIADKQM
80,SQPAFM[649.3660]QSSLSQPSVVLSGTAIHNFPTVQHQELAK,SQPAFM[655.3735]QSSLSQPSVVLSGTAIHNFPTVQHQELAK,,,,,0.426007,0.340913,1.256525,0.844556,...,red,MSEKSGQSTKAKDGKKYATLSLFNTYKGKSLETQKTTARHGLQSLG...,SQPAFMQSSLSQPSVVLSGTAIHNFPTVQHQELAK,2468,35,SQPAF,5,2473,SQAQELFSSSLQPYRSQPAF,QSSLSQPSVVLSGTAIHNFP
119,DQM[649.3660]EGSPNSSESFEHIAR,DQM[655.3735]EGSPNSSESFEHIAR,,,,0.941958,0.286451,0.200017,,0.768032,...,red,MSEKSGQSTKAKDGKKYATLSLFNTYKGKSLETQKTTARHGLQSLG...,DQMEGSPNSSESFEHIAR,773,18,DQ,2,775,DIPPIHPGMIPPKPLMRRDQ,EGSPNSSESFEHIARSARDH
167,DSAIQQQVANLQM[649.3660]K,DSAIQQQVANLQM[655.3735]K,,,,,,0.503574,0.623558,0.265243,...,red,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,DSAIQQQVANLQMK,1228,14,DSAIQQQVANLQ,12,1240,FNDIMRRKDSAIQQQVANLQ,KIVQEDRAVESRTTDLLTDW
294,IAELEM[649.3660]GLLHLQQNIEIPEISLPIHPMITNVAK,IAELEM[655.3735]GLLHLQQNIEIPEISLPIHPMITNVAK,-0.579336,,-0.491562,-0.370731,-0.55294,-0.461852,-0.522553,-0.186892,...,blue,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,IAELEMGLLHLQQNIEIPEISLPIHPMITNVAK,185,33,IAELE,5,190,ADRDGDKMAPSVEKKIAELE,GLLHLQQNIEIPEISLPIHP
379,M[649.3660]LWGSDPYPHAEPQQATTPK,M[655.3735]LWGSDPYPHAEPQQATTPK,,,,0.757064,1.757798,,0.190581,,...,grey,MSEKSGQSTKAKDGKKYATLSLFNTYKGKSLETQKTTARHGLQSLG...,MLWGSDPYPHAEPQQATTPK,804,20,,0,804,SFEHIARSARDHAISLSEPR,LWGSDPYPHAEPQQATTPKA
431,VLRPQVTAVAQQNQGEVPEPQDM[649.3660]K,VLRPQVTAVAQQNQGEVPEPQDM[655.3735]K,,,,,0.57524,1.034721,,,...,grey,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,VLRPQVTAVAQQNQGEVPEPQDMK,482,24,VLRPQVTAVAQQNQGEVPEPQD,22,504,RPQVTAVAQQNQGEVPEPQD,KVAEVLFDAADANAIEEVNL
484,AM[649.3660]HTPKPAVGEEK,AM[655.3735]HTPKPAVGEEK,,,,1.179078,,2.399439,,,...,grey,MWPTRRLVTIKRSGVDGPHFPLSLSTCLFGRGIECDIRIQLPVVSK...,AMHTPKPAVGEEK,1780,13,A,1,1781,ADTEEEFLAFRKQTPSAGKA,HTPKPAVGEEKDINTFLGTP
502,RSELEEQQM[649.3660]HLNVGLR,RSELEEQQM[655.3735]HLNVGLR,,,0.178435,-0.023271,0.302477,,,,...,grey,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,RSELEEQQMHLNVGLR,3190,16,RSELEEQQ,8,3198,FINHYANLFHEKRSELEEQQ,HLNVGLRKIKETVDQVEELR


Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,...,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...,LQHVEDGVLSMQVASAR,412,17,LQHVEDGVLS,10,422,QQKSQGLDSRLQHVEDGVLS,QVASARQTESLESLLSKSQE
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,...,red,MFRNQYDNDVTVWSPQGRIHQIEYAMEAVKQGSATVGLKSKTHAVL...,IHQIEYAMEAVK,18,12,IHQIEYA,7,25,YDNDVTVWSPQGRIHQIEYA,EAVKQGSATVGLKSKTHAVL
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,...,red,MPEEVHHGEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISN...,YHTSQSGDEMTSLSEYVSR,456,19,YHTSQSGDE,9,465,TNRRRLSELLRYHTSQSGDE,TSLSEYVSRMKETQKSIYYI
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,...,red,MASRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCA...,AIGISNFNHLQVEMILNKPGLK,155,22,AIGISNFNHLQVE,13,168,VDEGLVKAIGISNFNHLQVE,ILNKPGLKYKPAVNQIECHP
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,...,red,MSVVGIDLGFQSCYVAVARAGGIETIANEYSDRCTPACISFGPKNR...,TSTVDLPIENQLLWQIDREMLNLYIENEGKMIMQDK,573,36,TSTVDLPIENQLLWQIDREMLNLYIENEGK,30,603,QLLWQIDREMLNLYIENEGK,IMQDKLEKERNDAKNAVEEY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,TDFFIGGEEGM[649.3660]AEK,TDFFIGGEEGM[655.3735]AEK,0.008080,-0.318884,0.097640,-0.137602,0.183818,-0.011744,0.116157,0.026499,...,grey,MGGEQEEERFDGMLLAMAQQHEGGVQELVNTFFSFLRRKTDFFIGG...,TDFFIGGEEGMAEK,39,14,TDFFIGGEEG,10,49,NTFFSFLRRKTDFFIGGEEG,AEKLITQTFSHHNQLAQKTR
620,EDIERM[649.3660]VQEAEK,EDIERM[655.3735]VQEAEK,-0.909764,,,,,,,,...,grey,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...,EDIERMVQEAEK,512,12,EDIER,5,517,ENKITITNDKGRLSKEDIER,VQEAEKYKAEDEKQRDKVSS
621,SM[649.3660]PWNVDTLSK,SM[655.3735]PWNVDTLSK,0.276155,-0.230254,0.361472,0.135673,-0.067806,-0.089783,0.210097,-0.544967,...,grey,MVDYSVWDHIEVSDDEDETHPNIDTASLFRWRHQARVERMEQFQKE...,SMPWNVDTLSK,110,11,S,1,111,RKEERSWEQKLEEMRKKEKS,PWNVDTLSKDGFSKSMVNTK
622,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,...,grey,MEDLGENTMVLSTLRSLNNFISQRVEGGSGLDISTSAPGSLQMQYQ...,LSLQEQDAAIVKNMK,231,15,LSLQEQDAAIVKN,13,244,IKDLEQKLSLQEQDAAIVKN,KSELVRLPRLERELKQLREE


## Download Alphafold Data - ChURRO_2

In [95]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q07065' 'P25786' 'P08238' 'P15121' 'P34932' 'P04406' 'Q96CT7' 'Q15717'
 'P47897' 'P12694' 'Q15366' 'P14625' 'P15374' 'Q9H078' 'P31948' 'P11940'
 'Q16836' 'Q12907' 'P26038' 'P13639' 'P07814' 'P67870' 'P15170' 'P10809'
 'Q16181' 'O95347' 'P07437' 'P68371' 'Q86UP2' 'P68363' 'Q13283' 'P0DP23'
 'P55196' 'Q07866' 'Q86Y82' 'P11142' 'P68104' 'P78371' 'Q15233' 'P22061'
 'P80303' 'P52272' 'P50402' 'P35579' 'P83731' 'Q9UHD8' 'Q9NR31' 'P63261'
 'P62829' 'Q9P013' 'P61978' 'O14950' 'P46777' 'O43776' 'Q9P2E9' 'Q08211'
 'P15311' 'P54819' 'P00367' 'Q6PKG0' 'P09012' 'Q9Y266' 'Q02818' 'P27816'
 'Q9UQE7' 'O43143' 'Q4VCS5' 'P18669' 'Q9UMX0' 'Q12904' 'Q9Y383' 'P09874'
 'P35241' 'Q13404' 'P23588' 'Q9H910' 'Q9H1E3' 'P46778' 'P38646' 'O60763'
 'O14562' 'P13693' 'Q9H3P7' 'Q9UHX1' 'Q16891' 'Q9Y3U8' 'Q13263' 'P78344'
 'P31942' 'O00193' 'Q13586' 'O95983' 'P62847' 'P26368' 'P23246' 'P07910'
 'P40222' 'Q02790' 'Q86U42' 'Q14126' 'Q14247' 'P29692' 'P49750' 'Q00341'
 'O43852' 'Q9BW85' 'P26639' 'O

In [96]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 364/364 [00:00<00:00, 37042.09it/s]

2025-02-24 11:52:05> Valid proteins: 0
2025-02-24 11:52:05> Invalid proteins: 0
2025-02-24 11:52:05> Existing proteins: 364





In [97]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 364/364 [00:00<00:00, 60445.27it/s]

2025-02-24 11:52:06> Valid proteins: 0
2025-02-24 11:52:06> Invalid proteins: 0
2025-02-24 11:52:06> Existing proteins: 364





## Construct Alphafold Dataframe (Calculate Accessibilities) - ChURRO_2

In [98]:
# Format alphafold data into dataframe
alphafold_annotation_ChURRO_2 = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=unique_uniprotIDs)
alphafold_annotation_ChURRO_2

100%|██████████| 1849/1849 [00:35<00:00, 52.63it/s] 


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A8MXV4,1,M,1,34.18,7.223,7.574,9.030,7.341,-36.582,...,13.872,14.366,13.031,unstructured,unstructured,0,0,0,0,1
1,A8MXV4,1,S,2,29.59,5.569,5.520,4.112,5.964,-33.823,...,12.453,11.875,13.105,unstructured,unstructured,0,0,0,0,1
2,A8MXV4,1,S,3,29.28,5.596,6.732,8.090,6.621,-30.881,...,14.369,14.189,13.491,unstructured,unstructured,0,0,0,0,1
3,A8MXV4,1,S,4,27.39,4.474,3.768,3.026,4.746,-28.235,...,15.056,16.402,15.048,unstructured,unstructured,0,0,0,0,1
4,A8MXV4,1,L,5,28.73,3.667,4.309,3.682,4.070,-25.450,...,13.647,12.336,13.764,unstructured,unstructured,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215774,Q9Y6D9,364,R,714,81.93,-73.042,-72.792,-72.235,-71.898,-32.965,...,-3.568,-3.341,-2.958,HELX_RH_AL_P,HELX,0,1,0,0,0
215775,Q9Y6D9,364,Q,715,74.30,-72.910,-72.199,-70.839,-72.029,-35.063,...,-7.239,-7.954,-5.817,HELX_RH_AL_P,HELX,0,1,0,0,0
215776,Q9Y6D9,364,T,716,69.42,-74.822,-73.426,-72.488,-72.836,-37.516,...,-6.687,-6.171,-6.540,HELX_RH_AL_P,HELX,0,1,0,0,0
215777,Q9Y6D9,364,V,717,56.75,-77.727,-76.694,-76.633,-75.341,-35.784,...,-4.741,-3.243,-5.349,unstructured,unstructured,0,0,0,0,1


In [99]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 364/364 [00:03<00:00, 96.58it/s] 


In [100]:
alphafold_accessibility_ChURRO_2 = alphafold_annotation_ChURRO_2.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [101]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 364/364 [00:03<00:00, 103.36it/s]


In [102]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [103]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 364/364 [00:03<00:00, 105.33it/s]


In [104]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [105]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 364/364 [00:03<00:00, 106.62it/s]


In [106]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [107]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 364/364 [00:03<00:00, 98.25it/s] 


In [108]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [109]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 364/364 [00:03<00:00, 104.54it/s]


In [110]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [111]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 364/364 [00:03<00:00, 102.03it/s]


In [112]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [113]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 364/364 [00:03<00:00, 102.20it/s]


In [114]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [115]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 364/364 [00:03<00:00, 97.77it/s] 


In [116]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [117]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 364/364 [00:03<00:00, 99.66it/s] 


In [118]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [119]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 364/364 [00:03<00:00, 97.71it/s] 


In [120]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [121]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 364/364 [00:04<00:00, 90.84it/s] 


In [122]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [123]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 364/364 [00:04<00:00, 75.06it/s] 


In [124]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [125]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 364/364 [00:06<00:00, 58.03it/s]


In [126]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2;

In [127]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_2, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 364/364 [00:04<00:00, 81.79it/s] 


In [128]:
alphafold_accessibility_ChURRO_2 = alphafold_accessibility_ChURRO_2.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_2

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,A8MXV4,1,M,1,34.18,7.223,7.574,9.030,7.341,-36.582,...,1,1,1,1,1,1,2,3,5,0
1,A8MXV4,1,S,2,29.59,5.569,5.520,4.112,5.964,-33.823,...,1,2,2,2,2,2,3,4,6,0
2,A8MXV4,1,S,3,29.28,5.596,6.732,8.090,6.621,-30.881,...,1,2,2,2,2,2,4,5,7,0
3,A8MXV4,1,S,4,27.39,4.474,3.768,3.026,4.746,-28.235,...,1,2,2,2,2,2,4,6,7,0
4,A8MXV4,1,L,5,28.73,3.667,4.309,3.682,4.070,-25.450,...,1,2,2,2,2,2,4,6,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215774,Q9Y6D9,364,R,714,81.93,-73.042,-72.792,-72.235,-71.898,-32.965,...,2,2,3,4,5,5,11,23,51,2
215775,Q9Y6D9,364,Q,715,74.30,-72.910,-72.199,-70.839,-72.029,-35.063,...,2,2,2,2,3,4,9,18,43,2
215776,Q9Y6D9,364,T,716,69.42,-74.822,-73.426,-72.488,-72.836,-37.516,...,2,2,2,2,2,3,7,13,22,2
215777,Q9Y6D9,364,V,717,56.75,-77.727,-76.694,-76.633,-75.341,-35.784,...,1,2,2,2,2,2,5,8,15,2


In [129]:
alphafold_accessibility_ChURRO_2_smooth = get_smooth_score(
    alphafold_accessibility_ChURRO_2, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_ChURRO_2_smooth;

100%|██████████| 364/364 [00:00<00:00, 552.91it/s]


In [130]:
alphafold_accessibility_ChURRO_2_smooth['IDR'] = np.where(
    alphafold_accessibility_ChURRO_2_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_ChURRO_2_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,A8MXV4,1,M,1,34.18,7.223,7.574,9.030,7.341,-36.582,...,1.909091,1.909091,1.909091,1.909091,1.909091,3.818182,6.636364,12.181818,0.000000,1
1,A8MXV4,1,S,2,29.59,5.569,5.520,4.112,5.964,-33.823,...,2.083333,2.166667,2.166667,2.250000,2.250000,4.750000,9.916667,20.500000,0.416667,1
2,A8MXV4,1,S,3,29.28,5.596,6.732,8.090,6.621,-30.881,...,2.153846,2.230769,2.307692,2.615385,2.692308,5.538462,13.076923,28.923077,0.615385,1
3,A8MXV4,1,S,4,27.39,4.474,3.768,3.026,4.746,-28.235,...,2.214286,2.285714,2.571429,3.000000,3.071429,6.642857,16.857143,38.000000,0.714286,0
4,A8MXV4,1,L,5,28.73,3.667,4.309,3.682,4.070,-25.450,...,2.400000,2.533333,2.866667,3.466667,3.600000,8.133333,20.533333,47.666667,1.333333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713,Q9Y6D9,364,R,714,81.93,-73.042,-72.792,-72.235,-71.898,-32.965,...,2.066667,4.000000,4.733333,5.933333,6.200000,12.733333,34.733333,56.066667,3.066667,0
714,Q9Y6D9,364,Q,715,74.30,-72.910,-72.199,-70.839,-72.029,-35.063,...,2.071429,4.000000,4.642857,5.785714,6.071429,12.142857,32.714286,54.142857,2.928571,0
715,Q9Y6D9,364,T,716,69.42,-74.822,-73.426,-72.488,-72.836,-37.516,...,2.076923,3.923077,4.538462,5.615385,5.923077,12.000000,31.461538,52.230769,3.000000,0
716,Q9Y6D9,364,V,717,56.75,-77.727,-76.694,-76.633,-75.341,-35.784,...,2.083333,3.833333,4.416667,5.416667,5.750000,11.666667,30.166667,50.500000,2.833333,0


## Merge Dataframes into Full Dataset (Includes Alphafold) - ChURRO_2

In [131]:
alphafold_accessibility_ChURRO_2_smooth["position"] = alphafold_accessibility_ChURRO_2_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_wa = peptides_cs.merge(
    alphafold_accessibility_ChURRO_2_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_wa # wa means "with alphafold"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,...,2.761905,4.857143,6.238095,7.523810,9.095238,24.238095,59.190476,107.904762,6.238095,0
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,...,2.428571,3.952381,4.476190,5.523810,6.761905,18.714286,54.571429,106.047619,5.333333,0
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,...,2.523810,3.761905,4.666667,6.380952,7.238095,22.333333,70.333333,127.857143,6.619048,0
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,...,2.571429,5.380952,6.142857,8.238095,8.523810,19.190476,50.190476,111.476190,5.380952,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610,TDFFIGGEEGM[649.3660]AEK,TDFFIGGEEGM[655.3735]AEK,0.008080,-0.318884,0.097640,-0.137602,0.183818,-0.011744,0.116157,0.026499,...,2.000000,3.285714,4.571429,5.571429,6.000000,11.476190,23.333333,37.857143,2.476190,0
611,EDIERM[649.3660]VQEAEK,EDIERM[655.3735]VQEAEK,-0.909764,,,,,,,,...,2.666667,5.000000,5.285714,6.761905,6.809524,12.857143,36.238095,81.238095,2.857143,0
612,SM[649.3660]PWNVDTLSK,SM[655.3735]PWNVDTLSK,0.276155,-0.230254,0.361472,0.135673,-0.067806,-0.089783,0.210097,-0.544967,...,2.333333,3.476190,3.523810,4.380952,4.857143,9.238095,22.095238,40.380952,1.619048,0
613,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,...,1.809524,1.809524,2.142857,2.666667,3.000000,8.047619,14.857143,20.857143,1.142857,1


In [134]:
# NOTE: one peptide sequences differ between UniProt & AlphaFold

pd.set_option("display.max_columns", None)
display(peptides_wa[~(peptides_wa["AA"] == "M")])
peptides_wa = peptides_wa[(peptides_wa["AA"] == "M")]
pd.reset_option("display.max_columns")

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,3_3 Log2 Ratio HL,Protein,Protein ID,Entry Name,Protein Description,pvalue,avg ratio,neglogpvalue,Site,Label,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,y_coord_ca,y_coord_cb,y_coord_n,z_coord_c,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_2_180_pae,nAA_3_180_pae,nAA_4_180_pae,nAA_4.5_180_pae,nAA_5_180_pae,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae,nAA_2_180_pae_smooth10,nAA_3_180_pae_smooth10,nAA_4_180_pae_smooth10,nAA_4.5_180_pae_smooth10,nAA_5_180_pae_smooth10,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
381,EHM[649.3660]GNVVEALIALTN,EHM[655.3735]GNVVEALIALTN,0.269065,0.034308,0.023707,0.212029,0.74948,0.535874,-0.350295,0.805685,,sp|Q9NX55|HYPK_HUMAN,Q9NX55,HYPK_HUMAN,Huntingtin-interacting protein K,0.080406,0.284982,1.094711,M109,HYPK_M109,grey,MATEGDVELELETETSGPERPPEKPRKHDSGAADLERVTDYAEEKE...,EHMGNVVEALIALTN,106,15,EH,2,108,LIMTEMEISRAAAERSLREH,GNVVEALIALTN,Q9NX55,329,A,108,92.77,3.291,2.081,2.371,0.883,1.961,2.414,3.753,2.57,-7.576,-8.404,-9.096,-7.572,HELX_RH_AL_P,HELX,0,1,0,0,0,0,0,0,0,2,2,2,5,6,9,12,24,39,44,12,0.0,0.0,0.0,0.0,2.0,2.0,2.095238,3.52381,4.714286,5.904762,6.619048,15.0,33.238095,43.809524,3.47619,0


In [136]:
# Sanity check - ensure methionine locations are correct

#pd.set_option("display.max_columns", None)
display(peptides_wa[~(peptides_wa["Site"].str.strip("M").astype(int) == peptides_wa["Methionine Location"] + 1)])
#pd.reset_option("display.max_columns")

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR


In [137]:
peptides_wa.to_csv(os.path.join(curr_dir_path, "ChURRO_2_with_alphafold.csv"))

In [138]:
path = os.path.join(curr_dir_path, "ChURRO_2_with_alphafold.csv")
peptides_wa = pd.read_csv(path)
peptides_wa.set_index("Unnamed: 0", inplace=True)
peptides_wa.index.name = None
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,...,2.761905,4.857143,6.238095,7.523810,9.095238,24.238095,59.190476,107.904762,6.238095,0
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,...,2.428571,3.952381,4.476190,5.523810,6.761905,18.714286,54.571429,106.047619,5.333333,0
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,...,2.523810,3.761905,4.666667,6.380952,7.238095,22.333333,70.333333,127.857143,6.619048,0
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,...,2.571429,5.380952,6.142857,8.238095,8.523810,19.190476,50.190476,111.476190,5.380952,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610,TDFFIGGEEGM[649.3660]AEK,TDFFIGGEEGM[655.3735]AEK,0.008080,-0.318884,0.097640,-0.137602,0.183818,-0.011744,0.116157,0.026499,...,2.000000,3.285714,4.571429,5.571429,6.000000,11.476190,23.333333,37.857143,2.476190,0
611,EDIERM[649.3660]VQEAEK,EDIERM[655.3735]VQEAEK,-0.909764,,,,,,,,...,2.666667,5.000000,5.285714,6.761905,6.809524,12.857143,36.238095,81.238095,2.857143,0
612,SM[649.3660]PWNVDTLSK,SM[655.3735]PWNVDTLSK,0.276155,-0.230254,0.361472,0.135673,-0.067806,-0.089783,0.210097,-0.544967,...,2.333333,3.476190,3.523810,4.380952,4.857143,9.238095,22.095238,40.380952,1.619048,0
613,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,...,1.809524,1.809524,2.142857,2.666667,3.000000,8.047619,14.857143,20.857143,1.142857,1


## Set Parameters of Analysis - ChURRO_3

In [201]:
# Set parameters of analysis

analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["661.3660", "667.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "661.3660"
light_modification = "667.3735"

## Load Dataset - ChURRO_3

In [202]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "ChURRO_3_isoDTB.csv")
peptides = pd.read_csv(data_loc)
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,3_3 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,pval,average,Site,Label
0,SGEHDFGAAFDGDGDRNM[661.3660]ILGK,SGEHDFGAAFDGDGDRNM[667.3735]ILGK,,,4.117889,,,,,4.727480,,sp|P36871|PGM1_HUMAN,P36871,PGM1_HUMAN,PGM1,Phosphoglucomutase-1,0.043804,4.422685,M295,PGM1_M295
1,M[661.3660]LESYLHAK,M[667.3735]LESYLHAK,,,,,,,,1.845600,2.022412,sp|Q86X55|CARM1_HUMAN,Q86X55,CARM1_HUMAN,CARM1,Histone-arginine methyltransferase CARM1,0.029081,1.934006,M268,CARM1_M268
2,AHSIQIM[661.3660]K,AHSIQIM[667.3735]K,,1.517974,1.695327,,,,,,,sp|Q02543|RL18A_HUMAN,Q02543,RL18A_HUMAN,RPL18A,Large ribosomal subunit protein eL20,0.035102,1.606651,M127,RL18A_M127
3,GFQQILAGEYDHLPEQAFYM[661.3660]VGPIEEAVAK,GFQQILAGEYDHLPEQAFYM[667.3735]VGPIEEAVAK,,1.587855,1.706229,,,,,1.419860,,sp|P06576|ATPB_HUMAN,P06576,ATPB_HUMAN,ATP5F1B,"ATP synthase subunit beta, mitochondrial",0.002784,1.571315,M509,ATPB_M509
4,NLKPIKPM[661.3660]QFLGDEETVRK,NLKPIKPM[667.3735]QFLGDEETVRK,1.806439,1.527960,1.332013,,,,1.576976,1.474839,1.359018,sp|P18669|PGAM1_HUMAN,P18669,PGAM1_HUMAN,PGAM1,Phosphoglycerate mutase 1,0.000004,1.512874,M230,PGAM1_M230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199,LVSDGQALPEM[661.3660]EIHLQTNAEK,LVSDGQALPEM[667.3735]EIHLQTNAEK,,0.549687,0.592261,,,,-0.680663,-0.187379,-0.281744,sp|Q12931|TRAP1_HUMAN,Q12931,TRAP1_HUMAN,TRAP1,"Heat shock protein 75 kDa, mitochondrial",0.995261,-0.001567,M141,TRAP1_M141
1200,SSIHNFM[661.3660]THPEFR,SSIHNFM[667.3735]THPEFR,,-0.278141,,,,,,0.282283,,sp|P20020|AT2B1_HUMAN,P20020,AT2B1_HUMAN,ATP2B1,Plasma membrane calcium-transporting ATPase 1,0.995294,0.002071,M1145,AT2B1_M1145
1201,AAFTVSLDPGPLEQFPHSM[661.3660]EPQLR,AAFTVSLDPGPLEQFPHSM[667.3735]EPQLR,,0.408145,,,,,,-0.412637,,sp|Q9UKD2|MRT4_HUMAN,Q9UKD2,MRT4_HUMAN,MRTO4,mRNA turnover protein 4 homolog,0.996516,-0.002246,M149,MRT4_M149
1202,AIGVLTSGGDAQGM[661.3660]NAAVR,AIGVLTSGGDAQGM[667.3735]NAAVR,,0.003958,-0.003922,,,,,,,sp|P17858|PFKAL_HUMAN,P17858,PFKAL_HUMAN,PFKL,"ATP-dependent 6-phosphofructokinase, liver type",0.997051,0.000018,M30,PFKAL_M30


In [203]:
# Canonicalize data - none to do here
peptides;

In [204]:
# Manual labeling of peptides
label_col_data = ["red"] * 57 + ["blue"] * 125 + ["grey"] * 1022
label_col = pd.Series(label_col_data)
peptides["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [205]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P36871' 'Q86X55' 'Q02543' 'P06576' 'P18669' 'P14625' 'Q86UP2' 'Q07065'
 'P11142' 'Q969G3' 'Q9NPH2' 'P35241' 'P10809' 'P07814' 'O75390' 'P67870'
 'Q5TDH0' 'Q9UBF2' 'P47897' 'Q9HAV7' 'Q7Z4V5' 'P08238' 'Q16836' 'Q9Y4L1'
 'P46777' 'Q16204' 'P35580' 'P82650' 'P46940' 'P0DP23' 'Q10567' 'Q15691'
 'Q9H3P7' 'Q7Z417' 'Q14444' 'Q9H444' 'Q9UNZ2' 'P26368' 'Q04637' 'O60610'
 'P22061' 'Q9Y266' 'P09874' 'O15371' 'P09234' 'P35520' 'P13639' 'P52272'
 'P35579' 'P11940' 'P62857' 'Q12906' 'O60885' 'Q15019' 'Q9UK76' 'O43242'
 'P25205' 'Q13263' 'P46937' 'P06748' 'Q9NYP7' 'P07437' 'Q13435' 'P17987'
 'Q9H8Y8' 'P31948' 'P63261' 'P61254' 'P04406' 'P61978' 'P68363' 'P34932'
 'P23193' 'Q6P2E9' 'O14980' 'Q00341' 'O43660' 'Q14008' 'P50990' 'P23246'
 'O75533' 'P00367' 'Q86U42' 'P16949' 'Q9HB71' 'Q9UQE7' 'Q15365' 'P68371'
 'P62258' 'Q8WWY3' 'Q8WUH6' 'P78527' 'Q15366' 'P60842' 'P19022' 'Q13526'
 'Q6PKG0' 'Q9Y5S9' 'Q14974' 'P14678' 'P40222' 'Q13126' 'P29401' 'O60762'
 'Q15181' 'P62316' 'P48739' 'Q

In [206]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [207]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [208]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,Protein ID,Entry Name,Gene,Protein Description,pval,average,Site,Label,Color,Complete Sequence
0,SGEHDFGAAFDGDGDRNM[661.3660]ILGK,SGEHDFGAAFDGDGDRNM[667.3735]ILGK,,,4.117889,,,,,4.727480,...,P36871,PGM1_HUMAN,PGM1,Phosphoglucomutase-1,0.043804,4.422685,M295,PGM1_M295,red,MVKIVTVKTQAYQDQKPGTSGLRKRVKVFQSSANYAENFIQSIIST...
1,M[661.3660]LESYLHAK,M[667.3735]LESYLHAK,,,,,,,,1.845600,...,Q86X55,CARM1_HUMAN,CARM1,Histone-arginine methyltransferase CARM1,0.029081,1.934006,M268,CARM1_M268,red,MAAAAAAVGPGAGGAGSAVPGGAGPCATVSVFPGARLLTIGDANGE...
2,AHSIQIM[661.3660]K,AHSIQIM[667.3735]K,,1.517974,1.695327,,,,,,...,Q02543,RL18A_HUMAN,RPL18A,Large ribosomal subunit protein eL20,0.035102,1.606651,M127,RL18A_M127,red,MKASGTLREYKVVGRCLPTPKCHTPPLYRMRIFAPNHVVAKSRFWY...
3,GFQQILAGEYDHLPEQAFYM[661.3660]VGPIEEAVAK,GFQQILAGEYDHLPEQAFYM[667.3735]VGPIEEAVAK,,1.587855,1.706229,,,,,1.419860,...,P06576,ATPB_HUMAN,ATP5F1B,"ATP synthase subunit beta, mitochondrial",0.002784,1.571315,M509,ATPB_M509,red,MLGFVGRVAAAPASGALRRLTPSASLPPAQLLLRAAPTAVHPVRDY...
4,NLKPIKPM[661.3660]QFLGDEETVRK,NLKPIKPM[667.3735]QFLGDEETVRK,1.806439,1.527960,1.332013,,,,1.576976,1.474839,...,P18669,PGAM1_HUMAN,PGAM1,Phosphoglycerate mutase 1,0.000004,1.512874,M230,PGAM1_M230,red,MAAYKLVLIRHGESAWNLENRFSGWYDADLSPAGHEEAKRGGQALR...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199,LVSDGQALPEM[661.3660]EIHLQTNAEK,LVSDGQALPEM[667.3735]EIHLQTNAEK,,0.549687,0.592261,,,,-0.680663,-0.187379,...,Q12931,TRAP1_HUMAN,TRAP1,"Heat shock protein 75 kDa, mitochondrial",0.995261,-0.001567,M141,TRAP1_M141,grey,MARELRALLLWGRRLRPLLRAPALAAVPGGKPILCPRRTTAQLGPR...
1200,SSIHNFM[661.3660]THPEFR,SSIHNFM[667.3735]THPEFR,,-0.278141,,,,,,0.282283,...,P20020,AT2B1_HUMAN,ATP2B1,Plasma membrane calcium-transporting ATPase 1,0.995294,0.002071,M1145,AT2B1_M1145,grey,MGDMANNSVAYSGVKNSLKEANHDGDFGITLAELRALMELRSTDAL...
1201,AAFTVSLDPGPLEQFPHSM[661.3660]EPQLR,AAFTVSLDPGPLEQFPHSM[667.3735]EPQLR,,0.408145,,,,,,-0.412637,...,Q9UKD2,MRT4_HUMAN,MRTO4,mRNA turnover protein 4 homolog,0.996516,-0.002246,M149,MRT4_M149,grey,MPKSKRDKKVSLTKTAKKGLELKQNLIEELRKCVDTYKYLFIFSVA...
1202,AIGVLTSGGDAQGM[661.3660]NAAVR,AIGVLTSGGDAQGM[667.3735]NAAVR,,0.003958,-0.003922,,,,,,...,P17858,PFKAL_HUMAN,PFKL,"ATP-dependent 6-phosphofructokinase, liver type",0.997051,0.000018,M30,PFKAL_M30,grey,MAAVDLEKLRASGAGKAIGVLTSGGDAQGMNAAVRAVTRMGIYVGA...


In [209]:
# Create regex pattern to identify desired modifications
modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[661\.3660\]|M\[667\.3735\]


In [210]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [211]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [212]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [213]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    1204
Name: count, dtype: int64

In [214]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[661\.3660\]|M\[667\.3735\])


In [215]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [216]:
# Sanity check - ensure methionine locations are correct
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
temp.count("M") == len(temp)

True

In [217]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,SGEHDFGAAFDGDGDRNM[661.3660]ILGK,SGEHDFGAAFDGDGDRNM[667.3735]ILGK,,,4.117889,,,,,4.727480,...,red,MVKIVTVKTQAYQDQKPGTSGLRKRVKVFQSSANYAENFIQSIIST...,SGEHDFGAAFDGDGDRNMILGK,277,22,SGEHDFGAAFDGDGDRN,17,294,TMKSGEHDFGAAFDGDGDRN,ILGKHGFFVNPSDSVAVIAA
1,M[661.3660]LESYLHAK,M[667.3735]LESYLHAK,,,,,,,,1.845600,...,red,MAAAAAAVGPGAGGAGSAVPGGAGPCATVSVFPGARLLTIGDANGE...,MLESYLHAK,267,9,,0,267,PEQVDIIISEPMGYMLFNER,LESYLHAKKYLKPSGNMFPT
2,AHSIQIM[661.3660]K,AHSIQIM[667.3735]K,,1.517974,1.695327,,,,,,...,red,MKASGTLREYKVVGRCLPTPKCHTPPLYRMRIFAPNHVVAKSRFWY...,AHSIQIMK,120,8,AHSIQI,6,126,TQCYRDMGARHRARAHSIQI,KVEEIAASKCRRPAVKQFHD
3,GFQQILAGEYDHLPEQAFYM[661.3660]VGPIEEAVAK,GFQQILAGEYDHLPEQAFYM[667.3735]VGPIEEAVAK,,1.587855,1.706229,,,,,1.419860,...,red,MLGFVGRVAAAPASGALRRLTPSASLPPAQLLLRAAPTAVHPVRDY...,GFQQILAGEYDHLPEQAFYMVGPIEEAVAK,489,30,GFQQILAGEYDHLPEQAFY,19,508,KGFQQILAGEYDHLPEQAFY,VGPIEEAVAKADKLAEEHSS
4,NLKPIKPM[661.3660]QFLGDEETVRK,NLKPIKPM[667.3735]QFLGDEETVRK,1.806439,1.527960,1.332013,,,,1.576976,1.474839,...,red,MAAYKLVLIRHGESAWNLENRFSGWYDADLSPAGHEEAKRGGQALR...,NLKPIKPMQFLGDEETVRK,222,19,NLKPIKP,7,229,LPTGIPIVYELDKNLKPIKP,QFLGDEETVRKAMEAVAAQG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199,LVSDGQALPEM[661.3660]EIHLQTNAEK,LVSDGQALPEM[667.3735]EIHLQTNAEK,,0.549687,0.592261,,,,-0.680663,-0.187379,...,grey,MARELRALLLWGRRLRPLLRAPALAAVPGGKPILCPRRTTAQLGPR...,LVSDGQALPEMEIHLQTNAEK,130,21,LVSDGQALPE,10,140,SDALEKLRHKLVSDGQALPE,EIHLQTNAEKGTITIQDTGI
1200,SSIHNFM[661.3660]THPEFR,SSIHNFM[667.3735]THPEFR,,-0.278141,,,,,,0.282283,...,grey,MGDMANNSVAYSGVKNSLKEANHDGDFGITLAELRALMELRSTDAL...,SSIHNFMTHPEFR,1138,13,SSIHNF,6,1144,RSSLYEGLEKPESRSSIHNF,THPEFRIEDSEPHIPLIDDT
1201,AAFTVSLDPGPLEQFPHSM[661.3660]EPQLR,AAFTVSLDPGPLEQFPHSM[667.3735]EPQLR,,0.408145,,,,,,-0.412637,...,grey,MPKSKRDKKVSLTKTAKKGLELKQNLIEELRKCVDTYKYLFIFSVA...,AAFTVSLDPGPLEQFPHSMEPQLR,130,24,AAFTVSLDPGPLEQFPHS,18,148,NKAAFTVSLDPGPLEQFPHS,EPQLRQLGLPTALKRGVVTL
1202,AIGVLTSGGDAQGM[661.3660]NAAVR,AIGVLTSGGDAQGM[667.3735]NAAVR,,0.003958,-0.003922,,,,,,...,grey,MAAVDLEKLRASGAGKAIGVLTSGGDAQGMNAAVRAVTRMGIYVGA...,AIGVLTSGGDAQGMNAAVR,16,19,AIGVLTSGGDAQG,13,29,RASGAGKAIGVLTSGGDAQG,NAAVRAVTRMGIYVGAKVFL


In [218]:
# Remove invalid proteins (according to alphafold)
# 14 invalid peptides as a result -> 0 red, 1 blue, 23 grey

invalid_IDs = ['P78527', 'Q14204', 'P46013', 'Q9Y520', 'P49792']
display(peptides_cs[peptides_cs["Protein ID"].isin(invalid_IDs)])
peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
100,LSDFNDITNM[661.3660]LLLK,LSDFNDITNM[667.3735]LLLK,,-0.269239,-0.607489,,,,,-0.642971,...,blue,MAGSGAGVRCSLLRLQETLSAADRCGAALAGHQLIRGLGQECVLSS...,LSDFNDITNMLLLK,3655,14,LSDFNDITN,9,3664,GKGGSKLLRMKLSDFNDITN,LLLKMNKDSKPPGNLKECSP
239,STVLTPM[661.3660]FVETQASQGTLQTR,STVLTPM[667.3735]FVETQASQGTLQTR,,-0.392601,-0.427067,,,,,-0.164332,...,grey,MAGSGAGVRCSLLRLQETLSAADRCGAALAGHQLIRGLGQECVLSS...,STVLTPMFVETQASQGTLQTR,2598,21,STVLTP,6,2604,EFQEYTIDSDWRFRSTVLTP,FVETQASQGTLQTRTQEGSL
254,VLRPQVTAVAQQNQGEVPEPQDM[661.3660]K,VLRPQVTAVAQQNQGEVPEPQDM[667.3735]K,,-0.264152,-0.540461,,,,,-0.304486,...,grey,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,VLRPQVTAVAQQNQGEVPEPQDMK,482,24,VLRPQVTAVAQQNQGEVPEPQD,22,504,RPQVTAVAQQNQGEVPEPQD,KVAEVLFDAADANAIEEVNL
300,AM[661.3660]HTPKPAVGEEK,AM[667.3735]HTPKPAVGEEK,,-0.368253,-0.410126,,,,,-0.051744,...,grey,MWPTRRLVTIKRSGVDGPHFPLSLSTCLFGRGIECDIRIQLPVVSK...,AMHTPKPAVGEEK,1780,13,A,1,1781,ADTEEEFLAFRKQTPSAGKA,HTPKPAVGEEKDINTFLGTP
309,VWEQIDQM[661.3660]KEQPWVSVQPR,VWEQIDQM[667.3735]KEQPWVSVQPR,,,0.014569,,,,,0.02137,...,grey,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,VWEQIDQMKEQPWVSVQPR,1338,19,VWEQIDQ,7,1345,LQDLKGVWSELSKVWEQIDQ,KEQPWVSVQPRKLRQNLDAL
450,FGQMLGSNM[661.3660]TEFHSQISK,FGQMLGSNM[667.3735]TEFHSQISK,,-0.114483,-0.5137,,,,,-0.094437,...,grey,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,FGQMLGSNMTEFHSQISK,1130,18,FGQMLGSN,8,1138,KYDSWHKEVLSKFGQMLGSN,TEFHSQISKSRQELEQHSVD
451,M[661.3660]STSPEAFLALR,M[667.3735]STSPEAFLALR,,,-0.333719,,,,,-0.060206,...,grey,MAGSGAGVRCSLLRLQETLSAADRCGAALAGHQLIRGLGQECVLSS...,MSTSPEAFLALR,3889,12,,0,3889,SFRKRESKVPADLLKRAFVR,STSPEAFLALRSHFASSHAL
466,RM[661.3660]PPPANLPSLK,RM[667.3735]PPPANLPSLK,,,0.282823,,,,,0.125852,...,grey,MSEKSGQSTKAKDGKKYATLSLFNTYKGKSLETQKTTARHGLQSLG...,RMPPPANLPSLK,52,12,R,1,53,QKTTARHGLQSLGKVGISRR,PPPANLPSLKAENKGNDPNV
507,YYIQNGIQSFM[661.3660]QNYSSIDVLLHQSR,YYIQNGIQSFM[667.3735]QNYSSIDVLLHQSR,,-0.172996,,,,,,-0.235103,...,grey,MAGSGAGVRCSLLRLQETLSAADRCGAALAGHQLIRGLGQECVLSS...,YYIQNGIQSFMQNYSSIDVLLHQSR,3100,25,YYIQNGIQSF,10,3110,LLQDDVDRAKYYIQNGIQSF,QNYSSIDVLLHQSRLTKLQS
536,QTDVLQQLSIQM[661.3660]ANAK,QTDVLQQLSIQM[667.3735]ANAK,,-0.732577,,,,,,-2.17776,...,grey,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,QTDVLQQLSIQMANAK,1849,16,QTDVLQQLSIQ,11,1860,QMRFYFDPKQTDVLQQLSIQ,ANAKFNYGFEYLGVQDKLVQ


Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,SGEHDFGAAFDGDGDRNM[661.3660]ILGK,SGEHDFGAAFDGDGDRNM[667.3735]ILGK,,,4.117889,,,,,4.727480,...,red,MVKIVTVKTQAYQDQKPGTSGLRKRVKVFQSSANYAENFIQSIIST...,SGEHDFGAAFDGDGDRNMILGK,277,22,SGEHDFGAAFDGDGDRN,17,294,TMKSGEHDFGAAFDGDGDRN,ILGKHGFFVNPSDSVAVIAA
1,M[661.3660]LESYLHAK,M[667.3735]LESYLHAK,,,,,,,,1.845600,...,red,MAAAAAAVGPGAGGAGSAVPGGAGPCATVSVFPGARLLTIGDANGE...,MLESYLHAK,267,9,,0,267,PEQVDIIISEPMGYMLFNER,LESYLHAKKYLKPSGNMFPT
2,AHSIQIM[661.3660]K,AHSIQIM[667.3735]K,,1.517974,1.695327,,,,,,...,red,MKASGTLREYKVVGRCLPTPKCHTPPLYRMRIFAPNHVVAKSRFWY...,AHSIQIMK,120,8,AHSIQI,6,126,TQCYRDMGARHRARAHSIQI,KVEEIAASKCRRPAVKQFHD
3,GFQQILAGEYDHLPEQAFYM[661.3660]VGPIEEAVAK,GFQQILAGEYDHLPEQAFYM[667.3735]VGPIEEAVAK,,1.587855,1.706229,,,,,1.419860,...,red,MLGFVGRVAAAPASGALRRLTPSASLPPAQLLLRAAPTAVHPVRDY...,GFQQILAGEYDHLPEQAFYMVGPIEEAVAK,489,30,GFQQILAGEYDHLPEQAFY,19,508,KGFQQILAGEYDHLPEQAFY,VGPIEEAVAKADKLAEEHSS
4,NLKPIKPM[661.3660]QFLGDEETVRK,NLKPIKPM[667.3735]QFLGDEETVRK,1.806439,1.527960,1.332013,,,,1.576976,1.474839,...,red,MAAYKLVLIRHGESAWNLENRFSGWYDADLSPAGHEEAKRGGQALR...,NLKPIKPMQFLGDEETVRK,222,19,NLKPIKP,7,229,LPTGIPIVYELDKNLKPIKP,QFLGDEETVRKAMEAVAAQG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199,LVSDGQALPEM[661.3660]EIHLQTNAEK,LVSDGQALPEM[667.3735]EIHLQTNAEK,,0.549687,0.592261,,,,-0.680663,-0.187379,...,grey,MARELRALLLWGRRLRPLLRAPALAAVPGGKPILCPRRTTAQLGPR...,LVSDGQALPEMEIHLQTNAEK,130,21,LVSDGQALPE,10,140,SDALEKLRHKLVSDGQALPE,EIHLQTNAEKGTITIQDTGI
1200,SSIHNFM[661.3660]THPEFR,SSIHNFM[667.3735]THPEFR,,-0.278141,,,,,,0.282283,...,grey,MGDMANNSVAYSGVKNSLKEANHDGDFGITLAELRALMELRSTDAL...,SSIHNFMTHPEFR,1138,13,SSIHNF,6,1144,RSSLYEGLEKPESRSSIHNF,THPEFRIEDSEPHIPLIDDT
1201,AAFTVSLDPGPLEQFPHSM[661.3660]EPQLR,AAFTVSLDPGPLEQFPHSM[667.3735]EPQLR,,0.408145,,,,,,-0.412637,...,grey,MPKSKRDKKVSLTKTAKKGLELKQNLIEELRKCVDTYKYLFIFSVA...,AAFTVSLDPGPLEQFPHSMEPQLR,130,24,AAFTVSLDPGPLEQFPHS,18,148,NKAAFTVSLDPGPLEQFPHS,EPQLRQLGLPTALKRGVVTL
1202,AIGVLTSGGDAQGM[661.3660]NAAVR,AIGVLTSGGDAQGM[667.3735]NAAVR,,0.003958,-0.003922,,,,,,...,grey,MAAVDLEKLRASGAGKAIGVLTSGGDAQGMNAAVRAVTRMGIYVGA...,AIGVLTSGGDAQGMNAAVR,16,19,AIGVLTSGGDAQG,13,29,RASGAGKAIGVLTSGGDAQG,NAAVRAVTRMGIYVGAKVFL


## Download Alphafold Data - ChURRO_3

In [219]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P36871' 'Q86X55' 'Q02543' 'P06576' 'P18669' 'P14625' 'Q86UP2' 'Q07065'
 'P11142' 'Q969G3' 'Q9NPH2' 'P35241' 'P10809' 'P07814' 'O75390' 'P67870'
 'Q5TDH0' 'Q9UBF2' 'P47897' 'Q9HAV7' 'Q7Z4V5' 'P08238' 'Q16836' 'Q9Y4L1'
 'P46777' 'Q16204' 'P35580' 'P82650' 'P46940' 'P0DP23' 'Q10567' 'Q15691'
 'Q9H3P7' 'Q7Z417' 'Q14444' 'Q9H444' 'Q9UNZ2' 'P26368' 'Q04637' 'O60610'
 'P22061' 'Q9Y266' 'P09874' 'O15371' 'P09234' 'P35520' 'P13639' 'P52272'
 'P35579' 'P11940' 'P62857' 'Q12906' 'O60885' 'Q15019' 'Q9UK76' 'O43242'
 'P25205' 'Q13263' 'P46937' 'P06748' 'Q9NYP7' 'P07437' 'Q13435' 'P17987'
 'Q9H8Y8' 'P31948' 'P63261' 'P61254' 'P04406' 'P61978' 'P68363' 'P34932'
 'P23193' 'Q6P2E9' 'O14980' 'Q00341' 'O43660' 'Q14008' 'P50990' 'P23246'
 'O75533' 'P00367' 'Q86U42' 'P16949' 'Q9HB71' 'Q9UQE7' 'Q15365' 'P68371'
 'P62258' 'Q8WWY3' 'Q8WUH6' 'Q15366' 'P60842' 'P19022' 'Q13526' 'Q6PKG0'
 'Q9Y5S9' 'Q14974' 'P14678' 'P40222' 'Q13126' 'P29401' 'O60762' 'Q15181'
 'P62316' 'P48739' 'Q99536' 'Q

In [220]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 658/658 [00:00<00:00, 175418.04it/s]

2025-02-24 12:31:03> Valid proteins: 0
2025-02-24 12:31:03> Invalid proteins: 0
2025-02-24 12:31:03> Existing proteins: 658





In [221]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 658/658 [00:00<00:00, 194561.30it/s]

2025-02-24 12:31:04> Valid proteins: 0
2025-02-24 12:31:04> Invalid proteins: 0
2025-02-24 12:31:04> Existing proteins: 658





## Construct Alphafold Dataframe (Calculate Accessibilities) - ChURRO_3

In [222]:
# Format alphafold data into dataframe
alphafold_annotation_ChURRO_3 = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=unique_uniprotIDs)
alphafold_annotation_ChURRO_3

100%|██████████| 1970/1970 [01:00<00:00, 32.31it/s] 


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A1X283,1,M,1,41.06,-40.738,-40.530,-39.478,-41.824,5.011,...,-2.245,-1.467,-2.315,unstructured,unstructured,0,0,0,0,1
1,A1X283,1,P,2,48.82,-38.958,-39.990,-39.471,-40.218,3.031,...,-1.263,-2.235,-2.085,HELX_LH_PP_P,HELX,0,1,0,0,0
2,A1X283,1,P,3,62.68,-36.496,-37.841,-38.162,-38.923,2.149,...,1.930,3.067,0.962,HELX_LH_PP_P,HELX,0,1,0,0,0
3,A1X283,1,R,4,69.86,-33.663,-34.112,-33.161,-35.493,1.428,...,1.042,1.402,1.462,HELX_LH_PP_P,HELX,0,1,0,0,0
4,A1X283,1,R,5,83.19,-31.288,-32.426,-31.916,-33.030,-0.256,...,1.515,0.429,0.947,HELX_LH_PP_P,HELX,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399795,Q9Y6A5,658,K,834,91.67,-99.597,-98.201,-98.207,-97.236,-2.080,...,109.319,108.734,108.592,HELX_RH_AL_P,HELX,0,1,0,0,0
399796,Q9Y6A5,658,M,835,85.91,-101.353,-101.284,-101.535,-100.002,-4.680,...,108.112,106.641,108.230,HELX_RH_AL_P,HELX,0,1,0,0,0
399797,Q9Y6A5,658,E,836,86.42,-100.155,-100.156,-98.883,-100.256,-6.166,...,110.108,109.801,109.175,TURN_TY1_P,TURN,0,0,0,1,0
399798,Q9Y6A5,658,K,837,80.58,-101.582,-100.229,-99.040,-100.206,-3.677,...,113.276,113.481,111.908,TURN_TY1_P,TURN,0,0,0,1,0


In [223]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

  0%|          | 0/658 [00:00<?, ?it/s]

100%|██████████| 658/658 [00:07<00:00, 91.16it/s] 


In [224]:
alphafold_accessibility_ChURRO_3 = alphafold_annotation_ChURRO_3.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [225]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 658/658 [00:06<00:00, 96.12it/s] 


In [226]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [227]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 658/658 [00:06<00:00, 98.63it/s] 


In [228]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [229]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 658/658 [00:07<00:00, 89.02it/s] 


In [230]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [231]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 658/658 [00:06<00:00, 94.99it/s] 


In [232]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [233]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 658/658 [00:06<00:00, 94.58it/s] 


In [234]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [235]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 658/658 [00:06<00:00, 95.26it/s] 


In [236]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [237]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 658/658 [00:07<00:00, 89.26it/s] 


In [238]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [239]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 658/658 [00:07<00:00, 91.66it/s] 


In [240]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [241]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 658/658 [00:07<00:00, 92.52it/s] 


In [242]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [243]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 658/658 [00:07<00:00, 91.98it/s] 


In [244]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [245]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

  0%|          | 0/658 [00:00<?, ?it/s]

100%|██████████| 658/658 [00:07<00:00, 83.79it/s] 


In [246]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [247]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 658/658 [00:10<00:00, 65.01it/s] 


In [248]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [249]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 658/658 [00:12<00:00, 54.21it/s] 


In [250]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3;

In [251]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_3, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 658/658 [00:08<00:00, 75.36it/s] 


In [252]:
alphafold_accessibility_ChURRO_3 = alphafold_accessibility_ChURRO_3.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_3

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,A1X283,1,M,1,41.06,-40.738,-40.530,-39.478,-41.824,5.011,...,1,1,1,1,1,1,2,3,4,0
1,A1X283,1,P,2,48.82,-38.958,-39.990,-39.471,-40.218,3.031,...,2,2,2,2,2,2,3,4,7,0
2,A1X283,1,P,3,62.68,-36.496,-37.841,-38.162,-38.923,2.149,...,2,2,2,2,2,2,4,6,11,0
3,A1X283,1,R,4,69.86,-33.663,-34.112,-33.161,-35.493,1.428,...,1,2,2,2,2,2,4,11,26,0
4,A1X283,1,R,5,83.19,-31.288,-32.426,-31.916,-33.030,-0.256,...,2,2,2,2,2,2,7,25,41,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399795,Q9Y6A5,658,K,834,91.67,-99.597,-98.201,-98.207,-97.236,-2.080,...,2,2,2,3,4,4,10,13,17,1
399796,Q9Y6A5,658,M,835,85.91,-101.353,-101.284,-101.535,-100.002,-4.680,...,0,2,2,2,2,4,8,12,15,1
399797,Q9Y6A5,658,E,836,86.42,-100.155,-100.156,-98.883,-100.256,-6.166,...,0,2,2,2,2,2,7,10,14,2
399798,Q9Y6A5,658,K,837,80.58,-101.582,-100.229,-99.040,-100.206,-3.677,...,0,2,2,2,2,2,5,9,12,2


In [253]:
alphafold_accessibility_ChURRO_3_smooth = get_smooth_score(
    alphafold_accessibility_ChURRO_3, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_ChURRO_3_smooth;

100%|██████████| 658/658 [00:01<00:00, 563.68it/s]


In [254]:
alphafold_accessibility_ChURRO_3_smooth['IDR'] = np.where(
    alphafold_accessibility_ChURRO_3_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_ChURRO_3_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,A1X283,1,M,1,41.06,-40.738,-40.530,-39.478,-41.824,5.011,...,2.090909,2.181818,2.818182,3.000000,3.818182,11.181818,30.636364,49.545455,2.000000,0
1,A1X283,1,P,2,48.82,-38.958,-39.990,-39.471,-40.218,3.031,...,2.166667,2.333333,3.000000,3.333333,4.250000,12.083333,32.666667,53.333333,2.666667,0
2,A1X283,1,P,3,62.68,-36.496,-37.841,-38.162,-38.923,2.149,...,2.153846,2.307692,3.000000,3.307692,4.230769,12.230769,33.153846,55.538462,2.692308,0
3,A1X283,1,R,4,69.86,-33.663,-34.112,-33.161,-35.493,1.428,...,2.142857,2.285714,3.000000,3.357143,4.214286,12.428571,33.714286,57.214286,2.500000,0
4,A1X283,1,R,5,83.19,-31.288,-32.426,-31.916,-33.030,-0.256,...,2.200000,2.333333,3.133333,3.466667,4.333333,12.933333,34.266667,58.733333,2.733333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
833,Q9Y6A5,658,K,834,91.67,-99.597,-98.201,-98.207,-97.236,-2.080,...,2.266667,3.533333,4.533333,5.466667,5.733333,9.733333,15.266667,19.866667,2.133333,1
834,Q9Y6A5,658,M,835,85.91,-101.353,-101.284,-101.535,-100.002,-4.680,...,2.214286,3.428571,4.428571,5.285714,5.571429,9.571429,15.000000,19.428571,2.071429,1
835,Q9Y6A5,658,E,836,86.42,-100.155,-100.156,-98.883,-100.256,-6.166,...,2.153846,3.230769,4.230769,5.153846,5.461538,9.384615,14.615385,18.923077,2.076923,1
836,Q9Y6A5,658,K,837,80.58,-101.582,-100.229,-99.040,-100.206,-3.677,...,2.166667,3.000000,4.083333,5.000000,5.250000,9.166667,14.250000,18.416667,2.083333,1


## Merge Dataframes into Full Dataset (Includes Alphafold) - ChURRO_3

In [255]:
alphafold_accessibility_ChURRO_3_smooth["position"] = alphafold_accessibility_ChURRO_3_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_wa = peptides_cs.merge(
    alphafold_accessibility_ChURRO_3_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_wa # wa means "with alphafold"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,SGEHDFGAAFDGDGDRNM[661.3660]ILGK,SGEHDFGAAFDGDGDRNM[667.3735]ILGK,,,4.117889,,,,,4.727480,...,3.238095,4.666667,6.809524,8.047619,10.047619,35.095238,102.000000,202.285714,9.095238,0
1,M[661.3660]LESYLHAK,M[667.3735]LESYLHAK,,,,,,,,1.845600,...,2.666667,4.571429,5.523810,6.809524,7.904762,25.904762,86.714286,172.285714,8.285714,0
2,AHSIQIM[661.3660]K,AHSIQIM[667.3735]K,,1.517974,1.695327,,,,,,...,2.285714,3.095238,4.285714,5.333333,6.666667,17.952381,49.857143,82.714286,4.000000,0
3,GFQQILAGEYDHLPEQAFYM[661.3660]VGPIEEAVAK,GFQQILAGEYDHLPEQAFYM[667.3735]VGPIEEAVAK,,1.587855,1.706229,,,,,1.419860,...,2.238095,4.095238,5.190476,6.000000,6.904762,19.380952,55.857143,99.571429,5.047619,0
4,NLKPIKPM[661.3660]QFLGDEETVRK,NLKPIKPM[667.3735]QFLGDEETVRK,1.806439,1.527960,1.332013,,,,1.576976,1.474839,...,2.285714,3.476190,4.428571,5.952381,6.619048,15.095238,42.142857,81.523810,2.904762,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1175,LVSDGQALPEM[661.3660]EIHLQTNAEK,LVSDGQALPEM[667.3735]EIHLQTNAEK,,0.549687,0.592261,,,,-0.680663,-0.187379,...,2.619048,3.190476,4.428571,5.714286,6.714286,15.904762,41.190476,79.857143,2.809524,0
1176,SSIHNFM[661.3660]THPEFR,SSIHNFM[667.3735]THPEFR,,-0.278141,,,,,,0.282283,...,2.000000,2.000000,2.000000,2.000000,2.047619,4.095238,7.285714,10.857143,0.000000,1
1177,AAFTVSLDPGPLEQFPHSM[661.3660]EPQLR,AAFTVSLDPGPLEQFPHSM[667.3735]EPQLR,,0.408145,,,,,,-0.412637,...,2.285714,3.523810,4.761905,5.809524,6.476190,14.714286,38.761905,57.476190,3.047619,0
1178,AIGVLTSGGDAQGM[661.3660]NAAVR,AIGVLTSGGDAQGM[667.3735]NAAVR,,0.003958,-0.003922,,,,,,...,2.571429,4.476190,6.428571,7.904762,9.666667,35.047619,96.285714,192.095238,10.952381,0


In [257]:
# NOTE: five peptide sequences differ between UniProt & AlphaFold

pd.set_option("display.max_columns", None)
display(peptides_wa[~(peptides_wa["AA"] == "M")])
peptides_wa = peptides_wa[(peptides_wa["AA"] == "M")]
pd.reset_option("display.max_columns")

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,3_3 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,pval,average,Site,Label,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,y_coord_ca,y_coord_cb,y_coord_n,z_coord_c,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_2_180_pae,nAA_3_180_pae,nAA_4_180_pae,nAA_4.5_180_pae,nAA_5_180_pae,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae,nAA_2_180_pae_smooth10,nAA_3_180_pae_smooth10,nAA_4_180_pae_smooth10,nAA_4.5_180_pae_smooth10,nAA_5_180_pae_smooth10,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
236,AQM[661.3660]QTVPNAGHWIHADRPQDFIAAIR,AQM[667.3735]QTVPNAGHWIHADRPQDFIAAIR,,,-0.665441,,,,,-0.52563,,sp|Q8NFV4|ABHDB_HUMAN,Q8NFV4,ABHDB_HUMAN,ABHD11,sn-1-specific diacylglycerol lipase ABHD11,0.074388,-0.595535,M279,ABHDB_M279,grey,MLRWTRAWRLPREGLGPHGPSFARVPVAPSSSSGGRGGAEPRPLPL...,AQMQTVPNAGHWIHADRPQDFIAAIR,276,26,AQ,2,278,SQFVHPSHHPEIMRLFPRAQ,QTVPNAGHWIHADRPQDFIA,Q8NFV4,490,I,278,98.29,-9.711,-8.459,-7.565,-7.718,10.842,10.878,9.638,12.126,13.765,12.886,13.127,13.109,HELX_RH_AL_P,HELX,0,1,0,0,0,0,0,0,0,2,2,3,4,7,9,10,28,68,115,14,0.0,0.0,0.0,0.0,2.0,2.095238,2.571429,3.571429,5.095238,6.333333,7.52381,19.190476,54.619048,100.714286,4.857143,0
384,TVVTGIEM[661.3660]FHK,TVVTGIEM[667.3735]FHK,,0.88235,0.497077,,,,,,,sp|P49411|EFTU_HUMAN,P49411,EFTU_HUMAN,TUFM,"Elongation factor Tu, mitochondrial",0.173389,0.689714,M311,EFTU_M311,grey,MTTMAAATLLRATPHFSGLAAGRTFLLQGLLRLLKAPALPLLCRGL...,TVVTGIEMFHK,303,11,TVVTGIE,7,310,DECELLGHSKNIRTVVTGIE,FHKSLERAEAGDNLGALVRG,P49411,259,K,310,89.03,20.034,20.724,20.766,20.01,-2.936,-2.891,-1.461,-3.759,10.367,9.007,8.453,8.07,STRN,STRN,0,0,1,0,0,0,0,0,0,2,2,2,2,3,5,6,11,34,86,2,0.0,0.0,0.0,0.0,2.0,2.142857,2.380952,2.857143,3.952381,5.142857,6.333333,16.761905,52.952381,101.285714,3.380952,0
416,EHM[661.3660]GNVVEALIALTN,EHM[667.3735]GNVVEALIALTN,,0.089979,-0.17292,,,,0.215108,0.260641,0.204769,sp|Q9NX55|HYPK_HUMAN,Q9NX55,HYPK_HUMAN,HYPK,Huntingtin-interacting protein K,0.201761,0.119515,M109,HYPK_M109,grey,MATEGDVELELETETSGPERPPEKPRKHDSGAADLERVTDYAEEKE...,EHMGNVVEALIALTN,106,15,EH,2,108,LIMTEMEISRAAAERSLREH,GNVVEALIALTN,Q9NX55,596,A,108,92.77,3.291,2.081,2.371,0.883,1.961,2.414,3.753,2.57,-7.576,-8.404,-9.096,-7.572,HELX_RH_AL_P,HELX,0,1,0,0,0,0,0,0,0,2,2,2,5,6,9,12,24,39,44,12,0.0,0.0,0.0,0.0,2.0,2.0,2.095238,3.52381,4.714286,5.904762,6.619048,15.0,33.238095,43.809524,3.47619,0
1024,EGGSAAALSSSSSSSAAAAAASSSSSSGPGSAM[661.3660]ETG...,EGGSAAALSSSSSSSAAAAAASSSSSSGPGSAM[667.3735]ETG...,,0.180237,0.182949,,,,,-0.549589,,sp|Q8NFD5|ARI1B_HUMAN,Q8NFD5,ARI1B_HUMAN,ARID1B,AT-rich interactive domain-containing protein 1B,0.822594,-0.062134,M142,ARI1B_M142,grey,MAARAAAAAAAAAARARARAGSGERRAPPGPRPAPGARDLEAGARG...,EGGSAAALSSSSSSSAAAAAASSSSSSGPGSAMETGLLPNHK,109,42,EGGSAAALSSSSSSSAAAAAASSSSSSGPGSA,32,141,SSSAAAAAASSSSSSGPGSA,ETGLLPNHKLKTVGEAPAAP,Q8NFD5,489,G,141,30.16,-26.329,-27.241,,-28.344,76.067,77.156,,76.553,0.29,0.884,,1.665,unstructured,unstructured,0,0,0,0,1,0,0,0,1,2,2,2,2,2,2,2,4,7,10,0,0.0,0.0,0.0,0.619048,1.380952,1.47619,2.0,2.0,2.0,2.0,2.0,4.0,6.428571,9.904762,0.285714,1
1152,AQEAAAAVM[661.3660]QAAANSAQSR,AQEAAAAVM[667.3735]QAAANSAQSR,,,-0.257459,,,,,0.284424,,sp|Q8NFD5|ARI1B_HUMAN,Q8NFD5,ARI1B_HUMAN,ARID1B,AT-rich interactive domain-containing protein 1B,0.968347,0.013483,M1020,ARI1B_M1020,grey,MAARAAAAAAAAAARARARAGSGERRAPPGPRPAPGARDLEAGARG...,AQEAAAAVMQAAANSAQSR,1011,19,AQEAAAAV,8,1019,GMGPPMPTVNRKAQEAAAAV,QAAANSAQSRQGSFPGMNQS,Q8NFD5,489,K,1019,31.07,13.037,11.897,11.484,12.313,-20.213,-19.269,-19.647,-17.839,-82.437,-82.871,-84.311,-82.847,unstructured,unstructured,0,0,0,0,1,0,0,0,1,2,2,2,2,2,2,2,4,7,10,0,0.0,0.0,0.0,0.428571,1.333333,1.47619,2.0,2.0,2.0,2.0,2.0,4.0,6.428571,9.809524,0.142857,1


In [258]:
# Sanity check - ensure methionine locations are correct

#pd.set_option("display.max_columns", None)
display(peptides_wa[~(peptides_wa["Site"].str.strip("M").astype(int) == peptides_wa["Methionine Location"] + 1)])
#pd.reset_option("display.max_columns")

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR


In [261]:
#peptides_wa.to_csv(os.path.join(curr_dir_path, "ChURRO_3_with_alphafold.csv"))

In [262]:
path = os.path.join(curr_dir_path, "ChURRO_3_with_alphafold.csv")
peptides_wa = pd.read_csv(path)
peptides_wa.set_index("Unnamed: 0", inplace=True)
peptides_wa.index.name = None
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,SGEHDFGAAFDGDGDRNM[661.3660]ILGK,SGEHDFGAAFDGDGDRNM[667.3735]ILGK,,,4.117889,,,,,4.727480,...,3.238095,4.666667,6.809524,8.047619,10.047619,35.095238,102.000000,202.285714,9.095238,0
1,M[661.3660]LESYLHAK,M[667.3735]LESYLHAK,,,,,,,,1.845600,...,2.666667,4.571429,5.523810,6.809524,7.904762,25.904762,86.714286,172.285714,8.285714,0
2,AHSIQIM[661.3660]K,AHSIQIM[667.3735]K,,1.517974,1.695327,,,,,,...,2.285714,3.095238,4.285714,5.333333,6.666667,17.952381,49.857143,82.714286,4.000000,0
3,GFQQILAGEYDHLPEQAFYM[661.3660]VGPIEEAVAK,GFQQILAGEYDHLPEQAFYM[667.3735]VGPIEEAVAK,,1.587855,1.706229,,,,,1.419860,...,2.238095,4.095238,5.190476,6.000000,6.904762,19.380952,55.857143,99.571429,5.047619,0
4,NLKPIKPM[661.3660]QFLGDEETVRK,NLKPIKPM[667.3735]QFLGDEETVRK,1.806439,1.527960,1.332013,,,,1.576976,1.474839,...,2.285714,3.476190,4.428571,5.952381,6.619048,15.095238,42.142857,81.523810,2.904762,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1175,LVSDGQALPEM[661.3660]EIHLQTNAEK,LVSDGQALPEM[667.3735]EIHLQTNAEK,,0.549687,0.592261,,,,-0.680663,-0.187379,...,2.619048,3.190476,4.428571,5.714286,6.714286,15.904762,41.190476,79.857143,2.809524,0
1176,SSIHNFM[661.3660]THPEFR,SSIHNFM[667.3735]THPEFR,,-0.278141,,,,,,0.282283,...,2.000000,2.000000,2.000000,2.000000,2.047619,4.095238,7.285714,10.857143,0.000000,1
1177,AAFTVSLDPGPLEQFPHSM[661.3660]EPQLR,AAFTVSLDPGPLEQFPHSM[667.3735]EPQLR,,0.408145,,,,,,-0.412637,...,2.285714,3.523810,4.761905,5.809524,6.476190,14.714286,38.761905,57.476190,3.047619,0
1178,AIGVLTSGGDAQGM[661.3660]NAAVR,AIGVLTSGGDAQGM[667.3735]NAAVR,,0.003958,-0.003922,,,,,,...,2.571429,4.476190,6.428571,7.904762,9.666667,35.047619,96.285714,192.095238,10.952381,0


## The End (For Now)