## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import requests as r
from Bio import SeqIO
from io import StringIO
import warnings

warnings.filterwarnings('ignore')

# Import structuremap functions
import structuremap.utils
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score

structuremap.utils.set_logger()

## Set Parameters of Analysis - ChURRO_1

In [2]:
# Set parameters of analysis

analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["577.3085", "583.3160"] # which modifications we are looking for, as regex strings
heavy_modification = "577.3085"
light_modification = "583.3160"

## Load Dataset - ChURRO_1

In [4]:
# Set correct pathing

curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/ChURRO_revisions
Global Data Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/global_data


In [5]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "ChURRO_1_isoDTB.csv")
peptides = pd.read_csv(data_loc)
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,pvalue,avg ratio,neglogpval,Site,Label
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,sp|P11142|HSP7C_HUMAN,P11142,HSP7C_HUMAN,HSPA8,Heat shock cognate 71 kDa protein,5.529095e-04,4.131862,3.257346,M549,HSP7C_M549
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,sp|Q07065|CKAP4_HUMAN,Q07065,CKAP4_HUMAN,CKAP4,Cytoskeleton-associated protein 4,1.337810e-07,3.424534,6.873606,M423,CKAP4_M423
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,sp|Q9Y490|TLN1_HUMAN,Q9Y490,TLN1_HUMAN,TLN1,Talin-1,1.377828e-03,2.946435,2.860805,M1759,TLN1_M1759
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,sp|P99999|CYC_HUMAN,P99999,CYC_HUMAN,CYCS,Cytochrome c,2.391139e-03,2.876139,2.621395,M66,CYC_M66
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,sp|P00374|DYR_HUMAN,P00374,DYR_HUMAN,DHFR,Dihydrofolate reductase,1.270312e-02,2.831927,1.896090,M112,DYR_M112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,sp|Q9H444|CHM4B_HUMAN,Q9H444,CHM4B_HUMAN,CHMP4B,Charged multivesicular body protein 4b,9.760989e-01,-0.002733,0.010506,M35,CHM4B_M35
1047,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,sp|Q15233|NONO_HUMAN,Q15233,NONO_HUMAN,NONO,Non-POU domain-containing octamer-binding protein,9.809736e-01,0.002278,0.008343,M441,NONO_M441
1048,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,sp|Q86UP2|KTN1_HUMAN,Q86UP2,KTN1_HUMAN,KTN1,Kinectin,9.825290e-01,-0.008675,0.007655,M406,KTN1_M406
1049,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,sp|P51148|RAB5C_HUMAN,P51148,RAB5C_HUMAN,RAB5C,Ras-related protein Rab-5C,9.893231e-01,-0.012026,0.004662,M89,RAB5C_M89


In [6]:
# Canonicalize data - none to do here
peptides;

In [8]:
# Manual labeling of peptides
label_col_data = ["red"] * 513 + ["blue"] * 81 + ["grey"] * 457
label_col = pd.Series(label_col_data)
peptides["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [9]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P11142' 'Q07065' 'Q9Y490' 'P99999' 'P00374' 'O43837' 'P04406' 'P63261'
 'P07437' 'Q15233' 'P13693' 'P62820' 'P39023' 'P46777' 'Q13263' 'P27694'
 'P22626' 'O75208' 'P07954' 'Q92841' 'Q9UMX0' 'P06576' 'P12004' 'Q8WWM7'
 'Q14152' 'P06733' 'O43242' 'Q00577' 'P13639' 'P10809' 'P22307' 'P24752'
 'P27816' 'O15042' 'P62258' 'Q9BR76' 'P34932' 'Q969T9' 'Q13526' 'Q96A49'
 'Q9NX55' 'P0DMV8' 'P26038' 'P08238' 'Q9Y237' 'Q9BQE3' 'Q9BYN8' 'Q9GZM5'
 'Q9Y4L1' 'P26583' 'P53999' 'P27635' 'Q13428' 'O75396' 'P12270' 'Q96CT7'
 'P28066' 'Q8N8S7' 'O14974' 'P68133' 'P27797' 'P09874' 'O14654' 'Q14204'
 'P60842' 'Q86Y82' 'Q9Y2L1' 'Q9NP61' 'P06493' 'Q96MW1' 'P31943' 'P31948'
 'Q12907' 'P08708' 'Q5JSH3' 'P33316' 'P68371' 'Q7Z5L9' 'Q8WYA6' 'Q14684'
 'P06748' 'P52272' 'P15121' 'Q9BZI7' 'Q13435' 'P78344' 'O43719' 'Q02790'
 'Q68EM7' 'P35580' 'Q14247' 'P61978' 'P49750' 'P35241' 'Q9Y266' 'P07814'
 'Q08211' 'P49792' 'P51858' 'Q15366' 'O60814' 'P13804' 'P11940' 'P35579'
 'P61088' 'Q9NQC3' 'Q14980' 'O

In [10]:
# Helper function to get full amino acid sequence for a protein
def get_complete_sequence(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [12]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [13]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [14]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,Entry Name,Gene,Protein Description,pvalue,avg ratio,neglogpval,Site,Label,Color,Complete Sequence
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,...,HSP7C_HUMAN,HSPA8,Heat shock cognate 71 kDa protein,5.529095e-04,4.131862,3.257346,M549,HSP7C_M549,red,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,...,CKAP4_HUMAN,CKAP4,Cytoskeleton-associated protein 4,1.337810e-07,3.424534,6.873606,M423,CKAP4_M423,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,...,TLN1_HUMAN,TLN1,Talin-1,1.377828e-03,2.946435,2.860805,M1759,TLN1_M1759,red,MVALSLKISIGNVVKTMQFEPSTMVYDACRIIRERIPEAPAGPPSD...
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,...,CYC_HUMAN,CYCS,Cytochrome c,2.391139e-03,2.876139,2.621395,M66,CYC_M66,red,MGDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,...,DYR_HUMAN,DHFR,Dihydrofolate reductase,1.270312e-02,2.831927,1.896090,M112,DYR_M112,red,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,...,CHM4B_HUMAN,CHMP4B,Charged multivesicular body protein 4b,9.760989e-01,-0.002733,0.010506,M35,CHM4B_M35,grey,MSVFGKLFGAGGGKAGKGGPTPQEAIQRLRDTEEMLSKKQEFLEKK...
1047,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,...,NONO_HUMAN,NONO,Non-POU domain-containing octamer-binding protein,9.809736e-01,0.002278,0.008343,M441,NONO_M441,grey,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...
1048,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,...,KTN1_HUMAN,KTN1,Kinectin,9.825290e-01,-0.008675,0.007655,M406,KTN1_M406,grey,MEFYESAYFIVLIPSIVITVIFLFFWLFMKETLYDEVLAKQKREQK...
1049,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,...,RAB5C_HUMAN,RAB5C,Ras-related protein Rab-5C,9.893231e-01,-0.012026,0.004662,M89,RAB5C_M89,grey,MAGRGGAARPNGPAAGNKICQFKLVLLGESAVGKSSLVLRFVKGQF...


In [16]:
# Create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    whole, mantissa = modifications[0].split(".")
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        whole, mantissa = modifications[i].split(".")
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[577\.3085\]|M\[583\.3160\]


In [17]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [18]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [19]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [20]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    1051
Name: count, dtype: int64

In [21]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[577\.3085\]|M\[583\.3160\])


In [22]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [23]:
# Sanity check - ensure methionine locations are correct
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
temp.count("M") == len(temp)

True

In [24]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,...,red,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...,NSLESYAFNMK,539,11,NSLESYAFN,9,548,DEKQRDKVSSKNSLESYAFN,KATVEDEKLQGKINDEDKQK
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,...,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...,LQHVEDGVLSMQVASAR,412,17,LQHVEDGVLS,10,422,QQKSQGLDSRLQHVEDGVLS,QVASARQTESLESLLSKSQE
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,...,red,MVALSLKISIGNVVKTMQFEPSTMVYDACRIIRERIPEAPAGPPSD...,TLSHPQQMALLDQTK,1751,15,TLSHPQQ,7,1758,EPLTLAAVGAASKTLSHPQQ,ALLDQTKTLAESALQLLYTA
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,...,red,MGDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...,GIIWGEDTLMEYLENPK,56,17,GIIWGEDTL,9,65,GYSYTAANKNKGIIWGEDTL,EYLENPKKYIPGTKMIFVGI
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,...,red,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,VDMVWIVGGSSVYK,109,14,VD,2,111,RSLDDALKLTEQPELANKVD,VWIVGGSSVYKEAMNHPGHL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,...,grey,MSVFGKLFGAGGGKAGKGGPTPQEAIQRLRDTEEMLSKKQEFLEKK...,LRDTEEMLSK,28,10,LRDTEE,6,34,AGKGGPTPQEAIQRLRDTEE,LSKKQEFLEKKIEQELTAAK
1047,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,...,grey,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,FGQAATMEGIGAIGGTPPAFNR,434,22,FGQAAT,6,440,PDGTLGLTPPTTERFGQAAT,EGIGAIGGTPPAFNRAAPGA
1048,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,...,grey,MEFYESAYFIVLIPSIVITVIFLFFWLFMKETLYDEVLAKQKREQK...,IHVSYQETQQMQMK,393,14,IHVSYQETQQMQ,12,405,EHNVFQNKIHVSYQETQQMQ,KFQQVREQMEAEIAHLKQEN
1049,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,...,grey,MAGRGGAARPNGPAAGNKICQFKLVLLGESAVGKSSLVLRFVKGQF...,YHSLAPMYYR,82,10,YHSLAP,6,88,TVKFEIWDTAGQERYHSLAP,YYRGAQAAIVVYDITNTDTF


In [33]:
# Remove invalid proteins (according to alphafold)
# 18 invalid peptides as a result -> 9 red, 0 blue, 9 grey

invalid_IDs = ['Q14204', 'P49792', 'P46013', 'Q9NU22', 'Q9Y520', 'Q9UQ35', 'P78527', 'Q7Z6Z7']
peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,...,red,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...,NSLESYAFNMK,539,11,NSLESYAFN,9,548,DEKQRDKVSSKNSLESYAFN,KATVEDEKLQGKINDEDKQK
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,...,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...,LQHVEDGVLSMQVASAR,412,17,LQHVEDGVLS,10,422,QQKSQGLDSRLQHVEDGVLS,QVASARQTESLESLLSKSQE
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,...,red,MVALSLKISIGNVVKTMQFEPSTMVYDACRIIRERIPEAPAGPPSD...,TLSHPQQMALLDQTK,1751,15,TLSHPQQ,7,1758,EPLTLAAVGAASKTLSHPQQ,ALLDQTKTLAESALQLLYTA
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,...,red,MGDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...,GIIWGEDTLMEYLENPK,56,17,GIIWGEDTL,9,65,GYSYTAANKNKGIIWGEDTL,EYLENPKKYIPGTKMIFVGI
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,...,red,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,VDMVWIVGGSSVYK,109,14,VD,2,111,RSLDDALKLTEQPELANKVD,VWIVGGSSVYKEAMNHPGHL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,...,grey,MSVFGKLFGAGGGKAGKGGPTPQEAIQRLRDTEEMLSKKQEFLEKK...,LRDTEEMLSK,28,10,LRDTEE,6,34,AGKGGPTPQEAIQRLRDTEE,LSKKQEFLEKKIEQELTAAK
1047,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,...,grey,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,FGQAATMEGIGAIGGTPPAFNR,434,22,FGQAAT,6,440,PDGTLGLTPPTTERFGQAAT,EGIGAIGGTPPAFNRAAPGA
1048,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,...,grey,MEFYESAYFIVLIPSIVITVIFLFFWLFMKETLYDEVLAKQKREQK...,IHVSYQETQQMQMK,393,14,IHVSYQETQQMQ,12,405,EHNVFQNKIHVSYQETQQMQ,KFQQVREQMEAEIAHLKQEN
1049,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,...,grey,MAGRGGAARPNGPAAGNKICQFKLVLLGESAVGKSSLVLRFVKGQF...,YHSLAPMYYR,82,10,YHSLAP,6,88,TVKFEIWDTAGQERYHSLAP,YYRGAQAAIVVYDITNTDTF


## Download Alphafold Data - ChURRO_1

In [34]:
# Path for alphafold protein data

alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/pae


In [35]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P11142' 'Q07065' 'Q9Y490' 'P99999' 'P00374' 'O43837' 'P04406' 'P63261'
 'P07437' 'Q15233' 'P13693' 'P62820' 'P39023' 'P46777' 'Q13263' 'P27694'
 'P22626' 'O75208' 'P07954' 'Q92841' 'Q9UMX0' 'P06576' 'P12004' 'Q8WWM7'
 'Q14152' 'P06733' 'O43242' 'Q00577' 'P13639' 'P10809' 'P22307' 'P24752'
 'P27816' 'O15042' 'P62258' 'Q9BR76' 'P34932' 'Q969T9' 'Q13526' 'Q96A49'
 'Q9NX55' 'P0DMV8' 'P26038' 'P08238' 'Q9Y237' 'Q9BQE3' 'Q9BYN8' 'Q9GZM5'
 'Q9Y4L1' 'P26583' 'P53999' 'P27635' 'Q13428' 'O75396' 'P12270' 'Q96CT7'
 'P28066' 'Q8N8S7' 'O14974' 'P68133' 'P27797' 'P09874' 'O14654' 'P60842'
 'Q86Y82' 'Q9Y2L1' 'Q9NP61' 'P06493' 'Q96MW1' 'P31943' 'P31948' 'Q12907'
 'P08708' 'Q5JSH3' 'P33316' 'P68371' 'Q7Z5L9' 'Q8WYA6' 'Q14684' 'P06748'
 'P52272' 'P15121' 'Q9BZI7' 'Q13435' 'P78344' 'O43719' 'Q02790' 'Q68EM7'
 'P35580' 'Q14247' 'P61978' 'P49750' 'P35241' 'Q9Y266' 'P07814' 'Q08211'
 'P51858' 'Q15366' 'O60814' 'P13804' 'P11940' 'P35579' 'P61088' 'Q9NQC3'
 'Q14980' 'O43660' 'P14625' 'O

In [36]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 557/557 [00:00<00:00, 138895.80it/s]

2025-02-24 11:08:55> Valid proteins: 0
2025-02-24 11:08:55> Invalid proteins: 0
2025-02-24 11:08:55> Existing proteins: 557





In [37]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 557/557 [00:00<00:00, 149327.41it/s]

2025-02-24 11:08:56> Valid proteins: 0
2025-02-24 11:08:56> Invalid proteins: 0
2025-02-24 11:08:56> Existing proteins: 557





## Construct Alphafold Dataframe (Calculate Accessibilities) - ChURRO_1

In [38]:
# Format alphafold data into dataframe
alphafold_annotation_ChURRO_1 = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=unique_uniprotIDs)
alphafold_annotation_ChURRO_1

100%|██████████| 1849/1849 [00:51<00:00, 35.56it/s] 


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A0A2R8Y4L2,1,M,1,84.87,16.672,16.441,15.737,15.649,-4.743,...,-14.713,-13.538,-15.178,unstructured,unstructured,0,0,0,0,1
1,A0A2R8Y4L2,1,R,2,89.95,18.394,18.171,19.389,17.831,-7.753,...,-17.035,-17.849,-15.961,STRN,STRN,0,0,1,0,0
2,A0A2R8Y4L2,1,D,3,89.61,20.042,18.508,17.905,18.174,-10.302,...,-16.919,-17.992,-17.255,unstructured,unstructured,0,0,0,0,1
3,A0A2R8Y4L2,1,P,4,93.86,22.804,22.046,22.252,20.600,-11.701,...,-15.526,-14.118,-15.741,TURN_TY1_P,TURN,0,0,0,1,0
4,A0A2R8Y4L2,1,N,5,94.20,22.703,22.805,22.164,22.171,-13.138,...,-18.112,-17.913,-17.180,TURN_TY1_P,TURN,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352757,Q9Y6X4,557,K,666,53.36,-44.109,-44.548,-43.702,-44.447,-47.884,...,-10.531,-10.393,-11.884,unstructured,unstructured,0,0,0,0,1
352758,Q9Y6X4,557,A,667,54.16,-44.765,-45.645,-46.279,-44.950,-48.051,...,-7.492,-7.817,-8.622,unstructured,unstructured,0,0,0,0,1
352759,Q9Y6X4,557,K,668,46.58,-45.037,-44.530,-44.726,-45.243,-47.928,...,-3.931,-3.624,-5.178,unstructured,unstructured,0,0,0,0,1
352760,Q9Y6X4,557,L,669,56.53,-42.662,-44.147,-44.931,-44.103,-49.445,...,-0.766,-0.790,-2.103,unstructured,unstructured,0,0,0,0,1


In [39]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 557/557 [00:08<00:00, 63.28it/s] 


In [40]:
alphafold_accessibility_ChURRO_1 = alphafold_annotation_ChURRO_1.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [41]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 557/557 [00:07<00:00, 74.57it/s] 


In [42]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [43]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 557/557 [00:06<00:00, 82.22it/s] 


In [44]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [45]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 557/557 [00:06<00:00, 84.97it/s] 


In [46]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [47]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 557/557 [00:06<00:00, 88.11it/s] 


In [48]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [49]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 557/557 [00:06<00:00, 87.17it/s] 


In [50]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [51]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 557/557 [00:07<00:00, 78.46it/s] 


In [52]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [53]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 557/557 [00:06<00:00, 82.95it/s] 


In [54]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [55]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 557/557 [00:06<00:00, 84.71it/s] 


In [56]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [57]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 557/557 [00:06<00:00, 87.42it/s] 


In [58]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [59]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

  0%|          | 0/557 [00:00<?, ?it/s]

100%|██████████| 557/557 [00:06<00:00, 87.31it/s] 


In [60]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [61]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 557/557 [00:06<00:00, 80.36it/s] 


In [62]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [63]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

  0%|          | 0/557 [00:00<?, ?it/s]

100%|██████████| 557/557 [00:08<00:00, 69.14it/s] 


In [64]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [65]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 557/557 [00:09<00:00, 59.66it/s] 


In [66]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1;

In [67]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_ChURRO_1, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 557/557 [00:07<00:00, 74.75it/s] 


In [68]:
alphafold_accessibility_ChURRO_1 = alphafold_accessibility_ChURRO_1.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_ChURRO_1

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,A0A2R8Y4L2,1,M,1,84.87,16.672,16.441,15.737,15.649,-4.743,...,1,1,1,1,1,1,6,12,19,0
1,A0A2R8Y4L2,1,R,2,89.95,18.394,18.171,19.389,17.831,-7.753,...,2,2,3,3,6,6,12,13,23,1
2,A0A2R8Y4L2,1,D,3,89.61,20.042,18.508,17.905,18.174,-10.302,...,2,3,3,4,4,5,10,13,19,2
3,A0A2R8Y4L2,1,P,4,93.86,22.804,22.046,22.252,20.600,-11.701,...,2,2,2,3,4,4,9,12,13,0
4,A0A2R8Y4L2,1,N,5,94.20,22.703,22.805,22.164,22.171,-13.138,...,2,2,2,4,4,4,6,9,13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352757,Q9Y6X4,557,K,666,53.36,-44.109,-44.548,-43.702,-44.447,-47.884,...,1,2,2,2,2,2,4,7,9,0
352758,Q9Y6X4,557,A,667,54.16,-44.765,-45.645,-46.279,-44.950,-48.051,...,2,2,2,2,2,2,4,7,8,0
352759,Q9Y6X4,557,K,668,46.58,-45.037,-44.530,-44.726,-45.243,-47.928,...,1,2,2,2,2,2,4,5,7,0
352760,Q9Y6X4,557,L,669,56.53,-42.662,-44.147,-44.931,-44.103,-49.445,...,1,2,2,2,2,2,3,4,6,0


In [69]:
alphafold_accessibility_ChURRO_1_smooth = get_smooth_score(
    alphafold_accessibility_ChURRO_1, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_ChURRO_1_smooth;

100%|██████████| 557/557 [00:01<00:00, 450.07it/s]


In [70]:
alphafold_accessibility_ChURRO_1_smooth['IDR'] = np.where(
    alphafold_accessibility_ChURRO_1_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_ChURRO_1_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,A0A2R8Y4L2,1,M,1,84.87,16.672,16.441,15.737,15.649,-4.743,...,2.090909,2.545455,3.272727,4.181818,4.545455,9.000000,13.909091,21.636364,0.818182,1
1,A0A2R8Y4L2,1,R,2,89.95,18.394,18.171,19.389,17.831,-7.753,...,2.083333,2.500000,3.250000,4.083333,4.416667,9.083333,15.166667,24.583333,0.750000,1
2,A0A2R8Y4L2,1,D,3,89.61,20.042,18.508,17.905,18.174,-10.302,...,2.076923,2.461538,3.153846,3.923077,4.307692,9.153846,16.384615,27.230769,0.692308,1
3,A0A2R8Y4L2,1,P,4,93.86,22.804,22.046,22.252,20.600,-11.701,...,2.071429,2.428571,3.071429,3.785714,4.214286,9.285714,17.928571,30.000000,0.714286,1
4,A0A2R8Y4L2,1,N,5,94.20,22.703,22.805,22.164,22.171,-13.138,...,2.066667,2.400000,3.000000,3.733333,4.200000,9.400000,19.266667,32.066667,0.800000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,Q9Y6X4,557,K,666,53.36,-44.109,-44.548,-43.702,-44.447,-47.884,...,1.933333,1.933333,1.933333,1.933333,1.933333,3.733333,5.733333,8.533333,0.000000,1
666,Q9Y6X4,557,A,667,54.16,-44.765,-45.645,-46.279,-44.950,-48.051,...,1.928571,1.928571,1.928571,1.928571,1.928571,3.714286,5.714286,8.500000,0.000000,1
667,Q9Y6X4,557,K,668,46.58,-45.037,-44.530,-44.726,-45.243,-47.928,...,1.923077,1.923077,1.923077,1.923077,1.923077,3.692308,5.692308,8.384615,0.000000,1
668,Q9Y6X4,557,L,669,56.53,-42.662,-44.147,-44.931,-44.103,-49.445,...,1.916667,1.916667,1.916667,1.916667,1.916667,3.666667,5.666667,8.416667,0.000000,1


# Merge Dataframes into Full Dataset (Includes Alphafold) - ChURRO_1

In [71]:
alphafold_accessibility_ChURRO_1_smooth["position"] = alphafold_accessibility_ChURRO_1_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_wa = peptides_cs.merge(
    alphafold_accessibility_ChURRO_1_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_wa # wa means "with alphafold"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,...,2.095238,4.380952,5.285714,6.714286,6.952381,15.666667,40.666667,61.238095,4.380952,0.0
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0.0
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,...,2.380952,3.714286,5.238095,6.619048,7.047619,18.047619,57.952381,95.047619,5.380952,0.0
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,...,2.380952,3.857143,6.000000,7.285714,8.190476,20.333333,51.047619,87.857143,4.619048,0.0
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,...,2.476190,3.333333,4.857143,6.666667,8.095238,21.428571,59.619048,108.380952,5.523810,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1028,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,...,2.428571,5.095238,6.047619,8.380952,8.476190,18.142857,36.428571,52.190476,4.619048,0.0
1029,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,...,2.000000,2.000000,2.000000,2.000000,2.000000,4.000000,6.285714,10.000000,0.095238,1.0
1030,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,...,2.571429,5.285714,6.095238,7.952381,8.000000,12.047619,19.904762,27.476190,2.142857,1.0
1031,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,...,2.523810,3.428571,4.619048,5.952381,6.904762,18.904762,55.714286,94.666667,4.857143,0.0


In [None]:
# NOTE: five peptide sequences differ between UniProt & AlphaFold

pd.set_option("display.max_columns", None)
display(peptides_wa[~(peptides_wa["AA"] == "M")])
peptides_wa = peptides_wa[(peptides_wa["AA"] == "M")]
pd.reset_option("display.max_columns")

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,pvalue,avg ratio,neglogpval,Site,Label,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,y_coord_ca,y_coord_cb,y_coord_n,z_coord_c,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_2_180_pae,nAA_3_180_pae,nAA_4_180_pae,nAA_4.5_180_pae,nAA_5_180_pae,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae,nAA_2_180_pae_smooth10,nAA_3_180_pae_smooth10,nAA_4_180_pae_smooth10,nAA_4.5_180_pae_smooth10,nAA_5_180_pae_smooth10,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
50,EIQSSNLETAM[577.3085]SVIGDR,EIQSSNLETAM[583.3160]SVIGDR,1.089386,,1.279832,,1.261323,1.268879,,,sp|Q9NX55|HYPK_HUMAN,Q9NX55,HYPK_HUMAN,HYPK,Huntingtin-interacting protein K,0.000111,1.224855,3.954171,M56,HYPK_M56,red,MATEGDVELELETETSGPERPPEKPRKHDSGAADLERVTDYAEEKE...,EIQSSNLETAMSVIGDR,45,17,EIQSSNLETA,10,55,ERVTDYAEEKEIQSSNLETA,SVIGDRRSREQKAKQEREKE,Q9NX55,502.0,Q,55.0,69.9,-14.11,-14.821,-14.402,-16.255,-11.201,-12.063,-13.538,-11.93,34.016,35.062,34.949,34.86,unstructured,unstructured,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,7.0,12.0,0.0,0.0,0.0,0.0,0.0,0.47619,0.47619,1.571429,1.571429,2.0,2.047619,2.095238,4.761905,8.428571,12.190476,0.571429,1.0
275,EGGSAAALSSSSSSSAAAAAASSSSSSGPGSAM[577.3085]ETG...,EGGSAAALSSSSSSSAAAAAASSSSSSGPGSAM[583.3160]ETG...,0.375276,0.628574,0.366605,0.496898,0.594104,0.535261,0.196503,0.497402,sp|Q8NFD5|ARI1B_HUMAN,Q8NFD5,ARI1B_HUMAN,ARID1B,AT-rich interactive domain-containing protein 1B,3.6e-05,0.461328,4.438909,M142,ARI1B_M142,red,MAARAAAAAAAAAARARARAGSGERRAPPGPRPAPGARDLEAGARG...,EGGSAAALSSSSSSSAAAAAASSSSSSGPGSAMETGLLPNHK,109,42,EGGSAAALSSSSSSSAAAAAASSSSSSGPGSA,32,141,SSSAAAAAASSSSSSGPGSA,ETGLLPNHKLKTVGEAPAAP,Q8NFD5,418.0,G,141.0,30.16,-26.329,-27.241,,-28.344,76.067,77.156,,76.553,0.29,0.884,,1.665,unstructured,unstructured,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,7.0,10.0,0.0,0.0,0.0,0.0,0.619048,1.380952,1.47619,2.0,2.0,2.0,2.0,2.0,4.0,6.428571,9.904762,0.285714,1.0
661,AQEAAAAVM[577.3085]QAAANSAQSR,AQEAAAAVM[583.3160]QAAANSAQSR,,0.442455,0.061465,,0.231275,-0.056427,0.580257,,sp|Q8NFD5|ARI1B_HUMAN,Q8NFD5,ARI1B_HUMAN,ARID1B,AT-rich interactive domain-containing protein 1B,0.098649,0.251805,1.005906,M1020,ARI1B_M1020,grey,MAARAAAAAAAAAARARARAGSGERRAPPGPRPAPGARDLEAGARG...,AQEAAAAVMQAAANSAQSR,1011,19,AQEAAAAV,8,1019,GMGPPMPTVNRKAQEAAAAV,QAAANSAQSRQGSFPGMNQS,Q8NFD5,418.0,K,1019.0,31.07,13.037,11.897,11.484,12.313,-20.213,-19.269,-19.647,-17.839,-82.437,-82.871,-84.311,-82.847,unstructured,unstructured,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,7.0,10.0,0.0,0.0,0.0,0.0,0.428571,1.333333,1.47619,2.0,2.0,2.0,2.0,2.0,4.0,6.428571,9.809524,0.142857,1.0
773,GGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPM[5...,GGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPM[5...,,0.607134,,,,,,0.335,sp|A0A2R8Y4L2|RA1L3_HUMAN,A0A2R8Y4L2,RA1L3_HUMAN,HNRNPA1L3,Heterogeneous nuclear ribonucleoprotein A1-like 3,0.179014,0.471067,0.747112,M276,RA1L3_M276,grey,MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVM...,GGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPMK,232,45,GGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGP,43,275,GGGSYNDFGNYNNQSSNFGP,KGGNFEGRSSGPHGGGGQYF,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
828,EHM[577.3085]GNVVEALIALTN,EHM[583.3160]GNVVEALIALTN,0.253647,-0.532562,0.354471,0.270041,,0.254807,0.216633,0.227243,sp|Q9NX55|HYPK_HUMAN,Q9NX55,HYPK_HUMAN,HYPK,Huntingtin-interacting protein K,0.241736,0.149183,0.616659,M109,HYPK_M109,grey,MATEGDVELELETETSGPERPPEKPRKHDSGAADLERVTDYAEEKE...,EHMGNVVEALIALTN,106,15,EH,2,108,LIMTEMEISRAAAERSLREH,GNVVEALIALTN,Q9NX55,502.0,A,108.0,92.77,3.291,2.081,2.371,0.883,1.961,2.414,3.753,2.57,-7.576,-8.404,-9.096,-7.572,HELX_RH_AL_P,HELX,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,5.0,6.0,9.0,12.0,24.0,39.0,44.0,12.0,0.0,0.0,0.0,0.0,2.0,2.0,2.095238,3.52381,4.714286,5.904762,6.619048,15.0,33.238095,43.809524,3.47619,0.0


In [73]:
peptides_wa.to_csv(os.path.join(curr_dir_path, "ChURRO_1_with_alphafold.csv"))

In [74]:
path = os.path.join(curr_dir_path, "ChURRO_1_with_alphafold.csv")
peptides_wa = pd.read_csv(path)
peptides_wa.set_index("Unnamed: 0", inplace=True)
peptides_wa.index.name = None
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,...,2.095238,4.380952,5.285714,6.714286,6.952381,15.666667,40.666667,61.238095,4.380952,0.0
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0.0
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,...,2.380952,3.714286,5.238095,6.619048,7.047619,18.047619,57.952381,95.047619,5.380952,0.0
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,...,2.380952,3.857143,6.000000,7.285714,8.190476,20.333333,51.047619,87.857143,4.619048,0.0
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,...,2.476190,3.333333,4.857143,6.666667,8.095238,21.428571,59.619048,108.380952,5.523810,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1028,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,...,2.428571,5.095238,6.047619,8.380952,8.476190,18.142857,36.428571,52.190476,4.619048,0.0
1029,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,...,2.000000,2.000000,2.000000,2.000000,2.000000,4.000000,6.285714,10.000000,0.095238,1.0
1030,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,...,2.571429,5.285714,6.095238,7.952381,8.000000,12.047619,19.904762,27.476190,2.142857,1.0
1031,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,...,2.523810,3.428571,4.619048,5.952381,6.904762,18.904762,55.714286,94.666667,4.857143,0.0


## The End (For Now)