## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import requests as r
from Bio import SeqIO
from io import StringIO
import warnings

warnings.filterwarnings('ignore')

# Import structuremap functions
import structuremap.utils
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score

structuremap.utils.set_logger()

## Set Parameters of Analysis

In [2]:
# Set parameters of analysis

analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "655.3735"
light_modification = "649.3660"

## Load Dataset - Ox32yne

In [3]:
# Set correct pathing

curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/ChURRO_revisions
Global Data Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/global_data


In [4]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "ChURRO_Ox32yne_isoDTB_293T.csv")
peptides = pd.read_csv(data_loc)
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,3_3 Log2 Ratio HL,Protein,Protein ID,Entry Name,Protein Description,pvalue,avg ratio,neglogpvalue
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,5.847717,sp|Q07065|CKAP4_HUMAN,Q07065,CKAP4_HUMAN,Cytoskeleton-associated protein 4,1.634911e-05,6.486397,4.786506
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,5.759984,sp|P25786|PSA1_HUMAN,P25786,PSA1_HUMAN,Proteasome subunit alpha type-1,6.741449e-05,6.336632,4.171247
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,3.758318,sp|P08238|HS90B_HUMAN,P08238,HS90B_HUMAN,Heat shock protein HSP 90-beta,6.241545e-10,3.806415,9.204708
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,2.647510,sp|P15121|ALDR_HUMAN,P15121,ALDR_HUMAN,Aldo-keto reductase family 1 member B1,5.401038e-06,3.038070,5.267523
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,2.647917,sp|P34932|HSP74_HUMAN,P34932,HSP74_HUMAN,Heat shock 70 kDa protein 4,3.372263e-03,2.373026,2.472079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,SQM[649.3660]LAMNIEK,SQM[655.3735]LAMNIEK,-0.976846,0.490382,-0.027769,-0.043911,0.465599,,-0.377333,0.288108,0.129854,sp|P60228|EIF3E_HUMAN,P60228,EIF3E_HUMAN,Eukaryotic translation initiation factor 3 sub...,9.709520e-01,-0.006489,0.012802
812,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,-0.239224,sp|Q9Y6D9|MD1L1_HUMAN,Q9Y6D9,MD1L1_HUMAN,Mitotic spindle assembly checkpoint protein MAD1,9.773242e-01,-0.008231,0.009961
813,M[649.3660]QAQMQM[15.9949]QMQGGDGDGGALGHHV,M[655.3735]QAQMQM[15.9949]QMQGGDGDGGALGHHV,,,,0.149688,0.574267,0.076001,-0.699690,-0.327852,0.249183,sp|Q15019|SEPT2_HUMAN,Q15019,SEPT2_HUMAN,Septin-2,9.851589e-01,0.003600,0.006494
814,RGVM[649.3660]LAVDAVIAELKK,RGVM[655.3735]LAVDAVIAELKK,-1.596362,2.972889,-1.951015,-2.180257,0.516077,0.425237,0.695950,0.483076,0.554548,sp|P10809|CH60_HUMAN,P10809,CH60_HUMAN,"60 kDa heat shock protein, mitochondrial",9.874185e-01,-0.008873,0.005499


In [5]:
# Canonicalize data - none to do here
peptides;

In [6]:
# Manual labeling of peptides
label_col_data = ["red"] * 308 + ["blue"] * 87 + ["grey"] * 421
label_col = pd.Series(label_col_data)
peptides["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [7]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q07065' 'P25786' 'P08238' 'P15121' 'P34932' 'P04406' 'Q96CT7' 'Q15717'
 'Q14204' 'P47897' 'P12694' 'Q15366' 'P14625' 'P15374' 'Q9H078' 'P31948'
 'P11940' 'Q16836' 'Q12907' 'P26038' 'P13639' 'P07814' 'P67870' 'P10809'
 'P15170' 'Q16181' 'O95347' 'P07437' 'P68371' 'Q86UP2' 'P68363' 'Q13283'
 'P0DP23' 'P55196' 'Q07866' 'Q86Y82' 'P11142' 'P68104' 'P78371' 'Q15233'
 'P35579' 'P22061' 'P80303' 'P52272' 'P50402' 'P63261' 'P83731' 'Q9UHD8'
 'Q9NR31' 'P62829' 'Q9P013' 'P61978' 'O14950' 'P46777' 'O43776' 'Q9P2E9'
 'Q08211' 'P15311' 'P54819' 'P00367' 'Q6PKG0' 'P09012' 'Q9Y266' 'Q9Y520'
 'Q02818' 'P27816' 'Q9UQE7' 'O43143' 'Q4VCS5' 'P18669' 'Q9UMX0' 'Q12904'
 'Q9Y383' 'P09874' 'P35241' 'Q13404' 'P23588' 'Q9H910' 'Q9H1E3' 'P46778'
 'P49368' 'P38646' 'O60763' 'O14562' 'P13693' 'Q9H3P7' 'Q9UHX1' 'Q16891'
 'Q9Y3U8' 'Q13263' 'P78344' 'P31942' 'O00193' 'Q13586' 'O95983' 'P62847'
 'P26368' 'P23246' 'P07910' 'P40222' 'Q02790' 'Q86U42' 'Q14126' 'Q14247'
 'P29692' 'Q14157' 'P49750' 'Q

In [8]:
# Helper function to get full amino acid sequence for a protein
def get_complete_sequence(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [9]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [10]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [11]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,3_3 Log2 Ratio HL,Protein,Protein ID,Entry Name,Protein Description,pvalue,avg ratio,neglogpvalue,Color,Complete Sequence
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,5.847717,sp|Q07065|CKAP4_HUMAN,Q07065,CKAP4_HUMAN,Cytoskeleton-associated protein 4,1.634911e-05,6.486397,4.786506,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,5.759984,sp|P25786|PSA1_HUMAN,P25786,PSA1_HUMAN,Proteasome subunit alpha type-1,6.741449e-05,6.336632,4.171247,red,MFRNQYDNDVTVWSPQGRIHQIEYAMEAVKQGSATVGLKSKTHAVL...
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,3.758318,sp|P08238|HS90B_HUMAN,P08238,HS90B_HUMAN,Heat shock protein HSP 90-beta,6.241545e-10,3.806415,9.204708,red,MPEEVHHGEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISN...
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,2.647510,sp|P15121|ALDR_HUMAN,P15121,ALDR_HUMAN,Aldo-keto reductase family 1 member B1,5.401038e-06,3.038070,5.267523,red,MASRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCA...
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,2.647917,sp|P34932|HSP74_HUMAN,P34932,HSP74_HUMAN,Heat shock 70 kDa protein 4,3.372263e-03,2.373026,2.472079,red,MSVVGIDLGFQSCYVAVARAGGIETIANEYSDRCTPACISFGPKNR...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,SQM[649.3660]LAMNIEK,SQM[655.3735]LAMNIEK,-0.976846,0.490382,-0.027769,-0.043911,0.465599,,-0.377333,0.288108,0.129854,sp|P60228|EIF3E_HUMAN,P60228,EIF3E_HUMAN,Eukaryotic translation initiation factor 3 sub...,9.709520e-01,-0.006489,0.012802,grey,MAEYDLTTRIAHFLDRHLVFPLLEFLSVKEIYNEKELLQGKLDLLS...
812,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,-0.239224,sp|Q9Y6D9|MD1L1_HUMAN,Q9Y6D9,MD1L1_HUMAN,Mitotic spindle assembly checkpoint protein MAD1,9.773242e-01,-0.008231,0.009961,grey,MEDLGENTMVLSTLRSLNNFISQRVEGGSGLDISTSAPGSLQMQYQ...
813,M[649.3660]QAQMQM[15.9949]QMQGGDGDGGALGHHV,M[655.3735]QAQMQM[15.9949]QMQGGDGDGGALGHHV,,,,0.149688,0.574267,0.076001,-0.699690,-0.327852,0.249183,sp|Q15019|SEPT2_HUMAN,Q15019,SEPT2_HUMAN,Septin-2,9.851589e-01,0.003600,0.006494,grey,MSKQQPTQFINPETPGYVGFANLPNQVHRKSVKKGFEFTLMVVGES...
814,RGVM[649.3660]LAVDAVIAELKK,RGVM[655.3735]LAVDAVIAELKK,-1.596362,2.972889,-1.951015,-2.180257,0.516077,0.425237,0.695950,0.483076,0.554548,sp|P10809|CH60_HUMAN,P10809,CH60_HUMAN,"60 kDa heat shock protein, mitochondrial",9.874185e-01,-0.008873,0.005499,grey,MLRLPTVFRQMRPVSRVLAPHLTRAYAKDVKFGADARALMLQGVDL...


In [12]:
# Create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    whole, mantissa = modifications[0].split(".")
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        whole, mantissa = modifications[i].split(".")
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [13]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [14]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [15]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [16]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    816
Name: count, dtype: int64

In [17]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [18]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [19]:
# Sanity check - ensure methionine locations are correct
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
temp.count("M") == len(temp)

True

In [20]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,...,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...,LQHVEDGVLSMQVASAR,412,17,LQHVEDGVLS,10,422,QQKSQGLDSRLQHVEDGVLS,QVASARQTESLESLLSKSQE
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,...,red,MFRNQYDNDVTVWSPQGRIHQIEYAMEAVKQGSATVGLKSKTHAVL...,IHQIEYAMEAVK,18,12,IHQIEYA,7,25,YDNDVTVWSPQGRIHQIEYA,EAVKQGSATVGLKSKTHAVL
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,...,red,MPEEVHHGEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISN...,YHTSQSGDEMTSLSEYVSR,456,19,YHTSQSGDE,9,465,TNRRRLSELLRYHTSQSGDE,TSLSEYVSRMKETQKSIYYI
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,...,red,MASRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCA...,AIGISNFNHLQVEMILNKPGLK,155,22,AIGISNFNHLQVE,13,168,VDEGLVKAIGISNFNHLQVE,ILNKPGLKYKPAVNQIECHP
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,...,red,MSVVGIDLGFQSCYVAVARAGGIETIANEYSDRCTPACISFGPKNR...,TSTVDLPIENQLLWQIDREMLNLYIENEGKMIMQDK,573,36,TSTVDLPIENQLLWQIDREMLNLYIENEGK,30,603,QLLWQIDREMLNLYIENEGK,IMQDKLEKERNDAKNAVEEY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,SQM[649.3660]LAMNIEK,SQM[655.3735]LAMNIEK,-0.976846,0.490382,-0.027769,-0.043911,0.465599,,-0.377333,0.288108,...,grey,MAEYDLTTRIAHFLDRHLVFPLLEFLSVKEIYNEKELLQGKLDLLS...,SQMLAMNIEK,414,10,SQ,2,416,AVSPYQQVIEKTKSLSFRSQ,LAMNIEKKLNQNSRSEAPNW
812,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,...,grey,MEDLGENTMVLSTLRSLNNFISQRVEGGSGLDISTSAPGSLQMQYQ...,LSLQEQDAAIVKNMK,231,15,LSLQEQDAAIVKN,13,244,IKDLEQKLSLQEQDAAIVKN,KSELVRLPRLERELKQLREE
813,M[649.3660]QAQMQM[15.9949]QMQGGDGDGGALGHHV,M[655.3735]QAQMQM[15.9949]QMQGGDGDGGALGHHV,,,,0.149688,0.574267,0.076001,-0.699690,-0.327852,...,grey,MSKQQPTQFINPETPGYVGFANLPNQVHRKSVKKGFEFTLMVVGES...,MQAQMQMQMQGGDGDGGALGHHV,338,23,,0,338,DQILLEKEAELRRMQEMIAR,QAQMQMQMQGGDGDGGALGH
814,RGVM[649.3660]LAVDAVIAELKK,RGVM[655.3735]LAVDAVIAELKK,-1.596362,2.972889,-1.951015,-2.180257,0.516077,0.425237,0.695950,0.483076,...,grey,MLRLPTVFRQMRPVSRVLAPHLTRAYAKDVKFGADARALMLQGVDL...,RGVMLAVDAVIAELKK,141,16,RGV,3,144,KEGFEKISKGANPVEIRRGV,LAVDAVIAELKKQSKPVTTP


In [21]:
# Remove invalid proteins (according to alphafold)
# 10 invalid peptides as a result -> 5 red, 1 blue, 4 grey

invalid_IDs = ['Q14204', 'Q9Y520', 'P46013']
peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,...,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...,LQHVEDGVLSMQVASAR,412,17,LQHVEDGVLS,10,422,QQKSQGLDSRLQHVEDGVLS,QVASARQTESLESLLSKSQE
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,...,red,MFRNQYDNDVTVWSPQGRIHQIEYAMEAVKQGSATVGLKSKTHAVL...,IHQIEYAMEAVK,18,12,IHQIEYA,7,25,YDNDVTVWSPQGRIHQIEYA,EAVKQGSATVGLKSKTHAVL
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,...,red,MPEEVHHGEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISN...,YHTSQSGDEMTSLSEYVSR,456,19,YHTSQSGDE,9,465,TNRRRLSELLRYHTSQSGDE,TSLSEYVSRMKETQKSIYYI
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,...,red,MASRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCA...,AIGISNFNHLQVEMILNKPGLK,155,22,AIGISNFNHLQVE,13,168,VDEGLVKAIGISNFNHLQVE,ILNKPGLKYKPAVNQIECHP
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,...,red,MSVVGIDLGFQSCYVAVARAGGIETIANEYSDRCTPACISFGPKNR...,TSTVDLPIENQLLWQIDREMLNLYIENEGKMIMQDK,573,36,TSTVDLPIENQLLWQIDREMLNLYIENEGK,30,603,QLLWQIDREMLNLYIENEGK,IMQDKLEKERNDAKNAVEEY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,SQM[649.3660]LAMNIEK,SQM[655.3735]LAMNIEK,-0.976846,0.490382,-0.027769,-0.043911,0.465599,,-0.377333,0.288108,...,grey,MAEYDLTTRIAHFLDRHLVFPLLEFLSVKEIYNEKELLQGKLDLLS...,SQMLAMNIEK,414,10,SQ,2,416,AVSPYQQVIEKTKSLSFRSQ,LAMNIEKKLNQNSRSEAPNW
812,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,...,grey,MEDLGENTMVLSTLRSLNNFISQRVEGGSGLDISTSAPGSLQMQYQ...,LSLQEQDAAIVKNMK,231,15,LSLQEQDAAIVKN,13,244,IKDLEQKLSLQEQDAAIVKN,KSELVRLPRLERELKQLREE
813,M[649.3660]QAQMQM[15.9949]QMQGGDGDGGALGHHV,M[655.3735]QAQMQM[15.9949]QMQGGDGDGGALGHHV,,,,0.149688,0.574267,0.076001,-0.699690,-0.327852,...,grey,MSKQQPTQFINPETPGYVGFANLPNQVHRKSVKKGFEFTLMVVGES...,MQAQMQMQMQGGDGDGGALGHHV,338,23,,0,338,DQILLEKEAELRRMQEMIAR,QAQMQMQMQGGDGDGGALGH
814,RGVM[649.3660]LAVDAVIAELKK,RGVM[655.3735]LAVDAVIAELKK,-1.596362,2.972889,-1.951015,-2.180257,0.516077,0.425237,0.695950,0.483076,...,grey,MLRLPTVFRQMRPVSRVLAPHLTRAYAKDVKFGADARALMLQGVDL...,RGVMLAVDAVIAELKK,141,16,RGV,3,144,KEGFEKISKGANPVEIRRGV,LAVDAVIAELKKQSKPVTTP


# Download Alphafold Data - Ox32yne

In [22]:
# Path for alphafold protein data

alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/pae


In [23]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q07065' 'P25786' 'P08238' 'P15121' 'P34932' 'P04406' 'Q96CT7' 'Q15717'
 'P47897' 'P12694' 'Q15366' 'P14625' 'P15374' 'Q9H078' 'P31948' 'P11940'
 'Q16836' 'Q12907' 'P26038' 'P13639' 'P07814' 'P67870' 'P10809' 'P15170'
 'Q16181' 'O95347' 'P07437' 'P68371' 'Q86UP2' 'P68363' 'Q13283' 'P0DP23'
 'P55196' 'Q07866' 'Q86Y82' 'P11142' 'P68104' 'P78371' 'Q15233' 'P35579'
 'P22061' 'P80303' 'P52272' 'P50402' 'P63261' 'P83731' 'Q9UHD8' 'Q9NR31'
 'P62829' 'Q9P013' 'P61978' 'O14950' 'P46777' 'O43776' 'Q9P2E9' 'Q08211'
 'P15311' 'P54819' 'P00367' 'Q6PKG0' 'P09012' 'Q9Y266' 'Q02818' 'P27816'
 'Q9UQE7' 'O43143' 'Q4VCS5' 'P18669' 'Q9UMX0' 'Q12904' 'Q9Y383' 'P09874'
 'P35241' 'Q13404' 'P23588' 'Q9H910' 'Q9H1E3' 'P46778' 'P49368' 'P38646'
 'O60763' 'O14562' 'P13693' 'Q9H3P7' 'Q9UHX1' 'Q16891' 'Q9Y3U8' 'Q13263'
 'P78344' 'P31942' 'O00193' 'Q13586' 'O95983' 'P62847' 'P26368' 'P23246'
 'P07910' 'P40222' 'Q02790' 'Q86U42' 'Q14126' 'Q14247' 'P29692' 'Q14157'
 'P49750' 'Q00341' 'O43852' 'Q

In [24]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 379/379 [00:00<00:00, 190171.22it/s]

2025-02-23 14:16:32> Valid proteins: 0
2025-02-23 14:16:32> Invalid proteins: 0
2025-02-23 14:16:32> Existing proteins: 379





In [25]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 379/379 [00:00<00:00, 159522.45it/s]

2025-02-23 14:16:32> Valid proteins: 0
2025-02-23 14:16:32> Invalid proteins: 0
2025-02-23 14:16:32> Existing proteins: 379





## Construct Alphafold Dataframe (Calculate Accessibilities) - Ox32yne

In [26]:
# Format alphafold data into dataframe
alphafold_annotation_Ox32yne = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=unique_uniprotIDs)
alphafold_annotation_Ox32yne

100%|██████████| 1748/1748 [00:33<00:00, 52.18it/s] 


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A8MXV4,1,M,1,34.18,7.223,7.574,9.030,7.341,-36.582,...,13.872,14.366,13.031,unstructured,unstructured,0,0,0,0,1
1,A8MXV4,1,S,2,29.59,5.569,5.520,4.112,5.964,-33.823,...,12.453,11.875,13.105,unstructured,unstructured,0,0,0,0,1
2,A8MXV4,1,S,3,29.28,5.596,6.732,8.090,6.621,-30.881,...,14.369,14.189,13.491,unstructured,unstructured,0,0,0,0,1
3,A8MXV4,1,S,4,27.39,4.474,3.768,3.026,4.746,-28.235,...,15.056,16.402,15.048,unstructured,unstructured,0,0,0,0,1
4,A8MXV4,1,L,5,28.73,3.667,4.309,3.682,4.070,-25.450,...,13.647,12.336,13.764,unstructured,unstructured,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224477,Q9Y6D9,379,R,714,81.93,-73.042,-72.792,-72.235,-71.898,-32.965,...,-3.568,-3.341,-2.958,HELX_RH_AL_P,HELX,0,1,0,0,0
224478,Q9Y6D9,379,Q,715,74.30,-72.910,-72.199,-70.839,-72.029,-35.063,...,-7.239,-7.954,-5.817,HELX_RH_AL_P,HELX,0,1,0,0,0
224479,Q9Y6D9,379,T,716,69.42,-74.822,-73.426,-72.488,-72.836,-37.516,...,-6.687,-6.171,-6.540,HELX_RH_AL_P,HELX,0,1,0,0,0
224480,Q9Y6D9,379,V,717,56.75,-77.727,-76.694,-76.633,-75.341,-35.784,...,-4.741,-3.243,-5.349,unstructured,unstructured,0,0,0,0,1


In [27]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 379/379 [00:05<00:00, 69.96it/s] 


In [28]:
alphafold_accessibility_Ox32yne = alphafold_annotation_Ox32yne.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [29]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 379/379 [00:03<00:00, 102.03it/s]


In [30]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [31]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 379/379 [00:03<00:00, 104.12it/s]


In [32]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [33]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 379/379 [00:03<00:00, 97.63it/s] 


In [34]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [35]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 379/379 [00:03<00:00, 100.88it/s]


In [36]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [37]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 379/379 [00:03<00:00, 102.00it/s]


In [38]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [39]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 379/379 [00:03<00:00, 101.97it/s]


In [40]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [41]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 379/379 [00:03<00:00, 100.96it/s]


In [42]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [43]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 379/379 [00:03<00:00, 99.64it/s] 


In [44]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [45]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 379/379 [00:03<00:00, 99.56it/s] 


In [46]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [47]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 379/379 [00:03<00:00, 98.30it/s] 


In [48]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [49]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 379/379 [00:04<00:00, 90.86it/s] 


In [50]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [51]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 379/379 [00:05<00:00, 65.82it/s]


In [52]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [53]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 379/379 [00:06<00:00, 62.65it/s]


In [54]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne;

In [55]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_Ox32yne, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 379/379 [00:04<00:00, 91.25it/s] 


In [56]:
alphafold_accessibility_Ox32yne = alphafold_accessibility_Ox32yne.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_Ox32yne

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,A8MXV4,1,M,1,34.18,7.223,7.574,9.030,7.341,-36.582,...,1,1,1,1,1,1,2,3,5,0
1,A8MXV4,1,S,2,29.59,5.569,5.520,4.112,5.964,-33.823,...,1,2,2,2,2,2,3,4,6,0
2,A8MXV4,1,S,3,29.28,5.596,6.732,8.090,6.621,-30.881,...,1,2,2,2,2,2,4,5,7,0
3,A8MXV4,1,S,4,27.39,4.474,3.768,3.026,4.746,-28.235,...,1,2,2,2,2,2,4,6,7,0
4,A8MXV4,1,L,5,28.73,3.667,4.309,3.682,4.070,-25.450,...,1,2,2,2,2,2,4,6,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224477,Q9Y6D9,379,R,714,81.93,-73.042,-72.792,-72.235,-71.898,-32.965,...,2,2,3,4,5,5,11,23,51,2
224478,Q9Y6D9,379,Q,715,74.30,-72.910,-72.199,-70.839,-72.029,-35.063,...,2,2,2,2,3,4,9,18,43,2
224479,Q9Y6D9,379,T,716,69.42,-74.822,-73.426,-72.488,-72.836,-37.516,...,2,2,2,2,2,3,7,13,22,2
224480,Q9Y6D9,379,V,717,56.75,-77.727,-76.694,-76.633,-75.341,-35.784,...,1,2,2,2,2,2,5,8,15,2


In [57]:
alphafold_accessibility_Ox32yne_smooth = get_smooth_score(
    alphafold_accessibility_Ox32yne, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_Ox32yne_smooth;

100%|██████████| 379/379 [00:00<00:00, 460.05it/s]


In [58]:
alphafold_accessibility_Ox32yne_smooth['IDR'] = np.where(
    alphafold_accessibility_Ox32yne_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_Ox32yne_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,A8MXV4,1,M,1,34.18,7.223,7.574,9.030,7.341,-36.582,...,1.909091,1.909091,1.909091,1.909091,1.909091,3.818182,6.636364,12.181818,0.000000,1
1,A8MXV4,1,S,2,29.59,5.569,5.520,4.112,5.964,-33.823,...,2.083333,2.166667,2.166667,2.250000,2.250000,4.750000,9.916667,20.500000,0.416667,1
2,A8MXV4,1,S,3,29.28,5.596,6.732,8.090,6.621,-30.881,...,2.153846,2.230769,2.307692,2.615385,2.692308,5.538462,13.076923,28.923077,0.615385,1
3,A8MXV4,1,S,4,27.39,4.474,3.768,3.026,4.746,-28.235,...,2.214286,2.285714,2.571429,3.000000,3.071429,6.642857,16.857143,38.000000,0.714286,0
4,A8MXV4,1,L,5,28.73,3.667,4.309,3.682,4.070,-25.450,...,2.400000,2.533333,2.866667,3.466667,3.600000,8.133333,20.533333,47.666667,1.333333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713,Q9Y6D9,379,R,714,81.93,-73.042,-72.792,-72.235,-71.898,-32.965,...,2.066667,4.000000,4.733333,5.933333,6.200000,12.733333,34.733333,56.066667,3.066667,0
714,Q9Y6D9,379,Q,715,74.30,-72.910,-72.199,-70.839,-72.029,-35.063,...,2.071429,4.000000,4.642857,5.785714,6.071429,12.142857,32.714286,54.142857,2.928571,0
715,Q9Y6D9,379,T,716,69.42,-74.822,-73.426,-72.488,-72.836,-37.516,...,2.076923,3.923077,4.538462,5.615385,5.923077,12.000000,31.461538,52.230769,3.000000,0
716,Q9Y6D9,379,V,717,56.75,-77.727,-76.694,-76.633,-75.341,-35.784,...,2.083333,3.833333,4.416667,5.416667,5.750000,11.666667,30.166667,50.500000,2.833333,0


# Merge Dataframes into Full Dataset (Includes Alphafold) - Ox32yne

In [59]:
alphafold_accessibility_Ox32yne_smooth["position"] = alphafold_accessibility_Ox32yne_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_wa = peptides_cs.merge(
    alphafold_accessibility_Ox32yne_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_wa # wa means "with alphafold"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,...,2.761905,4.857143,6.238095,7.523810,9.095238,24.238095,59.190476,107.904762,6.238095,0
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,...,2.428571,3.952381,4.476190,5.523810,6.761905,18.714286,54.571429,106.047619,5.333333,0
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,...,2.523810,3.761905,4.666667,6.380952,7.238095,22.333333,70.333333,127.857143,6.619048,0
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,...,2.571429,5.380952,6.142857,8.238095,8.523810,19.190476,50.190476,111.476190,5.380952,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
801,SQM[649.3660]LAMNIEK,SQM[655.3735]LAMNIEK,-0.976846,0.490382,-0.027769,-0.043911,0.465599,,-0.377333,0.288108,...,2.380952,3.666667,4.761905,5.761905,6.285714,10.904762,17.380952,22.952381,2.047619,1
802,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,...,1.809524,1.809524,2.142857,2.666667,3.000000,8.047619,14.857143,20.857143,1.142857,1
803,M[649.3660]QAQMQM[15.9949]QMQGGDGDGGALGHHV,M[655.3735]QAQMQM[15.9949]QMQGGDGDGGALGHHV,,,,0.149688,0.574267,0.076001,-0.699690,-0.327852,...,1.952381,3.000000,3.809524,5.000000,5.380952,10.142857,15.666667,21.380952,1.857143,1
804,RGVM[649.3660]LAVDAVIAELKK,RGVM[655.3735]LAVDAVIAELKK,-1.596362,2.972889,-1.951015,-2.180257,0.516077,0.425237,0.695950,0.483076,...,2.904762,5.333333,6.238095,8.476190,9.000000,26.428571,63.047619,115.666667,7.809524,0


In [60]:
# NOTE: one peptide sequence differs between UniProt & AlphaFold (Uniprot's Q9NX55 M109)

pd.set_option("display.max_columns", None)
display(peptides_wa[~(peptides_wa["AA"] == "M")])
peptides_wa = peptides_wa[(peptides_wa["AA"] == "M")]
pd.reset_option("display.max_columns")

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,3_3 Log2 Ratio HL,Protein,Protein ID,Entry Name,Protein Description,pvalue,avg ratio,neglogpvalue,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,y_coord_ca,y_coord_cb,y_coord_n,z_coord_c,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_2_180_pae,nAA_3_180_pae,nAA_4_180_pae,nAA_4.5_180_pae,nAA_5_180_pae,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae,nAA_2_180_pae_smooth10,nAA_3_180_pae_smooth10,nAA_4_180_pae_smooth10,nAA_4.5_180_pae_smooth10,nAA_5_180_pae_smooth10,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
457,EHM[649.3660]GNVVEALIALTN,EHM[655.3735]GNVVEALIALTN,0.269065,0.034308,0.023707,0.212029,0.74948,0.535874,-0.350295,0.805685,,sp|Q9NX55|HYPK_HUMAN,Q9NX55,HYPK_HUMAN,Huntingtin-interacting protein K,0.080406,0.284982,1.094711,grey,MATEGDVELELETETSGPERPPEKPRKHDSGAADLERVTDYAEEKE...,EHMGNVVEALIALTN,106,15,EH,2,108,LIMTEMEISRAAAERSLREH,GNVVEALIALTN,Q9NX55,342,A,108,92.77,3.291,2.081,2.371,0.883,1.961,2.414,3.753,2.57,-7.576,-8.404,-9.096,-7.572,HELX_RH_AL_P,HELX,0,1,0,0,0,0,0,0,0,2,2,2,5,6,9,12,24,39,44,12,0.0,0.0,0.0,0.0,2.0,2.0,2.095238,3.52381,4.714286,5.904762,6.619048,15.0,33.238095,43.809524,3.47619,0


In [61]:
#peptides_wa.to_csv(os.path.join(curr_dir_path, "Ox32yne_with_alphafold.csv"))

In [62]:
path = os.path.join(curr_dir_path, "Ox32yne_with_alphafold.csv")
peptides_wa = pd.read_csv(path)
peptides_wa.set_index("Unnamed: 0", inplace=True)
peptides_wa.index.name = None
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,...,2.761905,4.857143,6.238095,7.523810,9.095238,24.238095,59.190476,107.904762,6.238095,0
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,...,2.428571,3.952381,4.476190,5.523810,6.761905,18.714286,54.571429,106.047619,5.333333,0
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,...,2.523810,3.761905,4.666667,6.380952,7.238095,22.333333,70.333333,127.857143,6.619048,0
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,...,2.571429,5.380952,6.142857,8.238095,8.523810,19.190476,50.190476,111.476190,5.380952,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
801,SQM[649.3660]LAMNIEK,SQM[655.3735]LAMNIEK,-0.976846,0.490382,-0.027769,-0.043911,0.465599,,-0.377333,0.288108,...,2.380952,3.666667,4.761905,5.761905,6.285714,10.904762,17.380952,22.952381,2.047619,1
802,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,...,1.809524,1.809524,2.142857,2.666667,3.000000,8.047619,14.857143,20.857143,1.142857,1
803,M[649.3660]QAQMQM[15.9949]QMQGGDGDGGALGHHV,M[655.3735]QAQMQM[15.9949]QMQGGDGDGGALGHHV,,,,0.149688,0.574267,0.076001,-0.699690,-0.327852,...,1.952381,3.000000,3.809524,5.000000,5.380952,10.142857,15.666667,21.380952,1.857143,1
804,RGVM[649.3660]LAVDAVIAELKK,RGVM[655.3735]LAVDAVIAELKK,-1.596362,2.972889,-1.951015,-2.180257,0.516077,0.425237,0.695950,0.483076,...,2.904762,5.333333,6.238095,8.476190,9.000000,26.428571,63.047619,115.666667,7.809524,0


# The End (For Now)