## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import requests as r
from Bio import SeqIO
from io import StringIO
import warnings

warnings.filterwarnings('ignore')

# Import structuremap functions
import structuremap.utils
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score

structuremap.utils.set_logger()

## Set Parameters of Analysis

In [2]:
# Set parameters of analysis

analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "655.3735"
light_modification = "649.3660"

## Load Dataset - MsrAKD

In [3]:
# Set correct pathing

curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/MsrKD
Global Data Directory: /Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/global_data


In [4]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "05_10_24_293T_MsrKD_data.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="293T_MsrAKD_quant")
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,Protein ID,Entry Name,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,P55072,TERA_HUMAN,VCP,Transitional endoplasmic reticulum ATPase,0.000016,4.801791,-5.943475,740,M740,TERA_M740
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,Q9NTJ3,SMC4_HUMAN,SMC4,Structural maintenance of chromosomes protein 4,0.001236,2.907956,-5.864487,814,M814,SMC4_M814
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,P39023,RL3_HUMAN,RPL3,Large ribosomal subunit protein uL3,0.031518,1.501438,-5.511180,168,M168,RL3_M168
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,P62847,RS24_HUMAN,RPS24,Small ribosomal subunit protein eS24,0.027458,1.561338,-5.494187,74,M74,RS24_M74
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,P50991,TCPD_HUMAN,CCT4,T-complex protein 1 subunit delta,0.016772,1.775418,-5.343441,81,M81,TCPD_M81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,O14744,ANM5_HUMAN,PRMT5,Protein arginine N-methyltransferase 5,0.694002,0.158639,-0.111634,187,M187,ANM5_M187
659,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,Q15233,NONO_HUMAN,NONO,Non-POU domain-containing octamer-binding protein,0.770188,0.113403,0.297063,326,M326,NONO_M326
660,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,Q99661,KIF2C_HUMAN,KIF2C,Kinesin-like protein KIF2C,0.884856,0.053127,0.044696,708,M708,KIF2C_M708
661,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,Q92922,SMRC1_HUMAN,SMARCC1,SWI/SNF complex subunit SMARCC1,0.909191,0.041345,-0.082454,944,M944,SMRC1_M944


In [5]:
# Canonicalize data - none to do here
peptides;

In [6]:
# Manual labeling of peptides
label_col_data = ["blue"] * 157 + ["green"] * 381 + ["white"] * 9 + ["red"] * 12 + ["gray"] * 104
label_col = pd.Series(label_col_data)
peptides["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [7]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P55072' 'Q9NTJ3' 'P39023' 'P62847' 'P50991' 'P78371' 'P62304' 'P51991'
 'P60709' 'Q13435' 'Q9UHV9' 'Q8IWC1' 'P54727' 'P11940' 'P27816' 'P07437'
 'O94913' 'P10809' 'P31948' 'P08670' 'P07814' 'P35579' 'P23246' 'O94776'
 'P36578' 'Q9Y4L1' 'Q04637' 'P55081' 'P06576' 'P07910' 'P46109' 'Q15424'
 'Q9HCG8' 'Q9UMX0' 'P20700' 'P40227' 'P29401' 'Q8WYA6' 'O75534' 'P25205'
 'Q6PKG0' 'Q9Y2W1' 'P11171' 'P30519' 'O60826' 'P08708' 'Q9P0L0' 'P48643'
 'Q16891' 'Q04837' 'P08238' 'Q8WWK9' 'Q14011' 'Q9H814' 'Q9UNZ5' 'Q03252'
 'Q9NTK5' 'P06493' 'Q9NUU7' 'Q4G0J3' 'Q09666' 'O75934' 'Q14320' 'Q15029'
 'Q16630' 'Q9UKD2' 'O95835' 'P16949' 'Q15459' 'O14745' 'P18669' 'Q96EP5'
 'P50402' 'P50213' 'Q5BKZ1' 'P49755' 'P60228' 'P41227' 'Q07065' 'O43252'
 'Q9UQN3' 'P14678' 'Q7L1Q6' 'P18583' 'Q9Y266' 'Q7Z739' 'Q16576' 'P22626'
 'Q9BYN8' 'Q9UKV3' 'Q14152' 'O60814' 'P05141' 'Q99832' 'P41252' 'Q9BXP5'
 'Q13263' 'Q7L4I2' 'Q8WUM0' 'O60508' 'Q01518' 'Q92922' 'O75157' 'P14174'
 'P33176' 'P26038' 'Q14204' 'Q

In [8]:
# Helper function to get full amino acid sequence for a protein
def get_complete_sequence(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [9]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [10]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [11]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label,Color,Complete Sequence
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,VCP,Transitional endoplasmic reticulum ATPase,0.000016,4.801791,-5.943475,740,M740,TERA_M740,blue,MASGADSKGDDLSTAILKQKNRPNRLIVDEAINEDNSVVSLSQPKM...
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,SMC4,Structural maintenance of chromosomes protein 4,0.001236,2.907956,-5.864487,814,M814,SMC4_M814,blue,MPRKGTQPSTARRREEGPPPPSPDGASSDAEPEPPSGRTESPATAA...
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,RPL3,Large ribosomal subunit protein uL3,0.031518,1.501438,-5.511180,168,M168,RL3_M168,blue,MSHRKFSAPRHGSLGFLPRKRSSRHRGKVKSFPKDDPSKPVHLTAF...
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,RPS24,Small ribosomal subunit protein eS24,0.027458,1.561338,-5.494187,74,M74,RS24_M74,blue,MNDTVTIRTRKFMTNRLLQRKQMVIDVLHPGKATVPKTEIREKLAK...
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,CCT4,T-complex protein 1 subunit delta,0.016772,1.775418,-5.343441,81,M81,TCPD_M81,blue,MPENVAPRSGATAGAAGGRGKGAYQDRDKPAQIRFSNISAAKAVAD...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,PRMT5,Protein arginine N-methyltransferase 5,0.694002,0.158639,-0.111634,187,M187,ANM5_M187,gray,MAAMAVGGAGGSRVSSGRDLNCVPEIADTLGAVAKQGFDFLCMPVF...
659,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,NONO,Non-POU domain-containing octamer-binding protein,0.770188,0.113403,0.297063,326,M326,NONO_M326,gray,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...
660,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,KIF2C,Kinesin-like protein KIF2C,0.884856,0.053127,0.044696,708,M708,KIF2C_M708,gray,MAMDSSLQARLFPGLAIKIQRSNGLIHSANVRTVNLEKSCVSVEWA...
661,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,SMARCC1,SWI/SNF complex subunit SMARCC1,0.909191,0.041345,-0.082454,944,M944,SMRC1_M944,gray,MAAAAGGGGPGTAVGATGSGIAAAAAGLAVYRRKDGGPATKFWESP...


In [12]:
# Create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    whole, mantissa = modifications[0].split(".")
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        whole, mantissa = modifications[i].split(".")
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [13]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [14]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [15]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [16]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    663
Name: count, dtype: int64

In [17]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [18]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [19]:
# Sanity check - ensure methionine locations are correct
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
temp.count("M") == len(temp)

True

In [20]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,blue,MASGADSKGDDLSTAILKQKNRPNRLIVDEAINEDNSVVSLSQPKM...,DHFEEAMR,733,8,DHFEEA,6,739,MEVEEDDPVPEIRRDHFEEA,RFARRSVSDNDIRKYEMFAQ
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,blue,MPRKGTQPSTARRREEGPPPPSPDGASSDAEPEPPSGRTESPATAA...,LRHSEREMR,806,9,LRHSERE,7,813,QEQKVQLEERVVKLRHSERE,RNTLEKFTASIQRLIEQEEY
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,blue,MSHRKFSAPRHGSLGFLPRKRSSRHRGKVKSFPKDDPSKPVHLTAF...,VIAHTQMR,161,8,VIAHTQ,6,167,KDFSSMKKYCQVIRVIAHTQ,RLLPLRQKKAHLMEIQVNGG
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,blue,MNDTVTIRTRKFMTNRLLQRKQMVIDVLHPGKATVPKTEIREKLAK...,TTGFGMIYDSLDYAK,68,15,TTGFG,5,73,VIFVFGFRTHFGGGKTTGFG,IYDSLDYAKKNEPKHRLARH
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,blue,MPENVAPRSGATAGAAGGRGKGAYQDRDKPAQIRFSNISAAKAVAD...,QMQVLHPAAR,79,10,Q,1,80,IQDGKGDVTITNDGATILKQ,QVLHPAARMLVELSKAQDIE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,gray,MAAMAVGGAGGSRVSSGRDLNCVPEIADTLGAVAKQGFDFLCMPVF...,TWMWWHNFR,184,9,TW,2,186,IIENAPTTHTEEYSGEEKTW,WWHNFRTLCDYSKRIAVALE
659,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,gray,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,MEELHNQEVQK,325,11,,0,325,EHQVMLMRQDLMRRQEELRR,EELHNQEVQKRKQLELRQEE
660,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,gray,MAMDSSLQARLFPGLAIKIQRSNGLIHSANVRTVNLEKSCVSVEWA...,LAMQLEEQASR,705,11,LA,2,707,AQQAKHFSALRDVIKALRLA,QLEEQASRQISSKKRPQ
661,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,gray,MAAAAGGGGPGTAVGATGSGIAAAAAGLAVYRRKDGGPATKFWESP...,QNFHMEQLK,939,9,QNFH,4,943,EKEALEQQRQQLLTERQNFH,EQLKYAELRARQQMEQQQHG


In [21]:
# NOTE: Some methionine site numbers (from the initial dataset) are incorrect
display(peptides_cs[~(peptides_cs["Site Number"] == peptides_cs["Methionine Location"] + 1)])

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
162,M[649.3660]GANSLER,M[655.3735]GANSLER,,,-2.92864,-2.736653,-2.494809,-2.734591,-2.455773,,...,green,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,MGANSLER,570,8,,0,570,MATGLERMGANNLERMGLER,GANSLERMGLERMGANSLER
251,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,-3.16286,-2.941378,-3.050561,-3.034934,-2.966366,-3.191401,-2.881145,-3.167577,...,green,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,RGMDDDRGPR,959,10,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA
313,ISM[649.3660]PDIDLNLKGPK,ISM[655.3735]PDIDLNLKGPK,,,-4.089726,0.719132,-3.780943,-4.33041,-4.456059,-3.573424,...,green,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,ISMPDIDLNLKGPK,2706,14,IS,2,2708,KLKGPKFKMPEMNIKAPKIS,PDIDLNLKGPKVKGDVDVSL
432,RGM[649.3660]DDDR,RGM[655.3735]DDDR,-4.085992,,-3.754871,,,,,,...,green,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,RGMDDDR,959,7,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA
601,VDINAPDVDVQGPDWHLKM[649.3660]PK,VDINAPDVDVQGPDWHLKM[655.3735]PK,,,,,,-2.987953,,,...,gray,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,VDINAPDVDVQGPDWHLKMPK,3131,21,VDINAPDVDVQGPDWHLK,18,3149,PKVDINAPDVDVQGPDWHLK,PKIKMPKISMPGFKGEGPEV
626,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,,,,-2.614655,,...,gray,MSGSSSVAAMKKVVQQLRLEAGLNRVKVSQAAADLKQFCLQNAQHD...,SGSSSVAAMKK,1,11,SGSSSVAA,8,9,MSGSSSVA,KKVVQQLRLEAGLNRVKVSQ
635,n[42.0106]ADKM[649.3660]DMSLDDIIK,n[42.0106]ADKM[655.3735]DMSLDDIIK,,,-0.581554,-1.593138,,,,,...,gray,MADKMDMSLDDIIKLNRSQRGGRGGGRGRGRAGSQGGRGGGAQAAA...,ADKMDMSLDDIIK,1,13,ADK,3,4,MAD,DMSLDDIIKLNRSQRGGRGG


In [22]:
# Remove invalid proteins (according to alphafold)
# 7 invalid peptides as a result -> 2 blue, 4 green, 1 gray

invalid_IDs = ['Q09666', 'Q14204', 'Q9Y520', 'Q14789']
peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,blue,MASGADSKGDDLSTAILKQKNRPNRLIVDEAINEDNSVVSLSQPKM...,DHFEEAMR,733,8,DHFEEA,6,739,MEVEEDDPVPEIRRDHFEEA,RFARRSVSDNDIRKYEMFAQ
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,blue,MPRKGTQPSTARRREEGPPPPSPDGASSDAEPEPPSGRTESPATAA...,LRHSEREMR,806,9,LRHSERE,7,813,QEQKVQLEERVVKLRHSERE,RNTLEKFTASIQRLIEQEEY
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,blue,MSHRKFSAPRHGSLGFLPRKRSSRHRGKVKSFPKDDPSKPVHLTAF...,VIAHTQMR,161,8,VIAHTQ,6,167,KDFSSMKKYCQVIRVIAHTQ,RLLPLRQKKAHLMEIQVNGG
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,blue,MNDTVTIRTRKFMTNRLLQRKQMVIDVLHPGKATVPKTEIREKLAK...,TTGFGMIYDSLDYAK,68,15,TTGFG,5,73,VIFVFGFRTHFGGGKTTGFG,IYDSLDYAKKNEPKHRLARH
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,blue,MPENVAPRSGATAGAAGGRGKGAYQDRDKPAQIRFSNISAAKAVAD...,QMQVLHPAAR,79,10,Q,1,80,IQDGKGDVTITNDGATILKQ,QVLHPAARMLVELSKAQDIE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,gray,MAAMAVGGAGGSRVSSGRDLNCVPEIADTLGAVAKQGFDFLCMPVF...,TWMWWHNFR,184,9,TW,2,186,IIENAPTTHTEEYSGEEKTW,WWHNFRTLCDYSKRIAVALE
659,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,gray,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,MEELHNQEVQK,325,11,,0,325,EHQVMLMRQDLMRRQEELRR,EELHNQEVQKRKQLELRQEE
660,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,gray,MAMDSSLQARLFPGLAIKIQRSNGLIHSANVRTVNLEKSCVSVEWA...,LAMQLEEQASR,705,11,LA,2,707,AQQAKHFSALRDVIKALRLA,QLEEQASRQISSKKRPQ
661,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,gray,MAAAAGGGGPGTAVGATGSGIAAAAAGLAVYRRKDGGPATKFWESP...,QNFHMEQLK,939,9,QNFH,4,943,EKEALEQQRQQLLTERQNFH,EQLKYAELRARQQMEQQQHG


# Download Alphafold Data - MsrAKD

In [23]:
# Path for alphafold protein data

alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/alphafold_data/pae


In [24]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P55072' 'Q9NTJ3' 'P39023' 'P62847' 'P50991' 'P78371' 'P62304' 'P51991'
 'P60709' 'Q13435' 'Q9UHV9' 'Q8IWC1' 'P54727' 'P11940' 'P27816' 'P07437'
 'O94913' 'P10809' 'P31948' 'P08670' 'P07814' 'P35579' 'P23246' 'O94776'
 'P36578' 'Q9Y4L1' 'Q04637' 'P55081' 'P06576' 'P07910' 'P46109' 'Q15424'
 'Q9HCG8' 'Q9UMX0' 'P20700' 'P40227' 'P29401' 'Q8WYA6' 'O75534' 'P25205'
 'Q6PKG0' 'Q9Y2W1' 'P11171' 'P30519' 'O60826' 'P08708' 'Q9P0L0' 'P48643'
 'Q16891' 'Q04837' 'P08238' 'Q8WWK9' 'Q14011' 'Q9H814' 'Q9UNZ5' 'Q03252'
 'Q9NTK5' 'P06493' 'Q9NUU7' 'Q4G0J3' 'O75934' 'Q14320' 'Q15029' 'Q16630'
 'Q9UKD2' 'O95835' 'P16949' 'Q15459' 'O14745' 'P18669' 'Q96EP5' 'P50402'
 'P50213' 'Q5BKZ1' 'P49755' 'P60228' 'P41227' 'Q07065' 'O43252' 'Q9UQN3'
 'P14678' 'Q7L1Q6' 'P18583' 'Q9Y266' 'Q7Z739' 'Q16576' 'P22626' 'Q9BYN8'
 'Q9UKV3' 'Q14152' 'O60814' 'P05141' 'Q99832' 'P41252' 'Q9BXP5' 'Q13263'
 'Q7L4I2' 'Q8WUM0' 'O60508' 'Q01518' 'Q92922' 'O75157' 'P14174' 'P33176'
 'P26038' 'Q9UHX1' 'P62258' 'Q

In [25]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 401/401 [00:00<00:00, 136209.58it/s]

2024-06-28 19:25:54> Valid proteins: 0
2024-06-28 19:25:54> Invalid proteins: 0
2024-06-28 19:25:54> Existing proteins: 401





In [26]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 401/401 [00:00<00:00, 129978.05it/s]

2024-06-28 19:25:54> Valid proteins: 0
2024-06-28 19:25:54> Invalid proteins: 0
2024-06-28 19:25:54> Existing proteins: 401





## Construct Alphafold Dataframe (Calculate Accessibilities) - MsrAKD

In [27]:
# Format alphafold data into dataframe
alphafold_annotation_MsrAKD = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=unique_uniprotIDs)
alphafold_annotation_MsrAKD

100%|██████████| 1110/1110 [00:48<00:00, 22.99it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A8MWD9,1,M,1,47.25,6.065,4.721,3.923,4.980,25.474,...,-6.924,-8.038,-7.255,unstructured,unstructured,0,0,0,0,1
1,A8MWD9,1,S,2,59.97,7.417,7.567,7.764,6.367,22.925,...,-5.364,-3.908,-5.523,unstructured,unstructured,0,0,0,0,1
2,A8MWD9,1,K,3,62.90,8.386,8.471,9.760,8.438,20.286,...,-7.835,-8.662,-7.119,unstructured,unstructured,0,0,0,0,1
3,A8MWD9,1,A,4,64.57,8.540,7.245,5.993,7.332,17.535,...,-5.865,-6.199,-6.761,unstructured,unstructured,0,0,0,0,1
4,A8MWD9,1,H,5,69.53,10.062,10.548,11.445,9.352,15.189,...,-5.055,-3.843,-4.996,HELX_LH_PP_P,HELX,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236903,Q9Y617,401,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,15.087,16.015,14.291,HELX_RH_AL_P,HELX,0,1,0,0,0
236904,Q9Y617,401,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,17.344,18.096,16.529,HELX_RH_AL_P,HELX,0,1,0,0,0
236905,Q9Y617,401,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,14.388,13.842,15.221,HELX_RH_AL_P,HELX,0,1,0,0,0
236906,Q9Y617,401,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,12.177,11.359,13.111,HELX_RH_AL_P,HELX,0,1,0,0,0


In [28]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 401/401 [00:07<00:00, 51.36it/s] 


In [29]:
alphafold_accessibility_MsrAKD = alphafold_annotation_MsrAKD.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [30]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 401/401 [00:05<00:00, 68.90it/s] 


In [31]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [32]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 401/401 [00:05<00:00, 70.06it/s] 


In [33]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [34]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 401/401 [00:05<00:00, 67.40it/s] 


In [35]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [36]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 401/401 [00:05<00:00, 69.22it/s] 


In [37]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [38]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 401/401 [00:05<00:00, 67.53it/s] 


In [39]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [40]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 401/401 [00:05<00:00, 68.36it/s] 


In [41]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [42]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 401/401 [00:06<00:00, 66.68it/s] 


In [43]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [44]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 401/401 [00:05<00:00, 67.04it/s] 


In [45]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [46]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 401/401 [00:06<00:00, 66.70it/s]


In [47]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [48]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 401/401 [00:06<00:00, 65.21it/s] 


In [49]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [50]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 401/401 [00:06<00:00, 61.54it/s] 


In [51]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [52]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 401/401 [00:07<00:00, 52.60it/s]


In [53]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [54]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 401/401 [00:09<00:00, 43.60it/s]


In [55]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [56]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 401/401 [00:06<00:00, 60.25it/s] 


In [57]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,A8MWD9,1,M,1,47.25,6.065,4.721,3.923,4.980,25.474,...,1,1,1,1,1,1,2,3,4,0
1,A8MWD9,1,S,2,59.97,7.417,7.567,7.764,6.367,22.925,...,0,2,2,2,2,2,3,4,5,0
2,A8MWD9,1,K,3,62.90,8.386,8.471,9.760,8.438,20.286,...,1,2,2,2,2,2,4,5,7,0
3,A8MWD9,1,A,4,64.57,8.540,7.245,5.993,7.332,17.535,...,1,2,2,2,2,2,4,8,12,0
4,A8MWD9,1,H,5,69.53,10.062,10.548,11.445,9.352,15.189,...,1,2,2,2,2,2,4,9,16,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236903,Q9Y617,401,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,2,2,3,6,7,7,12,35,70,4
236904,Q9Y617,401,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,2,2,3,4,5,5,9,33,62,2
236905,Q9Y617,401,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,2,2,3,4,4,5,10,29,54,4
236906,Q9Y617,401,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,1,2,2,2,2,2,5,13,30,2


In [58]:
alphafold_accessibility_MsrAKD_smooth = get_smooth_score(
    alphafold_accessibility_MsrAKD, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_MsrAKD_smooth;

100%|██████████| 401/401 [00:01<00:00, 356.46it/s]


In [59]:
alphafold_accessibility_MsrAKD_smooth['IDR'] = np.where(
    alphafold_accessibility_MsrAKD_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_MsrAKD_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,A8MWD9,1,M,1,47.25,6.065,4.721,3.923,4.980,25.474,...,1.909091,2.090909,2.181818,2.272727,2.727273,5.909091,15.090909,26.090909,0.636364,1
1,A8MWD9,1,S,2,59.97,7.417,7.567,7.764,6.367,22.925,...,1.916667,2.250000,2.333333,2.500000,2.916667,6.833333,16.750000,28.583333,0.916667,1
2,A8MWD9,1,K,3,62.90,8.386,8.471,9.760,8.438,20.286,...,1.923077,2.307692,2.461538,2.769231,3.230769,7.384615,18.461538,30.846154,0.923077,1
3,A8MWD9,1,A,4,64.57,8.540,7.245,5.993,7.332,17.535,...,1.928571,2.357143,2.571429,2.928571,3.500000,7.928571,19.785714,32.785714,0.857143,1
4,A8MWD9,1,H,5,69.53,10.062,10.548,11.445,9.352,15.189,...,1.933333,2.533333,2.800000,3.200000,3.733333,8.666667,21.200000,34.666667,1.333333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,Q9Y617,401,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,2.000000,4.266667,5.333333,6.466667,6.733333,17.133333,52.133333,84.600000,6.000000,0
366,Q9Y617,401,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,2.000000,4.142857,5.285714,6.357143,6.642857,16.857143,51.285714,82.857143,5.928571,0
367,Q9Y617,401,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,2.000000,4.076923,5.230769,6.307692,6.538462,16.000000,48.769231,79.307692,5.615385,0
368,Q9Y617,401,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,1.916667,3.916667,5.166667,6.083333,6.250000,15.333333,46.166667,75.333333,5.333333,0


# Merge Dataframes into Full Dataset (Includes Alphafold) - MsrAKD

In [60]:
alphafold_accessibility_MsrAKD_smooth["position"] = alphafold_accessibility_MsrAKD_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_wa = peptides_cs.merge(
    alphafold_accessibility_MsrAKD_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_wa # wa means "with alphafold"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,2.238095,3.380952,4.190476,5.047619,5.761905,13.714286,38.714286,67.380952,3.190476,0.0
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,2.047619,4.238095,6.142857,7.904762,8.000000,14.666667,35.619048,52.333333,3.952381,0.0
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,2.380952,3.428571,4.571429,5.952381,7.666667,22.476190,65.476190,125.333333,5.857143,0.0
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,2.476190,3.476190,4.714286,6.000000,7.333333,16.857143,42.238095,63.095238,2.619048,0.0
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,2.095238,3.571429,4.333333,5.428571,5.904762,16.952381,55.571429,105.619048,5.380952,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,2.238095,3.761905,4.571429,5.714286,6.190476,16.047619,50.666667,105.571429,4.619048,0.0
652,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,2.428571,5.047619,5.857143,7.619048,7.619048,11.904762,18.952381,26.047619,2.238095,1.0
653,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,2.095238,4.761905,5.333333,7.095238,7.333333,13.714286,31.571429,51.142857,3.047619,0.0
654,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,2.619048,4.238095,5.666667,6.666667,7.285714,11.333333,18.714286,24.904762,2.428571,1.0


In [61]:
#peptides_wa.to_csv(os.path.join(curr_dir_path, "MsrAKD_with_alphafold.csv"))

In [62]:
path = os.path.join(curr_dir_path, "MsrAKD_with_alphafold.csv")
peptides_wa = pd.read_csv(path)
peptides_wa.set_index("Unnamed: 0", inplace=True)
peptides_wa.index.name = None
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,2.238095,3.380952,4.190476,5.047619,5.761905,13.714286,38.714286,67.380952,3.190476,0.0
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,2.047619,4.238095,6.142857,7.904762,8.000000,14.666667,35.619048,52.333333,3.952381,0.0
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,2.380952,3.428571,4.571429,5.952381,7.666667,22.476190,65.476190,125.333333,5.857143,0.0
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,2.476190,3.476190,4.714286,6.000000,7.333333,16.857143,42.238095,63.095238,2.619048,0.0
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,2.095238,3.571429,4.333333,5.428571,5.904762,16.952381,55.571429,105.619048,5.380952,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,2.238095,3.761905,4.571429,5.714286,6.190476,16.047619,50.666667,105.571429,4.619048,0.0
652,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,2.428571,5.047619,5.857143,7.619048,7.619048,11.904762,18.952381,26.047619,2.238095,1.0
653,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,2.095238,4.761905,5.333333,7.095238,7.333333,13.714286,31.571429,51.142857,3.047619,0.0
654,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,2.619048,4.238095,5.666667,6.666667,7.285714,11.333333,18.714286,24.904762,2.428571,1.0


## Load Dataset - MsrB2KD

In [63]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "05_10_24_293T_MsrKD_data.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="293T_MsrB2KD_quant")
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,Protein ID,Entry Name,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,Q16836,HCDH_HUMAN,HADH,"Hydroxyacyl-coenzyme A dehydrogenase, mitochon...",7.317853e-14,13.135616,-4.116590,178,M178,HCDH_M178
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,P23193,TCEA1_HUMAN,TCEA1,Transcription elongation factor A protein 1,3.213832e-04,3.492977,-3.852951,48,M48,TCEA1_M48
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,Q16181,SEPT7_HUMAN,SEPTIN7,Septin-7,2.521873e-11,10.598277,-2.655733,355,M355,SEPT7_M355
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,P35579,MYH9_HUMAN,MYH9,Myosin-9,2.469697e-03,2.607356,-2.118498,1565,M1565,MYH9_M1565
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,P62258,1433E_HUMAN,YWHAE,14-3-3 protein epsilon,3.952954e-03,2.403078,-2.045838,160,M160,1433E_M160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,Q00341,VIGLN_HUMAN,HDLBP,Vigilin,1.332075e-01,0.875471,2.654099,128,M128,VIGLN_M128
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,P00374,DYR_HUMAN,DHFR,Dihydrofolate reductase,5.244905e-02,1.280262,2.951996,126,M126,DYR_M126
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,P35579,MYH9_HUMAN,MYH9,Myosin-9,6.250049e-02,1.204117,2.976128,1489,M1489,MYH9_M1489
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,P14868,SYDC_HUMAN,DARS1,"Aspartate--tRNA ligase, cytoplasmic",6.321136e-02,1.199205,3.046635,478,M478,SYDC_M478


In [64]:
# Canonicalize data - none to do here
peptides;

In [65]:
# Manual labeling of peptides
label_col_data = ["blue"] * 10 + ["white"] * 30 + ["green"] * 381 + ["red"] * 213 + ["gray"] * 120
label_col = pd.Series(label_col_data)
peptides["color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [66]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q16836' 'P23193' 'Q16181' 'P35579' 'P62258' 'P46109' 'P55072' 'Q9Y265'
 'P25205' 'P61024' 'P41227' 'P18583' 'Q9UN37' 'O14744' 'Q86UP2' 'O14874'
 'P36543' 'Q9Y2W2' 'P14174' 'Q9Y617' 'Q8WVK2' 'P31948' 'Q9Y3U8' 'Q99729'
 'Q9UKD2' 'Q9Y3I0' 'P27144' 'Q9UHX1' 'P22307' 'Q01518' 'Q9BWF3' 'Q9Y580'
 'O43707' 'P22061' 'P52272' 'Q9HD42' 'P50454' 'O95831' 'P18859' 'P05067'
 'P60709' 'P68032' 'P35611' 'P55196' 'Q4VCS5' 'P08243' 'P05023' 'P24539'
 'Q9NVI7' 'Q8WWM7' 'Q07812' 'Q9NYF8' 'Q9UHR4' 'P11021' 'Q9BRK5' 'O43852'
 'Q14444' 'Q96CT7' 'Q16543' 'P06493' 'P61604' 'P10809' 'Q9UQN3' 'Q9H444'
 'Q9Y3Y2' 'Q14011' 'E9PRG8' 'Q07065' 'Q15003' 'P09669' 'P33240' 'Q9H0L4'
 'Q92841' 'Q9NR30' 'Q9BUQ8' 'P00367' 'Q08211' 'Q99615' 'O75937' 'P55265'
 'P33316' 'Q14204' 'P55084' 'P42126' 'Q6P2E9' 'P29692' 'Q14152' 'O75821'
 'O15372' 'Q09666' 'Q15717' 'Q8N8S7' 'P06733' 'P14625' 'O43768' 'P15170'
 'Q01844' 'P15311' 'Q02790' 'Q96AE4' 'P35637' 'Q13283' 'Q9UN86' 'P14314'
 'P46926' 'Q14789' 'P38646' 'Q

In [67]:
# Helper function to get full amino acid sequence for a protein
def get_full_protein_seq(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [68]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [69]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [70]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label,color,Complete Sequence
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,HADH,"Hydroxyacyl-coenzyme A dehydrogenase, mitochon...",7.317853e-14,13.135616,-4.116590,178,M178,HCDH_M178,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,TCEA1,Transcription elongation factor A protein 1,3.213832e-04,3.492977,-3.852951,48,M48,TCEA1_M48,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,SEPTIN7,Septin-7,2.521873e-11,10.598277,-2.655733,355,M355,SEPT7_M355,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,MYH9,Myosin-9,2.469697e-03,2.607356,-2.118498,1565,M1565,MYH9_M1565,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,YWHAE,14-3-3 protein epsilon,3.952954e-03,2.403078,-2.045838,160,M160,1433E_M160,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,HDLBP,Vigilin,1.332075e-01,0.875471,2.654099,128,M128,VIGLN_M128,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,DHFR,Dihydrofolate reductase,5.244905e-02,1.280262,2.951996,126,M126,DYR_M126,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,MYH9,Myosin-9,6.250049e-02,1.204117,2.976128,1489,M1489,MYH9_M1489,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,DARS1,"Aspartate--tRNA ligase, cytoplasmic",6.321136e-02,1.199205,3.046635,478,M478,SYDC_M478,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...


In [71]:
# Create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    whole, mantissa = modifications[0].split(".")
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        whole, mantissa = modifications[i].split(".")
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [72]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [73]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [74]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [75]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    754
Name: count, dtype: int64

In [76]:
# create regex pattern to identify desired modifications
left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [77]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [78]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [79]:
# Sanity check - ensure methionine locations are correct
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
temp.count("M") == len(temp)

True

In [80]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,FAGLHFFNPVPVMK,165,14,FAGLHFFNPVPV,12,177,NATTRQDRFAGLHFFNPVPV,KLVEVIKTPMTSQKTFESLV
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...,IGMSVNAIR,45,9,IG,2,47,LKELKNIPMTLELLQSTRIG,SVNAIRKQSTDEEVTSLAKS
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...,KMEMEMEQVFEMK,351,13,KME,3,354,PLAQMEEERREHVAKMKKME,EMEQVFEMKVKEKVQKLKDS
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,LRLEVNLQAMK,1555,11,LRLEVNLQA,9,1564,EDELQATEDAKLRLEVNLQA,KAQFERDLQGRDEQSEEKKK
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,AASDIAMTELPPTHPIR,153,17,AASDIA,6,159,DRKEAAENSLVAYKAASDIA,TELPPTHPIRLGLALNFSVF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK,121,11,DQGLSI,6,127,MQRTGAHLELSLAKDQGLSI,VSGKLDAVMKARKDIVARLQ
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,EAMNHPGHLK,123,10,EA,2,125,LANKVDMVWIVGGSSVYKEA,NHPGHLKLFVTRIMQDFESD
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,ALEEAMEQK,1483,9,ALEEA,5,1488,AEAREKETKALSLARALEEA,EQKAELERLNKQFRTEMEDL
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...,VTMLFLGLHNVR,475,12,VT,2,477,SFRFGAPPHAGGGIGLERVT,LFLGLHNVRQTSMFPRDPKR


In [81]:
# NOTE: Some methionine site numbers (from the initial dataset) are incorrect
display(peptides_cs[~(peptides_cs["Site Number"] == peptides_cs["Methionine Location"] + 1)])

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
108,ISM[649.3660]PDIDLNLKGPK,ISM[655.3735]PDIDLNLKGPK,,,2.317727,1.315954,2.486824,2.297093,,2.372561,...,green,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,ISMPDIDLNLKGPK,2706,14,IS,2,2708,KLKGPKFKMPEMNIKAPKIS,PDIDLNLKGPKVKGDVDVSL
253,ISM[649.3660]PDVDLHLK,ISM[655.3735]PDVDLHLK,2.023708,,1.388773,,,,2.04048,1.42799,...,green,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,ISMPDVDLHLK,817,11,IS,2,819,KLKGPKFKMPEMNIKVPKIS,PDVDLHLKGPNVKGEYDVTM
274,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,1.689388,1.636527,1.738867,1.672636,1.615026,1.706146,1.594494,1.616323,...,green,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,RGMDDDRGPR,959,10,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA
387,M[649.3660]GANSLER,M[655.3735]GANSLER,1.356775,1.283564,1.510369,1.369582,1.35557,1.498688,1.530735,1.459537,...,green,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,MGANSLER,570,8,,0,570,MATGLERMGANNLERMGLER,GANSLERMGLERMGANSLER
422,RGM[649.3660]DDDR,RGM[655.3735]DDDR,,1.757216,,,1.155705,,,1.76007,...,red,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,RGMDDDR,959,7,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA
547,SM[15.9949]M[649.3660]SAYER,SM[15.9949]M[655.3735]SAYER,,1.01538,1.08481,1.091951,1.049318,1.202029,1.020585,1.133366,...,red,MATNIEQIFRSFVVSKFREIQQELSSGRNEGQLNGETNTPIEGNQA...,SMMSAYER,1031,8,SM,2,1033,ERSMMSYERSMMSPMAERSM,SAYERSMMSAYERSMMSPMA
609,SM[15.9949]M[649.3660]SSYSAADR,SM[15.9949]M[655.3735]SSYSAADR,0.795118,,0.866351,0.688615,0.901931,0.961633,,0.853249,...,red,MATNIEQIFRSFVVSKFREIQQELSSGRNEGQLNGETNTPIEGNQA...,SMMSSYSAADR,1090,11,SM,2,1092,SMMSPMADRSMMSMGADRSM,SSYSAADRSMMSSYSAADRS
628,GM[649.3660]QGPPGPR,GM[655.3735]QGPPGPR,1.281578,1.361743,,,,1.625771,,,...,red,MATEIGSPPRFFHMPRFQHQAPRQLFYKRPDFAQQQAMQQLTFDGK...,GMQGPPGPR,738,9,G,1,739,QGPPGPQGHLGPQGPPGTQG,QGPPGPRGMQGPPHPHGIQG
652,SM[649.3660]M[15.9949]SPMAER,SM[655.3735]M[15.9949]SPMAER,,,,-0.068689,0.573475,,,-0.19599,...,gray,MATNIEQIFRSFVVSKFREIQQELSSGRNEGQLNGETNTPIEGNQA...,SMMSPMAER,1022,9,S,1,1023,AAERSMMSSYERSMMSYERS,MSPMAERSMMSAYERSMMSA
684,n[42.0106]MDRM[649.3660]TEDALR,n[42.0106]MDRM[655.3735]TEDALR,0.398543,,,,0.678404,,,,...,gray,MDRMTEDALRLNLLKRSLDPADERDDVLAKRLKMEGHEAMERLKML...,MDRMTEDALR,0,10,MDR,3,3,MD,TEDALRLNLLKRSLDPADER


In [82]:
# remove invalid proteins (according to alphafold)
# 12 invalid peptides as a result -> 5 green, 4 red, 3 gray

invalid_IDs = ['Q14204', 'Q09666', 'Q14789', 'Q9Y520', 'P46013', 'Q9NU22']
peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,FAGLHFFNPVPVMK,165,14,FAGLHFFNPVPV,12,177,NATTRQDRFAGLHFFNPVPV,KLVEVIKTPMTSQKTFESLV
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...,IGMSVNAIR,45,9,IG,2,47,LKELKNIPMTLELLQSTRIG,SVNAIRKQSTDEEVTSLAKS
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...,KMEMEMEQVFEMK,351,13,KME,3,354,PLAQMEEERREHVAKMKKME,EMEQVFEMKVKEKVQKLKDS
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,LRLEVNLQAMK,1555,11,LRLEVNLQA,9,1564,EDELQATEDAKLRLEVNLQA,KAQFERDLQGRDEQSEEKKK
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,AASDIAMTELPPTHPIR,153,17,AASDIA,6,159,DRKEAAENSLVAYKAASDIA,TELPPTHPIRLGLALNFSVF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK,121,11,DQGLSI,6,127,MQRTGAHLELSLAKDQGLSI,VSGKLDAVMKARKDIVARLQ
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,EAMNHPGHLK,123,10,EA,2,125,LANKVDMVWIVGGSSVYKEA,NHPGHLKLFVTRIMQDFESD
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,ALEEAMEQK,1483,9,ALEEA,5,1488,AEAREKETKALSLARALEEA,EQKAELERLNKQFRTEMEDL
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...,VTMLFLGLHNVR,475,12,VT,2,477,SFRFGAPPHAGGGIGLERVT,LFLGLHNVRQTSMFPRDPKR


# Download Alphafold Data - MsrB2KD

In [83]:
# Path for alphafold protein data

alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/alphafold_data/pae


In [84]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q16836' 'P23193' 'Q16181' 'P35579' 'P62258' 'P46109' 'P55072' 'Q9Y265'
 'P25205' 'P61024' 'P41227' 'P18583' 'Q9UN37' 'O14744' 'Q86UP2' 'O14874'
 'P36543' 'Q9Y2W2' 'P14174' 'Q9Y617' 'Q8WVK2' 'P31948' 'Q9Y3U8' 'Q99729'
 'Q9UKD2' 'Q9Y3I0' 'P27144' 'Q9UHX1' 'P22307' 'Q01518' 'Q9BWF3' 'Q9Y580'
 'O43707' 'P22061' 'P52272' 'Q9HD42' 'P50454' 'O95831' 'P18859' 'P05067'
 'P60709' 'P68032' 'P35611' 'P55196' 'Q4VCS5' 'P08243' 'P05023' 'P24539'
 'Q9NVI7' 'Q8WWM7' 'Q07812' 'Q9NYF8' 'Q9UHR4' 'P11021' 'Q9BRK5' 'O43852'
 'Q14444' 'Q96CT7' 'Q16543' 'P06493' 'P61604' 'P10809' 'Q9UQN3' 'Q9H444'
 'Q9Y3Y2' 'Q14011' 'E9PRG8' 'Q07065' 'Q15003' 'P09669' 'P33240' 'Q9H0L4'
 'Q92841' 'Q9NR30' 'Q9BUQ8' 'P00367' 'Q08211' 'Q99615' 'O75937' 'P55265'
 'P33316' 'P55084' 'P42126' 'Q6P2E9' 'P29692' 'Q14152' 'O75821' 'O15372'
 'Q15717' 'Q8N8S7' 'P06733' 'P14625' 'O43768' 'P15170' 'Q01844' 'P15311'
 'Q02790' 'Q96AE4' 'P35637' 'Q13283' 'Q9UN86' 'P14314' 'P46926' 'P38646'
 'Q9BZE4' 'P49915' 'P62805' 'O

In [85]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 458/458 [00:00<00:00, 123314.37it/s]

2024-06-28 19:28:24> Valid proteins: 0
2024-06-28 19:28:24> Invalid proteins: 0
2024-06-28 19:28:24> Existing proteins: 458





In [86]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 458/458 [00:00<00:00, 150465.36it/s]

2024-06-28 19:28:24> Valid proteins: 0
2024-06-28 19:28:24> Invalid proteins: 0
2024-06-28 19:28:24> Existing proteins: 458





## Construct Alphafold Dataframe (Calculate Accessibilities) - MsrB2KD

In [87]:
# Format alphafold data into dataframe
alphafold_annotation_MsrB2KD = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=unique_uniprotIDs)
alphafold_annotation_MsrB2KD

100%|██████████| 1110/1110 [00:56<00:00, 19.60it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A8MWD9,1,M,1,47.25,6.065,4.721,3.923,4.980,25.474,...,-6.924,-8.038,-7.255,unstructured,unstructured,0,0,0,0,1
1,A8MWD9,1,S,2,59.97,7.417,7.567,7.764,6.367,22.925,...,-5.364,-3.908,-5.523,unstructured,unstructured,0,0,0,0,1
2,A8MWD9,1,K,3,62.90,8.386,8.471,9.760,8.438,20.286,...,-7.835,-8.662,-7.119,unstructured,unstructured,0,0,0,0,1
3,A8MWD9,1,A,4,64.57,8.540,7.245,5.993,7.332,17.535,...,-5.865,-6.199,-6.761,unstructured,unstructured,0,0,0,0,1
4,A8MWD9,1,H,5,69.53,10.062,10.548,11.445,9.352,15.189,...,-5.055,-3.843,-4.996,HELX_LH_PP_P,HELX,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275134,Q9Y617,458,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,15.087,16.015,14.291,HELX_RH_AL_P,HELX,0,1,0,0,0
275135,Q9Y617,458,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,17.344,18.096,16.529,HELX_RH_AL_P,HELX,0,1,0,0,0
275136,Q9Y617,458,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,14.388,13.842,15.221,HELX_RH_AL_P,HELX,0,1,0,0,0
275137,Q9Y617,458,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,12.177,11.359,13.111,HELX_RH_AL_P,HELX,0,1,0,0,0


In [88]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 458/458 [00:07<00:00, 65.29it/s] 


In [89]:
alphafold_accessibility_MsrB2KD = alphafold_annotation_MsrB2KD.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [90]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 458/458 [00:06<00:00, 67.10it/s] 


In [91]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [92]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 458/458 [00:06<00:00, 67.15it/s] 


In [93]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [94]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 458/458 [00:06<00:00, 67.25it/s] 


In [95]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [96]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 458/458 [00:07<00:00, 64.12it/s] 


In [97]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [98]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 458/458 [00:06<00:00, 66.30it/s] 


In [99]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [100]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 458/458 [00:06<00:00, 66.10it/s] 


In [101]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [102]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 458/458 [00:07<00:00, 64.18it/s] 


In [103]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [104]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 458/458 [00:07<00:00, 65.42it/s] 


In [105]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [106]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 458/458 [00:07<00:00, 64.51it/s] 


In [107]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [108]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 458/458 [00:07<00:00, 63.27it/s] 


In [109]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [110]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 458/458 [00:07<00:00, 59.75it/s] 


In [111]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [112]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 458/458 [00:09<00:00, 50.31it/s]


In [113]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [114]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 458/458 [00:10<00:00, 43.43it/s]


In [115]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [116]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 458/458 [00:07<00:00, 59.17it/s] 


In [117]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,A8MWD9,1,M,1,47.25,6.065,4.721,3.923,4.980,25.474,...,1,1,1,1,1,1,2,3,4,0
1,A8MWD9,1,S,2,59.97,7.417,7.567,7.764,6.367,22.925,...,0,2,2,2,2,2,3,4,5,0
2,A8MWD9,1,K,3,62.90,8.386,8.471,9.760,8.438,20.286,...,1,2,2,2,2,2,4,5,7,0
3,A8MWD9,1,A,4,64.57,8.540,7.245,5.993,7.332,17.535,...,1,2,2,2,2,2,4,8,12,0
4,A8MWD9,1,H,5,69.53,10.062,10.548,11.445,9.352,15.189,...,1,2,2,2,2,2,4,9,16,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275134,Q9Y617,458,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,2,2,3,6,7,7,12,35,70,4
275135,Q9Y617,458,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,2,2,3,4,5,5,9,33,62,2
275136,Q9Y617,458,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,2,2,3,4,4,5,10,29,54,4
275137,Q9Y617,458,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,1,2,2,2,2,2,5,13,30,2


In [118]:
alphafold_accessibility_MsrB2KD_smooth = get_smooth_score(
    alphafold_accessibility_MsrB2KD, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_MsrB2KD_smooth;

100%|██████████| 458/458 [00:01<00:00, 433.27it/s]


In [119]:
alphafold_accessibility_MsrB2KD_smooth['IDR'] = np.where(
    alphafold_accessibility_MsrB2KD_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_MsrB2KD_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,A8MWD9,1,M,1,47.25,6.065,4.721,3.923,4.980,25.474,...,1.909091,2.090909,2.181818,2.272727,2.727273,5.909091,15.090909,26.090909,0.636364,1
1,A8MWD9,1,S,2,59.97,7.417,7.567,7.764,6.367,22.925,...,1.916667,2.250000,2.333333,2.500000,2.916667,6.833333,16.750000,28.583333,0.916667,1
2,A8MWD9,1,K,3,62.90,8.386,8.471,9.760,8.438,20.286,...,1.923077,2.307692,2.461538,2.769231,3.230769,7.384615,18.461538,30.846154,0.923077,1
3,A8MWD9,1,A,4,64.57,8.540,7.245,5.993,7.332,17.535,...,1.928571,2.357143,2.571429,2.928571,3.500000,7.928571,19.785714,32.785714,0.857143,1
4,A8MWD9,1,H,5,69.53,10.062,10.548,11.445,9.352,15.189,...,1.933333,2.533333,2.800000,3.200000,3.733333,8.666667,21.200000,34.666667,1.333333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,Q9Y617,458,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,2.000000,4.266667,5.333333,6.466667,6.733333,17.133333,52.133333,84.600000,6.000000,0
366,Q9Y617,458,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,2.000000,4.142857,5.285714,6.357143,6.642857,16.857143,51.285714,82.857143,5.928571,0
367,Q9Y617,458,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,2.000000,4.076923,5.230769,6.307692,6.538462,16.000000,48.769231,79.307692,5.615385,0
368,Q9Y617,458,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,1.916667,3.916667,5.166667,6.083333,6.250000,15.333333,46.166667,75.333333,5.333333,0


# Merge Dataframes into Full Dataset (Includes Alphafold) - MsrB2KD

In [120]:
alphafold_accessibility_MsrB2KD_smooth["position"] = alphafold_accessibility_MsrB2KD_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_wa = peptides_cs.merge(
    alphafold_accessibility_MsrB2KD_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,2.571429,3.571429,5.333333,6.571429,8.333333,26.047619,71.952381,142.619048,6.809524,0.0
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,2.047619,3.238095,3.809524,4.857143,5.714286,12.904762,38.190476,60.285714,3.523810,0.0
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,2.285714,4.809524,5.571429,7.190476,7.380952,11.857143,18.476190,25.095238,2.047619,1.0
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,2.523810,1.0
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,2.619048,4.523810,5.857143,7.428571,8.142857,23.523810,60.000000,109.238095,7.190476,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,2.095238,2.619048,3.190476,4.000000,4.904762,13.142857,29.952381,49.190476,1.952381,0.0
738,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,2.333333,3.714286,5.095238,6.809524,7.857143,22.428571,67.000000,123.000000,5.809524,0.0
739,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,2.714286,5.000000,6.142857,7.523810,7.523810,11.904762,19.571429,26.333333,2.333333,1.0
740,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,2.619048,4.142857,6.047619,7.666667,8.428571,28.761905,87.285714,172.428571,8.761905,0.0


In [121]:
#peptides_wa.to_csv(os.path.join(curr_dir_path, "MsrB2KD_with_alphafold.csv"))

In [122]:
path = os.path.join(curr_dir_path, "MsrB2KD_with_alphafold.csv")
peptides_wa = pd.read_csv(path)
peptides_wa.set_index("Unnamed: 0", inplace=True)
peptides_wa.index.name = None
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,2.571429,3.571429,5.333333,6.571429,8.333333,26.047619,71.952381,142.619048,6.809524,0.0
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,2.047619,3.238095,3.809524,4.857143,5.714286,12.904762,38.190476,60.285714,3.523810,0.0
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,2.285714,4.809524,5.571429,7.190476,7.380952,11.857143,18.476190,25.095238,2.047619,1.0
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,2.523810,1.0
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,2.619048,4.523810,5.857143,7.428571,8.142857,23.523810,60.000000,109.238095,7.190476,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,2.095238,2.619048,3.190476,4.000000,4.904762,13.142857,29.952381,49.190476,1.952381,0.0
738,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,2.333333,3.714286,5.095238,6.809524,7.857143,22.428571,67.000000,123.000000,5.809524,0.0
739,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,2.714286,5.000000,6.142857,7.523810,7.523810,11.904762,19.571429,26.333333,2.333333,1.0
740,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,2.619048,4.142857,6.047619,7.666667,8.428571,28.761905,87.285714,172.428571,8.761905,0.0


# The End (For Now)