## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import requests as r
from Bio import SeqIO
from io import StringIO
import warnings

warnings.filterwarnings('ignore')

# Import structuremap functions
import structuremap.utils
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score

structuremap.utils.set_logger()

## Set Parameters of Analysis

In [2]:
# Set parameters of analysis

analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "655.3735"
light_modification = "649.3660"

## Load Dataset - MsrAKD

In [3]:
# Set correct pathing

curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/MsrKD
Global Data Directory: /Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/global_data


In [4]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "05_10_24_293T_MsrKD_data.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="293T_MsrAKD_quant")
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,Protein ID,Entry Name,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,P55072,TERA_HUMAN,VCP,Transitional endoplasmic reticulum ATPase,0.000016,4.801791,-5.943475,740,M740,TERA_M740
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,Q9NTJ3,SMC4_HUMAN,SMC4,Structural maintenance of chromosomes protein 4,0.001236,2.907956,-5.864487,814,M814,SMC4_M814
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,P39023,RL3_HUMAN,RPL3,Large ribosomal subunit protein uL3,0.031518,1.501438,-5.511180,168,M168,RL3_M168
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,P62847,RS24_HUMAN,RPS24,Small ribosomal subunit protein eS24,0.027458,1.561338,-5.494187,74,M74,RS24_M74
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,P50991,TCPD_HUMAN,CCT4,T-complex protein 1 subunit delta,0.016772,1.775418,-5.343441,81,M81,TCPD_M81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,O14744,ANM5_HUMAN,PRMT5,Protein arginine N-methyltransferase 5,0.694002,0.158639,-0.111634,187,M187,ANM5_M187
659,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,Q15233,NONO_HUMAN,NONO,Non-POU domain-containing octamer-binding protein,0.770188,0.113403,0.297063,326,M326,NONO_M326
660,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,Q99661,KIF2C_HUMAN,KIF2C,Kinesin-like protein KIF2C,0.884856,0.053127,0.044696,708,M708,KIF2C_M708
661,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,Q92922,SMRC1_HUMAN,SMARCC1,SWI/SNF complex subunit SMARCC1,0.909191,0.041345,-0.082454,944,M944,SMRC1_M944


In [5]:
# Canonicalize data - none to do here
peptides;

In [6]:
# Manual labeling of peptides
label_col_data = ["blue"] * 157 + ["green"] * 381 + ["white"] * 9 + ["red"] * 12 + ["gray"] * 104
label_col = pd.Series(label_col_data)
peptides["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [7]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P55072' 'Q9NTJ3' 'P39023' 'P62847' 'P50991' 'P78371' 'P62304' 'P51991'
 'P60709' 'Q13435' 'Q9UHV9' 'Q8IWC1' 'P54727' 'P11940' 'P27816' 'P07437'
 'O94913' 'P10809' 'P31948' 'P08670' 'P07814' 'P35579' 'P23246' 'O94776'
 'P36578' 'Q9Y4L1' 'Q04637' 'P55081' 'P06576' 'P07910' 'P46109' 'Q15424'
 'Q9HCG8' 'Q9UMX0' 'P20700' 'P40227' 'P29401' 'Q8WYA6' 'O75534' 'P25205'
 'Q6PKG0' 'Q9Y2W1' 'P11171' 'P30519' 'O60826' 'P08708' 'Q9P0L0' 'P48643'
 'Q16891' 'Q04837' 'P08238' 'Q8WWK9' 'Q14011' 'Q9H814' 'Q9UNZ5' 'Q03252'
 'Q9NTK5' 'P06493' 'Q9NUU7' 'Q4G0J3' 'Q09666' 'O75934' 'Q14320' 'Q15029'
 'Q16630' 'Q9UKD2' 'O95835' 'P16949' 'Q15459' 'O14745' 'P18669' 'Q96EP5'
 'P50402' 'P50213' 'Q5BKZ1' 'P49755' 'P60228' 'P41227' 'Q07065' 'O43252'
 'Q9UQN3' 'P14678' 'Q7L1Q6' 'P18583' 'Q9Y266' 'Q7Z739' 'Q16576' 'P22626'
 'Q9BYN8' 'Q9UKV3' 'Q14152' 'O60814' 'P05141' 'Q99832' 'P41252' 'Q9BXP5'
 'Q13263' 'Q7L4I2' 'Q8WUM0' 'O60508' 'Q01518' 'Q92922' 'O75157' 'P14174'
 'P33176' 'P26038' 'Q14204' 'Q

In [8]:
# Helper function to get full amino acid sequence for a protein
def get_complete_sequence(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [9]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [10]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [11]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label,Color,Complete Sequence
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,VCP,Transitional endoplasmic reticulum ATPase,0.000016,4.801791,-5.943475,740,M740,TERA_M740,blue,MASGADSKGDDLSTAILKQKNRPNRLIVDEAINEDNSVVSLSQPKM...
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,SMC4,Structural maintenance of chromosomes protein 4,0.001236,2.907956,-5.864487,814,M814,SMC4_M814,blue,MPRKGTQPSTARRREEGPPPPSPDGASSDAEPEPPSGRTESPATAA...
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,RPL3,Large ribosomal subunit protein uL3,0.031518,1.501438,-5.511180,168,M168,RL3_M168,blue,MSHRKFSAPRHGSLGFLPRKRSSRHRGKVKSFPKDDPSKPVHLTAF...
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,RPS24,Small ribosomal subunit protein eS24,0.027458,1.561338,-5.494187,74,M74,RS24_M74,blue,MNDTVTIRTRKFMTNRLLQRKQMVIDVLHPGKATVPKTEIREKLAK...
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,CCT4,T-complex protein 1 subunit delta,0.016772,1.775418,-5.343441,81,M81,TCPD_M81,blue,MPENVAPRSGATAGAAGGRGKGAYQDRDKPAQIRFSNISAAKAVAD...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,PRMT5,Protein arginine N-methyltransferase 5,0.694002,0.158639,-0.111634,187,M187,ANM5_M187,gray,MAAMAVGGAGGSRVSSGRDLNCVPEIADTLGAVAKQGFDFLCMPVF...
659,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,NONO,Non-POU domain-containing octamer-binding protein,0.770188,0.113403,0.297063,326,M326,NONO_M326,gray,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...
660,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,KIF2C,Kinesin-like protein KIF2C,0.884856,0.053127,0.044696,708,M708,KIF2C_M708,gray,MAMDSSLQARLFPGLAIKIQRSNGLIHSANVRTVNLEKSCVSVEWA...
661,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,SMARCC1,SWI/SNF complex subunit SMARCC1,0.909191,0.041345,-0.082454,944,M944,SMRC1_M944,gray,MAAAAGGGGPGTAVGATGSGIAAAAAGLAVYRRKDGGPATKFWESP...


In [12]:
# Create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    whole, mantissa = modifications[0].split(".")
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        whole, mantissa = modifications[i].split(".")
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [13]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [14]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [15]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [16]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    663
Name: count, dtype: int64

In [17]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [18]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [19]:
# Sanity check - ensure methionine locations are correct
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
temp.count("M") == len(temp)

True

In [20]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,blue,MASGADSKGDDLSTAILKQKNRPNRLIVDEAINEDNSVVSLSQPKM...,DHFEEAMR,733,8,DHFEEA,6,739,MEVEEDDPVPEIRRDHFEEA,RFARRSVSDNDIRKYEMFAQ
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,blue,MPRKGTQPSTARRREEGPPPPSPDGASSDAEPEPPSGRTESPATAA...,LRHSEREMR,806,9,LRHSERE,7,813,QEQKVQLEERVVKLRHSERE,RNTLEKFTASIQRLIEQEEY
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,blue,MSHRKFSAPRHGSLGFLPRKRSSRHRGKVKSFPKDDPSKPVHLTAF...,VIAHTQMR,161,8,VIAHTQ,6,167,KDFSSMKKYCQVIRVIAHTQ,RLLPLRQKKAHLMEIQVNGG
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,blue,MNDTVTIRTRKFMTNRLLQRKQMVIDVLHPGKATVPKTEIREKLAK...,TTGFGMIYDSLDYAK,68,15,TTGFG,5,73,VIFVFGFRTHFGGGKTTGFG,IYDSLDYAKKNEPKHRLARH
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,blue,MPENVAPRSGATAGAAGGRGKGAYQDRDKPAQIRFSNISAAKAVAD...,QMQVLHPAAR,79,10,Q,1,80,IQDGKGDVTITNDGATILKQ,QVLHPAARMLVELSKAQDIE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,gray,MAAMAVGGAGGSRVSSGRDLNCVPEIADTLGAVAKQGFDFLCMPVF...,TWMWWHNFR,184,9,TW,2,186,IIENAPTTHTEEYSGEEKTW,WWHNFRTLCDYSKRIAVALE
659,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,gray,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,MEELHNQEVQK,325,11,,0,325,EHQVMLMRQDLMRRQEELRR,EELHNQEVQKRKQLELRQEE
660,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,gray,MAMDSSLQARLFPGLAIKIQRSNGLIHSANVRTVNLEKSCVSVEWA...,LAMQLEEQASR,705,11,LA,2,707,AQQAKHFSALRDVIKALRLA,QLEEQASRQISSKKRPQ
661,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,gray,MAAAAGGGGPGTAVGATGSGIAAAAAGLAVYRRKDGGPATKFWESP...,QNFHMEQLK,939,9,QNFH,4,943,EKEALEQQRQQLLTERQNFH,EQLKYAELRARQQMEQQQHG


In [21]:
# NOTE: Some methionine site numbers (from the initial dataset) are incorrect
display(peptides_cs[~(peptides_cs["Site Number"] == peptides_cs["Methionine Location"] + 1)])

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
162,M[649.3660]GANSLER,M[655.3735]GANSLER,,,-2.92864,-2.736653,-2.494809,-2.734591,-2.455773,,...,green,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,MGANSLER,570,8,,0,570,MATGLERMGANNLERMGLER,GANSLERMGLERMGANSLER
251,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,-3.16286,-2.941378,-3.050561,-3.034934,-2.966366,-3.191401,-2.881145,-3.167577,...,green,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,RGMDDDRGPR,959,10,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA
313,ISM[649.3660]PDIDLNLKGPK,ISM[655.3735]PDIDLNLKGPK,,,-4.089726,0.719132,-3.780943,-4.33041,-4.456059,-3.573424,...,green,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,ISMPDIDLNLKGPK,2706,14,IS,2,2708,KLKGPKFKMPEMNIKAPKIS,PDIDLNLKGPKVKGDVDVSL
432,RGM[649.3660]DDDR,RGM[655.3735]DDDR,-4.085992,,-3.754871,,,,,,...,green,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,RGMDDDR,959,7,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA
601,VDINAPDVDVQGPDWHLKM[649.3660]PK,VDINAPDVDVQGPDWHLKM[655.3735]PK,,,,,,-2.987953,,,...,gray,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,VDINAPDVDVQGPDWHLKMPK,3131,21,VDINAPDVDVQGPDWHLK,18,3149,PKVDINAPDVDVQGPDWHLK,PKIKMPKISMPGFKGEGPEV
626,n[42.0106]SGSSSVAAM[649.3660]KK,n[42.0106]SGSSSVAAM[655.3735]KK,,,,,,,-2.614655,,...,gray,MSGSSSVAAMKKVVQQLRLEAGLNRVKVSQAAADLKQFCLQNAQHD...,SGSSSVAAMKK,1,11,SGSSSVAA,8,9,MSGSSSVA,KKVVQQLRLEAGLNRVKVSQ
635,n[42.0106]ADKM[649.3660]DMSLDDIIK,n[42.0106]ADKM[655.3735]DMSLDDIIK,,,-0.581554,-1.593138,,,,,...,gray,MADKMDMSLDDIIKLNRSQRGGRGGGRGRGRAGSQGGRGGGAQAAA...,ADKMDMSLDDIIK,1,13,ADK,3,4,MAD,DMSLDDIIKLNRSQRGGRGG


In [22]:
# Remove invalid proteins (according to alphafold)
# 7 invalid peptides as a result -> 2 blue, 4 green, 1 gray

invalid_IDs = ['Q09666', 'Q14204', 'Q9Y520', 'Q14789']
peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,blue,MASGADSKGDDLSTAILKQKNRPNRLIVDEAINEDNSVVSLSQPKM...,DHFEEAMR,733,8,DHFEEA,6,739,MEVEEDDPVPEIRRDHFEEA,RFARRSVSDNDIRKYEMFAQ
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,blue,MPRKGTQPSTARRREEGPPPPSPDGASSDAEPEPPSGRTESPATAA...,LRHSEREMR,806,9,LRHSERE,7,813,QEQKVQLEERVVKLRHSERE,RNTLEKFTASIQRLIEQEEY
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,blue,MSHRKFSAPRHGSLGFLPRKRSSRHRGKVKSFPKDDPSKPVHLTAF...,VIAHTQMR,161,8,VIAHTQ,6,167,KDFSSMKKYCQVIRVIAHTQ,RLLPLRQKKAHLMEIQVNGG
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,blue,MNDTVTIRTRKFMTNRLLQRKQMVIDVLHPGKATVPKTEIREKLAK...,TTGFGMIYDSLDYAK,68,15,TTGFG,5,73,VIFVFGFRTHFGGGKTTGFG,IYDSLDYAKKNEPKHRLARH
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,blue,MPENVAPRSGATAGAAGGRGKGAYQDRDKPAQIRFSNISAAKAVAD...,QMQVLHPAAR,79,10,Q,1,80,IQDGKGDVTITNDGATILKQ,QVLHPAARMLVELSKAQDIE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,gray,MAAMAVGGAGGSRVSSGRDLNCVPEIADTLGAVAKQGFDFLCMPVF...,TWMWWHNFR,184,9,TW,2,186,IIENAPTTHTEEYSGEEKTW,WWHNFRTLCDYSKRIAVALE
659,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,gray,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,MEELHNQEVQK,325,11,,0,325,EHQVMLMRQDLMRRQEELRR,EELHNQEVQKRKQLELRQEE
660,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,gray,MAMDSSLQARLFPGLAIKIQRSNGLIHSANVRTVNLEKSCVSVEWA...,LAMQLEEQASR,705,11,LA,2,707,AQQAKHFSALRDVIKALRLA,QLEEQASRQISSKKRPQ
661,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,gray,MAAAAGGGGPGTAVGATGSGIAAAAAGLAVYRRKDGGPATKFWESP...,QNFHMEQLK,939,9,QNFH,4,943,EKEALEQQRQQLLTERQNFH,EQLKYAELRARQQMEQQQHG


In [23]:
#peptides_cs.to_csv(os.path.join(curr_dir_path, "MsrAKD_clean.csv"))

In [24]:
path = os.path.join(curr_dir_path, "MsrAKD_clean.csv")
peptides_cs = pd.read_csv(path)
peptides_cs.set_index("Unnamed: 0", inplace=True)
peptides_cs.index.name = None
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,blue,MASGADSKGDDLSTAILKQKNRPNRLIVDEAINEDNSVVSLSQPKM...,DHFEEAMR,733,8,DHFEEA,6,739,MEVEEDDPVPEIRRDHFEEA,RFARRSVSDNDIRKYEMFAQ
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,blue,MPRKGTQPSTARRREEGPPPPSPDGASSDAEPEPPSGRTESPATAA...,LRHSEREMR,806,9,LRHSERE,7,813,QEQKVQLEERVVKLRHSERE,RNTLEKFTASIQRLIEQEEY
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,blue,MSHRKFSAPRHGSLGFLPRKRSSRHRGKVKSFPKDDPSKPVHLTAF...,VIAHTQMR,161,8,VIAHTQ,6,167,KDFSSMKKYCQVIRVIAHTQ,RLLPLRQKKAHLMEIQVNGG
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,blue,MNDTVTIRTRKFMTNRLLQRKQMVIDVLHPGKATVPKTEIREKLAK...,TTGFGMIYDSLDYAK,68,15,TTGFG,5,73,VIFVFGFRTHFGGGKTTGFG,IYDSLDYAKKNEPKHRLARH
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,blue,MPENVAPRSGATAGAAGGRGKGAYQDRDKPAQIRFSNISAAKAVAD...,QMQVLHPAAR,79,10,Q,1,80,IQDGKGDVTITNDGATILKQ,QVLHPAARMLVELSKAQDIE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,gray,MAAMAVGGAGGSRVSSGRDLNCVPEIADTLGAVAKQGFDFLCMPVF...,TWMWWHNFR,184,9,TW,2,186,IIENAPTTHTEEYSGEEKTW,WWHNFRTLCDYSKRIAVALE
659,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,gray,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,MEELHNQEVQK,325,11,,0,325,EHQVMLMRQDLMRRQEELRR,EELHNQEVQKRKQLELRQEE
660,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,gray,MAMDSSLQARLFPGLAIKIQRSNGLIHSANVRTVNLEKSCVSVEWA...,LAMQLEEQASR,705,11,LA,2,707,AQQAKHFSALRDVIKALRLA,QLEEQASRQISSKKRPQ
661,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,gray,MAAAAGGGGPGTAVGATGSGIAAAAAGLAVYRRKDGGPATKFWESP...,QNFHMEQLK,939,9,QNFH,4,943,EKEALEQQRQQLLTERQNFH,EQLKYAELRARQQMEQQQHG


# Download Alphafold Data - MsrAKD

In [25]:
# Path for alphafold protein data

alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/alphafold_data/pae


In [26]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P55072' 'Q9NTJ3' 'P39023' 'P62847' 'P50991' 'P78371' 'P62304' 'P51991'
 'P60709' 'Q13435' 'Q9UHV9' 'Q8IWC1' 'P54727' 'P11940' 'P27816' 'P07437'
 'O94913' 'P10809' 'P31948' 'P08670' 'P07814' 'P35579' 'P23246' 'O94776'
 'P36578' 'Q9Y4L1' 'Q04637' 'P55081' 'P06576' 'P07910' 'P46109' 'Q15424'
 'Q9HCG8' 'Q9UMX0' 'P20700' 'P40227' 'P29401' 'Q8WYA6' 'O75534' 'P25205'
 'Q6PKG0' 'Q9Y2W1' 'P11171' 'P30519' 'O60826' 'P08708' 'Q9P0L0' 'P48643'
 'Q16891' 'Q04837' 'P08238' 'Q8WWK9' 'Q14011' 'Q9H814' 'Q9UNZ5' 'Q03252'
 'Q9NTK5' 'P06493' 'Q9NUU7' 'Q4G0J3' 'O75934' 'Q14320' 'Q15029' 'Q16630'
 'Q9UKD2' 'O95835' 'P16949' 'Q15459' 'O14745' 'P18669' 'Q96EP5' 'P50402'
 'P50213' 'Q5BKZ1' 'P49755' 'P60228' 'P41227' 'Q07065' 'O43252' 'Q9UQN3'
 'P14678' 'Q7L1Q6' 'P18583' 'Q9Y266' 'Q7Z739' 'Q16576' 'P22626' 'Q9BYN8'
 'Q9UKV3' 'Q14152' 'O60814' 'P05141' 'Q99832' 'P41252' 'Q9BXP5' 'Q13263'
 'Q7L4I2' 'Q8WUM0' 'O60508' 'Q01518' 'Q92922' 'O75157' 'P14174' 'P33176'
 'P26038' 'Q9UHX1' 'P62258' 'Q

In [27]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 401/401 [00:00<00:00, 130847.67it/s]

2024-06-28 20:04:15> Valid proteins: 0
2024-06-28 20:04:15> Invalid proteins: 0
2024-06-28 20:04:15> Existing proteins: 401





In [28]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 401/401 [00:00<00:00, 135649.32it/s]

2024-06-28 20:04:15> Valid proteins: 0
2024-06-28 20:04:15> Invalid proteins: 0
2024-06-28 20:04:15> Existing proteins: 401





## Load Dataset - MsrB2KD

In [29]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "05_10_24_293T_MsrKD_data.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="293T_MsrB2KD_quant")
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,Protein ID,Entry Name,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,Q16836,HCDH_HUMAN,HADH,"Hydroxyacyl-coenzyme A dehydrogenase, mitochon...",7.317853e-14,13.135616,-4.116590,178,M178,HCDH_M178
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,P23193,TCEA1_HUMAN,TCEA1,Transcription elongation factor A protein 1,3.213832e-04,3.492977,-3.852951,48,M48,TCEA1_M48
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,Q16181,SEPT7_HUMAN,SEPTIN7,Septin-7,2.521873e-11,10.598277,-2.655733,355,M355,SEPT7_M355
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,P35579,MYH9_HUMAN,MYH9,Myosin-9,2.469697e-03,2.607356,-2.118498,1565,M1565,MYH9_M1565
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,P62258,1433E_HUMAN,YWHAE,14-3-3 protein epsilon,3.952954e-03,2.403078,-2.045838,160,M160,1433E_M160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,Q00341,VIGLN_HUMAN,HDLBP,Vigilin,1.332075e-01,0.875471,2.654099,128,M128,VIGLN_M128
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,P00374,DYR_HUMAN,DHFR,Dihydrofolate reductase,5.244905e-02,1.280262,2.951996,126,M126,DYR_M126
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,P35579,MYH9_HUMAN,MYH9,Myosin-9,6.250049e-02,1.204117,2.976128,1489,M1489,MYH9_M1489
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,P14868,SYDC_HUMAN,DARS1,"Aspartate--tRNA ligase, cytoplasmic",6.321136e-02,1.199205,3.046635,478,M478,SYDC_M478


In [30]:
# Canonicalize data - none to do here
peptides;

In [31]:
# Manual labeling of peptides
label_col_data = ["blue"] * 10 + ["white"] * 30 + ["green"] * 381 + ["red"] * 213 + ["gray"] * 120
label_col = pd.Series(label_col_data)
peptides["color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [32]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q16836' 'P23193' 'Q16181' 'P35579' 'P62258' 'P46109' 'P55072' 'Q9Y265'
 'P25205' 'P61024' 'P41227' 'P18583' 'Q9UN37' 'O14744' 'Q86UP2' 'O14874'
 'P36543' 'Q9Y2W2' 'P14174' 'Q9Y617' 'Q8WVK2' 'P31948' 'Q9Y3U8' 'Q99729'
 'Q9UKD2' 'Q9Y3I0' 'P27144' 'Q9UHX1' 'P22307' 'Q01518' 'Q9BWF3' 'Q9Y580'
 'O43707' 'P22061' 'P52272' 'Q9HD42' 'P50454' 'O95831' 'P18859' 'P05067'
 'P60709' 'P68032' 'P35611' 'P55196' 'Q4VCS5' 'P08243' 'P05023' 'P24539'
 'Q9NVI7' 'Q8WWM7' 'Q07812' 'Q9NYF8' 'Q9UHR4' 'P11021' 'Q9BRK5' 'O43852'
 'Q14444' 'Q96CT7' 'Q16543' 'P06493' 'P61604' 'P10809' 'Q9UQN3' 'Q9H444'
 'Q9Y3Y2' 'Q14011' 'E9PRG8' 'Q07065' 'Q15003' 'P09669' 'P33240' 'Q9H0L4'
 'Q92841' 'Q9NR30' 'Q9BUQ8' 'P00367' 'Q08211' 'Q99615' 'O75937' 'P55265'
 'P33316' 'Q14204' 'P55084' 'P42126' 'Q6P2E9' 'P29692' 'Q14152' 'O75821'
 'O15372' 'Q09666' 'Q15717' 'Q8N8S7' 'P06733' 'P14625' 'O43768' 'P15170'
 'Q01844' 'P15311' 'Q02790' 'Q96AE4' 'P35637' 'Q13283' 'Q9UN86' 'P14314'
 'P46926' 'Q14789' 'P38646' 'Q

In [33]:
# Helper function to get full amino acid sequence for a protein
def get_full_protein_seq(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [34]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [35]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [36]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label,color,Complete Sequence
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,HADH,"Hydroxyacyl-coenzyme A dehydrogenase, mitochon...",7.317853e-14,13.135616,-4.116590,178,M178,HCDH_M178,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,TCEA1,Transcription elongation factor A protein 1,3.213832e-04,3.492977,-3.852951,48,M48,TCEA1_M48,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,SEPTIN7,Septin-7,2.521873e-11,10.598277,-2.655733,355,M355,SEPT7_M355,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,MYH9,Myosin-9,2.469697e-03,2.607356,-2.118498,1565,M1565,MYH9_M1565,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,YWHAE,14-3-3 protein epsilon,3.952954e-03,2.403078,-2.045838,160,M160,1433E_M160,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,HDLBP,Vigilin,1.332075e-01,0.875471,2.654099,128,M128,VIGLN_M128,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,DHFR,Dihydrofolate reductase,5.244905e-02,1.280262,2.951996,126,M126,DYR_M126,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,MYH9,Myosin-9,6.250049e-02,1.204117,2.976128,1489,M1489,MYH9_M1489,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,DARS1,"Aspartate--tRNA ligase, cytoplasmic",6.321136e-02,1.199205,3.046635,478,M478,SYDC_M478,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...


In [37]:
# Create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    whole, mantissa = modifications[0].split(".")
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        whole, mantissa = modifications[i].split(".")
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [38]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [39]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [40]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [41]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    754
Name: count, dtype: int64

In [42]:
# create regex pattern to identify desired modifications
left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [43]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [44]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [45]:
# Sanity check - ensure methionine locations are correct
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
temp.count("M") == len(temp)

True

In [46]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,FAGLHFFNPVPVMK,165,14,FAGLHFFNPVPV,12,177,NATTRQDRFAGLHFFNPVPV,KLVEVIKTPMTSQKTFESLV
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...,IGMSVNAIR,45,9,IG,2,47,LKELKNIPMTLELLQSTRIG,SVNAIRKQSTDEEVTSLAKS
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...,KMEMEMEQVFEMK,351,13,KME,3,354,PLAQMEEERREHVAKMKKME,EMEQVFEMKVKEKVQKLKDS
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,LRLEVNLQAMK,1555,11,LRLEVNLQA,9,1564,EDELQATEDAKLRLEVNLQA,KAQFERDLQGRDEQSEEKKK
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,AASDIAMTELPPTHPIR,153,17,AASDIA,6,159,DRKEAAENSLVAYKAASDIA,TELPPTHPIRLGLALNFSVF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK,121,11,DQGLSI,6,127,MQRTGAHLELSLAKDQGLSI,VSGKLDAVMKARKDIVARLQ
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,EAMNHPGHLK,123,10,EA,2,125,LANKVDMVWIVGGSSVYKEA,NHPGHLKLFVTRIMQDFESD
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,ALEEAMEQK,1483,9,ALEEA,5,1488,AEAREKETKALSLARALEEA,EQKAELERLNKQFRTEMEDL
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...,VTMLFLGLHNVR,475,12,VT,2,477,SFRFGAPPHAGGGIGLERVT,LFLGLHNVRQTSMFPRDPKR


In [47]:
# NOTE: Some methionine site numbers (from the initial dataset) are incorrect
display(peptides_cs[~(peptides_cs["Site Number"] == peptides_cs["Methionine Location"] + 1)])

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
108,ISM[649.3660]PDIDLNLKGPK,ISM[655.3735]PDIDLNLKGPK,,,2.317727,1.315954,2.486824,2.297093,,2.372561,...,green,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,ISMPDIDLNLKGPK,2706,14,IS,2,2708,KLKGPKFKMPEMNIKAPKIS,PDIDLNLKGPKVKGDVDVSL
253,ISM[649.3660]PDVDLHLK,ISM[655.3735]PDVDLHLK,2.023708,,1.388773,,,,2.04048,1.42799,...,green,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,ISMPDVDLHLK,817,11,IS,2,819,KLKGPKFKMPEMNIKVPKIS,PDVDLHLKGPNVKGEYDVTM
274,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,1.689388,1.636527,1.738867,1.672636,1.615026,1.706146,1.594494,1.616323,...,green,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,RGMDDDRGPR,959,10,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA
387,M[649.3660]GANSLER,M[655.3735]GANSLER,1.356775,1.283564,1.510369,1.369582,1.35557,1.498688,1.530735,1.459537,...,green,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,MGANSLER,570,8,,0,570,MATGLERMGANNLERMGLER,GANSLERMGLERMGANSLER
422,RGM[649.3660]DDDR,RGM[655.3735]DDDR,,1.757216,,,1.155705,,,1.76007,...,red,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,RGMDDDR,959,7,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA
547,SM[15.9949]M[649.3660]SAYER,SM[15.9949]M[655.3735]SAYER,,1.01538,1.08481,1.091951,1.049318,1.202029,1.020585,1.133366,...,red,MATNIEQIFRSFVVSKFREIQQELSSGRNEGQLNGETNTPIEGNQA...,SMMSAYER,1031,8,SM,2,1033,ERSMMSYERSMMSPMAERSM,SAYERSMMSAYERSMMSPMA
609,SM[15.9949]M[649.3660]SSYSAADR,SM[15.9949]M[655.3735]SSYSAADR,0.795118,,0.866351,0.688615,0.901931,0.961633,,0.853249,...,red,MATNIEQIFRSFVVSKFREIQQELSSGRNEGQLNGETNTPIEGNQA...,SMMSSYSAADR,1090,11,SM,2,1092,SMMSPMADRSMMSMGADRSM,SSYSAADRSMMSSYSAADRS
628,GM[649.3660]QGPPGPR,GM[655.3735]QGPPGPR,1.281578,1.361743,,,,1.625771,,,...,red,MATEIGSPPRFFHMPRFQHQAPRQLFYKRPDFAQQQAMQQLTFDGK...,GMQGPPGPR,738,9,G,1,739,QGPPGPQGHLGPQGPPGTQG,QGPPGPRGMQGPPHPHGIQG
652,SM[649.3660]M[15.9949]SPMAER,SM[655.3735]M[15.9949]SPMAER,,,,-0.068689,0.573475,,,-0.19599,...,gray,MATNIEQIFRSFVVSKFREIQQELSSGRNEGQLNGETNTPIEGNQA...,SMMSPMAER,1022,9,S,1,1023,AAERSMMSSYERSMMSYERS,MSPMAERSMMSAYERSMMSA
684,n[42.0106]MDRM[649.3660]TEDALR,n[42.0106]MDRM[655.3735]TEDALR,0.398543,,,,0.678404,,,,...,gray,MDRMTEDALRLNLLKRSLDPADERDDVLAKRLKMEGHEAMERLKML...,MDRMTEDALR,0,10,MDR,3,3,MD,TEDALRLNLLKRSLDPADER


In [48]:
# remove invalid proteins (according to alphafold)
# 12 invalid peptides as a result -> 5 green, 4 red, 3 gray

invalid_IDs = ['Q14204', 'Q09666', 'Q14789', 'Q9Y520', 'P46013', 'Q9NU22']
peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,FAGLHFFNPVPVMK,165,14,FAGLHFFNPVPV,12,177,NATTRQDRFAGLHFFNPVPV,KLVEVIKTPMTSQKTFESLV
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...,IGMSVNAIR,45,9,IG,2,47,LKELKNIPMTLELLQSTRIG,SVNAIRKQSTDEEVTSLAKS
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...,KMEMEMEQVFEMK,351,13,KME,3,354,PLAQMEEERREHVAKMKKME,EMEQVFEMKVKEKVQKLKDS
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,LRLEVNLQAMK,1555,11,LRLEVNLQA,9,1564,EDELQATEDAKLRLEVNLQA,KAQFERDLQGRDEQSEEKKK
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,AASDIAMTELPPTHPIR,153,17,AASDIA,6,159,DRKEAAENSLVAYKAASDIA,TELPPTHPIRLGLALNFSVF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK,121,11,DQGLSI,6,127,MQRTGAHLELSLAKDQGLSI,VSGKLDAVMKARKDIVARLQ
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,EAMNHPGHLK,123,10,EA,2,125,LANKVDMVWIVGGSSVYKEA,NHPGHLKLFVTRIMQDFESD
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,ALEEAMEQK,1483,9,ALEEA,5,1488,AEAREKETKALSLARALEEA,EQKAELERLNKQFRTEMEDL
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...,VTMLFLGLHNVR,475,12,VT,2,477,SFRFGAPPHAGGGIGLERVT,LFLGLHNVRQTSMFPRDPKR


In [49]:
#peptides_cs.to_csv(os.path.join(curr_dir_path, "MsrB2KD_clean.csv"))

In [50]:
path = os.path.join(curr_dir_path, "MsrB2KD_clean.csv")
peptides_cs = pd.read_csv(path)
peptides_cs.set_index("Unnamed: 0", inplace=True)
peptides_cs.index.name = None
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,FAGLHFFNPVPVMK,165,14,FAGLHFFNPVPV,12,177,NATTRQDRFAGLHFFNPVPV,KLVEVIKTPMTSQKTFESLV
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...,IGMSVNAIR,45,9,IG,2,47,LKELKNIPMTLELLQSTRIG,SVNAIRKQSTDEEVTSLAKS
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...,KMEMEMEQVFEMK,351,13,KME,3,354,PLAQMEEERREHVAKMKKME,EMEQVFEMKVKEKVQKLKDS
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,LRLEVNLQAMK,1555,11,LRLEVNLQA,9,1564,EDELQATEDAKLRLEVNLQA,KAQFERDLQGRDEQSEEKKK
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,AASDIAMTELPPTHPIR,153,17,AASDIA,6,159,DRKEAAENSLVAYKAASDIA,TELPPTHPIRLGLALNFSVF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK,121,11,DQGLSI,6,127,MQRTGAHLELSLAKDQGLSI,VSGKLDAVMKARKDIVARLQ
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,EAMNHPGHLK,123,10,EA,2,125,LANKVDMVWIVGGSSVYKEA,NHPGHLKLFVTRIMQDFESD
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,ALEEAMEQK,1483,9,ALEEA,5,1488,AEAREKETKALSLARALEEA,EQKAELERLNKQFRTEMEDL
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...,VTMLFLGLHNVR,475,12,VT,2,477,SFRFGAPPHAGGGIGLERVT,LFLGLHNVRQTSMFPRDPKR


# Download Alphafold Data - MsrB2KD

In [51]:
# Path for alphafold protein data

alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/ChURRO_ABPP/alphafold_data/pae


In [52]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q16836' 'P23193' 'Q16181' 'P35579' 'P62258' 'P46109' 'P55072' 'Q9Y265'
 'P25205' 'P61024' 'P41227' 'P18583' 'Q9UN37' 'O14744' 'Q86UP2' 'O14874'
 'P36543' 'Q9Y2W2' 'P14174' 'Q9Y617' 'Q8WVK2' 'P31948' 'Q9Y3U8' 'Q99729'
 'Q9UKD2' 'Q9Y3I0' 'P27144' 'Q9UHX1' 'P22307' 'Q01518' 'Q9BWF3' 'Q9Y580'
 'O43707' 'P22061' 'P52272' 'Q9HD42' 'P50454' 'O95831' 'P18859' 'P05067'
 'P60709' 'P68032' 'P35611' 'P55196' 'Q4VCS5' 'P08243' 'P05023' 'P24539'
 'Q9NVI7' 'Q8WWM7' 'Q07812' 'Q9NYF8' 'Q9UHR4' 'P11021' 'Q9BRK5' 'O43852'
 'Q14444' 'Q96CT7' 'Q16543' 'P06493' 'P61604' 'P10809' 'Q9UQN3' 'Q9H444'
 'Q9Y3Y2' 'Q14011' 'E9PRG8' 'Q07065' 'Q15003' 'P09669' 'P33240' 'Q9H0L4'
 'Q92841' 'Q9NR30' 'Q9BUQ8' 'P00367' 'Q08211' 'Q99615' 'O75937' 'P55265'
 'P33316' 'P55084' 'P42126' 'Q6P2E9' 'P29692' 'Q14152' 'O75821' 'O15372'
 'Q15717' 'Q8N8S7' 'P06733' 'P14625' 'O43768' 'P15170' 'Q01844' 'P15311'
 'Q02790' 'Q96AE4' 'P35637' 'Q13283' 'Q9UN86' 'P14314' 'P46926' 'P38646'
 'Q9BZE4' 'P49915' 'P62805' 'O

In [53]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 458/458 [00:00<00:00, 142962.81it/s]

2024-06-28 20:04:16> Valid proteins: 0
2024-06-28 20:04:16> Invalid proteins: 0
2024-06-28 20:04:16> Existing proteins: 458





In [54]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 458/458 [00:00<00:00, 131269.05it/s]

2024-06-28 20:04:16> Valid proteins: 0
2024-06-28 20:04:16> Invalid proteins: 0
2024-06-28 20:04:16> Existing proteins: 458





# The End