## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import requests as r
from Bio import SeqIO
from io import StringIO
import warnings

warnings.filterwarnings('ignore')

# Import structuremap functions
import structuremap.utils
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score

structuremap.utils.set_logger()

## Set Parameters of Analysis

In [2]:
# Set parameters of analysis
analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "655.3735"
light_modification = "649.3660"

## Load Dataset - 50uM MsrAKD

In [3]:
# Set correct pathing
curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/MsrKD_low_doses
Global Data Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/global_data


In [4]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "10_01_24_Met_MsrKD_Mastersheet.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="50 uM MsrA KD")
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Protein,Protein ID,Entry Name,Met site,Label,Gene,Protein Description,pval,average ratio,neglog10pval
0,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,7.087870,,,,,,,...,sp|P25786|PSA1_HUMAN,P25786,PSA1_HUMAN,26,PSA1_M26,PSMA1,Proteasome subunit alpha type-1,0.000540,7.328966,3.267439
1,AQNLNPM[649.3660]VDVK,AQNLNPM[655.3735]VDVK,,,6.949318,5.759065,,,6.092256,,...,sp|Q9UBE0|SAE1_HUMAN,Q9UBE0,SAE1_HUMAN,105,SAE1_M105,SAE1,SUMO-activating enzyme subunit 1,0.000038,6.285160,4.421034
2,TVMLIPGDKM[649.3660]NEIMDK,TVMLIPGDKM[655.3735]NEIMDK,4.534985,5.059869,,,,,,,...,sp|Q9P287|BCCIP_HUMAN,Q9P287,BCCIP_HUMAN,301,BCCIP_M301,BCCIP,BRCA2 and CDKN1A-interacting protein,0.001490,4.797427,2.826925
3,STM[649.3660]KPVQK,STM[655.3735]KPVQK,,,,,,3.438903,,,...,sp|P11021|BIP_HUMAN,P11021,BIP_HUMAN,339,BIP_M339,HSPA5,Endoplasmic reticulum chaperone BiP,0.000001,3.433842,5.964250
4,EVHTQAENAEFM[649.3660]R,EVHTQAENAEFM[655.3735]R,,,,2.174587,,2.172702,2.647530,2.861742,...,sp|P09601|HMOX1_HUMAN,P09601,HMOX1_HUMAN,34,HMOX1_M34,HMOX1,Heme oxygenase 1,0.000033,2.684130,4.485917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,M[649.3660]AAPIDR,M[655.3735]AAPIDR,0.015240,,,0.288191,,,,,...,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,497,HNRPM_M497,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,0.488814,0.006493,0.310856
452,VDGM[649.3660]NAPK,VDGM[655.3735]NAPK,,-0.123573,-0.012968,0.002598,,0.073736,0.028475,,...,sp|P23588|IF4B_HUMAN,P23588,IF4B_HUMAN,533,IF4B_M533,EIF4B,Eukaryotic translation initiation factor 4B,0.490706,0.001104,0.309179
453,TLAM[649.3660]DTILANAR,TLAM[655.3735]DTILANAR,,,,0.049173,-0.012708,,-0.330627,-0.044101,...,sp|P46926|GNPI1_HUMAN,P46926,GNPI1_HUMAN,164,GNPI1_M164,GNPDA1,Glucosamine-6-phosphate isomerase 1,0.492894,0.003337,0.307247
454,QREMEEQM[649.3660]RR,QREMEEQM[655.3735]RR,-0.109944,0.125932,0.078497,-0.108901,-0.009808,0.147562,0.017649,-0.089849,...,sp|P23246|SFPQ_HUMAN,P23246,SFPQ_HUMAN,589,SFPQ_M589,SFPQ,"Splicing factor, proline- and glutamine-rich",0.494056,-0.001347,0.306224


In [5]:
# Canonicalize data - none to do here
peptides;

In [6]:
# Manual labeling of peptides
label_col_data = ["red"] * 91 + ["green"] * 89 + ["blue"] * 78 + ["gray"] * 198
label_col = pd.Series(label_col_data)
peptides["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [7]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P25786' 'Q9UBE0' 'Q9P287' 'P11021' 'P09601' 'Q13901' 'O95817' 'Q14204'
 'Q13868' 'P60842' 'P47897' 'Q9NTZ6' 'Q7L7X3' 'P06493' 'P18669' 'Q9BQI0'
 'P57076' 'P51532' 'O60784' 'P51659' 'O15347' 'Q86UP2' 'O95721' 'Q13283'
 'Q4VCS5' 'P26038' 'P54577' 'Q14683' 'Q9NVI7' 'P83731' 'P07195' 'Q6GQQ9'
 'P11310' 'Q8N8S7' 'Q9Y3U8' 'Q9BPW8' 'P13639' 'Q9Y3I0' 'Q04323' 'Q15233'
 'P41236' 'P07900' 'P46379' 'P82675' 'Q96LB3' 'Q7Z3B4' 'Q6P2Q9' 'P80303'
 'P52272' 'O14737' 'P82930' 'P61011' 'P07437' 'Q86U42' 'P23246' 'Q9H910'
 'Q5T8P6' 'P30519' 'P10809' 'Q9Y266' 'P49736' 'Q7L1Q6' 'Q92900' 'P49959'
 'P62195' 'Q9BRK5' 'Q9HD42' 'Q9Y2L1' 'Q12906' 'Q14320' 'P11142' 'Q9BW85'
 'Q92922' 'P31948' 'Q8WXF1' 'Q9UEE9' 'P12268' 'P39023' 'Q8NC51' 'Q9UMX0'
 'Q9Y2W2' 'O95373' 'Q07065' 'Q13263' 'Q99598' 'P12694' 'P48643' 'P36542'
 'P14625' 'Q14011' 'Q14839' 'Q15773' 'Q96CT7' 'Q9NZI8' 'P12270' 'O60664'
 'Q15717' 'Q99615' 'O95347' 'P15374' 'Q99623' 'P15311' 'Q9H9T3' 'Q16181'
 'P46777' 'P36543' 'P62258' 'P

In [8]:
# Helper function to get full amino acid sequence for a protein
def get_complete_sequence(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [9]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [10]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [11]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Entry Name,Met site,Label,Gene,Protein Description,pval,average ratio,neglog10pval,Color,Complete Sequence
0,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,7.087870,,,,,,,...,PSA1_HUMAN,26,PSA1_M26,PSMA1,Proteasome subunit alpha type-1,0.000540,7.328966,3.267439,red,MFRNQYDNDVTVWSPQGRIHQIEYAMEAVKQGSATVGLKSKTHAVL...
1,AQNLNPM[649.3660]VDVK,AQNLNPM[655.3735]VDVK,,,6.949318,5.759065,,,6.092256,,...,SAE1_HUMAN,105,SAE1_M105,SAE1,SUMO-activating enzyme subunit 1,0.000038,6.285160,4.421034,red,MVEKEEAGGGISEEEAAQYDRQIRLWGLEAQKRLRASRVLLVGLKG...
2,TVMLIPGDKM[649.3660]NEIMDK,TVMLIPGDKM[655.3735]NEIMDK,4.534985,5.059869,,,,,,,...,BCCIP_HUMAN,301,BCCIP_M301,BCCIP,BRCA2 and CDKN1A-interacting protein,0.001490,4.797427,2.826925,red,MASRSKRRAVESGVPQPPDPPVQRDEEEEKEVENEDEDDDDSDKEK...
3,STM[649.3660]KPVQK,STM[655.3735]KPVQK,,,,,,3.438903,,,...,BIP_HUMAN,339,BIP_M339,HSPA5,Endoplasmic reticulum chaperone BiP,0.000001,3.433842,5.964250,red,MKLSLVAAMLLLLSAARAEEEDKKEDVGTVVGIDLGTTYSCVGVFK...
4,EVHTQAENAEFM[649.3660]R,EVHTQAENAEFM[655.3735]R,,,,2.174587,,2.172702,2.647530,2.861742,...,HMOX1_HUMAN,34,HMOX1_M34,HMOX1,Heme oxygenase 1,0.000033,2.684130,4.485917,red,MERPQPDSMPQDLSEALKEATKEVHTQAENAEFMRNFQKGQVTRDG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,M[649.3660]AAPIDR,M[655.3735]AAPIDR,0.015240,,,0.288191,,,,,...,HNRPM_HUMAN,497,HNRPM_M497,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,0.488814,0.006493,0.310856,gray,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...
452,VDGM[649.3660]NAPK,VDGM[655.3735]NAPK,,-0.123573,-0.012968,0.002598,,0.073736,0.028475,,...,IF4B_HUMAN,533,IF4B_M533,EIF4B,Eukaryotic translation initiation factor 4B,0.490706,0.001104,0.309179,gray,MAASAKKKNKKGKTISLTDFLAEDGGTGGGSTYVSKPVSWADETDD...
453,TLAM[649.3660]DTILANAR,TLAM[655.3735]DTILANAR,,,,0.049173,-0.012708,,-0.330627,-0.044101,...,GNPI1_HUMAN,164,GNPI1_M164,GNPDA1,Glucosamine-6-phosphate isomerase 1,0.492894,0.003337,0.307247,gray,MKLIILEHYSQASEWAAKYIRNRIIQFNPGPEKYFTLGLPTGSTPL...
454,QREMEEQM[649.3660]RR,QREMEEQM[655.3735]RR,-0.109944,0.125932,0.078497,-0.108901,-0.009808,0.147562,0.017649,-0.089849,...,SFPQ_HUMAN,589,SFPQ_M589,SFPQ,"Splicing factor, proline- and glutamine-rich",0.494056,-0.001347,0.306224,gray,MSRDRFRSRGGGGGGFHRRGGGGGRGGLHDFRSPPPGMGLNQNRGP...


In [12]:
# Create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    whole, mantissa = modifications[0].split(".")
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        whole, mantissa = modifications[i].split(".")
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [13]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [14]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [15]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [16]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    456
Name: count, dtype: int64

In [17]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [18]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [19]:
# Sanity check - ensure methionine locations are correct (and match Met site numbers from initial dataset)
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
((temp.count("M") == len(temp)) & (peptides_cs["Met site"] == peptides_cs["Methionine Location"]+1)).value_counts()

True    456
Name: count, dtype: int64

In [20]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,7.087870,,,,,,,...,red,MFRNQYDNDVTVWSPQGRIHQIEYAMEAVKQGSATVGLKSKTHAVL...,IHQIEYAMEAVK,18,12,IHQIEYA,7,25,YDNDVTVWSPQGRIHQIEYA,EAVKQGSATVGLKSKTHAVL
1,AQNLNPM[649.3660]VDVK,AQNLNPM[655.3735]VDVK,,,6.949318,5.759065,,,6.092256,,...,red,MVEKEEAGGGISEEEAAQYDRQIRLWGLEAQKRLRASRVLLVGLKG...,AQNLNPMVDVK,98,11,AQNLNP,6,104,GSVGRNRAEASLERAQNLNP,VDVKVDTEDIEKKPESFFTQ
2,TVMLIPGDKM[649.3660]NEIMDK,TVMLIPGDKM[655.3735]NEIMDK,4.534985,5.059869,,,,,,,...,red,MASRSKRRAVESGVPQPPDPPVQRDEEEEKEVENEDEDDDDSDKEK...,TVMLIPGDKMNEIMDK,291,16,TVMLIPGDK,9,300,SFDDVPMTPLRTVMLIPGDK,NEIMDKLKEYLSV
3,STM[649.3660]KPVQK,STM[655.3735]KPVQK,,,,,,3.438903,,,...,red,MKLSLVAAMLLLLSAARAEEEDKKEDVGTVVGIDLGTTYSCVGVFK...,STMKPVQK,336,8,ST,2,338,SETLTRAKFEELNMDLFRST,KPVQKVLEDSDLKKSDIDEI
4,EVHTQAENAEFM[649.3660]R,EVHTQAENAEFM[655.3735]R,,,,2.174587,,2.172702,2.647530,2.861742,...,red,MERPQPDSMPQDLSEALKEATKEVHTQAENAEFMRNFQKGQVTRDG...,EVHTQAENAEFMR,22,13,EVHTQAENAEF,11,33,SEALKEATKEVHTQAENAEF,RNFQKGQVTRDGFKLVMASL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,M[649.3660]AAPIDR,M[655.3735]AAPIDR,0.015240,,,0.288191,,,,,...,gray,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,MAAPIDR,496,7,,0,496,ERIGSGVERMGAGMGFGLER,AAPIDRVGQTIERMGSGVER
452,VDGM[649.3660]NAPK,VDGM[655.3735]NAPK,,-0.123573,-0.012968,0.002598,,0.073736,0.028475,,...,gray,MAASAKKKNKKGKTISLTDFLAEDGGTGGGSTYVSKPVSWADETDD...,VDGMNAPK,529,8,VDG,3,532,APAQPSEEGPGRKDENKVDG,NAPKGQTGNSSRGPGDGGNR
453,TLAM[649.3660]DTILANAR,TLAM[655.3735]DTILANAR,,,,0.049173,-0.012708,,-0.330627,-0.044101,...,gray,MKLIILEHYSQASEWAAKYIRNRIIQFNPGPEKYFTLGLPTGSTPL...,TLAMDTILANAR,160,12,TLA,3,163,IAFNEPGSSLVSRTRVKTLA,DTILANARFFDGELTKVPTM
454,QREMEEQM[649.3660]RR,QREMEEQM[655.3735]RR,-0.109944,0.125932,0.078497,-0.108901,-0.009808,0.147562,0.017649,-0.089849,...,gray,MSRDRFRSRGGGGGGFHRRGGGGGRGGLHDFRSPPPGMGLNQNRGP...,QREMEEQMRR,581,10,QREMEEQ,7,588,EERRRREEEMMIRQREMEEQ,RRQREESYSRMGYMDPRERD


In [21]:
# Remove invalid proteins (according to alphafold)
# 3 invalid peptides as a result -> 3 red

invalid_IDs = ['Q14204']
display(peptides_cs[peptides_cs["Protein ID"].isin(invalid_IDs)])
peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
7,M[649.3660]VVLSLPR,M[655.3735]VVLSLPR,,1.863568,1.484367,2.536939,,,,,...,red,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,MVVLSLPR,990,8,,0,990,LNPPIEECRYKLYQEMFAWK,VVLSLPRIQSQRYQVGVHYE
16,KVM[649.3660]SQEIQEQLHK,KVM[655.3735]SQEIQEQLHK,,,,1.574638,1.203174,1.0808,,1.505646,...,red,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,KVMSQEIQEQLHK,3253,13,KV,2,3255,ANDKLKKMVKDQQEAEKKKV,SQEIQEQLHKQQEVIADKQM
37,RSELEEQQM[649.3660]HLNVGLR,RSELEEQQM[655.3735]HLNVGLR,,,,,,,,0.779359,...,red,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,RSELEEQQMHLNVGLR,3190,16,RSELEEQQ,8,3198,FINHYANLFHEKRSELEEQQ,HLNVGLRKIKETVDQVEELR


Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,7.087870,,,,,,,...,red,MFRNQYDNDVTVWSPQGRIHQIEYAMEAVKQGSATVGLKSKTHAVL...,IHQIEYAMEAVK,18,12,IHQIEYA,7,25,YDNDVTVWSPQGRIHQIEYA,EAVKQGSATVGLKSKTHAVL
1,AQNLNPM[649.3660]VDVK,AQNLNPM[655.3735]VDVK,,,6.949318,5.759065,,,6.092256,,...,red,MVEKEEAGGGISEEEAAQYDRQIRLWGLEAQKRLRASRVLLVGLKG...,AQNLNPMVDVK,98,11,AQNLNP,6,104,GSVGRNRAEASLERAQNLNP,VDVKVDTEDIEKKPESFFTQ
2,TVMLIPGDKM[649.3660]NEIMDK,TVMLIPGDKM[655.3735]NEIMDK,4.534985,5.059869,,,,,,,...,red,MASRSKRRAVESGVPQPPDPPVQRDEEEEKEVENEDEDDDDSDKEK...,TVMLIPGDKMNEIMDK,291,16,TVMLIPGDK,9,300,SFDDVPMTPLRTVMLIPGDK,NEIMDKLKEYLSV
3,STM[649.3660]KPVQK,STM[655.3735]KPVQK,,,,,,3.438903,,,...,red,MKLSLVAAMLLLLSAARAEEEDKKEDVGTVVGIDLGTTYSCVGVFK...,STMKPVQK,336,8,ST,2,338,SETLTRAKFEELNMDLFRST,KPVQKVLEDSDLKKSDIDEI
4,EVHTQAENAEFM[649.3660]R,EVHTQAENAEFM[655.3735]R,,,,2.174587,,2.172702,2.647530,2.861742,...,red,MERPQPDSMPQDLSEALKEATKEVHTQAENAEFMRNFQKGQVTRDG...,EVHTQAENAEFMR,22,13,EVHTQAENAEF,11,33,SEALKEATKEVHTQAENAEF,RNFQKGQVTRDGFKLVMASL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,M[649.3660]AAPIDR,M[655.3735]AAPIDR,0.015240,,,0.288191,,,,,...,gray,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,MAAPIDR,496,7,,0,496,ERIGSGVERMGAGMGFGLER,AAPIDRVGQTIERMGSGVER
452,VDGM[649.3660]NAPK,VDGM[655.3735]NAPK,,-0.123573,-0.012968,0.002598,,0.073736,0.028475,,...,gray,MAASAKKKNKKGKTISLTDFLAEDGGTGGGSTYVSKPVSWADETDD...,VDGMNAPK,529,8,VDG,3,532,APAQPSEEGPGRKDENKVDG,NAPKGQTGNSSRGPGDGGNR
453,TLAM[649.3660]DTILANAR,TLAM[655.3735]DTILANAR,,,,0.049173,-0.012708,,-0.330627,-0.044101,...,gray,MKLIILEHYSQASEWAAKYIRNRIIQFNPGPEKYFTLGLPTGSTPL...,TLAMDTILANAR,160,12,TLA,3,163,IAFNEPGSSLVSRTRVKTLA,DTILANARFFDGELTKVPTM
454,QREMEEQM[649.3660]RR,QREMEEQM[655.3735]RR,-0.109944,0.125932,0.078497,-0.108901,-0.009808,0.147562,0.017649,-0.089849,...,gray,MSRDRFRSRGGGGGGFHRRGGGGGRGGLHDFRSPPPGMGLNQNRGP...,QREMEEQMRR,581,10,QREMEEQ,7,588,EERRRREEEMMIRQREMEEQ,RRQREESYSRMGYMDPRERD


# Download Alphafold Data - 50uM MsrAKD

In [22]:
# Path for alphafold protein data

alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/pae


In [23]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P25786' 'Q9UBE0' 'Q9P287' 'P11021' 'P09601' 'Q13901' 'O95817' 'Q13868'
 'P60842' 'P47897' 'Q9NTZ6' 'Q7L7X3' 'P06493' 'P18669' 'Q9BQI0' 'P57076'
 'P51532' 'O60784' 'P51659' 'O15347' 'Q86UP2' 'O95721' 'Q13283' 'Q4VCS5'
 'P26038' 'P54577' 'Q14683' 'Q9NVI7' 'P83731' 'P07195' 'Q6GQQ9' 'P11310'
 'Q8N8S7' 'Q9Y3U8' 'Q9BPW8' 'P13639' 'Q9Y3I0' 'Q04323' 'Q15233' 'P41236'
 'P07900' 'P46379' 'P82675' 'Q96LB3' 'Q7Z3B4' 'Q6P2Q9' 'P80303' 'P52272'
 'O14737' 'P82930' 'P61011' 'P07437' 'Q86U42' 'P23246' 'Q9H910' 'Q5T8P6'
 'P30519' 'P10809' 'Q9Y266' 'P49736' 'Q7L1Q6' 'Q92900' 'P49959' 'P62195'
 'Q9BRK5' 'Q9HD42' 'Q9Y2L1' 'Q12906' 'Q14320' 'P11142' 'Q9BW85' 'Q92922'
 'P31948' 'Q8WXF1' 'Q9UEE9' 'P12268' 'P39023' 'Q8NC51' 'Q9UMX0' 'Q9Y2W2'
 'O95373' 'Q07065' 'Q13263' 'Q99598' 'P12694' 'P48643' 'P36542' 'P14625'
 'Q14011' 'Q14839' 'Q15773' 'Q96CT7' 'Q9NZI8' 'P12270' 'O60664' 'Q15717'
 'Q99615' 'O95347' 'P15374' 'Q99623' 'P15311' 'Q9H9T3' 'Q16181' 'P46777'
 'P36543' 'P62258' 'P51003' 'Q

In [24]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 306/306 [00:00<00:00, 161238.32it/s]

2024-10-17 19:07:24> Valid proteins: 0
2024-10-17 19:07:24> Invalid proteins: 0
2024-10-17 19:07:24> Existing proteins: 306





In [25]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 306/306 [00:00<00:00, 146479.92it/s]

2024-10-17 19:07:24> Valid proteins: 0
2024-10-17 19:07:24> Invalid proteins: 0
2024-10-17 19:07:24> Existing proteins: 306





# Construct Alphafold Dataframe (Calculate Accessibilities) - 50uM MsrAKD

In [26]:
# Format alphafold data into dataframe
alphafold_annotation_50uM_MsrAKD = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=unique_uniprotIDs)
alphafold_annotation_50uM_MsrAKD

100%|██████████| 1663/1663 [00:26<00:00, 62.87it/s] 


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A8MXV4,1,M,1,34.18,7.223,7.574,9.030,7.341,-36.582,...,13.872,14.366,13.031,unstructured,unstructured,0,0,0,0,1
1,A8MXV4,1,S,2,29.59,5.569,5.520,4.112,5.964,-33.823,...,12.453,11.875,13.105,unstructured,unstructured,0,0,0,0,1
2,A8MXV4,1,S,3,29.28,5.596,6.732,8.090,6.621,-30.881,...,14.369,14.189,13.491,unstructured,unstructured,0,0,0,0,1
3,A8MXV4,1,S,4,27.39,4.474,3.768,3.026,4.746,-28.235,...,15.056,16.402,15.048,unstructured,unstructured,0,0,0,0,1
4,A8MXV4,1,L,5,28.73,3.667,4.309,3.682,4.070,-25.450,...,13.647,12.336,13.764,unstructured,unstructured,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172886,Q9Y6I3,306,N,572,54.72,25.101,25.186,24.532,24.493,-18.764,...,-39.681,-38.286,-40.506,HELX_LH_PP_P,HELX,0,1,0,0,0
172887,Q9Y6I3,306,P,573,65.63,26.835,26.186,26.972,26.212,-21.764,...,-40.727,-42.041,-40.399,unstructured,unstructured,0,0,0,0,1
172888,Q9Y6I3,306,F,574,63.21,25.232,26.486,27.464,26.097,-23.913,...,-37.640,-36.539,-38.553,unstructured,unstructured,0,0,0,0,1
172889,Q9Y6I3,306,L,575,52.36,24.683,23.920,22.511,24.754,-27.247,...,-37.513,-38.155,-37.901,unstructured,unstructured,0,0,0,0,1


In [27]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 306/306 [00:03<00:00, 76.97it/s] 


In [28]:
alphafold_accessibility_50uM_MsrAKD = alphafold_annotation_50uM_MsrAKD.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [29]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 306/306 [00:02<00:00, 115.79it/s]


In [30]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [31]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 306/306 [00:02<00:00, 117.41it/s]


In [32]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [33]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 306/306 [00:02<00:00, 110.60it/s]


In [34]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [35]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 306/306 [00:02<00:00, 113.83it/s]


In [36]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [37]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 306/306 [00:02<00:00, 114.46it/s]


In [38]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [39]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 306/306 [00:03<00:00, 99.87it/s] 


In [40]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [41]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 306/306 [00:02<00:00, 111.84it/s]


In [42]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [43]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 306/306 [00:02<00:00, 112.19it/s]


In [44]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [45]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 306/306 [00:02<00:00, 104.51it/s]


In [46]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [47]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 306/306 [00:02<00:00, 109.29it/s]


In [48]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [49]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 306/306 [00:02<00:00, 102.76it/s]


In [50]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [51]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 306/306 [00:03<00:00, 85.40it/s] 


In [52]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [53]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 306/306 [00:04<00:00, 67.99it/s]


In [54]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD;

In [55]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrAKD, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 306/306 [00:03<00:00, 98.57it/s] 


In [56]:
alphafold_accessibility_50uM_MsrAKD = alphafold_accessibility_50uM_MsrAKD.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrAKD

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,A8MXV4,1,M,1,34.18,7.223,7.574,9.030,7.341,-36.582,...,1,1,1,1,1,1,2,3,5,0
1,A8MXV4,1,S,2,29.59,5.569,5.520,4.112,5.964,-33.823,...,1,2,2,2,2,2,3,4,6,0
2,A8MXV4,1,S,3,29.28,5.596,6.732,8.090,6.621,-30.881,...,1,2,2,2,2,2,4,5,7,0
3,A8MXV4,1,S,4,27.39,4.474,3.768,3.026,4.746,-28.235,...,1,2,2,2,2,2,4,6,7,0
4,A8MXV4,1,L,5,28.73,3.667,4.309,3.682,4.070,-25.450,...,1,2,2,2,2,2,4,6,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172886,Q9Y6I3,306,N,572,54.72,25.101,25.186,24.532,24.493,-18.764,...,1,2,2,2,2,2,4,6,9,0
172887,Q9Y6I3,306,P,573,65.63,26.835,26.186,26.972,26.212,-21.764,...,2,2,2,2,2,2,4,7,8,0
172888,Q9Y6I3,306,F,574,63.21,25.232,26.486,27.464,26.097,-23.913,...,2,2,2,2,2,2,4,6,7,0
172889,Q9Y6I3,306,L,575,52.36,24.683,23.920,22.511,24.754,-27.247,...,1,2,2,2,2,2,3,4,6,0


In [57]:
alphafold_accessibility_50uM_MsrAKD_smooth = get_smooth_score(
    alphafold_accessibility_50uM_MsrAKD, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_50uM_MsrAKD_smooth;

100%|██████████| 306/306 [00:00<00:00, 481.05it/s]


In [58]:
alphafold_accessibility_50uM_MsrAKD_smooth['IDR'] = np.where(
    alphafold_accessibility_50uM_MsrAKD_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_50uM_MsrAKD_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,A8MXV4,1,M,1,34.18,7.223,7.574,9.030,7.341,-36.582,...,1.909091,1.909091,1.909091,1.909091,1.909091,3.818182,6.636364,12.181818,0.000000,1
1,A8MXV4,1,S,2,29.59,5.569,5.520,4.112,5.964,-33.823,...,2.083333,2.166667,2.166667,2.250000,2.250000,4.750000,9.916667,20.500000,0.416667,1
2,A8MXV4,1,S,3,29.28,5.596,6.732,8.090,6.621,-30.881,...,2.153846,2.230769,2.307692,2.615385,2.692308,5.538462,13.076923,28.923077,0.615385,1
3,A8MXV4,1,S,4,27.39,4.474,3.768,3.026,4.746,-28.235,...,2.214286,2.285714,2.571429,3.000000,3.071429,6.642857,16.857143,38.000000,0.714286,0
4,A8MXV4,1,L,5,28.73,3.667,4.309,3.682,4.070,-25.450,...,2.400000,2.533333,2.866667,3.466667,3.600000,8.133333,20.533333,47.666667,1.333333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571,Q9Y6I3,306,N,572,54.72,25.101,25.186,24.532,24.493,-18.764,...,1.933333,1.933333,1.933333,1.933333,1.933333,3.800000,5.933333,8.866667,0.000000,1
572,Q9Y6I3,306,P,573,65.63,26.835,26.186,26.972,26.212,-21.764,...,1.928571,1.928571,1.928571,1.928571,1.928571,3.785714,5.928571,8.785714,0.000000,1
573,Q9Y6I3,306,F,574,63.21,25.232,26.486,27.464,26.097,-23.913,...,1.923077,1.923077,1.923077,1.923077,1.923077,3.769231,5.923077,8.692308,0.000000,1
574,Q9Y6I3,306,L,575,52.36,24.683,23.920,22.511,24.754,-27.247,...,1.916667,1.916667,1.916667,1.916667,1.916667,3.750000,5.916667,8.666667,0.000000,1


# Merge Dataframes into Full Dataset (Includes Alphafold) - 50uM MsrAKD

In [59]:
alphafold_accessibility_50uM_MsrAKD_smooth["position"] = alphafold_accessibility_50uM_MsrAKD_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_wa = peptides_cs.merge(
    alphafold_accessibility_50uM_MsrAKD_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_wa # wa means "with alphafold"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,7.087870,,,,,,,...,2.761905,4.857143,6.238095,7.523810,9.095238,24.238095,59.190476,107.904762,6.238095,0.0
1,AQNLNPM[649.3660]VDVK,AQNLNPM[655.3735]VDVK,,,6.949318,5.759065,,,6.092256,,...,2.428571,3.619048,5.476190,6.523810,7.476190,22.142857,56.619048,113.761905,6.238095,0.0
2,TVMLIPGDKM[649.3660]NEIMDK,TVMLIPGDKM[655.3735]NEIMDK,4.534985,5.059869,,,,,,,...,2.333333,4.142857,5.333333,7.285714,8.047619,17.428571,52.476190,100.619048,4.142857,0.0
3,STM[649.3660]KPVQK,STM[655.3735]KPVQK,,,,,,3.438903,,,...,2.523810,4.285714,6.238095,7.619048,8.095238,22.190476,64.095238,127.857143,6.904762,0.0
4,EVHTQAENAEFM[649.3660]R,EVHTQAENAEFM[655.3735]R,,,,2.174587,,2.172702,2.647530,2.861742,...,2.333333,4.666667,5.809524,7.523810,8.190476,19.190476,49.571429,96.523810,4.857143,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,M[649.3660]AAPIDR,M[655.3735]AAPIDR,0.015240,,,0.288191,,,,,...,1.952381,1.952381,2.000000,2.000000,2.095238,5.095238,8.285714,11.428571,0.333333,1.0
449,VDGM[649.3660]NAPK,VDGM[655.3735]NAPK,,-0.123573,-0.012968,0.002598,,0.073736,0.028475,,...,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.095238,9.952381,0.047619,1.0
450,TLAM[649.3660]DTILANAR,TLAM[655.3735]DTILANAR,,,,0.049173,-0.012708,,-0.330627,-0.044101,...,2.619048,4.095238,5.190476,6.238095,7.666667,18.333333,48.333333,95.190476,4.380952,0.0
451,QREMEEQM[649.3660]RR,QREMEEQM[655.3735]RR,-0.109944,0.125932,0.078497,-0.108901,-0.009808,0.147562,0.017649,-0.089849,...,1.857143,2.000000,2.619048,3.857143,4.476190,8.952381,15.142857,21.047619,1.904762,1.0


In [60]:
# note: one row with missing AA steric analysis: AlphaFold seems to have a sequence that's 59 AA long, but UniProt has a sequence that's 133 AA long
display(peptides_wa[peptides_wa["Protein ID"] == "P62861"])
peptides_wa = peptides_wa[~(peptides_wa["Protein ID"] == "P62861")]
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
195,RRM[649.3660]QYNR,RRM[655.3735]QYNR,,-0.323883,-0.048357,-0.195666,-0.313184,-0.105649,,,...,,,,,,,,,,


Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,7.087870,,,,,,,...,2.761905,4.857143,6.238095,7.523810,9.095238,24.238095,59.190476,107.904762,6.238095,0.0
1,AQNLNPM[649.3660]VDVK,AQNLNPM[655.3735]VDVK,,,6.949318,5.759065,,,6.092256,,...,2.428571,3.619048,5.476190,6.523810,7.476190,22.142857,56.619048,113.761905,6.238095,0.0
2,TVMLIPGDKM[649.3660]NEIMDK,TVMLIPGDKM[655.3735]NEIMDK,4.534985,5.059869,,,,,,,...,2.333333,4.142857,5.333333,7.285714,8.047619,17.428571,52.476190,100.619048,4.142857,0.0
3,STM[649.3660]KPVQK,STM[655.3735]KPVQK,,,,,,3.438903,,,...,2.523810,4.285714,6.238095,7.619048,8.095238,22.190476,64.095238,127.857143,6.904762,0.0
4,EVHTQAENAEFM[649.3660]R,EVHTQAENAEFM[655.3735]R,,,,2.174587,,2.172702,2.647530,2.861742,...,2.333333,4.666667,5.809524,7.523810,8.190476,19.190476,49.571429,96.523810,4.857143,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,M[649.3660]AAPIDR,M[655.3735]AAPIDR,0.015240,,,0.288191,,,,,...,1.952381,1.952381,2.000000,2.000000,2.095238,5.095238,8.285714,11.428571,0.333333,1.0
449,VDGM[649.3660]NAPK,VDGM[655.3735]NAPK,,-0.123573,-0.012968,0.002598,,0.073736,0.028475,,...,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.095238,9.952381,0.047619,1.0
450,TLAM[649.3660]DTILANAR,TLAM[655.3735]DTILANAR,,,,0.049173,-0.012708,,-0.330627,-0.044101,...,2.619048,4.095238,5.190476,6.238095,7.666667,18.333333,48.333333,95.190476,4.380952,0.0
451,QREMEEQM[649.3660]RR,QREMEEQM[655.3735]RR,-0.109944,0.125932,0.078497,-0.108901,-0.009808,0.147562,0.017649,-0.089849,...,1.857143,2.000000,2.619048,3.857143,4.476190,8.952381,15.142857,21.047619,1.904762,1.0


In [61]:
#peptides_wa.to_csv(os.path.join(curr_dir_path, "50uM_MsrAKD_with_alphafold.csv"))

In [62]:
path = os.path.join(curr_dir_path, "50uM_MsrAKD_with_alphafold.csv")
peptides_wa = pd.read_csv(path)
peptides_wa.set_index("Unnamed: 0", inplace=True)
peptides_wa.index.name = None
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,7.087870,,,,,,,...,2.761905,4.857143,6.238095,7.523810,9.095238,24.238095,59.190476,107.904762,6.238095,0.0
1,AQNLNPM[649.3660]VDVK,AQNLNPM[655.3735]VDVK,,,6.949318,5.759065,,,6.092256,,...,2.428571,3.619048,5.476190,6.523810,7.476190,22.142857,56.619048,113.761905,6.238095,0.0
2,TVMLIPGDKM[649.3660]NEIMDK,TVMLIPGDKM[655.3735]NEIMDK,4.534985,5.059869,,,,,,,...,2.333333,4.142857,5.333333,7.285714,8.047619,17.428571,52.476190,100.619048,4.142857,0.0
3,STM[649.3660]KPVQK,STM[655.3735]KPVQK,,,,,,3.438903,,,...,2.523810,4.285714,6.238095,7.619048,8.095238,22.190476,64.095238,127.857143,6.904762,0.0
4,EVHTQAENAEFM[649.3660]R,EVHTQAENAEFM[655.3735]R,,,,2.174587,,2.172702,2.647530,2.861742,...,2.333333,4.666667,5.809524,7.523810,8.190476,19.190476,49.571429,96.523810,4.857143,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,M[649.3660]AAPIDR,M[655.3735]AAPIDR,0.015240,,,0.288191,,,,,...,1.952381,1.952381,2.000000,2.000000,2.095238,5.095238,8.285714,11.428571,0.333333,1.0
449,VDGM[649.3660]NAPK,VDGM[655.3735]NAPK,,-0.123573,-0.012968,0.002598,,0.073736,0.028475,,...,2.000000,2.000000,2.000000,2.000000,2.000000,3.904762,6.095238,9.952381,0.047619,1.0
450,TLAM[649.3660]DTILANAR,TLAM[655.3735]DTILANAR,,,,0.049173,-0.012708,,-0.330627,-0.044101,...,2.619048,4.095238,5.190476,6.238095,7.666667,18.333333,48.333333,95.190476,4.380952,0.0
451,QREMEEQM[649.3660]RR,QREMEEQM[655.3735]RR,-0.109944,0.125932,0.078497,-0.108901,-0.009808,0.147562,0.017649,-0.089849,...,1.857143,2.000000,2.619048,3.857143,4.476190,8.952381,15.142857,21.047619,1.904762,1.0


# Load Dataset - 50uM MsrB2KD

In [63]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "10_01_24_Met_MsrKD_Mastersheet.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="50 uM MsrB2 KD")
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Protein,Protein ID,Entry Name,Met site,Label,Gene,Protein Description,pval,average ratio,neglog10p
0,IPASKM[649.3660]QEHMR,IPASKM[655.3735]QEHMR,,2.434304,,,,,,2.444069,...,sp|Q15459|SF3A1_HUMAN,Q15459,SF3A1_HUMAN,425,SF3A1_M425,SF3A1,Splicing factor 3A subunit 1,0.000034,2.500033,4.466069
1,IQQM[649.3660]LPDK,IQQM[655.3735]LPDK,,,1.437991,1.289692,1.662225,,,,...,sp|Q8IZ40|RCOR2_HUMAN,Q8IZ40,RCOR2_HUMAN,160,RCOR2_M160,RCOR2,REST corepressor 2,0.000931,1.463303,3.031193
2,RFYEQM[649.3660]NGPVAGASR,RFYEQM[655.3735]NGPVAGASR,,,,,,,,,...,sp|P29692|EF1D_HUMAN,P29692,EF1D_HUMAN,29,EF1D_M29,EEF1D,Elongation factor 1-delta,0.000937,1.392077,3.028062
3,LM[649.3660]GQIHQLR,LM[655.3735]GQIHQLR,,1.349017,,,,1.314251,,,...,sp|Q8TBA6|GOGA5_HUMAN,Q8TBA6,GOGA5_HUMAN,486,GOGA5_M486,GOLGA5,Golgin subfamily A member 5,0.004210,1.184229,2.375681
4,KQM[649.3660]EYER,KQM[655.3735]EYER,,0.800333,,1.051198,,1.095443,,,...,sp|Q8NE71|ABCF1_HUMAN,Q8NE71,ABCF1_HUMAN,265,ABCF1_M265,ABCF1,ATP-binding cassette sub-family F member 1,0.001148,0.921695,2.940117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,IQM[649.3660]SNLMNQAR,IQM[655.3735]SNLMNQAR,,0.055682,-0.157995,0.119155,,0.005589,-0.075306,0.115623,...,sp|P36543|VATE1_HUMAN,P36543,VATE1_HUMAN,72,VATE1_M72,ATP6V1E1,V-type proton ATPase subunit E 1,0.475241,0.004315,0.323086
225,RM[649.3660]NTNPSR,RM[655.3735]NTNPSR,,0.042313,0.066312,-0.157665,,0.080951,,,...,sp|P40429|RL13A_HUMAN,P40429,RL13A_HUMAN,62,RL13A_M62,RPL13A,Large ribosomal subunit protein uL13,0.477439,-0.004475,0.321082
226,RMQEM[649.3660]IAR,RMQEM[655.3735]IAR,,,,,0.819923,,-0.456874,,...,sp|Q15019|SEPT2_HUMAN,Q15019,SEPT2_HUMAN,335,SEPT2_M335,SEPTIN2,Septin-2,0.477585,-0.033629,0.320949
227,RPAEDM[649.3660]EEEQAFKR,RPAEDM[655.3735]EEEQAFKR,-0.269849,,0.195697,,0.138164,-0.258595,0.024214,-0.099087,...,sp|P61978|HNRPK_HUMAN,P61978,HNRPK_HUMAN,27,HNRPK_M27,HNRNPK,Heterogeneous nuclear ribonucleoprotein K,0.480656,0.007210,0.318166


In [64]:
# Canonicalize data - none to do here
peptides;

In [65]:
# Manual labeling of peptides
label_col_data = ["red"] * 32 + ["green"] * 89 + ["blue"] * 16 + ["gray"] * 92
label_col = pd.Series(label_col_data)
peptides["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [66]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q15459' 'Q8IZ40' 'P29692' 'Q8TBA6' 'Q8NE71' 'P35579' 'Q86UP2' 'P54886'
 'P24539' 'P60228' 'Q15233' 'Q9UMX5' 'Q9Y383' 'P52272' 'P12270' 'Q9Y266'
 'Q04323' 'P50990' 'Q13435' 'P61978' 'P14866' 'Q13404' 'Q13310' 'Q9H444'
 'Q02790' 'P16949' 'Q07065' 'Q13263' 'Q99598' 'P12694' 'P36542' 'O95347'
 'Q14011' 'P14625' 'Q96CT7' 'P48643' 'Q99615' 'Q14839' 'P36543' 'Q9NZI8'
 'Q15717' 'O60664' 'P15374' 'P46777' 'P18669' 'Q99623' 'Q15773' 'P15311'
 'Q9H9T3' 'Q16181' 'P46781' 'P62258' 'P11142' 'P31948' 'P06733' 'Q9UQE7'
 'P68104' 'P51003' 'Q7Z739' 'P23246' 'P38646' 'P62841' 'P84098' 'P07910'
 'Q9UMX0' 'P61011' 'P10809' 'Q16543' 'P33176' 'P61247' 'P50454' 'P09496'
 'P60709' 'Q14152' 'Q04760' 'Q9BQ04' 'P26038' 'P22626' 'Q9Y4L1' 'O14744'
 'Q04837' 'P11940' 'Q9NUU7' 'P23786' 'P07108' 'P41227' 'O14497' 'Q16643'
 'A8MXV4' 'P55072' 'P46109' 'P37802' 'Q9UHV9' 'Q9Y617' 'Q15424' 'Q8IWC1'
 'Q9UHX1' 'P61604' 'P08238' 'Q4VCS5' 'Q96AE4' 'Q9UL46' 'Q9HD42' 'P62805'
 'Q15019' 'P17980' 'O14737' 'Q

In [67]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [68]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [69]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Entry Name,Met site,Label,Gene,Protein Description,pval,average ratio,neglog10p,Color,Complete Sequence
0,IPASKM[649.3660]QEHMR,IPASKM[655.3735]QEHMR,,2.434304,,,,,,2.444069,...,SF3A1_HUMAN,425,SF3A1_M425,SF3A1,Splicing factor 3A subunit 1,0.000034,2.500033,4.466069,red,MPAGPVQAVPPPPPVPTEPKQPTEEEASSKEDSAPSKPVVGIIYPP...
1,IQQM[649.3660]LPDK,IQQM[655.3735]LPDK,,,1.437991,1.289692,1.662225,,,,...,RCOR2_HUMAN,160,RCOR2_M160,RCOR2,REST corepressor 2,0.000931,1.463303,3.031193,red,MPSVMEKPSAGSGILSRSRAKTVPNGGQPHSEDDSSEEEHSHDSMI...
2,RFYEQM[649.3660]NGPVAGASR,RFYEQM[655.3735]NGPVAGASR,,,,,,,,,...,EF1D_HUMAN,29,EF1D_M29,EEF1D,Elongation factor 1-delta,0.000937,1.392077,3.028062,red,MATNFLAHEKIWFDKFKYDDAERRFYEQMNGPVAGASRQENGASVI...
3,LM[649.3660]GQIHQLR,LM[655.3735]GQIHQLR,,1.349017,,,,1.314251,,,...,GOGA5_HUMAN,486,GOGA5_M486,GOLGA5,Golgin subfamily A member 5,0.004210,1.184229,2.375681,red,MSWFVDLAGKAEDLLNRVDQGAATALSRKDNASNIYSKNTDYTELH...
4,KQM[649.3660]EYER,KQM[655.3735]EYER,,0.800333,,1.051198,,1.095443,,,...,ABCF1_HUMAN,265,ABCF1_M265,ABCF1,ATP-binding cassette sub-family F member 1,0.001148,0.921695,2.940117,red,MPKAPKQQPPEPEWIGDGESTSPSDKVVKKGKKDKKIKKTFFEELA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,IQM[649.3660]SNLMNQAR,IQM[655.3735]SNLMNQAR,,0.055682,-0.157995,0.119155,,0.005589,-0.075306,0.115623,...,VATE1_HUMAN,72,VATE1_M72,ATP6V1E1,V-type proton ATPase subunit E 1,0.475241,0.004315,0.323086,gray,MALSDADVQKQIKHMMAFIEQEANEKAEEIDAKAEEEFNIEKGRLV...
225,RM[649.3660]NTNPSR,RM[655.3735]NTNPSR,,0.042313,0.066312,-0.157665,,0.080951,,,...,RL13A_HUMAN,62,RL13A_M62,RPL13A,Large ribosomal subunit protein uL13,0.477439,-0.004475,0.321082,gray,MAEVQVLVLDGRGHLLGRLAAIVAKQVLLGRKVVVVRCEGINISGN...
226,RMQEM[649.3660]IAR,RMQEM[655.3735]IAR,,,,,0.819923,,-0.456874,,...,SEPT2_HUMAN,335,SEPT2_M335,SEPTIN2,Septin-2,0.477585,-0.033629,0.320949,gray,MSKQQPTQFINPETPGYVGFANLPNQVHRKSVKKGFEFTLMVVGES...
227,RPAEDM[649.3660]EEEQAFKR,RPAEDM[655.3735]EEEQAFKR,-0.269849,,0.195697,,0.138164,-0.258595,0.024214,-0.099087,...,HNRPK_HUMAN,27,HNRPK_M27,HNRNPK,Heterogeneous nuclear ribonucleoprotein K,0.480656,0.007210,0.318166,gray,METEQPEETFPNTETNGEFGKRPAEDMEEEQAFKRSRNTDEMVELR...


In [70]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [71]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [72]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [73]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    229
Name: count, dtype: int64

In [74]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [75]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [76]:
# Sanity check - ensure methionine locations are correct (and match Met site numbers from initial dataset)
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
((temp.count("M") == len(temp)) & (peptides_cs["Met site"] == peptides_cs["Methionine Location"]+1)).value_counts()

True    229
Name: count, dtype: int64

In [77]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,IPASKM[649.3660]QEHMR,IPASKM[655.3735]QEHMR,,2.434304,,,,,,2.444069,...,red,MPAGPVQAVPPPPPVPTEPKQPTEEEASSKEDSAPSKPVVGIIYPP...,IPASKMQEHMR,419,11,IPASK,5,424,PAPDEYLVSPITGEKIPASK,QEHMRIGLLDPRWLEQRDRS
1,IQQM[649.3660]LPDK,IQQM[655.3735]LPDK,,,1.437991,1.289692,1.662225,,,,...,red,MPSVMEKPSAGSGILSRSRAKTVPNGGQPHSEDDSSEEEHSHDSMI...,IQQMLPDK,156,8,IQQ,3,159,KVLFEQAFGFHGKCFQRIQQ,LPDKLIPSLVKYYYSWKKTR
2,RFYEQM[649.3660]NGPVAGASR,RFYEQM[655.3735]NGPVAGASR,,,,,,,,,...,red,MATNFLAHEKIWFDKFKYDDAERRFYEQMNGPVAGASRQENGASVI...,RFYEQMNGPVAGASR,23,15,RFYEQ,5,28,EKIWFDKFKYDDAERRFYEQ,NGPVAGASRQENGASVILRD
3,LM[649.3660]GQIHQLR,LM[655.3735]GQIHQLR,,1.349017,,,,1.314251,,,...,red,MSWFVDLAGKAEDLLNRVDQGAATALSRKDNASNIYSKNTDYTELH...,LMGQIHQLR,484,9,L,1,485,MELEELRHEKEMQREEIQKL,GQIHQLRSELQDMEAQQVNE
4,KQM[649.3660]EYER,KQM[655.3735]EYER,,0.800333,,1.051198,,1.095443,,,...,red,MPKAPKQQPPEPEWIGDGESTSPSDKVVKKGKKDKKIKKTFFEELA...,KQMEYER,262,7,KQ,2,264,KADDPYAHLSKKEKKKLKKQ,EYERQVASLKAANAAENDFS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,IQM[649.3660]SNLMNQAR,IQM[655.3735]SNLMNQAR,,0.055682,-0.157995,0.119155,,0.005589,-0.075306,0.115623,...,gray,MALSDADVQKQIKHMMAFIEQEANEKAEEIDAKAEEEFNIEKGRLV...,IQMSNLMNQAR,69,11,IQ,2,71,KIMEYYEKKEKQIEQQKKIQ,SNLMNQARLKVLRARDDLIT
225,RM[649.3660]NTNPSR,RM[655.3735]NTNPSR,,0.042313,0.066312,-0.157665,,0.080951,,,...,gray,MAEVQVLVLDGRGHLLGRLAAIVAKQVLLGRKVVVVRCEGINISGN...,RMNTNPSR,60,8,R,1,61,NISGNFYRNKLKYLAFLRKR,NTNPSRGPYHFRAPSRIFWR
226,RMQEM[649.3660]IAR,RMQEM[655.3735]IAR,,,,,0.819923,,-0.456874,,...,gray,MSKQQPTQFINPETPGYVGFANLPNQVHRKSVKKGFEFTLMVVGES...,RMQEMIAR,330,8,RMQE,4,334,DMNKDQILLEKEAELRRMQE,IARMQAQMQMQMQGGDGDGG
227,RPAEDM[649.3660]EEEQAFKR,RPAEDM[655.3735]EEEQAFKR,-0.269849,,0.195697,,0.138164,-0.258595,0.024214,-0.099087,...,gray,METEQPEETFPNTETNGEFGKRPAEDMEEEQAFKRSRNTDEMVELR...,RPAEDMEEEQAFKR,21,14,RPAED,5,26,EETFPNTETNGEFGKRPAED,EEEQAFKRSRNTDEMVELRI


In [78]:
# Remove invalid proteins (according to alphafold)
# Here, none!

peptides_cs;

# Download Alphafold Data - 50uM MsrB2KD

In [79]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q15459' 'Q8IZ40' 'P29692' 'Q8TBA6' 'Q8NE71' 'P35579' 'Q86UP2' 'P54886'
 'P24539' 'P60228' 'Q15233' 'Q9UMX5' 'Q9Y383' 'P52272' 'P12270' 'Q9Y266'
 'Q04323' 'P50990' 'Q13435' 'P61978' 'P14866' 'Q13404' 'Q13310' 'Q9H444'
 'Q02790' 'P16949' 'Q07065' 'Q13263' 'Q99598' 'P12694' 'P36542' 'O95347'
 'Q14011' 'P14625' 'Q96CT7' 'P48643' 'Q99615' 'Q14839' 'P36543' 'Q9NZI8'
 'Q15717' 'O60664' 'P15374' 'P46777' 'P18669' 'Q99623' 'Q15773' 'P15311'
 'Q9H9T3' 'Q16181' 'P46781' 'P62258' 'P11142' 'P31948' 'P06733' 'Q9UQE7'
 'P68104' 'P51003' 'Q7Z739' 'P23246' 'P38646' 'P62841' 'P84098' 'P07910'
 'Q9UMX0' 'P61011' 'P10809' 'Q16543' 'P33176' 'P61247' 'P50454' 'P09496'
 'P60709' 'Q14152' 'Q04760' 'Q9BQ04' 'P26038' 'P22626' 'Q9Y4L1' 'O14744'
 'Q04837' 'P11940' 'Q9NUU7' 'P23786' 'P07108' 'P41227' 'O14497' 'Q16643'
 'A8MXV4' 'P55072' 'P46109' 'P37802' 'Q9UHV9' 'Q9Y617' 'Q15424' 'Q8IWC1'
 'Q9UHX1' 'P61604' 'P08238' 'Q4VCS5' 'Q96AE4' 'Q9UL46' 'Q9HD42' 'P62805'
 'Q15019' 'P17980' 'O14737' 'Q

In [80]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 165/165 [00:00<00:00, 133396.33it/s]

2024-10-17 19:08:39> Valid proteins: 0
2024-10-17 19:08:39> Invalid proteins: 0
2024-10-17 19:08:39> Existing proteins: 165





In [81]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 165/165 [00:00<00:00, 141035.29it/s]

2024-10-17 19:08:39> Valid proteins: 0
2024-10-17 19:08:39> Invalid proteins: 0
2024-10-17 19:08:39> Existing proteins: 165





# Construct Alphafold Dataframe (Calculate Accessibilities) - 50uM MsrB2KD

In [82]:
# Format alphafold data into dataframe
alphafold_annotation_50uM_MsrB2KD = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=unique_uniprotIDs)
alphafold_annotation_50uM_MsrB2KD

100%|██████████| 1663/1663 [00:13<00:00, 122.10it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A8MXV4,1,M,1,34.18,7.223,7.574,9.030,7.341,-36.582,...,13.872,14.366,13.031,unstructured,unstructured,0,0,0,0,1
1,A8MXV4,1,S,2,29.59,5.569,5.520,4.112,5.964,-33.823,...,12.453,11.875,13.105,unstructured,unstructured,0,0,0,0,1
2,A8MXV4,1,S,3,29.28,5.596,6.732,8.090,6.621,-30.881,...,14.369,14.189,13.491,unstructured,unstructured,0,0,0,0,1
3,A8MXV4,1,S,4,27.39,4.474,3.768,3.026,4.746,-28.235,...,15.056,16.402,15.048,unstructured,unstructured,0,0,0,0,1
4,A8MXV4,1,L,5,28.73,3.667,4.309,3.682,4.070,-25.450,...,13.647,12.336,13.764,unstructured,unstructured,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88968,Q9Y617,165,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,15.087,16.015,14.291,HELX_RH_AL_P,HELX,0,1,0,0,0
88969,Q9Y617,165,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,17.344,18.096,16.529,HELX_RH_AL_P,HELX,0,1,0,0,0
88970,Q9Y617,165,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,14.388,13.842,15.221,HELX_RH_AL_P,HELX,0,1,0,0,0
88971,Q9Y617,165,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,12.177,11.359,13.111,HELX_RH_AL_P,HELX,0,1,0,0,0


In [83]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 165/165 [00:01<00:00, 117.19it/s]


In [84]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_annotation_50uM_MsrB2KD.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [85]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 165/165 [00:01<00:00, 121.21it/s]


In [86]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [87]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 165/165 [00:01<00:00, 124.39it/s]


In [88]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [89]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 165/165 [00:01<00:00, 123.85it/s]


In [90]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [91]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 165/165 [00:01<00:00, 120.81it/s]


In [92]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [93]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 165/165 [00:01<00:00, 122.26it/s]


In [94]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [95]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 165/165 [00:01<00:00, 121.54it/s]


In [96]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [97]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 165/165 [00:01<00:00, 119.43it/s]


In [98]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [99]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 165/165 [00:01<00:00, 113.63it/s]


In [100]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [101]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 165/165 [00:01<00:00, 117.85it/s]


In [102]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [103]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 165/165 [00:01<00:00, 112.26it/s]


In [104]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [105]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 165/165 [00:01<00:00, 95.30it/s] 


In [106]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [107]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 165/165 [00:01<00:00, 90.06it/s] 


In [108]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [109]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 165/165 [00:02<00:00, 74.67it/s] 


In [110]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD;

In [111]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_50uM_MsrB2KD, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 165/165 [00:01<00:00, 107.21it/s]


In [112]:
alphafold_accessibility_50uM_MsrB2KD = alphafold_accessibility_50uM_MsrB2KD.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_50uM_MsrB2KD

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,A8MXV4,1,M,1,34.18,7.223,7.574,9.030,7.341,-36.582,...,1,1,1,1,1,1,2,3,5,0
1,A8MXV4,1,S,2,29.59,5.569,5.520,4.112,5.964,-33.823,...,1,2,2,2,2,2,3,4,6,0
2,A8MXV4,1,S,3,29.28,5.596,6.732,8.090,6.621,-30.881,...,1,2,2,2,2,2,4,5,7,0
3,A8MXV4,1,S,4,27.39,4.474,3.768,3.026,4.746,-28.235,...,1,2,2,2,2,2,4,6,7,0
4,A8MXV4,1,L,5,28.73,3.667,4.309,3.682,4.070,-25.450,...,1,2,2,2,2,2,4,6,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88968,Q9Y617,165,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,2,2,3,6,7,7,12,35,70,4
88969,Q9Y617,165,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,2,2,3,4,5,5,9,33,62,2
88970,Q9Y617,165,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,2,2,3,4,4,5,10,29,54,4
88971,Q9Y617,165,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,1,2,2,2,2,2,5,13,30,2


In [113]:
alphafold_accessibility_50uM_MsrB2KD_smooth = get_smooth_score(
    alphafold_accessibility_50uM_MsrB2KD, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_50uM_MsrB2KD_smooth;

100%|██████████| 165/165 [00:00<00:00, 606.91it/s]


In [114]:
alphafold_accessibility_50uM_MsrB2KD_smooth['IDR'] = np.where(
    alphafold_accessibility_50uM_MsrB2KD_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_50uM_MsrB2KD_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,A8MXV4,1,M,1,34.18,7.223,7.574,9.030,7.341,-36.582,...,1.909091,1.909091,1.909091,1.909091,1.909091,3.818182,6.636364,12.181818,0.000000,1
1,A8MXV4,1,S,2,29.59,5.569,5.520,4.112,5.964,-33.823,...,2.083333,2.166667,2.166667,2.250000,2.250000,4.750000,9.916667,20.500000,0.416667,1
2,A8MXV4,1,S,3,29.28,5.596,6.732,8.090,6.621,-30.881,...,2.153846,2.230769,2.307692,2.615385,2.692308,5.538462,13.076923,28.923077,0.615385,1
3,A8MXV4,1,S,4,27.39,4.474,3.768,3.026,4.746,-28.235,...,2.214286,2.285714,2.571429,3.000000,3.071429,6.642857,16.857143,38.000000,0.714286,0
4,A8MXV4,1,L,5,28.73,3.667,4.309,3.682,4.070,-25.450,...,2.400000,2.533333,2.866667,3.466667,3.600000,8.133333,20.533333,47.666667,1.333333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,Q9Y617,165,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,2.000000,4.266667,5.333333,6.466667,6.733333,17.133333,52.133333,84.600000,6.000000,0
366,Q9Y617,165,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,2.000000,4.142857,5.285714,6.357143,6.642857,16.857143,51.285714,82.857143,5.928571,0
367,Q9Y617,165,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,2.000000,4.076923,5.230769,6.307692,6.538462,16.000000,48.769231,79.307692,5.615385,0
368,Q9Y617,165,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,1.916667,3.916667,5.166667,6.083333,6.250000,15.333333,46.166667,75.333333,5.333333,0


# Merge Dataframes into Full Dataset (Includes Alphafold) - 50uM MsrB2KD

In [115]:
alphafold_accessibility_50uM_MsrB2KD_smooth["position"] = alphafold_accessibility_50uM_MsrB2KD_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_wa = peptides_cs.merge(
    alphafold_accessibility_50uM_MsrB2KD_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_wa # wa means "with alphafold"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,IPASKM[649.3660]QEHMR,IPASKM[655.3735]QEHMR,,2.434304,,,,,,2.444069,...,2.047619,2.523810,3.000000,3.761905,4.571429,9.571429,19.190476,27.476190,1.619048,1
1,IQQM[649.3660]LPDK,IQQM[655.3735]LPDK,,,1.437991,1.289692,1.662225,,,,...,2.238095,3.714286,4.571429,5.523810,6.476190,14.333333,35.190476,45.952381,3.428571,0
2,RFYEQM[649.3660]NGPVAGASR,RFYEQM[655.3735]NGPVAGASR,,,,,,,,,...,2.238095,3.238095,3.571429,4.142857,4.380952,7.571429,12.000000,16.190476,1.380952,1
3,LM[649.3660]GQIHQLR,LM[655.3735]GQIHQLR,,1.349017,,,,1.314251,,,...,2.380952,5.190476,6.285714,7.952381,8.000000,13.000000,19.904762,27.666667,2.666667,1
4,KQM[649.3660]EYER,KQM[655.3735]EYER,,0.800333,,1.051198,,1.095443,,,...,2.142857,3.857143,4.619048,6.238095,6.476190,10.952381,16.333333,22.476190,1.857143,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,IQM[649.3660]SNLMNQAR,IQM[655.3735]SNLMNQAR,,0.055682,-0.157995,0.119155,,0.005589,-0.075306,0.115623,...,2.047619,4.761905,6.000000,8.000000,8.000000,13.190476,20.095238,29.095238,2.952381,1
225,RM[649.3660]NTNPSR,RM[655.3735]NTNPSR,,0.042313,0.066312,-0.157665,,0.080951,,,...,2.380952,3.619048,4.333333,5.476190,6.190476,12.952381,28.761905,54.190476,2.857143,0
226,RMQEM[649.3660]IAR,RMQEM[655.3735]IAR,,,,,0.819923,,-0.456874,,...,2.047619,3.476190,4.380952,5.952381,6.285714,11.095238,16.857143,22.809524,2.000000,1
227,RPAEDM[649.3660]EEEQAFKR,RPAEDM[655.3735]EEEQAFKR,-0.269849,,0.195697,,0.138164,-0.258595,0.024214,-0.099087,...,1.809524,1.809524,2.000000,2.000000,2.000000,3.666667,6.047619,9.238095,0.000000,1


In [116]:
#peptides_wa.to_csv(os.path.join(curr_dir_path, "50uM_MsrB2KD_with_alphafold.csv"))

In [117]:
path = os.path.join(curr_dir_path, "50uM_MsrB2KD_with_alphafold.csv")
peptides_wa = pd.read_csv(path)
peptides_wa.set_index("Unnamed: 0", inplace=True)
peptides_wa.index.name = None
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,IPASKM[649.3660]QEHMR,IPASKM[655.3735]QEHMR,,2.434304,,,,,,2.444069,...,2.047619,2.523810,3.000000,3.761905,4.571429,9.571429,19.190476,27.476190,1.619048,1
1,IQQM[649.3660]LPDK,IQQM[655.3735]LPDK,,,1.437991,1.289692,1.662225,,,,...,2.238095,3.714286,4.571429,5.523810,6.476190,14.333333,35.190476,45.952381,3.428571,0
2,RFYEQM[649.3660]NGPVAGASR,RFYEQM[655.3735]NGPVAGASR,,,,,,,,,...,2.238095,3.238095,3.571429,4.142857,4.380952,7.571429,12.000000,16.190476,1.380952,1
3,LM[649.3660]GQIHQLR,LM[655.3735]GQIHQLR,,1.349017,,,,1.314251,,,...,2.380952,5.190476,6.285714,7.952381,8.000000,13.000000,19.904762,27.666667,2.666667,1
4,KQM[649.3660]EYER,KQM[655.3735]EYER,,0.800333,,1.051198,,1.095443,,,...,2.142857,3.857143,4.619048,6.238095,6.476190,10.952381,16.333333,22.476190,1.857143,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,IQM[649.3660]SNLMNQAR,IQM[655.3735]SNLMNQAR,,0.055682,-0.157995,0.119155,,0.005589,-0.075306,0.115623,...,2.047619,4.761905,6.000000,8.000000,8.000000,13.190476,20.095238,29.095238,2.952381,1
225,RM[649.3660]NTNPSR,RM[655.3735]NTNPSR,,0.042313,0.066312,-0.157665,,0.080951,,,...,2.380952,3.619048,4.333333,5.476190,6.190476,12.952381,28.761905,54.190476,2.857143,0
226,RMQEM[649.3660]IAR,RMQEM[655.3735]IAR,,,,,0.819923,,-0.456874,,...,2.047619,3.476190,4.380952,5.952381,6.285714,11.095238,16.857143,22.809524,2.000000,1
227,RPAEDM[649.3660]EEEQAFKR,RPAEDM[655.3735]EEEQAFKR,-0.269849,,0.195697,,0.138164,-0.258595,0.024214,-0.099087,...,1.809524,1.809524,2.000000,2.000000,2.000000,3.666667,6.047619,9.238095,0.000000,1


# Load Dataset - 100uM MsrAKD

In [118]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "10_01_24_Met_MsrKD_Mastersheet.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="100 uM MsrA KD")
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Protein ID,Entry Name,Met site,Label,Gene,Protein Description,pval,average ratio,actual ratio,neglog10p
0,RFYEQM[649.3660]NGPVAGASR,RFYEQM[655.3735]NGPVAGASR,,,,,,,,-1.459176,...,P29692,EF1D_HUMAN,29,EF1D_M29,EEF1D,Elongation factor 1-delta,5.725075e-04,-1.411377,1.411377,3.242219
1,DSDSGLEQM[649.3660]SIGHHIR,DSDSGLEQM[655.3735]SIGHHIR,-0.987823,-0.958346,-0.907636,-1.029497,-0.812127,-1.234694,-0.450931,-1.072178,...,Q15773,MLF2_HUMAN,151,MLF2_M151,MLF2,Myeloid leukemia factor 2,4.377829e-03,-1.088260,1.088260,2.358741
2,SMM[649.3660]SYER,SMM[655.3735]SYER,-1.883411,,,,-1.217990,-0.646863,,-0.717885,...,P18583,SON_HUMAN,1018,SON_M1018,SON,Protein SON,9.609837e-03,-1.064925,1.064925,2.017284
3,LMQLM[649.3660]ESEQK,LMQLM[655.3735]ESEQK,-0.923502,,,-1.171454,-1.012287,-0.755677,-1.619834,,...,Q86UP2,KTN1_HUMAN,554,KTN1_M554,KTN1,Kinectin,7.946650e-04,-1.061351,1.061351,3.099816
4,FQHQGFGM[649.3660]LLMEEAER,FQHQGFGM[655.3735]LLMEEAER,-0.974651,-1.024898,,-1.003069,-1.034371,,,,...,Q9H9T3,ELP3_HUMAN,499,ELP3_M499,ELP3,Elongator complex protein 3,2.506647e-08,-1.015595,1.015595,7.600907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,RLLKEEDM[649.3660]TK,RLLKEEDM[655.3735]TK,0.059652,-0.077560,-0.266137,0.298445,0.029505,-0.001940,0.007283,0.027519,...,P38919,IF4A3_HUMAN,23,IF4A3_M23,EIF4A3,Eukaryotic initiation factor 4A-III,4.668313e-01,0.010588,-0.010588,0.330840
291,M[649.3660]ADKDGDLIATK,M[655.3735]ADKDGDLIATK,0.095942,,-0.317916,,-0.667125,0.242408,0.428816,0.402582,...,O43852,CALU_HUMAN,162,CALU_M162,CALU,Calumenin,4.700893e-01,0.024475,-0.024475,0.327820
292,LRQENM[649.3660]ELAER,LRQENM[655.3735]ELAER,,,,0.047247,,,,-0.134433,...,P40222,TXLNA_HUMAN,303,TXLNA_M303,TXLNA,Alpha-taxilin,4.723186e-01,-0.006267,0.006267,0.325765
293,MDM[649.3660]SLDDIIK,MDM[655.3735]SLDDIIK,,,,,-0.120082,,,,...,Q86V81,THOC4_HUMAN,7,THOC4_M7,ALYREF,THO complex subunit 4,4.979716e-01,-0.000479,0.000479,0.302795


In [119]:
# Canonicalize data - none to do here
peptides;

In [120]:
# Manual labeling of peptides
label_col_data = ["red"] * 44 + ["green"] * 110 + ["blue"] * 57 + ["gray"] * 84
label_col = pd.Series(label_col_data)
peptides["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [121]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P29692' 'Q15773' 'P18583' 'Q86UP2' 'Q9H9T3' 'Q9UMX5' 'Q9UMX0' 'Q16181'
 'O75533' 'P61011' 'P05141' 'Q15233' 'P52272' 'Q8WXF1' 'P23246' 'Q04323'
 'P68104' 'Q15366' 'Q14683' 'Q9H444' 'Q13404' 'Q9Y617' 'P51572' 'Q8IWC1'
 'Q9Y3U8' 'Q96PK6' 'Q99848' 'O14737' 'Q92900' 'P50990' 'P07910' 'P37802'
 'Q07065' 'Q13263' 'P11021' 'Q99436' 'P48643' 'P12694' 'P67870' 'Q14011'
 'P36542' 'P36543' 'P22307' 'Q96CT7' 'O60664' 'Q99615' 'P80303' 'P14625'
 'P15374' 'Q9NTZ6' 'Q9NZI8' 'Q99623' 'P09874' 'P15311' 'O95347' 'Q13283'
 'P60228' 'P83731' 'P26038' 'P11940' 'P62258' 'Q14204' 'Q15717' 'P51003'
 'P35579' 'P46777' 'P08238' 'Q4VCS5' 'P40222' 'P18669' 'P07900' 'Q96LB3'
 'P82675' 'P51532' 'Q9HD42' 'Q7Z739' 'P11310' 'P31948' 'P11142' 'P43243'
 'P82970' 'P84098' 'P62841' 'P09496' 'Q02790' 'P40429' 'P53999' 'Q9NUU7'
 'P60709' 'O60841' 'P38646' 'Q15637' 'Q15424' 'P39023' 'P50454' 'Q9UJY5'
 'P61247' 'Q00587' 'P07108' 'P46779' 'Q96AE4' 'P22626' 'Q9Y2S6' 'Q13435'
 'P33176' 'P05023' 'Q9Y4L1' 'Q

In [122]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [123]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [124]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Met site,Label,Gene,Protein Description,pval,average ratio,actual ratio,neglog10p,Color,Complete Sequence
0,RFYEQM[649.3660]NGPVAGASR,RFYEQM[655.3735]NGPVAGASR,,,,,,,,-1.459176,...,29,EF1D_M29,EEF1D,Elongation factor 1-delta,5.725075e-04,-1.411377,1.411377,3.242219,red,MATNFLAHEKIWFDKFKYDDAERRFYEQMNGPVAGASRQENGASVI...
1,DSDSGLEQM[649.3660]SIGHHIR,DSDSGLEQM[655.3735]SIGHHIR,-0.987823,-0.958346,-0.907636,-1.029497,-0.812127,-1.234694,-0.450931,-1.072178,...,151,MLF2_M151,MLF2,Myeloid leukemia factor 2,4.377829e-03,-1.088260,1.088260,2.358741,red,MFRFMRDVEPEDPMFLMDPFAIHRQHMSRMLSGGFGYSPFLSITDG...
2,SMM[649.3660]SYER,SMM[655.3735]SYER,-1.883411,,,,-1.217990,-0.646863,,-0.717885,...,1018,SON_M1018,SON,Protein SON,9.609837e-03,-1.064925,1.064925,2.017284,red,MATNIEQIFRSFVVSKFREIQQELSSGRNEGQLNGETNTPIEGNQA...
3,LMQLM[649.3660]ESEQK,LMQLM[655.3735]ESEQK,-0.923502,,,-1.171454,-1.012287,-0.755677,-1.619834,,...,554,KTN1_M554,KTN1,Kinectin,7.946650e-04,-1.061351,1.061351,3.099816,red,MEFYESAYFIVLIPSIVITVIFLFFWLFMKETLYDEVLAKQKREQK...
4,FQHQGFGM[649.3660]LLMEEAER,FQHQGFGM[655.3735]LLMEEAER,-0.974651,-1.024898,,-1.003069,-1.034371,,,,...,499,ELP3_M499,ELP3,Elongator complex protein 3,2.506647e-08,-1.015595,1.015595,7.600907,red,MRQKRKGDLSPAELMMLTIGDVIKQLIEAHEQGKDIDLNKVKTKTA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,RLLKEEDM[649.3660]TK,RLLKEEDM[655.3735]TK,0.059652,-0.077560,-0.266137,0.298445,0.029505,-0.001940,0.007283,0.027519,...,23,IF4A3_M23,EIF4A3,Eukaryotic initiation factor 4A-III,4.668313e-01,0.010588,-0.010588,0.330840,gray,MATTATMATSGSARKRLLKEEDMTKVEFETSEEVDVTPTFDTMGLR...
291,M[649.3660]ADKDGDLIATK,M[655.3735]ADKDGDLIATK,0.095942,,-0.317916,,-0.667125,0.242408,0.428816,0.402582,...,162,CALU_M162,CALU,Calumenin,4.700893e-01,0.024475,-0.024475,0.327820,gray,MDLRQFLMCLSLCTAFALSKPTEKKDRVHHEPQLSDKVHNDAQSFD...
292,LRQENM[649.3660]ELAER,LRQENM[655.3735]ELAER,,,,0.047247,,,,-0.134433,...,303,TXLNA_M303,TXLNA,Alpha-taxilin,4.723186e-01,-0.006267,0.006267,0.325765,gray,MKNQDKKNGAAKQSNPKSSPGQPEAGPEGAQERPSQAAPAVEAEGP...
293,MDM[649.3660]SLDDIIK,MDM[655.3735]SLDDIIK,,,,,-0.120082,,,,...,7,THOC4_M7,ALYREF,THO complex subunit 4,4.979716e-01,-0.000479,0.000479,0.302795,gray,MADKMDMSLDDIIKLNRSQRGGRGGGRGRGRAGSQGGRGGGAQAAA...


In [125]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [126]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [127]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [128]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    295
Name: count, dtype: int64

In [129]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [130]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [131]:
# Sanity check - ensure methionine locations are correct (and match Met site numbers from initial dataset)
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
((temp.count("M") == len(temp)) & (peptides_cs["Met site"] == peptides_cs["Methionine Location"]+1)).value_counts()

True    295
Name: count, dtype: int64

In [132]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,RFYEQM[649.3660]NGPVAGASR,RFYEQM[655.3735]NGPVAGASR,,,,,,,,-1.459176,...,red,MATNFLAHEKIWFDKFKYDDAERRFYEQMNGPVAGASRQENGASVI...,RFYEQMNGPVAGASR,23,15,RFYEQ,5,28,EKIWFDKFKYDDAERRFYEQ,NGPVAGASRQENGASVILRD
1,DSDSGLEQM[649.3660]SIGHHIR,DSDSGLEQM[655.3735]SIGHHIR,-0.987823,-0.958346,-0.907636,-1.029497,-0.812127,-1.234694,-0.450931,-1.072178,...,red,MFRFMRDVEPEDPMFLMDPFAIHRQHMSRMLSGGFGYSPFLSITDG...,DSDSGLEQMSIGHHIR,142,16,DSDSGLEQ,8,150,PGGIRETRRTVRDSDSGLEQ,SIGHHIRDRAHILQRSRNHR
2,SMM[649.3660]SYER,SMM[655.3735]SYER,-1.883411,,,,-1.217990,-0.646863,,-0.717885,...,red,MATNIEQIFRSFVVSKFREIQQELSSGRNEGQLNGETNTPIEGNQA...,SMMSYER,1015,7,SM,2,1017,SMMMSYAAERSMMSSYERSM,SYERSMMSPMAERSMMSAYE
3,LMQLM[649.3660]ESEQK,LMQLM[655.3735]ESEQK,-0.923502,,,-1.171454,-1.012287,-0.755677,-1.619834,,...,red,MEFYESAYFIVLIPSIVITVIFLFFWLFMKETLYDEVLAKQKREQK...,LMQLMESEQK,549,10,LMQL,4,553,SKLTDTLVSKQQLEQRLMQL,ESEQKRVNKEESLQMQVQDI
4,FQHQGFGM[649.3660]LLMEEAER,FQHQGFGM[655.3735]LLMEEAER,-0.974651,-1.024898,,-1.003069,-1.034371,,,,...,red,MRQKRKGDLSPAELMMLTIGDVIKQLIEAHEQGKDIDLNKVKTKTA...,FQHQGFGMLLMEEAER,491,16,FQHQGFG,7,498,GSVVPVSSRDPTKFQHQGFG,LLMEEAERIAREEHGSGKIA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,RLLKEEDM[649.3660]TK,RLLKEEDM[655.3735]TK,0.059652,-0.077560,-0.266137,0.298445,0.029505,-0.001940,0.007283,0.027519,...,gray,MATTATMATSGSARKRLLKEEDMTKVEFETSEEVDVTPTFDTMGLR...,RLLKEEDMTK,15,10,RLLKEED,7,22,TTATMATSGSARKRLLKEED,TKVEFETSEEVDVTPTFDTM
291,M[649.3660]ADKDGDLIATK,M[655.3735]ADKDGDLIATK,0.095942,,-0.317916,,-0.667125,0.242408,0.428816,0.402582,...,gray,MDLRQFLMCLSLCTAFALSKPTEKKDRVHHEPQLSDKVHNDAQSFD...,MADKDGDLIATK,161,12,,0,161,DPDDGFNYKQMMVRDERRFK,ADKDGDLIATKEEFTAFLHP
292,LRQENM[649.3660]ELAER,LRQENM[655.3735]ELAER,,,,0.047247,,,,-0.134433,...,gray,MKNQDKKNGAAKQSNPKSSPGQPEAGPEGAQERPSQAAPAVEAEGP...,LRQENMELAER,297,11,LRQEN,5,302,DIQLQMEQHNERNSKLRQEN,ELAERLKKLIEQYELREEHI
293,MDM[649.3660]SLDDIIK,MDM[655.3735]SLDDIIK,,,,,-0.120082,,,,...,gray,MADKMDMSLDDIIKLNRSQRGGRGGGRGRGRAGSQGGRGGGAQAAA...,MDMSLDDIIK,4,10,MD,2,6,MADKM,SLDDIIKLNRSQRGGRGGGR


In [133]:
# Remove invalid proteins (according to alphafold)
# 1 invalid peptide as a result -> 1 green

invalid_IDs = ['Q14204']
display(peptides_cs[peptides_cs["Protein ID"].isin(invalid_IDs)])
peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
76,KVM[649.3660]SQEIQEQLHK,KVM[655.3735]SQEIQEQLHK,,,,,,,-0.726297,-0.656295,...,green,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,KVMSQEIQEQLHK,3253,13,KV,2,3255,ANDKLKKMVKDQQEAEKKKV,SQEIQEQLHKQQEVIADKQM


Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,RFYEQM[649.3660]NGPVAGASR,RFYEQM[655.3735]NGPVAGASR,,,,,,,,-1.459176,...,red,MATNFLAHEKIWFDKFKYDDAERRFYEQMNGPVAGASRQENGASVI...,RFYEQMNGPVAGASR,23,15,RFYEQ,5,28,EKIWFDKFKYDDAERRFYEQ,NGPVAGASRQENGASVILRD
1,DSDSGLEQM[649.3660]SIGHHIR,DSDSGLEQM[655.3735]SIGHHIR,-0.987823,-0.958346,-0.907636,-1.029497,-0.812127,-1.234694,-0.450931,-1.072178,...,red,MFRFMRDVEPEDPMFLMDPFAIHRQHMSRMLSGGFGYSPFLSITDG...,DSDSGLEQMSIGHHIR,142,16,DSDSGLEQ,8,150,PGGIRETRRTVRDSDSGLEQ,SIGHHIRDRAHILQRSRNHR
2,SMM[649.3660]SYER,SMM[655.3735]SYER,-1.883411,,,,-1.217990,-0.646863,,-0.717885,...,red,MATNIEQIFRSFVVSKFREIQQELSSGRNEGQLNGETNTPIEGNQA...,SMMSYER,1015,7,SM,2,1017,SMMMSYAAERSMMSSYERSM,SYERSMMSPMAERSMMSAYE
3,LMQLM[649.3660]ESEQK,LMQLM[655.3735]ESEQK,-0.923502,,,-1.171454,-1.012287,-0.755677,-1.619834,,...,red,MEFYESAYFIVLIPSIVITVIFLFFWLFMKETLYDEVLAKQKREQK...,LMQLMESEQK,549,10,LMQL,4,553,SKLTDTLVSKQQLEQRLMQL,ESEQKRVNKEESLQMQVQDI
4,FQHQGFGM[649.3660]LLMEEAER,FQHQGFGM[655.3735]LLMEEAER,-0.974651,-1.024898,,-1.003069,-1.034371,,,,...,red,MRQKRKGDLSPAELMMLTIGDVIKQLIEAHEQGKDIDLNKVKTKTA...,FQHQGFGMLLMEEAER,491,16,FQHQGFG,7,498,GSVVPVSSRDPTKFQHQGFG,LLMEEAERIAREEHGSGKIA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,RLLKEEDM[649.3660]TK,RLLKEEDM[655.3735]TK,0.059652,-0.077560,-0.266137,0.298445,0.029505,-0.001940,0.007283,0.027519,...,gray,MATTATMATSGSARKRLLKEEDMTKVEFETSEEVDVTPTFDTMGLR...,RLLKEEDMTK,15,10,RLLKEED,7,22,TTATMATSGSARKRLLKEED,TKVEFETSEEVDVTPTFDTM
291,M[649.3660]ADKDGDLIATK,M[655.3735]ADKDGDLIATK,0.095942,,-0.317916,,-0.667125,0.242408,0.428816,0.402582,...,gray,MDLRQFLMCLSLCTAFALSKPTEKKDRVHHEPQLSDKVHNDAQSFD...,MADKDGDLIATK,161,12,,0,161,DPDDGFNYKQMMVRDERRFK,ADKDGDLIATKEEFTAFLHP
292,LRQENM[649.3660]ELAER,LRQENM[655.3735]ELAER,,,,0.047247,,,,-0.134433,...,gray,MKNQDKKNGAAKQSNPKSSPGQPEAGPEGAQERPSQAAPAVEAEGP...,LRQENMELAER,297,11,LRQEN,5,302,DIQLQMEQHNERNSKLRQEN,ELAERLKKLIEQYELREEHI
293,MDM[649.3660]SLDDIIK,MDM[655.3735]SLDDIIK,,,,,-0.120082,,,,...,gray,MADKMDMSLDDIIKLNRSQRGGRGGGRGRGRAGSQGGRGGGAQAAA...,MDMSLDDIIK,4,10,MD,2,6,MADKM,SLDDIIKLNRSQRGGRGGGR


# Download Alphafold Data - 100uM MsrAKD

In [134]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P29692' 'Q15773' 'P18583' 'Q86UP2' 'Q9H9T3' 'Q9UMX5' 'Q9UMX0' 'Q16181'
 'O75533' 'P61011' 'P05141' 'Q15233' 'P52272' 'Q8WXF1' 'P23246' 'Q04323'
 'P68104' 'Q15366' 'Q14683' 'Q9H444' 'Q13404' 'Q9Y617' 'P51572' 'Q8IWC1'
 'Q9Y3U8' 'Q96PK6' 'Q99848' 'O14737' 'Q92900' 'P50990' 'P07910' 'P37802'
 'Q07065' 'Q13263' 'P11021' 'Q99436' 'P48643' 'P12694' 'P67870' 'Q14011'
 'P36542' 'P36543' 'P22307' 'Q96CT7' 'O60664' 'Q99615' 'P80303' 'P14625'
 'P15374' 'Q9NTZ6' 'Q9NZI8' 'Q99623' 'P09874' 'P15311' 'O95347' 'Q13283'
 'P60228' 'P83731' 'P26038' 'P11940' 'P62258' 'Q15717' 'P51003' 'P35579'
 'P46777' 'P08238' 'Q4VCS5' 'P40222' 'P18669' 'P07900' 'Q96LB3' 'P82675'
 'P51532' 'Q9HD42' 'Q7Z739' 'P11310' 'P31948' 'P11142' 'P43243' 'P82970'
 'P84098' 'P62841' 'P09496' 'Q02790' 'P40429' 'P53999' 'Q9NUU7' 'P60709'
 'O60841' 'P38646' 'Q15637' 'Q15424' 'P39023' 'P50454' 'Q9UJY5' 'P61247'
 'Q00587' 'P07108' 'P46779' 'Q96AE4' 'P22626' 'Q9Y2S6' 'Q13435' 'P33176'
 'P05023' 'Q9Y4L1' 'Q04837' 'Q

In [135]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 197/197 [00:00<00:00, 150041.38it/s]

2024-10-17 19:09:16> Valid proteins: 0
2024-10-17 19:09:16> Invalid proteins: 0
2024-10-17 19:09:16> Existing proteins: 197





In [136]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 197/197 [00:00<00:00, 34598.35it/s]

2024-10-17 19:09:16> Valid proteins: 0
2024-10-17 19:09:16> Invalid proteins: 0
2024-10-17 19:09:16> Existing proteins: 197





# Construct Alphafold Dataframe (Calculate Accessibilities) - 100uM MsrAKD

In [137]:
# Format alphafold data into dataframe
alphafold_annotation_100uM_MsrAKD = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=unique_uniprotIDs)
alphafold_annotation_100uM_MsrAKD

100%|██████████| 1663/1663 [00:16<00:00, 103.02it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,E9PRG8,1,M,1,53.66,31.653,31.058,32.072,30.756,-15.522,...,-12.734,-11.583,-13.004,unstructured,unstructured,0,0,0,0,1
1,E9PRG8,1,G,2,61.43,32.653,31.558,,30.969,-12.926,...,-15.819,,-14.660,unstructured,unstructured,0,0,0,0,1
2,E9PRG8,1,A,3,63.59,34.813,35.038,36.302,33.893,-10.809,...,-15.414,-16.107,-15.836,unstructured,unstructured,0,0,0,0,1
3,E9PRG8,1,P,4,64.28,35.599,34.793,34.939,35.147,-7.676,...,-14.858,-13.434,-14.749,HELX_RH_3T_P,HELX,0,1,0,0,0
4,E9PRG8,1,G,5,59.04,37.048,37.549,,36.530,-6.832,...,-17.473,,-16.633,HELX_RH_3T_P,HELX,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107432,Q9Y617,197,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,15.087,16.015,14.291,HELX_RH_AL_P,HELX,0,1,0,0,0
107433,Q9Y617,197,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,17.344,18.096,16.529,HELX_RH_AL_P,HELX,0,1,0,0,0
107434,Q9Y617,197,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,14.388,13.842,15.221,HELX_RH_AL_P,HELX,0,1,0,0,0
107435,Q9Y617,197,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,12.177,11.359,13.111,HELX_RH_AL_P,HELX,0,1,0,0,0


In [138]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 197/197 [00:01<00:00, 107.12it/s]


In [139]:
alphafold_accessibility_100uM_MsrAKD = alphafold_annotation_100uM_MsrAKD.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [140]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 197/197 [00:01<00:00, 125.11it/s]


In [141]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [142]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 197/197 [00:01<00:00, 123.31it/s]


In [143]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [144]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 197/197 [00:01<00:00, 115.36it/s]


In [145]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [146]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 197/197 [00:02<00:00, 94.98it/s] 


In [147]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [148]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 197/197 [00:01<00:00, 118.88it/s]


In [149]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [150]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 197/197 [00:01<00:00, 119.43it/s]


In [151]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [152]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 197/197 [00:01<00:00, 117.80it/s]


In [153]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [154]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 197/197 [00:01<00:00, 106.08it/s]


In [155]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [156]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 197/197 [00:01<00:00, 110.54it/s]


In [157]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [158]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 197/197 [00:01<00:00, 113.70it/s]


In [159]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [160]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 197/197 [00:01<00:00, 104.46it/s]


In [161]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [162]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 197/197 [00:02<00:00, 89.68it/s] 


In [163]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [164]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 197/197 [00:02<00:00, 70.08it/s]


In [165]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD;

In [166]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrAKD, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 197/197 [00:01<00:00, 107.96it/s]


In [167]:
alphafold_accessibility_100uM_MsrAKD = alphafold_accessibility_100uM_MsrAKD.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrAKD

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,E9PRG8,1,M,1,53.66,31.653,31.058,32.072,30.756,-15.522,...,1,1,1,1,1,1,2,3,5,0
1,E9PRG8,1,G,2,61.43,32.653,31.558,,30.969,-12.926,...,0,2,2,2,2,2,3,5,6,0
2,E9PRG8,1,A,3,63.59,34.813,35.038,36.302,33.893,-10.809,...,1,2,2,2,2,2,4,6,7,0
3,E9PRG8,1,P,4,64.28,35.599,34.793,34.939,35.147,-7.676,...,1,2,2,2,2,2,5,7,9,0
4,E9PRG8,1,G,5,59.04,37.048,37.549,,36.530,-6.832,...,0,2,2,2,2,2,4,8,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107432,Q9Y617,197,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,2,2,3,6,7,7,12,35,70,4
107433,Q9Y617,197,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,2,2,3,4,5,5,9,33,62,2
107434,Q9Y617,197,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,2,2,3,4,4,5,10,29,54,4
107435,Q9Y617,197,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,1,2,2,2,2,2,5,13,30,2


In [168]:
alphafold_accessibility_100uM_MsrAKD_smooth = get_smooth_score(
    alphafold_accessibility_100uM_MsrAKD, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_100uM_MsrAKD_smooth;

100%|██████████| 197/197 [00:00<00:00, 615.05it/s]


In [169]:
alphafold_accessibility_100uM_MsrAKD_smooth['IDR'] = np.where(
    alphafold_accessibility_100uM_MsrAKD_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_100uM_MsrAKD_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,E9PRG8,1,M,1,53.66,31.653,31.058,32.072,30.756,-15.522,...,1.818182,1.818182,1.909091,1.909091,2.000000,4.272727,7.545455,10.818182,0.181818,1
1,E9PRG8,1,G,2,61.43,32.653,31.558,,30.969,-12.926,...,1.833333,1.833333,1.916667,1.916667,2.000000,4.583333,8.083333,11.750000,0.333333,1
2,E9PRG8,1,A,3,63.59,34.813,35.038,36.302,33.893,-10.809,...,1.846154,1.846154,1.923077,2.000000,2.076923,4.846154,8.769231,12.615385,0.307692,1
3,E9PRG8,1,P,4,64.28,35.599,34.793,34.939,35.147,-7.676,...,1.857143,1.928571,2.071429,2.142857,2.214286,5.071429,9.285714,13.285714,0.285714,1
4,E9PRG8,1,G,5,59.04,37.048,37.549,,36.530,-6.832,...,1.866667,1.933333,2.133333,2.333333,2.466667,5.533333,9.866667,14.066667,0.400000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,Q9Y617,197,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,2.000000,4.266667,5.333333,6.466667,6.733333,17.133333,52.133333,84.600000,6.000000,0
366,Q9Y617,197,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,2.000000,4.142857,5.285714,6.357143,6.642857,16.857143,51.285714,82.857143,5.928571,0
367,Q9Y617,197,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,2.000000,4.076923,5.230769,6.307692,6.538462,16.000000,48.769231,79.307692,5.615385,0
368,Q9Y617,197,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,1.916667,3.916667,5.166667,6.083333,6.250000,15.333333,46.166667,75.333333,5.333333,0


# Merge Dataframes into Full Dataset (Includes Alphafold) - 100uM MsrAKD

In [170]:
alphafold_accessibility_100uM_MsrAKD_smooth["position"] = alphafold_accessibility_100uM_MsrAKD_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_wa = peptides_cs.merge(
    alphafold_accessibility_100uM_MsrAKD_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_wa # wa means "with alphafold"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,RFYEQM[649.3660]NGPVAGASR,RFYEQM[655.3735]NGPVAGASR,,,,,,,,-1.459176,...,2.238095,3.238095,3.571429,4.142857,4.380952,7.571429,12.000000,16.190476,1.380952,1
1,DSDSGLEQM[649.3660]SIGHHIR,DSDSGLEQM[655.3735]SIGHHIR,-0.987823,-0.958346,-0.907636,-1.029497,-0.812127,-1.234694,-0.450931,-1.072178,...,2.476190,3.238095,4.666667,6.047619,7.285714,16.333333,43.095238,68.000000,1.380952,0
2,SMM[649.3660]SYER,SMM[655.3735]SYER,-1.883411,,,,-1.217990,-0.646863,,-0.717885,...,2.238095,3.380952,4.428571,5.952381,6.380952,10.904762,17.523810,24.238095,1.476190,1
3,LMQLM[649.3660]ESEQK,LMQLM[655.3735]ESEQK,-0.923502,,,-1.171454,-1.012287,-0.755677,-1.619834,,...,2.000000,2.000000,2.000000,2.142857,3.095238,7.619048,13.190476,18.142857,1.190476,1
4,FQHQGFGM[649.3660]LLMEEAER,FQHQGFGM[655.3735]LLMEEAER,-0.974651,-1.024898,,-1.003069,-1.034371,,,,...,2.190476,3.809524,4.380952,5.619048,5.904762,16.142857,44.000000,85.190476,4.428571,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,RLLKEEDM[649.3660]TK,RLLKEEDM[655.3735]TK,0.059652,-0.077560,-0.266137,0.298445,0.029505,-0.001940,0.007283,0.027519,...,2.047619,2.047619,2.047619,2.285714,2.523810,5.523810,14.285714,28.238095,0.571429,1
290,M[649.3660]ADKDGDLIATK,M[655.3735]ADKDGDLIATK,0.095942,,-0.317916,,-0.667125,0.242408,0.428816,0.402582,...,2.142857,4.095238,5.285714,6.761905,7.333333,17.000000,44.952381,75.380952,4.142857,0
291,LRQENM[649.3660]ELAER,LRQENM[655.3735]ELAER,,,,0.047247,,,,-0.134433,...,2.666667,5.238095,6.095238,8.000000,8.000000,13.571429,20.000000,28.000000,2.952381,1
292,MDM[649.3660]SLDDIIK,MDM[655.3735]SLDDIIK,,,,,-0.120082,,,,...,1.941176,2.411765,2.882353,3.882353,4.352941,7.529412,13.117647,17.235294,1.058824,1


In [171]:
#peptides_wa.to_csv(os.path.join(curr_dir_path, "100uM_MsrAKD_with_alphafold.csv"))

In [172]:
path = os.path.join(curr_dir_path, "100uM_MsrAKD_with_alphafold.csv")
peptides_wa = pd.read_csv(path)
peptides_wa.set_index("Unnamed: 0", inplace=True)
peptides_wa.index.name = None
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,RFYEQM[649.3660]NGPVAGASR,RFYEQM[655.3735]NGPVAGASR,,,,,,,,-1.459176,...,2.238095,3.238095,3.571429,4.142857,4.380952,7.571429,12.000000,16.190476,1.380952,1
1,DSDSGLEQM[649.3660]SIGHHIR,DSDSGLEQM[655.3735]SIGHHIR,-0.987823,-0.958346,-0.907636,-1.029497,-0.812127,-1.234694,-0.450931,-1.072178,...,2.476190,3.238095,4.666667,6.047619,7.285714,16.333333,43.095238,68.000000,1.380952,0
2,SMM[649.3660]SYER,SMM[655.3735]SYER,-1.883411,,,,-1.217990,-0.646863,,-0.717885,...,2.238095,3.380952,4.428571,5.952381,6.380952,10.904762,17.523810,24.238095,1.476190,1
3,LMQLM[649.3660]ESEQK,LMQLM[655.3735]ESEQK,-0.923502,,,-1.171454,-1.012287,-0.755677,-1.619834,,...,2.000000,2.000000,2.000000,2.142857,3.095238,7.619048,13.190476,18.142857,1.190476,1
4,FQHQGFGM[649.3660]LLMEEAER,FQHQGFGM[655.3735]LLMEEAER,-0.974651,-1.024898,,-1.003069,-1.034371,,,,...,2.190476,3.809524,4.380952,5.619048,5.904762,16.142857,44.000000,85.190476,4.428571,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,RLLKEEDM[649.3660]TK,RLLKEEDM[655.3735]TK,0.059652,-0.077560,-0.266137,0.298445,0.029505,-0.001940,0.007283,0.027519,...,2.047619,2.047619,2.047619,2.285714,2.523810,5.523810,14.285714,28.238095,0.571429,1
290,M[649.3660]ADKDGDLIATK,M[655.3735]ADKDGDLIATK,0.095942,,-0.317916,,-0.667125,0.242408,0.428816,0.402582,...,2.142857,4.095238,5.285714,6.761905,7.333333,17.000000,44.952381,75.380952,4.142857,0
291,LRQENM[649.3660]ELAER,LRQENM[655.3735]ELAER,,,,0.047247,,,,-0.134433,...,2.666667,5.238095,6.095238,8.000000,8.000000,13.571429,20.000000,28.000000,2.952381,1
292,MDM[649.3660]SLDDIIK,MDM[655.3735]SLDDIIK,,,,,-0.120082,,,,...,1.941176,2.411765,2.882353,3.882353,4.352941,7.529412,13.117647,17.235294,1.058824,1


# Load Dataset - 100uM MsrB2KD

In [173]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "10_01_24_Met_MsrKD_Mastersheet.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="100 uM MsrB2 KD")
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Protein ID,Entry Name,Met site,Label,Gene,Protein Description,pval,average ratio,actual ratio,neglog10p
0,VNDAM[649.3660]NMGHTAK,VNDAM[655.3735]NMGHTAK,,,,,-3.607220,-2.901661,-3.034067,-3.036814,...,O75208,COQ9_HUMAN,281,COQ9_M281,COQ9,"Ubiquinone biosynthesis protein COQ9, mitochon...",0.000092,-3.144940,3.144940,4.038309
1,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,,,-2.625611,-3.598459,-3.187213,,,...,Q00341,VIGLN_HUMAN,128,VIGLN_M128,HDLBP,Vigilin,0.001642,-3.137094,3.137094,2.784499
2,LAEM[649.3660]LDQR,LAEM[655.3735]LDQR,,,,,-2.952328,-2.872499,,,...,Q5VTR2,BRE1A_HUMAN,61,BRE1A_M61,RNF20,E3 ubiquitin-protein ligase BRE1A,0.000094,-2.912413,2.912413,4.027394
3,NEM[649.3660]GHTPLDYAR,NEM[655.3735]GHTPLDYAR,,,-2.547209,-2.423277,,,,,...,Q9H078,CLPB_HUMAN,298,CLPB_M298,CLPB,Mitochondrial disaggregase,0.000535,-2.643397,2.643397,3.271242
4,ADWLSHYWM[649.3660]PK,ADWLSHYWM[655.3735]PK,-2.399117,,,,-2.364680,,,,...,P08243,ASNS_HUMAN,538,ASNS_M538,ASNS,Asparagine synthetase [glutamine-hydrolyzing],0.000026,-2.381899,2.381899,4.582910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,ITTM[649.3660]QLER,ITTM[655.3735]QLER,-0.951248,0.138245,,-0.829181,0.507390,-0.877976,0.609445,-1.225696,...,Q14152,EIF3A_HUMAN,736,EIF3A_M736,EIF3A,Eukaryotic translation initiation factor 3 sub...,0.490225,-0.014308,0.014308,0.309605
333,VHM[649.3660]WEQTVK,VHM[655.3735]WEQTVK,0.339103,0.416417,-0.180036,-0.215728,0.264689,-0.157789,0.052332,-0.228648,...,Q14683,SMC1A_HUMAN,837,SMC1A_M837,SMC1A,Structural maintenance of chromosomes protein 1A,0.491880,0.003895,-0.003895,0.308141
334,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,,,,-0.095381,0.045956,0.052306,,...,O94979,SC31A_HUMAN,823,SC31A_M823,SEC31A,Protein transport protein Sec31A,0.494331,0.000960,-0.000960,0.305982
335,RM[649.3660]YEQQR,RM[655.3735]YEQQR,,-0.251258,0.618049,,-0.377629,,,,...,Q9NZZ3,CHMP5_HUMAN,74,CHMP5_M74,CHMP5,Charged multivesicular body protein 5,0.496447,0.002691,-0.002691,0.304127


In [174]:
# Canonicalize data - none to do here
peptides;

In [175]:
# Manual labeling of peptides
label_col_data = ["red"] * 48 + ["green"] * 110 + ["blue"] * 36 + ["gray"] * 143
label_col = pd.Series(label_col_data)
peptides["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [176]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['O75208' 'Q00341' 'Q5VTR2' 'Q9H078' 'P08243' 'P62857' 'P26583' 'P11940'
 'P0DP23' 'P34932' 'O14654' 'P68363' 'P46783' 'P35579' 'P09429' 'P06733'
 'Q15233' 'Q9UKD2' 'Q15056' 'O60749' 'P08238' 'P23588' 'P33176' 'P52272'
 'Q9UQN3' 'Q9Y5A9' 'P23246' 'P07910' 'Q01844' 'Q99733' 'P12270' 'Q13573'
 'P16949' 'P25786' 'Q16181' 'Q14683' 'Q86UP2' 'P38646' 'P07437' 'P61011'
 'Q86U42' 'O14737' 'P39023' 'O95373' 'Q14320' 'Q99436' 'P67870' 'P80303'
 'P22307' 'P09874' 'P40222' 'P43243' 'P82970' 'P51532' 'Q07065' 'Q13263'
 'P11021' 'P12694' 'Q14011' 'P48643' 'P36542' 'Q96CT7' 'P14625' 'P36543'
 'Q9NTZ6' 'Q99615' 'Q9NZI8' 'O60664' 'O95347' 'P15374' 'Q14204' 'Q99623'
 'P15311' 'P46777' 'Q13283' 'P26038' 'P60228' 'P83731' 'P51003' 'Q15717'
 'P07900' 'P11142' 'P31948' 'P18669' 'P62258' 'Q7Z739' 'Q96LB3' 'P11310'
 'P82675' 'Q4VCS5' 'Q9HD42' 'P62841' 'Q9UMX0' 'P84098' 'P53999' 'Q02790'
 'Q15637' 'P40429' 'P46779' 'Q9H444' 'P61247' 'O60841' 'Q9Y2S6' 'O75533'
 'P09496' 'Q00587' 'Q15424' 'P

In [177]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [178]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [179]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Met site,Label,Gene,Protein Description,pval,average ratio,actual ratio,neglog10p,Color,Complete Sequence
0,VNDAM[649.3660]NMGHTAK,VNDAM[655.3735]NMGHTAK,,,,,-3.607220,-2.901661,-3.034067,-3.036814,...,281,COQ9_M281,COQ9,"Ubiquinone biosynthesis protein COQ9, mitochon...",0.000092,-3.144940,3.144940,4.038309,red,MAAAAVSGALGRAGWRLLQLRCLPVARCRQALVPRAFHASAVGLRS...
1,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,,,-2.625611,-3.598459,-3.187213,,,...,128,VIGLN_M128,HDLBP,Vigilin,0.001642,-3.137094,3.137094,2.784499,red,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...
2,LAEM[649.3660]LDQR,LAEM[655.3735]LDQR,,,,,-2.952328,-2.872499,,,...,61,BRE1A_M61,RNF20,E3 ubiquitin-protein ligase BRE1A,0.000094,-2.912413,2.912413,4.027394,red,MSGIGNKRAAGEPGTSMPPEKKAAVEDSGTTVETIKLGGVSSTEEL...
3,NEM[649.3660]GHTPLDYAR,NEM[655.3735]GHTPLDYAR,,,-2.547209,-2.423277,,,,,...,298,CLPB_M298,CLPB,Mitochondrial disaggregase,0.000535,-2.643397,2.643397,3.271242,red,MLGSLVLRRKALAPRLLLRLLRSPTLRGHGGASGRNVTTGSLGEPQ...
4,ADWLSHYWM[649.3660]PK,ADWLSHYWM[655.3735]PK,-2.399117,,,,-2.364680,,,,...,538,ASNS_M538,ASNS,Asparagine synthetase [glutamine-hydrolyzing],0.000026,-2.381899,2.381899,4.582910,red,MCGIWALFGSDDCLSVQCLSAMKIAHRGPDAFRFENVNGYTNCCFG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,ITTM[649.3660]QLER,ITTM[655.3735]QLER,-0.951248,0.138245,,-0.829181,0.507390,-0.877976,0.609445,-1.225696,...,736,EIF3A_M736,EIF3A,Eukaryotic translation initiation factor 3 sub...,0.490225,-0.014308,0.014308,0.309605,gray,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...
333,VHM[649.3660]WEQTVK,VHM[655.3735]WEQTVK,0.339103,0.416417,-0.180036,-0.215728,0.264689,-0.157789,0.052332,-0.228648,...,837,SMC1A_M837,SMC1A,Structural maintenance of chromosomes protein 1A,0.491880,0.003895,-0.003895,0.308141,gray,MGFLKLIEIENFKSYKGRQIIGPFQRFTAIIGPNGSGKSNLMDAIS...
334,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,,,,-0.095381,0.045956,0.052306,,...,823,SC31A_M823,SEC31A,Protein transport protein Sec31A,0.494331,0.000960,-0.000960,0.305982,gray,MKLKEVDRTAMQAWSPAQNHPIYLATGTSAQQLDATFSTNASLEIF...
335,RM[649.3660]YEQQR,RM[655.3735]YEQQR,,-0.251258,0.618049,,-0.377629,,,,...,74,CHMP5_M74,CHMP5,Charged multivesicular body protein 5,0.496447,0.002691,-0.002691,0.304127,gray,MNRLFGKAKPKAPPPSLTDCIGTVDSRAESIDKKISRLDAELVKYK...


In [180]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [181]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [182]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [183]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    337
Name: count, dtype: int64

In [184]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [185]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [186]:
# Sanity check - ensure methionine locations are correct (and match Met site numbers from initial dataset)
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
((temp.count("M") == len(temp)) & (peptides_cs["Met site"] == peptides_cs["Methionine Location"]+1)).value_counts()

True    337
Name: count, dtype: int64

In [187]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,VNDAM[649.3660]NMGHTAK,VNDAM[655.3735]NMGHTAK,,,,,-3.607220,-2.901661,-3.034067,-3.036814,...,red,MAAAAVSGALGRAGWRLLQLRCLPVARCRQALVPRAFHASAVGLRS...,VNDAMNMGHTAK,276,12,VNDA,4,280,DSSPDFEDTWRFLENRVNDA,NMGHTAKQVKSTGEALVQGL
1,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,,,-2.625611,-3.598459,-3.187213,,,...,red,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK,121,11,DQGLSI,6,127,MQRTGAHLELSLAKDQGLSI,VSGKLDAVMKARKDIVARLQ
2,LAEM[649.3660]LDQR,LAEM[655.3735]LDQR,,,,,-2.952328,-2.872499,,,...,red,MSGIGNKRAAGEPGTSMPPEKKAAVEDSGTTVETIKLGGVSSTEEL...,LAEMLDQR,57,8,LAE,3,60,SSTEELDIRTLQTKNRKLAE,LDQRQAIEDELREHIEKLER
3,NEM[649.3660]GHTPLDYAR,NEM[655.3735]GHTPLDYAR,,,-2.547209,-2.423277,,,,,...,red,MLGSLVLRRKALAPRLLLRLLRSPTLRGHGGASGRNVTTGSLGEPQ...,NEMGHTPLDYAR,295,12,NE,2,297,DYRTVKELLDGGANPLQRNE,GHTPLDYAREGEVMKLLRTS
4,ADWLSHYWM[649.3660]PK,ADWLSHYWM[655.3735]PK,-2.399117,,,,-2.364680,,,,...,red,MCGIWALFGSDDCLSVQCLSAMKIAHRGPDAFRFENVNGYTNCCFG...,ADWLSHYWMPK,529,11,ADWLSHYW,8,537,YRQVFERHYPGRADWLSHYW,PKWINATDPSARTLTHYKSA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,ITTM[649.3660]QLER,ITTM[655.3735]QLER,-0.951248,0.138245,,-0.829181,0.507390,-0.877976,0.609445,-1.225696,...,gray,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,ITTMQLER,732,8,ITT,3,735,EQRIKDMDLWEQQEEERITT,QLEREKALEHKNRMSRMLED
333,VHM[649.3660]WEQTVK,VHM[655.3735]WEQTVK,0.339103,0.416417,-0.180036,-0.215728,0.264689,-0.157789,0.052332,-0.228648,...,gray,MGFLKLIEIENFKSYKGRQIIGPFQRFTAIIGPNGSGKSNLMDAIS...,VHMWEQTVK,834,9,VH,2,836,LGIQLDFEKNQLKEDQDKVH,WEQTVKKDENEIEKLKKEEQ
334,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,,,,-0.095381,0.045956,0.052306,,...,gray,MKLKEVDRTAMQAWSPAQNHPIYLATGTSAQQLDATFSTNASLEIF...,GRPGPVAGHHQMPR,811,14,GRPGPVAGHHQ,11,822,PYEKQQLPKGRPGPVAGHHQ,PRVQTQQYYPHGENPPPPGF
335,RM[649.3660]YEQQR,RM[655.3735]YEQQR,,-0.251258,0.618049,,-0.377629,,,,...,gray,MNRLFGKAKPKAPPPSLTDCIGTVDSRAESIDKKISRLDAELVKYK...,RMYEQQR,72,7,R,1,73,EGPAKNMVKQKALRVLKQKR,YEQQRDNLAQQSFNMEQANY


In [188]:
# Remove invalid proteins (according to alphafold)
# 3 invalid peptides as a result -> 1 green, 2 gray

invalid_IDs = ['Q14204', 'P78527']
display(peptides_cs[peptides_cs["Protein ID"].isin(invalid_IDs)])
peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
peptides_cs

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
78,KVM[649.3660]SQEIQEQLHK,KVM[655.3735]SQEIQEQLHK,,,-1.102531,-0.96019,-1.394902,,,-0.673681,...,green,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,KVMSQEIQEQLHK,3253,13,KV,2,3255,ANDKLKKMVKDQQEAEKKKV,SQEIQEQLHKQQEVIADKQM
284,RSELEEQQM[649.3660]HLNVGLR,RSELEEQQM[655.3735]HLNVGLR,,-0.079413,,0.327373,,,,,...,gray,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,RSELEEQQMHLNVGLR,3190,16,RSELEEQQ,8,3198,FINHYANLFHEKRSELEEQQ,HLNVGLRKIKETVDQVEELR
290,LSLM[649.3660]YAR,LSLM[655.3735]YAR,,,,,,,,,...,gray,MAGSGAGVRCSLLRLQETLSAADRCGAALAGHQLIRGLGQECVLSS...,LSLMYAR,2738,7,LSL,3,2741,RTDLLRLRRRFMRDQEKLSL,YARKGVAEQKREKEIKSELK


Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,VNDAM[649.3660]NMGHTAK,VNDAM[655.3735]NMGHTAK,,,,,-3.607220,-2.901661,-3.034067,-3.036814,...,red,MAAAAVSGALGRAGWRLLQLRCLPVARCRQALVPRAFHASAVGLRS...,VNDAMNMGHTAK,276,12,VNDA,4,280,DSSPDFEDTWRFLENRVNDA,NMGHTAKQVKSTGEALVQGL
1,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,,,-2.625611,-3.598459,-3.187213,,,...,red,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK,121,11,DQGLSI,6,127,MQRTGAHLELSLAKDQGLSI,VSGKLDAVMKARKDIVARLQ
2,LAEM[649.3660]LDQR,LAEM[655.3735]LDQR,,,,,-2.952328,-2.872499,,,...,red,MSGIGNKRAAGEPGTSMPPEKKAAVEDSGTTVETIKLGGVSSTEEL...,LAEMLDQR,57,8,LAE,3,60,SSTEELDIRTLQTKNRKLAE,LDQRQAIEDELREHIEKLER
3,NEM[649.3660]GHTPLDYAR,NEM[655.3735]GHTPLDYAR,,,-2.547209,-2.423277,,,,,...,red,MLGSLVLRRKALAPRLLLRLLRSPTLRGHGGASGRNVTTGSLGEPQ...,NEMGHTPLDYAR,295,12,NE,2,297,DYRTVKELLDGGANPLQRNE,GHTPLDYAREGEVMKLLRTS
4,ADWLSHYWM[649.3660]PK,ADWLSHYWM[655.3735]PK,-2.399117,,,,-2.364680,,,,...,red,MCGIWALFGSDDCLSVQCLSAMKIAHRGPDAFRFENVNGYTNCCFG...,ADWLSHYWMPK,529,11,ADWLSHYW,8,537,YRQVFERHYPGRADWLSHYW,PKWINATDPSARTLTHYKSA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,ITTM[649.3660]QLER,ITTM[655.3735]QLER,-0.951248,0.138245,,-0.829181,0.507390,-0.877976,0.609445,-1.225696,...,gray,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,ITTMQLER,732,8,ITT,3,735,EQRIKDMDLWEQQEEERITT,QLEREKALEHKNRMSRMLED
333,VHM[649.3660]WEQTVK,VHM[655.3735]WEQTVK,0.339103,0.416417,-0.180036,-0.215728,0.264689,-0.157789,0.052332,-0.228648,...,gray,MGFLKLIEIENFKSYKGRQIIGPFQRFTAIIGPNGSGKSNLMDAIS...,VHMWEQTVK,834,9,VH,2,836,LGIQLDFEKNQLKEDQDKVH,WEQTVKKDENEIEKLKKEEQ
334,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,,,,-0.095381,0.045956,0.052306,,...,gray,MKLKEVDRTAMQAWSPAQNHPIYLATGTSAQQLDATFSTNASLEIF...,GRPGPVAGHHQMPR,811,14,GRPGPVAGHHQ,11,822,PYEKQQLPKGRPGPVAGHHQ,PRVQTQQYYPHGENPPPPGF
335,RM[649.3660]YEQQR,RM[655.3735]YEQQR,,-0.251258,0.618049,,-0.377629,,,,...,gray,MNRLFGKAKPKAPPPSLTDCIGTVDSRAESIDKKISRLDAELVKYK...,RMYEQQR,72,7,R,1,73,EGPAKNMVKQKALRVLKQKR,YEQQRDNLAQQSFNMEQANY


# Download Alphafold Data - 100uM MsrB2KD

In [189]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['O75208' 'Q00341' 'Q5VTR2' 'Q9H078' 'P08243' 'P62857' 'P26583' 'P11940'
 'P0DP23' 'P34932' 'O14654' 'P68363' 'P46783' 'P35579' 'P09429' 'P06733'
 'Q15233' 'Q9UKD2' 'Q15056' 'O60749' 'P08238' 'P23588' 'P33176' 'P52272'
 'Q9UQN3' 'Q9Y5A9' 'P23246' 'P07910' 'Q01844' 'Q99733' 'P12270' 'Q13573'
 'P16949' 'P25786' 'Q16181' 'Q14683' 'Q86UP2' 'P38646' 'P07437' 'P61011'
 'Q86U42' 'O14737' 'P39023' 'O95373' 'Q14320' 'Q99436' 'P67870' 'P80303'
 'P22307' 'P09874' 'P40222' 'P43243' 'P82970' 'P51532' 'Q07065' 'Q13263'
 'P11021' 'P12694' 'Q14011' 'P48643' 'P36542' 'Q96CT7' 'P14625' 'P36543'
 'Q9NTZ6' 'Q99615' 'Q9NZI8' 'O60664' 'O95347' 'P15374' 'Q99623' 'P15311'
 'P46777' 'Q13283' 'P26038' 'P60228' 'P83731' 'P51003' 'Q15717' 'P07900'
 'P11142' 'P31948' 'P18669' 'P62258' 'Q7Z739' 'Q96LB3' 'P11310' 'P82675'
 'Q4VCS5' 'Q9HD42' 'P62841' 'Q9UMX0' 'P84098' 'P53999' 'Q02790' 'Q15637'
 'P40429' 'P46779' 'Q9H444' 'P61247' 'O60841' 'Q9Y2S6' 'O75533' 'P09496'
 'Q00587' 'Q15424' 'P60709' 'Q

In [190]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 226/226 [00:00<00:00, 154282.67it/s]

2024-10-17 19:10:02> Valid proteins: 0
2024-10-17 19:10:02> Invalid proteins: 0
2024-10-17 19:10:02> Existing proteins: 226





In [191]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 226/226 [00:00<00:00, 141860.63it/s]

2024-10-17 19:10:02> Valid proteins: 0
2024-10-17 19:10:02> Invalid proteins: 0
2024-10-17 19:10:02> Existing proteins: 226





# Construct Alphafold Dataframe (Calculate Accessibilities) - 100uM MsrB2KD

In [192]:
# Format alphafold data into dataframe
alphafold_annotation_100uM_MsrB2KD = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=unique_uniprotIDs)
alphafold_annotation_100uM_MsrB2KD

100%|██████████| 1663/1663 [00:18<00:00, 89.72it/s] 


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,E9PRG8,1,M,1,53.66,31.653,31.058,32.072,30.756,-15.522,...,-12.734,-11.583,-13.004,unstructured,unstructured,0,0,0,0,1
1,E9PRG8,1,G,2,61.43,32.653,31.558,,30.969,-12.926,...,-15.819,,-14.660,unstructured,unstructured,0,0,0,0,1
2,E9PRG8,1,A,3,63.59,34.813,35.038,36.302,33.893,-10.809,...,-15.414,-16.107,-15.836,unstructured,unstructured,0,0,0,0,1
3,E9PRG8,1,P,4,64.28,35.599,34.793,34.939,35.147,-7.676,...,-14.858,-13.434,-14.749,HELX_RH_3T_P,HELX,0,1,0,0,0
4,E9PRG8,1,G,5,59.04,37.048,37.549,,36.530,-6.832,...,-17.473,,-16.633,HELX_RH_3T_P,HELX,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122444,Q9Y605,226,K,123,86.95,-71.221,-69.937,-68.865,-69.450,-11.627,...,56.298,57.032,54.972,HELX_RH_AL_P,HELX,0,1,0,0,0
122445,Q9Y605,226,S,124,83.09,-73.735,-72.542,-72.222,-71.354,-10.738,...,54.991,54.023,55.206,HELX_RH_AL_P,HELX,0,1,0,0,0
122446,Q9Y605,226,E,125,77.14,-75.055,-74.572,-74.062,-73.510,-13.695,...,53.193,51.953,53.687,HELX_RH_AL_P,HELX,0,1,0,0,0
122447,Q9Y605,226,S,126,64.51,-75.362,-74.601,-73.339,-74.255,-14.112,...,56.421,56.974,55.300,TURN_TY1_P,TURN,0,0,0,1,0


In [193]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 226/226 [00:01<00:00, 114.06it/s]


In [194]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_annotation_100uM_MsrB2KD.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [195]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 226/226 [00:01<00:00, 123.57it/s]


In [196]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [197]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 226/226 [00:01<00:00, 125.98it/s]


In [198]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [199]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 226/226 [00:01<00:00, 124.86it/s]


In [200]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [201]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 226/226 [00:01<00:00, 122.21it/s]


In [202]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [203]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 226/226 [00:01<00:00, 119.61it/s]


In [204]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [205]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 226/226 [00:01<00:00, 115.44it/s]


In [206]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [207]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 226/226 [00:02<00:00, 112.14it/s]


In [208]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [209]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 226/226 [00:02<00:00, 111.08it/s]


In [210]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [211]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 226/226 [00:02<00:00, 103.57it/s]


In [212]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [213]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 226/226 [00:01<00:00, 115.96it/s]


In [214]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [215]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 226/226 [00:02<00:00, 111.08it/s]


In [216]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [217]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 226/226 [00:02<00:00, 92.56it/s] 


In [218]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [219]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 226/226 [00:02<00:00, 76.03it/s] 


In [220]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD;

In [221]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_100uM_MsrB2KD, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 226/226 [00:02<00:00, 110.31it/s]


In [222]:
alphafold_accessibility_100uM_MsrB2KD = alphafold_accessibility_100uM_MsrB2KD.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_100uM_MsrB2KD

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,E9PRG8,1,M,1,53.66,31.653,31.058,32.072,30.756,-15.522,...,1,1,1,1,1,1,2,3,5,0
1,E9PRG8,1,G,2,61.43,32.653,31.558,,30.969,-12.926,...,0,2,2,2,2,2,3,5,6,0
2,E9PRG8,1,A,3,63.59,34.813,35.038,36.302,33.893,-10.809,...,1,2,2,2,2,2,4,6,7,0
3,E9PRG8,1,P,4,64.28,35.599,34.793,34.939,35.147,-7.676,...,1,2,2,2,2,2,5,7,9,0
4,E9PRG8,1,G,5,59.04,37.048,37.549,,36.530,-6.832,...,0,2,2,2,2,2,4,8,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122444,Q9Y605,226,K,123,86.95,-71.221,-69.937,-68.865,-69.450,-11.627,...,1,2,2,2,4,4,9,13,16,2
122445,Q9Y605,226,S,124,83.09,-73.735,-72.542,-72.222,-71.354,-10.738,...,1,2,2,2,2,3,8,12,15,2
122446,Q9Y605,226,E,125,77.14,-75.055,-74.572,-74.062,-73.510,-13.695,...,1,2,2,2,2,2,6,9,13,2
122447,Q9Y605,226,S,126,64.51,-75.362,-74.601,-73.339,-74.255,-14.112,...,1,2,2,2,2,2,5,7,12,2


In [223]:
alphafold_accessibility_100uM_MsrB2KD_smooth = get_smooth_score(
    alphafold_accessibility_100uM_MsrB2KD, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_100uM_MsrB2KD_smooth;

100%|██████████| 226/226 [00:00<00:00, 626.30it/s]


In [224]:
alphafold_accessibility_100uM_MsrB2KD_smooth['IDR'] = np.where(
    alphafold_accessibility_100uM_MsrB2KD_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_100uM_MsrB2KD_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,E9PRG8,1,M,1,53.66,31.653,31.058,32.072,30.756,-15.522,...,1.818182,1.818182,1.909091,1.909091,2.000000,4.272727,7.545455,10.818182,0.181818,1
1,E9PRG8,1,G,2,61.43,32.653,31.558,,30.969,-12.926,...,1.833333,1.833333,1.916667,1.916667,2.000000,4.583333,8.083333,11.750000,0.333333,1
2,E9PRG8,1,A,3,63.59,34.813,35.038,36.302,33.893,-10.809,...,1.846154,1.846154,1.923077,2.000000,2.076923,4.846154,8.769231,12.615385,0.307692,1
3,E9PRG8,1,P,4,64.28,35.599,34.793,34.939,35.147,-7.676,...,1.857143,1.928571,2.071429,2.142857,2.214286,5.071429,9.285714,13.285714,0.285714,1
4,E9PRG8,1,G,5,59.04,37.048,37.549,,36.530,-6.832,...,1.866667,1.933333,2.133333,2.333333,2.466667,5.533333,9.866667,14.066667,0.400000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,Q9Y605,226,K,123,86.95,-71.221,-69.937,-68.865,-69.450,-11.627,...,1.933333,3.466667,4.066667,5.600000,5.733333,9.466667,14.866667,19.200000,1.933333,1
123,Q9Y605,226,S,124,83.09,-73.735,-72.542,-72.222,-71.354,-10.738,...,1.928571,3.357143,3.928571,5.428571,5.571429,9.285714,14.571429,18.785714,1.928571,1
124,Q9Y605,226,E,125,77.14,-75.055,-74.572,-74.062,-73.510,-13.695,...,1.923077,3.153846,3.769231,5.230769,5.384615,8.923077,14.153846,18.230769,1.846154,1
125,Q9Y605,226,S,126,64.51,-75.362,-74.601,-73.339,-74.255,-14.112,...,1.916667,3.083333,3.583333,5.000000,5.166667,8.666667,13.750000,17.666667,1.750000,1


# Merge Dataframes into Full Dataset (Includes Alphafold) - 100uM MsrB2KD

In [225]:
alphafold_accessibility_100uM_MsrB2KD_smooth["position"] = alphafold_accessibility_100uM_MsrB2KD_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_wa = peptides_cs.merge(
    alphafold_accessibility_100uM_MsrB2KD_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_wa # wa means "with alphafold"

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,VNDAM[649.3660]NMGHTAK,VNDAM[655.3735]NMGHTAK,,,,,-3.607220,-2.901661,-3.034067,-3.036814,...,2.095238,3.428571,4.047619,5.333333,5.714286,14.047619,31.428571,55.142857,4.142857,0.0
1,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,,,-2.625611,-3.598459,-3.187213,,,...,2.095238,2.619048,3.190476,4.000000,4.904762,13.142857,29.952381,49.190476,1.952381,0.0
2,LAEM[649.3660]LDQR,LAEM[655.3735]LDQR,,,,,-2.952328,-2.872499,,,...,2.571429,5.238095,6.000000,7.904762,7.904762,12.047619,19.238095,26.857143,2.000000,1.0
3,NEM[649.3660]GHTPLDYAR,NEM[655.3735]GHTPLDYAR,,,-2.547209,-2.423277,,,,,...,2.095238,2.761905,3.095238,4.047619,4.952381,14.714286,42.619048,79.285714,3.952381,0.0
4,ADWLSHYWM[649.3660]PK,ADWLSHYWM[655.3735]PK,-2.399117,,,,-2.364680,,,,...,2.000000,2.380952,2.809524,3.380952,3.904762,12.666667,45.904762,99.904762,3.666667,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,ITTM[649.3660]QLER,ITTM[655.3735]QLER,-0.951248,0.138245,,-0.829181,0.507390,-0.877976,0.609445,-1.225696,...,2.714286,5.095238,5.904762,7.714286,7.809524,11.761905,19.142857,25.809524,1.714286,1.0
330,VHM[649.3660]WEQTVK,VHM[655.3735]WEQTVK,0.339103,0.416417,-0.180036,-0.215728,0.264689,-0.157789,0.052332,-0.228648,...,2.380952,4.571429,5.523810,7.523810,7.714286,15.476190,33.238095,49.761905,3.857143,0.0
331,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,,,,-0.095381,0.045956,0.052306,,...,2.000000,2.000000,2.000000,2.000000,2.000000,4.047619,7.571429,10.619048,0.095238,1.0
332,RM[649.3660]YEQQR,RM[655.3735]YEQQR,,-0.251258,0.618049,,-0.377629,,,,...,2.523810,5.666667,6.142857,8.333333,8.619048,20.047619,37.761905,53.000000,5.476190,0.0


In [226]:
# note: one row with missing AA steric analysis: AlphaFold seems to have a sequence that's 59 AA long, but UniProt has a sequence that's 133 AA long
display(peptides_wa[peptides_wa["Protein ID"] == "P62861"])
peptides_wa = peptides_wa[~(peptides_wa["Protein ID"] == "P62861")]
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
259,RRM[649.3660]QYNR,RRM[655.3735]QYNR,,0.076238,-0.086972,,-0.09764,-0.370949,0.018663,0.039122,...,,,,,,,,,,


Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,VNDAM[649.3660]NMGHTAK,VNDAM[655.3735]NMGHTAK,,,,,-3.607220,-2.901661,-3.034067,-3.036814,...,2.095238,3.428571,4.047619,5.333333,5.714286,14.047619,31.428571,55.142857,4.142857,0.0
1,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,,,-2.625611,-3.598459,-3.187213,,,...,2.095238,2.619048,3.190476,4.000000,4.904762,13.142857,29.952381,49.190476,1.952381,0.0
2,LAEM[649.3660]LDQR,LAEM[655.3735]LDQR,,,,,-2.952328,-2.872499,,,...,2.571429,5.238095,6.000000,7.904762,7.904762,12.047619,19.238095,26.857143,2.000000,1.0
3,NEM[649.3660]GHTPLDYAR,NEM[655.3735]GHTPLDYAR,,,-2.547209,-2.423277,,,,,...,2.095238,2.761905,3.095238,4.047619,4.952381,14.714286,42.619048,79.285714,3.952381,0.0
4,ADWLSHYWM[649.3660]PK,ADWLSHYWM[655.3735]PK,-2.399117,,,,-2.364680,,,,...,2.000000,2.380952,2.809524,3.380952,3.904762,12.666667,45.904762,99.904762,3.666667,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,ITTM[649.3660]QLER,ITTM[655.3735]QLER,-0.951248,0.138245,,-0.829181,0.507390,-0.877976,0.609445,-1.225696,...,2.714286,5.095238,5.904762,7.714286,7.809524,11.761905,19.142857,25.809524,1.714286,1.0
330,VHM[649.3660]WEQTVK,VHM[655.3735]WEQTVK,0.339103,0.416417,-0.180036,-0.215728,0.264689,-0.157789,0.052332,-0.228648,...,2.380952,4.571429,5.523810,7.523810,7.714286,15.476190,33.238095,49.761905,3.857143,0.0
331,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,,,,-0.095381,0.045956,0.052306,,...,2.000000,2.000000,2.000000,2.000000,2.000000,4.047619,7.571429,10.619048,0.095238,1.0
332,RM[649.3660]YEQQR,RM[655.3735]YEQQR,,-0.251258,0.618049,,-0.377629,,,,...,2.523810,5.666667,6.142857,8.333333,8.619048,20.047619,37.761905,53.000000,5.476190,0.0


In [227]:
#peptides_wa.to_csv(os.path.join(curr_dir_path, "100uM_MsrB2KD_with_alphafold.csv"))

In [228]:
path = os.path.join(curr_dir_path, "100uM_MsrB2KD_with_alphafold.csv")
peptides_wa = pd.read_csv(path)
peptides_wa.set_index("Unnamed: 0", inplace=True)
peptides_wa.index.name = None
peptides_wa

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,VNDAM[649.3660]NMGHTAK,VNDAM[655.3735]NMGHTAK,,,,,-3.607220,-2.901661,-3.034067,-3.036814,...,2.095238,3.428571,4.047619,5.333333,5.714286,14.047619,31.428571,55.142857,4.142857,0.0
1,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,,,-2.625611,-3.598459,-3.187213,,,...,2.095238,2.619048,3.190476,4.000000,4.904762,13.142857,29.952381,49.190476,1.952381,0.0
2,LAEM[649.3660]LDQR,LAEM[655.3735]LDQR,,,,,-2.952328,-2.872499,,,...,2.571429,5.238095,6.000000,7.904762,7.904762,12.047619,19.238095,26.857143,2.000000,1.0
3,NEM[649.3660]GHTPLDYAR,NEM[655.3735]GHTPLDYAR,,,-2.547209,-2.423277,,,,,...,2.095238,2.761905,3.095238,4.047619,4.952381,14.714286,42.619048,79.285714,3.952381,0.0
4,ADWLSHYWM[649.3660]PK,ADWLSHYWM[655.3735]PK,-2.399117,,,,-2.364680,,,,...,2.000000,2.380952,2.809524,3.380952,3.904762,12.666667,45.904762,99.904762,3.666667,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,ITTM[649.3660]QLER,ITTM[655.3735]QLER,-0.951248,0.138245,,-0.829181,0.507390,-0.877976,0.609445,-1.225696,...,2.714286,5.095238,5.904762,7.714286,7.809524,11.761905,19.142857,25.809524,1.714286,1.0
330,VHM[649.3660]WEQTVK,VHM[655.3735]WEQTVK,0.339103,0.416417,-0.180036,-0.215728,0.264689,-0.157789,0.052332,-0.228648,...,2.380952,4.571429,5.523810,7.523810,7.714286,15.476190,33.238095,49.761905,3.857143,0.0
331,GRPGPVAGHHQM[649.3660]PR,GRPGPVAGHHQM[655.3735]PR,,,,,-0.095381,0.045956,0.052306,,...,2.000000,2.000000,2.000000,2.000000,2.000000,4.047619,7.571429,10.619048,0.095238,1.0
332,RM[649.3660]YEQQR,RM[655.3735]YEQQR,,-0.251258,0.618049,,-0.377629,,,,...,2.523810,5.666667,6.142857,8.333333,8.619048,20.047619,37.761905,53.000000,5.476190,0.0


# Load Dataset - 200uM siControl

In [229]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "10_01_24_Met_MsrKD_Mastersheet.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="200 uM siControl")
peptides

Unnamed: 0,Protein,Protein ID,Entry Name,Site,Label,Gene,Protein Description,Light Modified Peptide,Heavy Modified Peptide,siControl_1 Log2 Ratio HL,...,siControl_6 Log2 Ratio HL,siControl_7 Log2 Ratio HL,siControl_8 Log2 Ratio HL,siControl_9 Log2 Ratio HL,siControl_10 Log2 Ratio HL,siControl_11 Log2 Ratio HL,siControl_12 Log2 Ratio HL,pval,neglogp,avg
0,sp|P25786|PSA1_HUMAN,P25786,PSA1_HUMAN,M26,PSA1_M26,PSMA1,Proteasome subunit alpha type-1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,...,,,,,,,6.105851,2.468395e-04,3.607585,6.383453
1,sp|Q07065|CKAP4_HUMAN,Q07065,CKAP4_HUMAN,M423,CKAP4_M423,CKAP4,Cytoskeleton-associated protein 4,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,5.401829,...,5.592437,5.579160,,,,,5.692710,1.437906e-10,9.842270,5.486751
2,sp|Q13263|TIF1B_HUMAN,Q13263,TIF1B_HUMAN,M796,TIF1B_M796,TRIM28,Transcription intermediary factor 1-beta,M[649.3660]NEAFGDTK,M[655.3735]NEAFGDTK,3.957123,...,3.862194,3.285276,4.285666,4.798653,4.374849,,,4.980290e-08,7.302745,4.070801
3,sp|Q86UP2|KTN1_HUMAN,Q86UP2,KTN1_HUMAN,M643,KTN1_M643,KTN1,Kinectin,DIQNM[649.3660]NFLLK,DIQNM[655.3735]NFLLK,,...,,,,,,2.214946,,2.170714e-04,3.663397,2.243072
4,sp|P48643|TCPE_HUMAN,P48643,TCPE_HUMAN,M523,TCPE_M523,CCT5,T-complex protein 1 subunit epsilon,KQQISLATQM[649.3660]VR,KQQISLATQM[655.3735]VR,,...,,1.876733,,,,,,3.155683e-03,2.500907,2.039533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,sp|P36543|VATE1_HUMAN,P36543,VATE1_HUMAN,M15,VATE1_M15,ATP6V1E1,V-type proton ATPase subunit E 1,HM[649.3660]MAFIEQEANEK,HM[655.3735]MAFIEQEANEK,-0.550263,...,,0.162762,,,,1.256694,-0.309046,4.929757e-01,0.307174,0.042072
422,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,M543,KINH_M543,KIF5B,Kinesin-1 heavy chain,LKEM[649.3660]TNHQK,LKEM[655.3735]TNHQK,,...,,,,0.105261,,,-0.156300,4.951227e-01,0.305287,0.076113
423,sp|Q14152|EIF3A_HUMAN,Q14152,EIF3A_HUMAN,M736,EIF3A_M736,EIF3A,Eukaryotic translation initiation factor 3 sub...,ITTM[649.3660]QLER,ITTM[655.3735]QLER,1.147766,...,-0.711419,-0.824941,-0.535697,-0.673417,1.135670,-0.812964,,4.956274e-01,0.304845,0.007873
424,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,M529,HNRPM_M529,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,M[15.9949]GLSM[649.3660]ER,M[15.9949]GLSM[655.3735]ER,0.164323,...,0.137926,0.206113,,,,-0.246257,-0.497937,4.975510e-01,0.303162,0.081522


In [230]:
# Canonicalize data - none to do here
peptides;

In [231]:
# Manual labeling of peptides
label_col_data = ["red"] * 39 + ["green"] * 147 + ["blue"] * 50 + ["gray"] * 190
label_col = pd.Series(label_col_data)
peptides["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [232]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P25786' 'Q07065' 'Q13263' 'Q86UP2' 'P48643' 'P53999' 'Q9H078' 'Q9NTZ6'
 'P60842' 'P14625' 'Q96CT7' 'P12694' 'P67870' 'Q14011' 'Q99436' 'O60664'
 'Q99615' 'O95347' 'P15374' 'Q14204' 'Q15717' 'P35579' 'Q9NZI8' 'P83731'
 'Q16181' 'P22307' 'P36542' 'P46777' 'Q8TBA6' 'Q9BQI0' 'P15311' 'Q9UQE7'
 'Q99623' 'Q13283' 'P11940' 'P61011' 'P23246' 'P09874' 'P35611' 'Q9UG63'
 'P08238' 'Q14671' 'P06733' 'Q96LB3' 'Q9UMX5' 'Q15233' 'P62258' 'Q01518'
 'Q9UMX0' 'P41236' 'P60228' 'P36578' 'P49736' 'Q02818' 'P52272' 'Q8N8S7'
 'P84098' 'P68104' 'P51003' 'Q9Y2L1' 'O14737' 'P62861' 'P31948' 'O14745'
 'P30519' 'P18621' 'Q96PK6' 'P12268' 'P07910' 'P18669' 'P12270' 'Q99733'
 'P35637' 'Q02790' 'P47914' 'Q01844' 'Q15019' 'P61978' 'Q07666' 'O14776'
 'Q15056' 'P06748' 'P40222' 'O75821' 'P14314' 'O00193' 'O43395' 'P36543'
 'Q9UQN3' 'Q9Y5A9' 'O75533' 'P52292' 'Q86V81' 'P27635' 'P68363' 'P14868'
 'P16949' 'Q8WXF1' 'P78371' 'Q9NR30' 'P22061' 'O95831' 'Q9UHV9' 'P37108'
 'Q9Y266' 'Q16543' 'P09669' 'P

In [233]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [234]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [235]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Protein,Protein ID,Entry Name,Site,Label,Gene,Protein Description,Light Modified Peptide,Heavy Modified Peptide,siControl_1 Log2 Ratio HL,...,siControl_8 Log2 Ratio HL,siControl_9 Log2 Ratio HL,siControl_10 Log2 Ratio HL,siControl_11 Log2 Ratio HL,siControl_12 Log2 Ratio HL,pval,neglogp,avg,Color,Complete Sequence
0,sp|P25786|PSA1_HUMAN,P25786,PSA1_HUMAN,M26,PSA1_M26,PSMA1,Proteasome subunit alpha type-1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,...,,,,,6.105851,2.468395e-04,3.607585,6.383453,red,MFRNQYDNDVTVWSPQGRIHQIEYAMEAVKQGSATVGLKSKTHAVL...
1,sp|Q07065|CKAP4_HUMAN,Q07065,CKAP4_HUMAN,M423,CKAP4_M423,CKAP4,Cytoskeleton-associated protein 4,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,5.401829,...,,,,,5.692710,1.437906e-10,9.842270,5.486751,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...
2,sp|Q13263|TIF1B_HUMAN,Q13263,TIF1B_HUMAN,M796,TIF1B_M796,TRIM28,Transcription intermediary factor 1-beta,M[649.3660]NEAFGDTK,M[655.3735]NEAFGDTK,3.957123,...,4.285666,4.798653,4.374849,,,4.980290e-08,7.302745,4.070801,red,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...
3,sp|Q86UP2|KTN1_HUMAN,Q86UP2,KTN1_HUMAN,M643,KTN1_M643,KTN1,Kinectin,DIQNM[649.3660]NFLLK,DIQNM[655.3735]NFLLK,,...,,,,2.214946,,2.170714e-04,3.663397,2.243072,red,MEFYESAYFIVLIPSIVITVIFLFFWLFMKETLYDEVLAKQKREQK...
4,sp|P48643|TCPE_HUMAN,P48643,TCPE_HUMAN,M523,TCPE_M523,CCT5,T-complex protein 1 subunit epsilon,KQQISLATQM[649.3660]VR,KQQISLATQM[655.3735]VR,,...,,,,,,3.155683e-03,2.500907,2.039533,red,MASMGTLAFDEYGRPFLIIKDQDRKSRLMGLEALKSHIMAAKAVAN...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,sp|P36543|VATE1_HUMAN,P36543,VATE1_HUMAN,M15,VATE1_M15,ATP6V1E1,V-type proton ATPase subunit E 1,HM[649.3660]MAFIEQEANEK,HM[655.3735]MAFIEQEANEK,-0.550263,...,,,,1.256694,-0.309046,4.929757e-01,0.307174,0.042072,gray,MALSDADVQKQIKHMMAFIEQEANEKAEEIDAKAEEEFNIEKGRLV...
422,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,M543,KINH_M543,KIF5B,Kinesin-1 heavy chain,LKEM[649.3660]TNHQK,LKEM[655.3735]TNHQK,,...,,0.105261,,,-0.156300,4.951227e-01,0.305287,0.076113,gray,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...
423,sp|Q14152|EIF3A_HUMAN,Q14152,EIF3A_HUMAN,M736,EIF3A_M736,EIF3A,Eukaryotic translation initiation factor 3 sub...,ITTM[649.3660]QLER,ITTM[655.3735]QLER,1.147766,...,-0.535697,-0.673417,1.135670,-0.812964,,4.956274e-01,0.304845,0.007873,gray,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...
424,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,M529,HNRPM_M529,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,M[15.9949]GLSM[649.3660]ER,M[15.9949]GLSM[655.3735]ER,0.164323,...,,,,-0.246257,-0.497937,4.975510e-01,0.303162,0.081522,gray,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...


In [236]:
# Extract clean AA sequence from peptides (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_cs["Peptide Sequence"] = peptides_cs["Light Modified Peptide"].map(filtering)
peptides_cs;

In [237]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [238]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [239]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    426
Name: count, dtype: int64

In [240]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [241]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [242]:
# Sanity check - ensure methionine locations are correct (and match Met site numbers from initial dataset)
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
((temp.count("M") == len(temp)) & (peptides_cs["Site"].str[1:].astype(int) == peptides_cs["Methionine Location"]+1)).value_counts()

True    426
Name: count, dtype: int64

In [243]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Protein,Protein ID,Entry Name,Site,Label,Gene,Protein Description,Light Modified Peptide,Heavy Modified Peptide,siControl_1 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,sp|P25786|PSA1_HUMAN,P25786,PSA1_HUMAN,M26,PSA1_M26,PSMA1,Proteasome subunit alpha type-1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,...,red,MFRNQYDNDVTVWSPQGRIHQIEYAMEAVKQGSATVGLKSKTHAVL...,IHQIEYAMEAVK,18,12,IHQIEYA,7,25,YDNDVTVWSPQGRIHQIEYA,EAVKQGSATVGLKSKTHAVL
1,sp|Q07065|CKAP4_HUMAN,Q07065,CKAP4_HUMAN,M423,CKAP4_M423,CKAP4,Cytoskeleton-associated protein 4,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,5.401829,...,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...,LQHVEDGVLSMQVASAR,412,17,LQHVEDGVLS,10,422,QQKSQGLDSRLQHVEDGVLS,QVASARQTESLESLLSKSQE
2,sp|Q13263|TIF1B_HUMAN,Q13263,TIF1B_HUMAN,M796,TIF1B_M796,TRIM28,Transcription intermediary factor 1-beta,M[649.3660]NEAFGDTK,M[655.3735]NEAFGDTK,3.957123,...,red,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,MNEAFGDTK,795,9,,0,795,TEDKADVQSIIGLQRFFETR,NEAFGDTKFSAVLVEPPPMS
3,sp|Q86UP2|KTN1_HUMAN,Q86UP2,KTN1_HUMAN,M643,KTN1_M643,KTN1,Kinectin,DIQNM[649.3660]NFLLK,DIQNM[655.3735]NFLLK,,...,red,MEFYESAYFIVLIPSIVITVIFLFFWLFMKETLYDEVLAKQKREQK...,DIQNMNFLLK,638,10,DIQN,4,642,LASERDRLTSKEEELKDIQN,NFLLKAEVQKLQALANEQAA
4,sp|P48643|TCPE_HUMAN,P48643,TCPE_HUMAN,M523,TCPE_M523,CCT5,T-complex protein 1 subunit epsilon,KQQISLATQM[649.3660]VR,KQQISLATQM[655.3735]VR,,...,red,MASMGTLAFDEYGRPFLIIKDQDRKSRLMGLEALKSHIMAAKAVAN...,KQQISLATQMVR,513,12,KQQISLATQ,9,522,QQHVIETLIGKKQQISLATQ,VRMILKIDDIRKPGESEE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,sp|P36543|VATE1_HUMAN,P36543,VATE1_HUMAN,M15,VATE1_M15,ATP6V1E1,V-type proton ATPase subunit E 1,HM[649.3660]MAFIEQEANEK,HM[655.3735]MAFIEQEANEK,-0.550263,...,gray,MALSDADVQKQIKHMMAFIEQEANEKAEEIDAKAEEEFNIEKGRLV...,HMMAFIEQEANEK,13,13,H,1,14,MALSDADVQKQIK,MAFIEQEANEKAEEIDAKAE
422,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,M543,KINH_M543,KIF5B,Kinesin-1 heavy chain,LKEM[649.3660]TNHQK,LKEM[655.3735]TNHQK,,...,gray,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,LKEMTNHQK,539,9,LKE,3,542,LNQKSATLASIDAELQKLKE,TNHQKKRAAEMMASLLKDLA
423,sp|Q14152|EIF3A_HUMAN,Q14152,EIF3A_HUMAN,M736,EIF3A_M736,EIF3A,Eukaryotic translation initiation factor 3 sub...,ITTM[649.3660]QLER,ITTM[655.3735]QLER,1.147766,...,gray,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,ITTMQLER,732,8,ITT,3,735,EQRIKDMDLWEQQEEERITT,QLEREKALEHKNRMSRMLED
424,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,M529,HNRPM_M529,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,M[15.9949]GLSM[649.3660]ER,M[15.9949]GLSM[655.3735]ER,0.164323,...,gray,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,MGLSMER,524,7,MGLS,4,528,ERMGSGVERMGPAIERMGLS,ERMVPAGMGAGLERMGPVMD


In [244]:
# Remove invalid proteins (according to alphafold)
# 4 invalid peptides as a result -> 1 red, 1 green, 2 gray

invalid_IDs = ['Q14204', 'P78527', 'Q9NU22']
display(peptides_cs[peptides_cs["Protein ID"].isin(invalid_IDs)])
peptides_cs = peptides_cs[~peptides_cs["Protein ID"].isin(invalid_IDs)]
peptides_cs

Unnamed: 0,Protein,Protein ID,Entry Name,Site,Label,Gene,Protein Description,Light Modified Peptide,Heavy Modified Peptide,siControl_1 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
21,sp|Q14204|DYHC1_HUMAN,Q14204,DYHC1_HUMAN,M3256,DYHC1_M3256,DYNC1H1,Cytoplasmic dynein 1 heavy chain 1,KVM[649.3660]SQEIQEQLHK,KVM[655.3735]SQEIQEQLHK,0.872674,...,red,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,KVMSQEIQEQLHK,3253,13,KV,2,3255,ANDKLKKMVKDQQEAEKKKV,SQEIQEQLHKQQEVIADKQM
40,sp|Q14204|DYHC1_HUMAN,Q14204,DYHC1_HUMAN,M3199,DYHC1_M3199,DYNC1H1,Cytoplasmic dynein 1 heavy chain 1,RSELEEQQM[649.3660]HLNVGLR,RSELEEQQM[655.3735]HLNVGLR,0.434001,...,green,MSEPGGGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGG...,RSELEEQQMHLNVGLR,3190,16,RSELEEQQ,8,3198,FINHYANLFHEKRSELEEQQ,HLNVGLRKIKETVDQVEELR
261,sp|P78527|PRKDC_HUMAN,P78527,PRKDC_HUMAN,M3729,PRKDC_M3729,PRKDC,DNA-dependent protein kinase catalytic subunit,VTVM[649.3660]ASLR,VTVM[655.3735]ASLR,,...,gray,MAGSGAGVRCSLLRLQETLSAADRCGAALAGHQLIRGLGQECVLSS...,VTVMASLR,3725,8,VTV,3,3728,GKPLPEYHVRIAGFDERVTV,ASLRRPKRIIIRGHDEREHP
401,sp|Q9NU22|MDN1_HUMAN,Q9NU22,MDN1_HUMAN,M5110,MDN1_M5110,MDN1,Midasin,SM[649.3660]GDHNER,SM[655.3735]GDHNER,,...,gray,MEHFLLEVAAAPLRLIAAKNEKSRSELGRFLAKQVWTPQDRQCVLS...,SMGDHNER,5108,8,S,1,5109,TRKNTQSFKRKPGQADNERS,GDHNERVHKRLRTVDTDSHA


Unnamed: 0,Protein,Protein ID,Entry Name,Site,Label,Gene,Protein Description,Light Modified Peptide,Heavy Modified Peptide,siControl_1 Log2 Ratio HL,...,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,sp|P25786|PSA1_HUMAN,P25786,PSA1_HUMAN,M26,PSA1_M26,PSMA1,Proteasome subunit alpha type-1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,...,red,MFRNQYDNDVTVWSPQGRIHQIEYAMEAVKQGSATVGLKSKTHAVL...,IHQIEYAMEAVK,18,12,IHQIEYA,7,25,YDNDVTVWSPQGRIHQIEYA,EAVKQGSATVGLKSKTHAVL
1,sp|Q07065|CKAP4_HUMAN,Q07065,CKAP4_HUMAN,M423,CKAP4_M423,CKAP4,Cytoskeleton-associated protein 4,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,5.401829,...,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...,LQHVEDGVLSMQVASAR,412,17,LQHVEDGVLS,10,422,QQKSQGLDSRLQHVEDGVLS,QVASARQTESLESLLSKSQE
2,sp|Q13263|TIF1B_HUMAN,Q13263,TIF1B_HUMAN,M796,TIF1B_M796,TRIM28,Transcription intermediary factor 1-beta,M[649.3660]NEAFGDTK,M[655.3735]NEAFGDTK,3.957123,...,red,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,MNEAFGDTK,795,9,,0,795,TEDKADVQSIIGLQRFFETR,NEAFGDTKFSAVLVEPPPMS
3,sp|Q86UP2|KTN1_HUMAN,Q86UP2,KTN1_HUMAN,M643,KTN1_M643,KTN1,Kinectin,DIQNM[649.3660]NFLLK,DIQNM[655.3735]NFLLK,,...,red,MEFYESAYFIVLIPSIVITVIFLFFWLFMKETLYDEVLAKQKREQK...,DIQNMNFLLK,638,10,DIQN,4,642,LASERDRLTSKEEELKDIQN,NFLLKAEVQKLQALANEQAA
4,sp|P48643|TCPE_HUMAN,P48643,TCPE_HUMAN,M523,TCPE_M523,CCT5,T-complex protein 1 subunit epsilon,KQQISLATQM[649.3660]VR,KQQISLATQM[655.3735]VR,,...,red,MASMGTLAFDEYGRPFLIIKDQDRKSRLMGLEALKSHIMAAKAVAN...,KQQISLATQMVR,513,12,KQQISLATQ,9,522,QQHVIETLIGKKQQISLATQ,VRMILKIDDIRKPGESEE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,sp|P36543|VATE1_HUMAN,P36543,VATE1_HUMAN,M15,VATE1_M15,ATP6V1E1,V-type proton ATPase subunit E 1,HM[649.3660]MAFIEQEANEK,HM[655.3735]MAFIEQEANEK,-0.550263,...,gray,MALSDADVQKQIKHMMAFIEQEANEKAEEIDAKAEEEFNIEKGRLV...,HMMAFIEQEANEK,13,13,H,1,14,MALSDADVQKQIK,MAFIEQEANEKAEEIDAKAE
422,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,M543,KINH_M543,KIF5B,Kinesin-1 heavy chain,LKEM[649.3660]TNHQK,LKEM[655.3735]TNHQK,,...,gray,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,LKEMTNHQK,539,9,LKE,3,542,LNQKSATLASIDAELQKLKE,TNHQKKRAAEMMASLLKDLA
423,sp|Q14152|EIF3A_HUMAN,Q14152,EIF3A_HUMAN,M736,EIF3A_M736,EIF3A,Eukaryotic translation initiation factor 3 sub...,ITTM[649.3660]QLER,ITTM[655.3735]QLER,1.147766,...,gray,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,ITTMQLER,732,8,ITT,3,735,EQRIKDMDLWEQQEEERITT,QLEREKALEHKNRMSRMLED
424,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,M529,HNRPM_M529,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,M[15.9949]GLSM[649.3660]ER,M[15.9949]GLSM[655.3735]ER,0.164323,...,gray,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,MGLSMER,524,7,MGLS,4,528,ERMGSGVERMGPAIERMGLS,ERMVPAGMGAGLERMGPVMD


# Download Alphafold Data - 200uM siControl

In [245]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P25786' 'Q07065' 'Q13263' 'Q86UP2' 'P48643' 'P53999' 'Q9H078' 'Q9NTZ6'
 'P60842' 'P14625' 'Q96CT7' 'P12694' 'P67870' 'Q14011' 'Q99436' 'O60664'
 'Q99615' 'O95347' 'P15374' 'Q15717' 'P35579' 'Q9NZI8' 'P83731' 'Q16181'
 'P22307' 'P36542' 'P46777' 'Q8TBA6' 'Q9BQI0' 'P15311' 'Q9UQE7' 'Q99623'
 'Q13283' 'P11940' 'P61011' 'P23246' 'P09874' 'P35611' 'Q9UG63' 'P08238'
 'Q14671' 'P06733' 'Q96LB3' 'Q9UMX5' 'Q15233' 'P62258' 'Q01518' 'Q9UMX0'
 'P41236' 'P60228' 'P36578' 'P49736' 'Q02818' 'P52272' 'Q8N8S7' 'P84098'
 'P68104' 'P51003' 'Q9Y2L1' 'O14737' 'P62861' 'P31948' 'O14745' 'P30519'
 'P18621' 'Q96PK6' 'P12268' 'P07910' 'P18669' 'P12270' 'Q99733' 'P35637'
 'Q02790' 'P47914' 'Q01844' 'Q15019' 'P61978' 'Q07666' 'O14776' 'Q15056'
 'P06748' 'P40222' 'O75821' 'P14314' 'O00193' 'O43395' 'P36543' 'Q9UQN3'
 'Q9Y5A9' 'O75533' 'P52292' 'Q86V81' 'P27635' 'P68363' 'P14868' 'P16949'
 'Q8WXF1' 'P78371' 'Q9NR30' 'P22061' 'O95831' 'Q9UHV9' 'P37108' 'Q9Y266'
 'Q16543' 'P09669' 'P82970' 'Q

In [246]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 280/280 [00:00<00:00, 189940.99it/s]

2024-10-17 19:10:53> Valid proteins: 0
2024-10-17 19:10:53> Invalid proteins: 0
2024-10-17 19:10:53> Existing proteins: 280





In [247]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 280/280 [00:00<00:00, 192430.79it/s]

2024-10-17 19:10:53> Valid proteins: 0
2024-10-17 19:10:53> Invalid proteins: 0
2024-10-17 19:10:53> Existing proteins: 280





# Construct Alphafold Dataframe (Calculate Accessibilities) - 200uM siControl

In [248]:
# Format alphafold data into dataframe
alphafold_annotation_200uM_siControl = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=unique_uniprotIDs)
alphafold_annotation_200uM_siControl

100%|██████████| 1663/1663 [00:22<00:00, 74.49it/s] 


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,E9PRG8,1,M,1,53.66,31.653,31.058,32.072,30.756,-15.522,...,-12.734,-11.583,-13.004,unstructured,unstructured,0,0,0,0,1
1,E9PRG8,1,G,2,61.43,32.653,31.558,,30.969,-12.926,...,-15.819,,-14.660,unstructured,unstructured,0,0,0,0,1
2,E9PRG8,1,A,3,63.59,34.813,35.038,36.302,33.893,-10.809,...,-15.414,-16.107,-15.836,unstructured,unstructured,0,0,0,0,1
3,E9PRG8,1,P,4,64.28,35.599,34.793,34.939,35.147,-7.676,...,-14.858,-13.434,-14.749,HELX_RH_3T_P,HELX,0,1,0,0,0
4,E9PRG8,1,G,5,59.04,37.048,37.549,,36.530,-6.832,...,-17.473,,-16.633,HELX_RH_3T_P,HELX,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147629,Q9Y5A9,280,Q,575,75.92,19.425,18.044,18.018,16.939,-17.423,...,-13.444,-12.722,-12.965,HELX_RH_AL_P,HELX,0,1,0,0,0
147630,Q9Y5A9,280,G,576,73.73,21.009,20.842,,19.614,-20.151,...,-12.044,,-12.243,HELX_RH_AL_P,HELX,0,1,0,0,0
147631,Q9Y5A9,280,R,577,70.88,19.908,19.849,18.593,19.904,-21.479,...,-14.435,-14.081,-13.474,TURN_TY1_P,TURN,0,0,0,1,0
147632,Q9Y5A9,280,G,578,59.40,21.699,20.303,,20.084,-18.953,...,-17.485,,-16.166,TURN_TY1_P,TURN,0,0,0,1,0


In [249]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 280/280 [00:02<00:00, 107.83it/s]


In [250]:
alphafold_accessibility_200uM_siControl = alphafold_annotation_200uM_siControl.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [251]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 280/280 [00:02<00:00, 132.59it/s]


In [252]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [253]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 280/280 [00:02<00:00, 131.35it/s]


In [254]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [255]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 280/280 [00:02<00:00, 125.59it/s]


In [256]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [257]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 280/280 [00:02<00:00, 123.24it/s]


In [258]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [259]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 280/280 [00:02<00:00, 129.24it/s]


In [260]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [261]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 280/280 [00:02<00:00, 125.86it/s]


In [262]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [263]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 280/280 [00:02<00:00, 128.57it/s]


In [264]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [265]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 280/280 [00:02<00:00, 123.60it/s]


In [266]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [267]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 280/280 [00:02<00:00, 104.30it/s]


In [268]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [269]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 280/280 [00:02<00:00, 124.01it/s]


In [270]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [271]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 280/280 [00:02<00:00, 116.20it/s]


In [272]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [273]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 280/280 [00:03<00:00, 89.77it/s] 


In [274]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [275]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 280/280 [00:03<00:00, 77.33it/s] 


In [276]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl;

In [277]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_200uM_siControl, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 280/280 [00:02<00:00, 116.91it/s]


In [278]:
alphafold_accessibility_200uM_siControl = alphafold_accessibility_200uM_siControl.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_200uM_siControl

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,E9PRG8,1,M,1,53.66,31.653,31.058,32.072,30.756,-15.522,...,1,1,1,1,1,1,2,3,5,0
1,E9PRG8,1,G,2,61.43,32.653,31.558,,30.969,-12.926,...,0,2,2,2,2,2,3,5,6,0
2,E9PRG8,1,A,3,63.59,34.813,35.038,36.302,33.893,-10.809,...,1,2,2,2,2,2,4,6,7,0
3,E9PRG8,1,P,4,64.28,35.599,34.793,34.939,35.147,-7.676,...,1,2,2,2,2,2,5,7,9,0
4,E9PRG8,1,G,5,59.04,37.048,37.549,,36.530,-6.832,...,0,2,2,2,2,2,4,8,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147629,Q9Y5A9,280,Q,575,75.92,19.425,18.044,18.018,16.939,-17.423,...,1,2,2,2,3,3,7,12,16,2
147630,Q9Y5A9,280,G,576,73.73,21.009,20.842,,19.614,-20.151,...,2,2,2,2,2,2,7,11,12,2
147631,Q9Y5A9,280,R,577,70.88,19.908,19.849,18.593,19.904,-21.479,...,1,2,2,2,2,2,6,9,11,2
147632,Q9Y5A9,280,G,578,59.40,21.699,20.303,,20.084,-18.953,...,0,2,2,2,2,2,4,7,9,1


In [279]:
alphafold_accessibility_200uM_siControl_smooth = get_smooth_score(
    alphafold_accessibility_200uM_siControl, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_200uM_siControl_smooth;

100%|██████████| 280/280 [00:00<00:00, 574.65it/s]


In [280]:
alphafold_accessibility_200uM_siControl_smooth['IDR'] = np.where(
    alphafold_accessibility_200uM_siControl_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_200uM_siControl_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,E9PRG8,1,M,1,53.66,31.653,31.058,32.072,30.756,-15.522,...,1.818182,1.818182,1.909091,1.909091,2.000000,4.272727,7.545455,10.818182,0.181818,1
1,E9PRG8,1,G,2,61.43,32.653,31.558,,30.969,-12.926,...,1.833333,1.833333,1.916667,1.916667,2.000000,4.583333,8.083333,11.750000,0.333333,1
2,E9PRG8,1,A,3,63.59,34.813,35.038,36.302,33.893,-10.809,...,1.846154,1.846154,1.923077,2.000000,2.076923,4.846154,8.769231,12.615385,0.307692,1
3,E9PRG8,1,P,4,64.28,35.599,34.793,34.939,35.147,-7.676,...,1.857143,1.928571,2.071429,2.142857,2.214286,5.071429,9.285714,13.285714,0.285714,1
4,E9PRG8,1,G,5,59.04,37.048,37.549,,36.530,-6.832,...,1.866667,1.933333,2.133333,2.333333,2.466667,5.533333,9.866667,14.066667,0.400000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574,Q9Y5A9,280,Q,575,75.92,19.425,18.044,18.018,16.939,-17.423,...,1.933333,3.000000,3.533333,4.733333,5.000000,8.733333,17.200000,30.000000,1.866667,1
575,Q9Y5A9,280,G,576,73.73,21.009,20.842,,19.614,-20.151,...,1.928571,2.857143,3.357143,4.500000,4.785714,8.500000,16.071429,27.928571,1.928571,1
576,Q9Y5A9,280,R,577,70.88,19.908,19.849,18.593,19.904,-21.479,...,1.923077,2.615385,3.153846,4.230769,4.538462,8.230769,15.538462,26.461538,1.846154,1
577,Q9Y5A9,280,G,578,59.40,21.699,20.303,,20.084,-18.953,...,1.916667,2.500000,2.916667,3.916667,4.250000,8.000000,14.333333,23.916667,1.833333,1


# Merge Dataframes into Full Dataset (Includes Alphafold) - 200uM siControl

In [281]:
alphafold_accessibility_200uM_siControl_smooth["position"] = alphafold_accessibility_200uM_siControl_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_wa = peptides_cs.merge(
    alphafold_accessibility_200uM_siControl_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_wa # wa means "with alphafold"

Unnamed: 0,Protein,Protein ID,Entry Name,Site,Label,Gene,Protein Description,Light Modified Peptide,Heavy Modified Peptide,siControl_1 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,sp|P25786|PSA1_HUMAN,P25786,PSA1_HUMAN,M26,PSA1_M26,PSMA1,Proteasome subunit alpha type-1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,...,2.761905,4.857143,6.238095,7.523810,9.095238,24.238095,59.190476,107.904762,6.238095,0.0
1,sp|Q07065|CKAP4_HUMAN,Q07065,CKAP4_HUMAN,M423,CKAP4_M423,CKAP4,Cytoskeleton-associated protein 4,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,5.401829,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0.0
2,sp|Q13263|TIF1B_HUMAN,Q13263,TIF1B_HUMAN,M796,TIF1B_M796,TRIM28,Transcription intermediary factor 1-beta,M[649.3660]NEAFGDTK,M[655.3735]NEAFGDTK,3.957123,...,2.428571,4.095238,4.666667,5.952381,6.047619,11.952381,35.952381,67.523810,2.904762,0.0
3,sp|Q86UP2|KTN1_HUMAN,Q86UP2,KTN1_HUMAN,M643,KTN1_M643,KTN1,Kinectin,DIQNM[649.3660]NFLLK,DIQNM[655.3735]NFLLK,,...,3.142857,5.285714,6.333333,7.714286,7.714286,12.000000,19.809524,26.952381,2.047619,1.0
4,sp|P48643|TCPE_HUMAN,P48643,TCPE_HUMAN,M523,TCPE_M523,CCT5,T-complex protein 1 subunit epsilon,KQQISLATQM[649.3660]VR,KQQISLATQM[655.3735]VR,,...,2.047619,4.095238,5.619048,6.904762,7.571429,20.142857,71.761905,133.047619,6.380952,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,sp|P36543|VATE1_HUMAN,P36543,VATE1_HUMAN,M15,VATE1_M15,ATP6V1E1,V-type proton ATPase subunit E 1,HM[649.3660]MAFIEQEANEK,HM[655.3735]MAFIEQEANEK,-0.550263,...,2.285714,4.238095,5.190476,6.714286,6.809524,10.904762,17.428571,23.714286,1.714286,1.0
418,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,M543,KINH_M543,KIF5B,Kinesin-1 heavy chain,LKEM[649.3660]TNHQK,LKEM[655.3735]TNHQK,,...,2.190476,4.952381,5.857143,7.523810,7.523810,11.523810,19.000000,25.380952,2.095238,1.0
419,sp|Q14152|EIF3A_HUMAN,Q14152,EIF3A_HUMAN,M736,EIF3A_M736,EIF3A,Eukaryotic translation initiation factor 3 sub...,ITTM[649.3660]QLER,ITTM[655.3735]QLER,1.147766,...,2.714286,5.095238,5.904762,7.714286,7.809524,11.761905,19.142857,25.809524,1.714286,1.0
420,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,M529,HNRPM_M529,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,M[15.9949]GLSM[649.3660]ER,M[15.9949]GLSM[655.3735]ER,0.164323,...,1.952381,1.952381,2.000000,2.000000,2.000000,4.142857,6.761905,10.285714,0.095238,1.0


In [282]:
# note: one row with missing AA steric analysis: AlphaFold seems to have a sequence that's 59 AA long, but UniProt has a sequence that's 133 AA long
display(peptides_wa[peptides_wa["Protein ID"] == "P62861"])
peptides_wa = peptides_wa[~(peptides_wa["Protein ID"] == "P62861")]
peptides_wa

Unnamed: 0,Protein,Protein ID,Entry Name,Site,Label,Gene,Protein Description,Light Modified Peptide,Heavy Modified Peptide,siControl_1 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
70,sp|P62861|RS30_HUMAN,P62861,RS30_HUMAN,M110,RS30_M110,FAU,Ubiquitin-like FUBI-ribosomal protein eS30 fus...,RRM[649.3660]QYNR,RRM[655.3735]QYNR,0.226856,...,,,,,,,,,,


Unnamed: 0,Protein,Protein ID,Entry Name,Site,Label,Gene,Protein Description,Light Modified Peptide,Heavy Modified Peptide,siControl_1 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,sp|P25786|PSA1_HUMAN,P25786,PSA1_HUMAN,M26,PSA1_M26,PSMA1,Proteasome subunit alpha type-1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,...,2.761905,4.857143,6.238095,7.523810,9.095238,24.238095,59.190476,107.904762,6.238095,0.0
1,sp|Q07065|CKAP4_HUMAN,Q07065,CKAP4_HUMAN,M423,CKAP4_M423,CKAP4,Cytoskeleton-associated protein 4,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,5.401829,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0.0
2,sp|Q13263|TIF1B_HUMAN,Q13263,TIF1B_HUMAN,M796,TIF1B_M796,TRIM28,Transcription intermediary factor 1-beta,M[649.3660]NEAFGDTK,M[655.3735]NEAFGDTK,3.957123,...,2.428571,4.095238,4.666667,5.952381,6.047619,11.952381,35.952381,67.523810,2.904762,0.0
3,sp|Q86UP2|KTN1_HUMAN,Q86UP2,KTN1_HUMAN,M643,KTN1_M643,KTN1,Kinectin,DIQNM[649.3660]NFLLK,DIQNM[655.3735]NFLLK,,...,3.142857,5.285714,6.333333,7.714286,7.714286,12.000000,19.809524,26.952381,2.047619,1.0
4,sp|P48643|TCPE_HUMAN,P48643,TCPE_HUMAN,M523,TCPE_M523,CCT5,T-complex protein 1 subunit epsilon,KQQISLATQM[649.3660]VR,KQQISLATQM[655.3735]VR,,...,2.047619,4.095238,5.619048,6.904762,7.571429,20.142857,71.761905,133.047619,6.380952,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,sp|P36543|VATE1_HUMAN,P36543,VATE1_HUMAN,M15,VATE1_M15,ATP6V1E1,V-type proton ATPase subunit E 1,HM[649.3660]MAFIEQEANEK,HM[655.3735]MAFIEQEANEK,-0.550263,...,2.285714,4.238095,5.190476,6.714286,6.809524,10.904762,17.428571,23.714286,1.714286,1.0
418,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,M543,KINH_M543,KIF5B,Kinesin-1 heavy chain,LKEM[649.3660]TNHQK,LKEM[655.3735]TNHQK,,...,2.190476,4.952381,5.857143,7.523810,7.523810,11.523810,19.000000,25.380952,2.095238,1.0
419,sp|Q14152|EIF3A_HUMAN,Q14152,EIF3A_HUMAN,M736,EIF3A_M736,EIF3A,Eukaryotic translation initiation factor 3 sub...,ITTM[649.3660]QLER,ITTM[655.3735]QLER,1.147766,...,2.714286,5.095238,5.904762,7.714286,7.809524,11.761905,19.142857,25.809524,1.714286,1.0
420,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,M529,HNRPM_M529,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,M[15.9949]GLSM[649.3660]ER,M[15.9949]GLSM[655.3735]ER,0.164323,...,1.952381,1.952381,2.000000,2.000000,2.000000,4.142857,6.761905,10.285714,0.095238,1.0


In [283]:
#peptides_wa.to_csv(os.path.join(curr_dir_path, "200uM_siControl_with_alphafold.csv"))

In [284]:
path = os.path.join(curr_dir_path, "200uM_siControl_with_alphafold.csv")
peptides_wa = pd.read_csv(path)
peptides_wa.set_index("Unnamed: 0", inplace=True)
peptides_wa.index.name = None
peptides_wa

Unnamed: 0,Protein,Protein ID,Entry Name,Site,Label,Gene,Protein Description,Light Modified Peptide,Heavy Modified Peptide,siControl_1 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,sp|P25786|PSA1_HUMAN,P25786,PSA1_HUMAN,M26,PSA1_M26,PSMA1,Proteasome subunit alpha type-1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,,...,2.761905,4.857143,6.238095,7.523810,9.095238,24.238095,59.190476,107.904762,6.238095,0.0
1,sp|Q07065|CKAP4_HUMAN,Q07065,CKAP4_HUMAN,M423,CKAP4_M423,CKAP4,Cytoskeleton-associated protein 4,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,5.401829,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0.0
2,sp|Q13263|TIF1B_HUMAN,Q13263,TIF1B_HUMAN,M796,TIF1B_M796,TRIM28,Transcription intermediary factor 1-beta,M[649.3660]NEAFGDTK,M[655.3735]NEAFGDTK,3.957123,...,2.428571,4.095238,4.666667,5.952381,6.047619,11.952381,35.952381,67.523810,2.904762,0.0
3,sp|Q86UP2|KTN1_HUMAN,Q86UP2,KTN1_HUMAN,M643,KTN1_M643,KTN1,Kinectin,DIQNM[649.3660]NFLLK,DIQNM[655.3735]NFLLK,,...,3.142857,5.285714,6.333333,7.714286,7.714286,12.000000,19.809524,26.952381,2.047619,1.0
4,sp|P48643|TCPE_HUMAN,P48643,TCPE_HUMAN,M523,TCPE_M523,CCT5,T-complex protein 1 subunit epsilon,KQQISLATQM[649.3660]VR,KQQISLATQM[655.3735]VR,,...,2.047619,4.095238,5.619048,6.904762,7.571429,20.142857,71.761905,133.047619,6.380952,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,sp|P36543|VATE1_HUMAN,P36543,VATE1_HUMAN,M15,VATE1_M15,ATP6V1E1,V-type proton ATPase subunit E 1,HM[649.3660]MAFIEQEANEK,HM[655.3735]MAFIEQEANEK,-0.550263,...,2.285714,4.238095,5.190476,6.714286,6.809524,10.904762,17.428571,23.714286,1.714286,1.0
418,sp|P33176|KINH_HUMAN,P33176,KINH_HUMAN,M543,KINH_M543,KIF5B,Kinesin-1 heavy chain,LKEM[649.3660]TNHQK,LKEM[655.3735]TNHQK,,...,2.190476,4.952381,5.857143,7.523810,7.523810,11.523810,19.000000,25.380952,2.095238,1.0
419,sp|Q14152|EIF3A_HUMAN,Q14152,EIF3A_HUMAN,M736,EIF3A_M736,EIF3A,Eukaryotic translation initiation factor 3 sub...,ITTM[649.3660]QLER,ITTM[655.3735]QLER,1.147766,...,2.714286,5.095238,5.904762,7.714286,7.809524,11.761905,19.142857,25.809524,1.714286,1.0
420,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,M529,HNRPM_M529,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,M[15.9949]GLSM[649.3660]ER,M[15.9949]GLSM[655.3735]ER,0.164323,...,1.952381,1.952381,2.000000,2.000000,2.000000,4.142857,6.761905,10.285714,0.095238,1.0


# The End (For Now)