## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import requests as r
from Bio import SeqIO
from io import StringIO
import warnings

warnings.filterwarnings('ignore')

# Import structuremap functions
import structuremap.utils
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score

structuremap.utils.set_logger()

In [2]:
# Set parameters of analysis
analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "655.3735" 
light_modification = "649.3660"

## Load Combined Dataset - 500 uM

In [3]:
# Set correct pathing
curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/LFD_HFD_Met_reactivity_corrected
Global Data Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/global_data


In [4]:
# Load combined dataset - 500 uM
data_loc = os.path.join(curr_dir_path, "10_01_24_LFD_HFD_statistics.xlsx")
peptides_combined_500uM = pd.read_excel(data_loc, sheet_name="500 uM combined", skiprows=9, nrows=209, usecols= 'A:G')
peptides_combined_500uM

Unnamed: 0,Entry Name,Protein ID,Light Modified Peptide,Heavy Modified Peptide,Site Number,Site,Label
0,PLIN2_MOUSE,P43883,KNM[649.3660]HSANQK,KNM[655.3735]HSANQK,265,M265,PLIN2_M265
1,STIP1_MOUSE,Q60864,LM[649.3660]DVGLIAIR,LM[655.3735]DVGLIAIR,535,M535,STIP1_M535
2,MYH9_MOUSE,Q8VDD5,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,1565,M1565,MYH9_M1565
3,1433E_MOUSE,P62259,DSTLIM[649.3660]QLLR,DSTLIM[655.3735]QLLR,221,M221,1433E_M221
4,BIP_MOUSE,P20029,M[649.3660]KETAEAYLGK,M[655.3735]KETAEAYLGK,154,M154,BIP_M154
...,...,...,...,...,...,...,...
204,G3P_MOUSE,P16858,VIHDNFGIVEGLM[649.3660]TTVHAITATQK,VIHDNFGIVEGLM[655.3735]TTVHAITATQK,173,M173,G3P_M173
205,THIL_MOUSE,Q8QZT1,M[649.3660]LEIDPQK,M[655.3735]LEIDPQK,363,M363,THIL_M363
206,GLYAL_MOUSE,Q5FW57,VYGTVFHM[649.3660]NQGNPFK,VYGTVFHM[655.3735]NQGNPFK,34,M34,GLYAL_M34
207,ENPL_MOUSE,P08113,RVFITDDFHDM[649.3660]MPK,RVFITDDFHDM[655.3735]MPK,425,M425,ENPL_M425


In [5]:
# Manual labeling of peptides - 0 uM
label_col_data = ["red"] * 24 + ["green"] * 20 + ["white"] * 55 + ["green"] * 50 + ["blue"] * 60
label_col = pd.Series(label_col_data)
peptides_combined_500uM["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides_combined)
#pd.reset_option("display.max_rows")
peptides_combined_500uM;

## Load isoDTB Dataset - 500 uM

In [6]:
# Load isoDTB dataset
data_loc = os.path.join(curr_dir_path, "10_01_24_LFD_HFD_statistics.xlsx")
peptides_isoDTB_500uM = pd.read_excel(data_loc, sheet_name="500 uM isoDTB")
peptides_isoDTB_500uM

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,3_3 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,isodtb avg,pvalue
0,STM[649.3660]GKPQR,STM[655.3735]GKPQR,0.406868,0.395987,0.374974,0.416952,0.392813,0.393463,0.403596,0.380191,,sp|P53026|RL10A_MOUSE,P53026,RL10A_MOUSE,Rpl10a,Large ribosomal subunit protein uL1,0.395606,1.110121e-11
1,TAHIVLEDGTKM[649.3660]K,TAHIVLEDGTKM[655.3735]K,-1.345951,-1.336930,-1.259091,-1.154772,-1.156377,-1.096452,-1.280766,-1.275782,-1.310493,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",-1.246290,1.138424e-10
2,M[649.3660]STSGPR,M[655.3735]STSGPR,1.479224,1.424925,1.259457,1.580921,1.624588,1.621573,1.539448,1.439540,1.613931,sp|P11679|K2C8_MOUSE,P11679,K2C8_MOUSE,Krt8,"Keratin, type II cytoskeletal 8",1.509290,2.958754e-10
3,M[649.3660]GFPEAASSFR,M[655.3735]GFPEAASSFR,1.867041,2.169877,2.136274,1.985958,2.333332,2.021928,2.302533,2.313512,2.365057,sp|P32020|SCP2_MOUSE,P32020,SCP2_MOUSE,Scp2,Sterol carrier protein 2,2.166168,3.365295e-10
4,AM[649.3660]LSTGFK,AM[655.3735]LSTGFK,-1.291887,-1.389566,-1.339514,-1.228957,-1.187654,-1.061855,-1.230488,-1.330375,-1.093638,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",-1.239326,7.078562e-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,,,-0.445163,,,,-0.464686,-1.512033,-1.494568,sp|P62259|1433E_MOUSE,P62259,1433E_MOUSE,Ywhae,14-3-3 protein epsilon,-0.979113,4.804266e-02
292,MREHVM[649.3660]K,MREHVM[655.3735]K,,0.377009,,0.882393,0.671755,,,,,sp|Q02819|NUCB1_MOUSE,Q02819,NUCB1_MOUSE,Nucb1,Nucleobindin-1,0.643719,4.812779e-02
293,KFM[649.3660]NPFNLPNLYQK,KFM[655.3735]NPFNLPNLYQK,,,,,,,,-1.148367,-1.336717,sp|Q60864|STIP1_MOUSE,Q60864,STIP1_MOUSE,Stip1,Stress-induced-phosphoprotein 1,-1.242542,4.815872e-02
294,IRVM[649.3660]LYPSRI,IRVM[655.3735]LYPSRI,0.364001,,,0.354543,0.293109,0.420985,-0.043213,-0.017246,-0.013869,sp|O55142|RL35A_MOUSE,O55142,RL35A_MOUSE,Rpl35a,Large ribosomal subunit protein eL33,0.194044,4.874279e-02


In [7]:
unique_uniprotIDs = peptides_isoDTB_500uM["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['P53026' 'Q8C196' 'P11679' 'P32020' 'P08226' 'Q3UZZ6' 'Q05421' 'P10649'
 'P41216' 'P05784' 'P16460' 'P20852' 'Q99PT1' 'Q9R0H0' 'O35490' 'Q91YI0'
 'Q05920' 'P08113' 'Q8CAY6' 'P55264' 'P24369' 'Q9ET01' 'P97872' 'Q9QXF8'
 'P40630' 'P97371' 'P33267' 'Q9JKR6' 'Q91VS7' 'Q9CQC9' 'P17182' 'P53657'
 'P16858' 'P23116' 'Q99KR3' 'O35423' 'P08003' 'Q99K67' 'P62843' 'P16546'
 'P30115' 'Q78PY7' 'P20152' 'P29758' 'P17665' 'Q8R164' 'Q91W64' 'P97351'
 'P06728' 'P70694' 'Q91Y97' 'P50247' 'Q9CQQ7' 'Q9DBG5' 'Q07417' 'Q3THE2'
 'Q8BP67' 'P24549' 'Q5FW57' 'P20029' 'O09167' 'P47915' 'Q8VCT4' 'Q60864'
 'Q9CPQ8' 'P54071' 'Q8QZT1' 'P09411' 'Q9D0F9' 'P47911' 'Q9D819' 'Q63880'
 'P51174' 'P47963' 'P43883' 'F6ZDS4' 'P62806' 'P48678' 'Q9WVL0' 'Q8VC30'
 'Q8VDD5' 'P60710' 'Q9CPQ1' 'Q99KI0' 'P52825' 'Q9DBA8' 'Q8VC12' 'O54749'
 'Q8CHT0' 'P24472' 'P97450' 'P63325' 'Q00623' 'P97372' 'P48771' 'P84099'
 'Q9CY58' 'Q8K1Z0' 'P54869' 'O08709' 'P06745' 'P37804' 'P14824' 'P19783'
 'Q64442' 'Q8CFX1' 'Q8BG05' 'Q

In [8]:
# Helper function to get full amino acid sequence for a protein
def get_complete_sequence(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [9]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated

In [10]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated

Unnamed: 0,Protein ID,Complete Sequence
0,Q8C196,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1,Q07417,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...
2,Q91YI0,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...
3,P50247,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...
4,P33267,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...
...,...,...
28,Q9QX47,MAADIEQVFRSFVVSKFREIQQELSSGRSEGQLNGETNPPIEGNQA...
29,P10605,MWWSLILLSCLLALTSAHDKPSFHPLSDDLINYINKQNTTWQAGRN...
30,P38060,MASVRKAFPRRLVGLTSLRAVSTSSMGTLPKQVKIVEVGPRDGLQN...
31,P11499,MPEEVHHGEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISN...


In [11]:
peptides_isoDTB_500uM = peptides_isoDTB_500uM.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_isoDTB_500uM;

In [12]:
# Create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    whole, mantissa = modifications[0].split(".")
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        whole, mantissa = modifications[i].split(".")
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [13]:
# Filter out modification symbols to get raw protein sequence (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_isoDTB_500uM["Peptide Sequence"] = peptides_isoDTB_500uM["Heavy Modified Peptide"].map(filtering)
peptides_isoDTB_500uM;

In [14]:
peptides_isoDTB_500uM["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_isoDTB_500uM["Complete Sequence"], peptides_isoDTB_500uM["Peptide Sequence"])], dtype=int)
peptides_isoDTB_500uM;

In [15]:
peptides_isoDTB_500uM["Sequence Length"] = peptides_isoDTB_500uM["Peptide Sequence"].str.len().astype(int)
peptides_isoDTB_500uM;

In [16]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_isoDTB_500uM["Complete Sequence"], peptides_isoDTB_500uM["Sequence Location"], peptides_isoDTB_500uM["Sequence Length"])]
(temp == peptides_isoDTB_500uM["Peptide Sequence"]).value_counts()

Peptide Sequence
True    296
Name: count, dtype: int64

In [17]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_isoDTB_500uM["Left Prefix"] = peptides_isoDTB_500uM["Heavy Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_isoDTB_500uM["Left Prefix"] = peptides_isoDTB_500uM["Left Prefix"].map(filtering)
peptides_isoDTB_500uM["Left Prefix Length"] = peptides_isoDTB_500uM["Left Prefix"].str.len().astype(int)

peptides_isoDTB_500uM;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [18]:
peptides_isoDTB_500uM["Methionine Location"] = peptides_isoDTB_500uM["Sequence Location"] + peptides_isoDTB_500uM["Left Prefix Length"]
peptides_isoDTB_500uM;

In [19]:
# Sanity check - ensure methionine locations are correct
temp = [A[B] for A, B in zip(peptides_isoDTB_500uM["Complete Sequence"], peptides_isoDTB_500uM["Methionine Location"])]
temp.count("M") == len(temp)

True

In [20]:
peptides_isoDTB_500uM = peptides_isoDTB_500uM.drop_duplicates(["Protein ID", "Methionine Location"]) #TODO: ask about this - is keeping the first duplicate okay, or should we do something more complex?
peptides_isoDTB_500uM;

## Merge Combined & isoDTB Dataframes - 500 uM

In [21]:
peptides_isoDTB_500uM["Methionine Location"] = peptides_isoDTB_500uM["Methionine Location"] + 1 # one-index the positions to match initial dataframe

peptides_combined_500uM = peptides_combined_500uM.merge(
    peptides_isoDTB_500uM, 
    how="left", 
    left_on=["Protein ID", "Site Number"], 
    right_on=["Protein ID", "Methionine Location"]
)
peptides_combined_500uM

Unnamed: 0,Entry Name_x,Protein ID,Light Modified Peptide_x,Heavy Modified Peptide_x,Site Number,Site,Label,Color,Light Modified Peptide_y,Heavy Modified Peptide_y,...,Protein Description,isodtb avg,pvalue,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location
0,PLIN2_MOUSE,P43883,KNM[649.3660]HSANQK,KNM[655.3735]HSANQK,265,M265,PLIN2_M265,red,KNM[649.3660]HSANQK,KNM[655.3735]HSANQK,...,Perilipin-2,-1.060568,0.001504,MAAAVVDPQQSVVMRVANLPLVSSTYDLVSSAYVSTKDQYPYLRSV...,KNMHSANQK,262,9,KN,2,265
1,STIP1_MOUSE,Q60864,LM[649.3660]DVGLIAIR,LM[655.3735]DVGLIAIR,535,M535,STIP1_M535,red,LM[649.3660]DVGLIAIR,LM[655.3735]DVGLIAIR,...,Stress-induced-phosphoprotein 1,9.182313,0.000744,MEQVNELKEKGNKALSAGNIDDALQCYSEAIKLDPQNHVLYSNRSA...,LMDVGLIAIR,533,10,L,1,535
2,MYH9_MOUSE,Q8VDD5,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,1565,M1565,MYH9_M1565,red,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,...,Myosin-9,1.331119,0.007703,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSSKNGFEPASL...,LRLEVNLQAMK,1555,11,LRLEVNLQA,9,1565
3,1433E_MOUSE,P62259,DSTLIM[649.3660]QLLR,DSTLIM[655.3735]QLLR,221,M221,1433E_M221,red,DSTLIM[649.3660]QLLR,DSTLIM[655.3735]QLLR,...,14-3-3 protein epsilon,0.656393,0.013444,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,DSTLIMQLLR,215,10,DSTLI,5,221
4,BIP_MOUSE,P20029,M[649.3660]KETAEAYLGK,M[655.3735]KETAEAYLGK,154,M154,BIP_M154,red,M[649.3660]KETAEAYLGK,M[655.3735]KETAEAYLGK,...,Endoplasmic reticulum chaperone BiP,1.767053,0.015673,MMKFTVVAAALLLLGAVRAEEEDKKEDVGTVVGIDLGTTYSCVGVF...,MKETAEAYLGK,153,11,,0,154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,G3P_MOUSE,P16858,VIHDNFGIVEGLM[649.3660]TTVHAITATQK,VIHDNFGIVEGLM[655.3735]TTVHAITATQK,173,M173,G3P_M173,blue,VIHDNFGIVEGLM[649.3660]TTVHAITATQK,VIHDNFGIVEGLM[655.3735]TTVHAITATQK,...,Glyceraldehyde-3-phosphate dehydrogenase,-0.382459,0.000322,MVKVGVNGFGRIGRLVTRAAICSGKVEIVAINDPFIDLNYMVYMFQ...,VIHDNFGIVEGLMTTVHAITATQK,160,24,VIHDNFGIVEGL,12,173
205,THIL_MOUSE,Q8QZT1,M[649.3660]LEIDPQK,M[655.3735]LEIDPQK,363,M363,THIL_M363,blue,M[649.3660]LEIDPQK,M[655.3735]LEIDPQK,...,"Acetyl-CoA acetyltransferase, mitochondrial",0.307201,0.014423,MAALVALHGVVRRPLLRGLLQEVRCLERSYASKPTLNEVVIVSAIR...,MLEIDPQK,362,8,,0,363
206,GLYAL_MOUSE,Q5FW57,VYGTVFHM[649.3660]NQGNPFK,VYGTVFHM[655.3735]NQGNPFK,34,M34,GLYAL_M34,blue,VYGTVFHM[649.3660]NQGNPFK,VYGTVFHM[655.3735]NQGNPFK,...,Glycine N-acyltransferase-like protein,0.504804,0.000560,MLHLRSSQMLQMLESSLRKYLPESLKVYGTVFHMNQGNPFKLKALV...,VYGTVFHMNQGNPFK,26,15,VYGTVFH,7,34
207,ENPL_MOUSE,P08113,RVFITDDFHDM[649.3660]MPK,RVFITDDFHDM[655.3735]MPK,425,M425,ENPL_M425,blue,RVFITDDFHDM[649.3660]MPK,RVFITDDFHDM[655.3735]MPK,...,Endoplasmin,0.518175,0.000037,MRVLWVLGLCCVLLTFGFVRADDEVDVDGTVEEDLGKSREGSRTDD...,RVFITDDFHDMMPK,414,14,RVFITDDFHD,10,425


In [22]:
# Sanity check - ensure all peptides in initial combined dataframe have a match in the isoDTB dataframe
temp = peptides_combined_500uM.isna().sum()
(temp["Methionine Location"] == 0)

True

In [23]:
# Clean up the combined dataframe - drop & rename columns

cols_to_rename = {
    'Entry Name_x': 'Entry Name', 
    'Light Modified Peptide_x': 'Light Modified Peptide', 
    'Heavy Modified Peptide_x': 'Heavy Modified Peptide', 
    '1_1 Log2 Ratio HL': '1_1 isoDTB Log2 HL 500uM', 
    '1_2 Log2 Ratio HL': '1_2 isoDTB Log2 HL 500uM', 
    '1_3 Log2 Ratio HL': '1_3 isoDTB Log2 HL 500uM', 
    '2_1 Log2 Ratio HL': '2_1 isoDTB Log2 HL 500uM', 
    '2_2 Log2 Ratio HL': '2_2 isoDTB Log2 HL 500uM', 
    '2_3 Log2 Ratio HL': '2_3 isoDTB Log2 HL 500uM', 
    '3_1 Log2 Ratio HL': '3_1 isoDTB Log2 HL 500uM', 
    '3_2 Log2 Ratio HL': '3_2 isoDTB Log2 HL 500uM', 
    '3_3 Log2 Ratio HL': '3_3 isoDTB Log2 HL 500uM'
}
peptides_combined_500uM.rename(columns=cols_to_rename, inplace=True)

cols_to_keep = [
    'Protein', 'Protein ID', 'Entry Name', 'Gene', 'Protein Description', 'Site Number', 'Label', 'Color', 'Light Modified Peptide', 'Heavy Modified Peptide', 
    '1_1 isoDTB Log2 HL 500uM', '1_2 isoDTB Log2 HL 500uM', '1_3 isoDTB Log2 HL 500uM', '2_1 isoDTB Log2 HL 500uM', '2_2 isoDTB Log2 HL 500uM', '2_3 isoDTB Log2 HL 500uM', '3_1 isoDTB Log2 HL 500uM', '3_2 isoDTB Log2 HL 500uM', '3_3 isoDTB Log2 HL 500uM'
]
peptides_combined_500uM = peptides_combined_500uM[cols_to_keep]
peptides_combined_500uM

Unnamed: 0,Protein,Protein ID,Entry Name,Gene,Protein Description,Site Number,Label,Color,Light Modified Peptide,Heavy Modified Peptide,1_1 isoDTB Log2 HL 500uM,1_2 isoDTB Log2 HL 500uM,1_3 isoDTB Log2 HL 500uM,2_1 isoDTB Log2 HL 500uM,2_2 isoDTB Log2 HL 500uM,2_3 isoDTB Log2 HL 500uM,3_1 isoDTB Log2 HL 500uM,3_2 isoDTB Log2 HL 500uM,3_3 isoDTB Log2 HL 500uM
0,sp|P43883|PLIN2_MOUSE,P43883,PLIN2_MOUSE,Plin2,Perilipin-2,265,PLIN2_M265,red,KNM[649.3660]HSANQK,KNM[655.3735]HSANQK,-1.369733,,,-0.730133,,-0.737838,-1.295412,,-1.169723
1,sp|Q60864|STIP1_MOUSE,Q60864,STIP1_MOUSE,Stip1,Stress-induced-phosphoprotein 1,535,STIP1_M535,red,LM[649.3660]DVGLIAIR,LM[655.3735]DVGLIAIR,,,8.708766,,,,,9.561387,9.276784
2,sp|Q8VDD5|MYH9_MOUSE,Q8VDD5,MYH9_MOUSE,Myh9,Myosin-9,1565,MYH9_M1565,red,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,1.515208,1.365596,1.112553,,,
3,sp|P62259|1433E_MOUSE,P62259,1433E_MOUSE,Ywhae,14-3-3 protein epsilon,221,1433E_M221,red,DSTLIM[649.3660]QLLR,DSTLIM[655.3735]QLLR,,0.199639,0.300016,1.033391,0.209846,-0.134683,1.225565,1.172932,1.244436
4,sp|P20029|BIP_MOUSE,P20029,BIP_MOUSE,Hspa5,Endoplasmic reticulum chaperone BiP,154,BIP_M154,red,M[649.3660]KETAEAYLGK,M[655.3735]KETAEAYLGK,,2.664610,,0.966154,1.901734,,,1.535712,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,sp|P16858|G3P_MOUSE,P16858,G3P_MOUSE,Gapdh,Glyceraldehyde-3-phosphate dehydrogenase,173,G3P_M173,blue,VIHDNFGIVEGLM[649.3660]TTVHAITATQK,VIHDNFGIVEGLM[655.3735]TTVHAITATQK,-0.266606,-0.360720,-0.581778,-0.021438,-0.287734,-0.389033,-0.685679,-0.467100,-0.382046
205,sp|Q8QZT1|THIL_MOUSE,Q8QZT1,THIL_MOUSE,Acat1,"Acetyl-CoA acetyltransferase, mitochondrial",363,THIL_M363,blue,M[649.3660]LEIDPQK,M[655.3735]LEIDPQK,0.246808,0.517785,0.237241,0.519555,0.451872,0.623949,-0.033769,-0.105834,
206,sp|Q5FW57|GLYAL_MOUSE,Q5FW57,GLYAL_MOUSE,Gm4952,Glycine N-acyltransferase-like protein,34,GLYAL_M34,blue,VYGTVFHM[649.3660]NQGNPFK,VYGTVFHM[655.3735]NQGNPFK,0.345814,0.489821,0.289352,0.763115,0.743601,1.019832,0.385364,0.260730,0.245604
207,sp|P08113|ENPL_MOUSE,P08113,ENPL_MOUSE,Hsp90b1,Endoplasmin,425,ENPL_M425,blue,RVFITDDFHDM[649.3660]MPK,RVFITDDFHDM[655.3735]MPK,0.561879,0.660556,0.705344,0.261344,0.543667,0.672810,0.458355,0.165973,0.633643


## Load isoDTB Dataset - 0 uM

In [24]:
# Load isoDTB dataset
data_loc = os.path.join(curr_dir_path, "10_01_24_LFD_HFD_statistics.xlsx")
peptides_isoDTB_0uM = pd.read_excel(data_loc, sheet_name="0 uM isoDTB")
peptides_isoDTB_0uM

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,3_3 Log2 Ratio HL,Protein,isoDTB ID,Entry Name,Gene,Protein Description,isoDTB average,pvalue
0,EVEM[649.3660]DAVGK,EVEM[655.3735]DAVGK,-1.652314,-1.579836,-1.577018,-1.487990,-1.507101,-1.574416,-1.371444,-1.559925,-1.522835,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",-1.536987,8.008912e-12
1,TAHIVLEDGTKM[649.3660]K,TAHIVLEDGTKM[655.3735]K,-1.456215,-1.474618,-1.465335,-1.308171,-1.312756,-1.430254,-1.286013,-1.302464,-1.259681,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",-1.366168,4.959904e-11
2,EEM[649.3660]DHSVSPFMR,EEM[655.3735]DHSVSPFMR,-1.287048,-1.209515,-1.313581,-1.306046,-1.327070,-1.281267,-1.398723,-1.378484,-1.063495,sp|Q3UZZ6|ST1D1_MOUSE,Q3UZZ6,ST1D1_MOUSE,Sult1d1,Sulfotransferase 1 family member D1,-1.285025,2.194522e-10
3,DELGLNKYM[649.3660]ESDGIK,DELGLNKYM[655.3735]ESDGIK,-1.432976,-1.490696,-1.476548,-1.388501,-1.133721,-1.260989,-1.320824,-1.476059,-1.380138,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",-1.373383,4.989561e-10
4,AM[649.3660]LSTGFK,AM[655.3735]LSTGFK,-1.305008,-1.558018,-1.403343,-1.274761,-1.143798,-1.340259,-1.109032,-1.317659,-1.278765,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",-1.303405,1.916621e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,TFVVQGFGNVGLHSM[649.3660]R,TFVVQGFGNVGLHSM[655.3735]R,,0.139565,,0.435170,0.701000,0.297533,,,,sp|P26443|DHE3_MOUSE,P26443,DHE3_MOUSE,Glud1,"Glutamate dehydrogenase 1, mitochondrial",0.393317,4.557378e-02
284,LESEM[649.3660]EDAYHEHQANLLR,LESEM[655.3735]EDAYHEHQANLLR,,-1.018235,,-0.331426,,-0.695029,0.006284,-0.197496,-0.207434,sp|Q8VIJ6|SFPQ_MOUSE,Q8VIJ6,SFPQ_MOUSE,Sfpq,"Splicing factor, proline- and glutamine-rich",-0.407223,4.628352e-02
285,FAGLHFFNPVPM[649.3660]MK,FAGLHFFNPVPM[655.3735]MK,0.203382,-0.075219,0.655993,0.963381,0.537992,0.395622,0.585876,-0.344842,-0.002494,sp|Q61425|HCDH_MOUSE,Q61425,HCDH_MOUSE,Hadh,"Hydroxyacyl-coenzyme A dehydrogenase, mitochon...",0.324410,4.632590e-02
286,ISIEM[649.3660]NGTLEDQLSHLK,ISIEM[655.3735]NGTLEDQLSHLK,,-0.488006,,-0.450109,-0.674747,-0.617400,-0.275914,0.278649,,sp|P57780|ACTN4_MOUSE,P57780,ACTN4_MOUSE,Actn4,Alpha-actinin-4,-0.371254,4.730059e-02


In [25]:
# Canonicalize data - rename col isoDTB ID -> Protein ID
peptides_isoDTB_0uM.rename(columns={"isoDTB ID": "Protein ID"}, inplace=True)
peptides_isoDTB_0uM;

In [26]:
unique_uniprotIDs = peptides_isoDTB_0uM["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q8C196' 'Q3UZZ6' 'P11679' 'P08226' 'P10649' 'P05784' 'P16460' 'P33267'
 'Q05421' 'P56480' 'P20852' 'Q91YI0' 'P38647' 'O35490' 'Q03265' 'Q9R0H0'
 'P16858' 'P97872' 'P24369' 'P30115' 'P32020' 'Q78PY7' 'P20029' 'Q8VC30'
 'P17182' 'P08003' 'P97371' 'Q99K67' 'Q8CAY6' 'Q9CPQ8' 'Q63918' 'Q9ET01'
 'P54869' 'Q99KI0' 'P50247' 'P48771' 'Q91W64' 'P11725' 'Q9CQC9' 'P47962'
 'Q9JKR6' 'P47915' 'P41216' 'Q91VS7' 'P08113' 'O09167' 'Q63886' 'P17665'
 'Q9CQQ7' 'Q8VCT4' 'Q9EQ20' 'Q8CHT0' 'Q99PT1' 'P06745' 'P14824' 'Q9CPU0'
 'Q07417' 'Q9QXF8' 'P55264' 'Q99K48' 'P62843' 'P09411' 'Q921G7' 'P29758'
 'Q9DBG5' 'P40630' 'Q99JY0' 'P24456' 'Q8QZT1' 'P97372' 'P06728' 'P50136'
 'Q9CPQ1' 'P53657' 'P29341' 'P20152' 'O08601' 'P24472' 'P47911' 'P62259'
 'P54071' 'Q9CZX8' 'Q8CFX1' 'Q8BMS1' 'P63038' 'P43883' 'Q63880' 'P22315'
 'Q61425' 'P97351' 'Q9Z2I8' 'Q8R0Y6' 'P24270' 'Q9DBJ1' 'Q8BG05' 'Q01853'
 'P10126' 'Q9CY58' 'P63017' 'Q5FW57' 'Q99KR3' 'Q8K1Z0' 'Q05920' 'P53026'
 'P48678' 'Q9CZ44' 'P50518' 'P

In [27]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated

In [28]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated

Unnamed: 0,Protein ID,Complete Sequence
0,Q8C196,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1,Q07417,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...
2,Q91YI0,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...
3,P50247,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...
4,P33267,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...
...,...,...
28,Q9QX47,MAADIEQVFRSFVVSKFREIQQELSSGRSEGQLNGETNPPIEGNQA...
29,P10605,MWWSLILLSCLLALTSAHDKPSFHPLSDDLINYINKQNTTWQAGRN...
30,P38060,MASVRKAFPRRLVGLTSLRAVSTSSMGTLPKQVKIVEVGPRDGLQN...
31,P11499,MPEEVHHGEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISN...


In [29]:
peptides_isoDTB_0uM = peptides_isoDTB_0uM.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_isoDTB_0uM;

In [30]:
# Filter out modification symbols to get raw protein sequence (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_isoDTB_0uM["Peptide Sequence"] = peptides_isoDTB_0uM["Heavy Modified Peptide"].map(filtering)
peptides_isoDTB_0uM;

In [31]:
peptides_isoDTB_0uM["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_isoDTB_0uM["Complete Sequence"], peptides_isoDTB_0uM["Peptide Sequence"])], dtype=int)
peptides_isoDTB_0uM;

In [32]:
peptides_isoDTB_0uM["Sequence Length"] = peptides_isoDTB_0uM["Peptide Sequence"].str.len().astype(int)
peptides_isoDTB_0uM;

In [33]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_isoDTB_0uM["Complete Sequence"], peptides_isoDTB_0uM["Sequence Location"], peptides_isoDTB_0uM["Sequence Length"])]
(temp == peptides_isoDTB_0uM["Peptide Sequence"]).value_counts()

Peptide Sequence
True    288
Name: count, dtype: int64

In [34]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_isoDTB_0uM["Left Prefix"] = peptides_isoDTB_0uM["Heavy Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_isoDTB_0uM["Left Prefix"] = peptides_isoDTB_0uM["Left Prefix"].map(filtering)
peptides_isoDTB_0uM["Left Prefix Length"] = peptides_isoDTB_0uM["Left Prefix"].str.len().astype(int)

peptides_isoDTB_0uM;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [35]:
peptides_isoDTB_0uM["Methionine Location"] = peptides_isoDTB_0uM["Sequence Location"] + peptides_isoDTB_0uM["Left Prefix Length"]
peptides_isoDTB_0uM;

In [36]:
# Sanity check - ensure methionine locations are correct
temp = [A[B] for A, B in zip(peptides_isoDTB_0uM["Complete Sequence"], peptides_isoDTB_0uM["Methionine Location"])]
temp.count("M") == len(temp)

True

In [37]:
peptides_isoDTB_0uM = peptides_isoDTB_0uM.drop_duplicates(["Protein ID", "Methionine Location"]) #TODO: ask about this - is keeping the first duplicate okay, or should we do something more complex?
peptides_isoDTB_0uM;

## Merge Combined & isoDTB Dataframes - 0 uM

In [38]:
peptides_isoDTB_0uM["Methionine Location"] = peptides_isoDTB_0uM["Methionine Location"] + 1 # one-index the positions to match initial dataframe

peptides_combined_500uM = peptides_combined_500uM.merge(
    peptides_isoDTB_0uM, 
    how="left", 
    left_on=["Protein ID", "Site Number"], 
    right_on=["Protein ID", "Methionine Location"]
)
peptides_combined_500uM

Unnamed: 0,Protein_x,Protein ID,Entry Name_x,Gene_x,Protein Description_x,Site Number,Label,Color,Light Modified Peptide_x,Heavy Modified Peptide_x,...,Protein Description_y,isoDTB average,pvalue,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location
0,sp|P43883|PLIN2_MOUSE,P43883,PLIN2_MOUSE,Plin2,Perilipin-2,265,PLIN2_M265,red,KNM[649.3660]HSANQK,KNM[655.3735]HSANQK,...,Perilipin-2,-1.042984,0.001549,MAAAVVDPQQSVVMRVANLPLVSSTYDLVSSAYVSTKDQYPYLRSV...,KNMHSANQK,262.0,9.0,KN,2.0,265.0
1,sp|Q60864|STIP1_MOUSE,Q60864,STIP1_MOUSE,Stip1,Stress-induced-phosphoprotein 1,535,STIP1_M535,red,LM[649.3660]DVGLIAIR,LM[655.3735]DVGLIAIR,...,,,,,,,,,,
2,sp|Q8VDD5|MYH9_MOUSE,Q8VDD5,MYH9_MOUSE,Myh9,Myosin-9,1565,MYH9_M1565,red,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,...,,,,,,,,,,
3,sp|P62259|1433E_MOUSE,P62259,1433E_MOUSE,Ywhae,14-3-3 protein epsilon,221,1433E_M221,red,DSTLIM[649.3660]QLLR,DSTLIM[655.3735]QLLR,...,,,,,,,,,,
4,sp|P20029|BIP_MOUSE,P20029,BIP_MOUSE,Hspa5,Endoplasmic reticulum chaperone BiP,154,BIP_M154,red,M[649.3660]KETAEAYLGK,M[655.3735]KETAEAYLGK,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,sp|P16858|G3P_MOUSE,P16858,G3P_MOUSE,Gapdh,Glyceraldehyde-3-phosphate dehydrogenase,173,G3P_M173,blue,VIHDNFGIVEGLM[649.3660]TTVHAITATQK,VIHDNFGIVEGLM[655.3735]TTVHAITATQK,...,Glyceraldehyde-3-phosphate dehydrogenase,-0.345148,0.004713,MVKVGVNGFGRIGRLVTRAAICSGKVEIVAINDPFIDLNYMVYMFQ...,VIHDNFGIVEGLMTTVHAITATQK,160.0,24.0,VIHDNFGIVEGL,12.0,173.0
205,sp|Q8QZT1|THIL_MOUSE,Q8QZT1,THIL_MOUSE,Acat1,"Acetyl-CoA acetyltransferase, mitochondrial",363,THIL_M363,blue,M[649.3660]LEIDPQK,M[655.3735]LEIDPQK,...,"Acetyl-CoA acetyltransferase, mitochondrial",0.293440,0.000613,MAALVALHGVVRRPLLRGLLQEVRCLERSYASKPTLNEVVIVSAIR...,MLEIDPQK,362.0,8.0,,0.0,363.0
206,sp|Q5FW57|GLYAL_MOUSE,Q5FW57,GLYAL_MOUSE,Gm4952,Glycine N-acyltransferase-like protein,34,GLYAL_M34,blue,VYGTVFHM[649.3660]NQGNPFK,VYGTVFHM[655.3735]NQGNPFK,...,Glycine N-acyltransferase-like protein,0.558688,0.003065,MLHLRSSQMLQMLESSLRKYLPESLKVYGTVFHMNQGNPFKLKALV...,VYGTVFHMNQGNPFK,26.0,15.0,VYGTVFH,7.0,34.0
207,sp|P08113|ENPL_MOUSE,P08113,ENPL_MOUSE,Hsp90b1,Endoplasmin,425,ENPL_M425,blue,RVFITDDFHDM[649.3660]MPK,RVFITDDFHDM[655.3735]MPK,...,Endoplasmin,0.563888,0.000089,MRVLWVLGLCCVLLTFGFVRADDEVDVDGTVEEDLGKSREGSRTDD...,RVFITDDFHDMMPK,414.0,14.0,RVFITDDFHD,10.0,425.0


In [39]:
# Sanity check - ensure all peptides in initial combined dataframe have a match in the isoDTB dataframe
# TODO: note - there's a mistake in "0 uM isoDTB" cell A47 - the heavy and light modifications are mismatched, I went by the modification described in "0 uM combined" cells C219 and D219
temp = peptides_combined_500uM.isna().sum()
(temp["Methionine Location"] == 0)

False

In [40]:
#TODO: deal with matches not found
#pd.set_option("display.max_columns", None)
#display(peptides_combined[peptides_combined["Complete Sequence"].isna()])
#pd.reset_option("display.max_columns")

In [41]:
# Clean up the combined dataframe - drop & rename columns

cols_to_rename = {
    'Protein_x': 'Protein', 
    'Entry Name_x': 'Entry Name', 
    'Gene_x': 'Gene', 
    'Protein Description_x': 'Protein Description', 
    'Light Modified Peptide_x': 'Light Modified Peptide', 
    'Heavy Modified Peptide_x': 'Heavy Modified Peptide', 
    '1_1 Log2 Ratio HL': '1_1 isoDTB Log2 HL 0uM', 
    '1_2 Log2 Ratio HL': '1_2 isoDTB Log2 HL 0uM', 
    '1_3 Log2 Ratio HL': '1_3 isoDTB Log2 HL 0uM', 
    '2_1 Log2 Ratio HL': '2_1 isoDTB Log2 HL 0uM', 
    '2_2 Log2 Ratio HL': '2_2 isoDTB Log2 HL 0uM', 
    '2_3 Log2 Ratio HL': '2_3 isoDTB Log2 HL 0uM', 
    '3_1 Log2 Ratio HL': '3_1 isoDTB Log2 HL 0uM', 
    '3_2 Log2 Ratio HL': '3_2 isoDTB Log2 HL 0uM', 
    '3_3 Log2 Ratio HL': '3_3 isoDTB Log2 HL 0uM'
}
peptides_combined_500uM.rename(columns=cols_to_rename, inplace=True)

cols_to_keep = [
    'Protein', 'Protein ID', 'Entry Name', 'Gene', 'Protein Description', 'Site Number', 'Label', 'Color', 'Light Modified Peptide', 'Heavy Modified Peptide', 
    '1_1 isoDTB Log2 HL 0uM', '1_2 isoDTB Log2 HL 0uM', '1_3 isoDTB Log2 HL 0uM', '2_1 isoDTB Log2 HL 0uM', '2_2 isoDTB Log2 HL 0uM', '2_3 isoDTB Log2 HL 0uM', '3_1 isoDTB Log2 HL 0uM', '3_2 isoDTB Log2 HL 0uM', '3_3 isoDTB Log2 HL 0uM', 
    '1_1 isoDTB Log2 HL 500uM', '1_2 isoDTB Log2 HL 500uM', '1_3 isoDTB Log2 HL 500uM', '2_1 isoDTB Log2 HL 500uM', '2_2 isoDTB Log2 HL 500uM', '2_3 isoDTB Log2 HL 500uM', '3_1 isoDTB Log2 HL 500uM', '3_2 isoDTB Log2 HL 500uM', '3_3 isoDTB Log2 HL 500uM'
]
peptides_combined_500uM = peptides_combined_500uM[cols_to_keep]
peptides_combined_500uM

Unnamed: 0,Protein,Protein ID,Entry Name,Gene,Protein Description,Site Number,Label,Color,Light Modified Peptide,Heavy Modified Peptide,...,3_3 isoDTB Log2 HL 0uM,1_1 isoDTB Log2 HL 500uM,1_2 isoDTB Log2 HL 500uM,1_3 isoDTB Log2 HL 500uM,2_1 isoDTB Log2 HL 500uM,2_2 isoDTB Log2 HL 500uM,2_3 isoDTB Log2 HL 500uM,3_1 isoDTB Log2 HL 500uM,3_2 isoDTB Log2 HL 500uM,3_3 isoDTB Log2 HL 500uM
0,sp|P43883|PLIN2_MOUSE,P43883,PLIN2_MOUSE,Plin2,Perilipin-2,265,PLIN2_M265,red,KNM[649.3660]HSANQK,KNM[655.3735]HSANQK,...,-1.223533,-1.369733,,,-0.730133,,-0.737838,-1.295412,,-1.169723
1,sp|Q60864|STIP1_MOUSE,Q60864,STIP1_MOUSE,Stip1,Stress-induced-phosphoprotein 1,535,STIP1_M535,red,LM[649.3660]DVGLIAIR,LM[655.3735]DVGLIAIR,...,,,,8.708766,,,,,9.561387,9.276784
2,sp|Q8VDD5|MYH9_MOUSE,Q8VDD5,MYH9_MOUSE,Myh9,Myosin-9,1565,MYH9_M1565,red,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,...,,,,,1.515208,1.365596,1.112553,,,
3,sp|P62259|1433E_MOUSE,P62259,1433E_MOUSE,Ywhae,14-3-3 protein epsilon,221,1433E_M221,red,DSTLIM[649.3660]QLLR,DSTLIM[655.3735]QLLR,...,,,0.199639,0.300016,1.033391,0.209846,-0.134683,1.225565,1.172932,1.244436
4,sp|P20029|BIP_MOUSE,P20029,BIP_MOUSE,Hspa5,Endoplasmic reticulum chaperone BiP,154,BIP_M154,red,M[649.3660]KETAEAYLGK,M[655.3735]KETAEAYLGK,...,,,2.664610,,0.966154,1.901734,,,1.535712,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,sp|P16858|G3P_MOUSE,P16858,G3P_MOUSE,Gapdh,Glyceraldehyde-3-phosphate dehydrogenase,173,G3P_M173,blue,VIHDNFGIVEGLM[649.3660]TTVHAITATQK,VIHDNFGIVEGLM[655.3735]TTVHAITATQK,...,-0.040965,-0.266606,-0.360720,-0.581778,-0.021438,-0.287734,-0.389033,-0.685679,-0.467100,-0.382046
205,sp|Q8QZT1|THIL_MOUSE,Q8QZT1,THIL_MOUSE,Acat1,"Acetyl-CoA acetyltransferase, mitochondrial",363,THIL_M363,blue,M[649.3660]LEIDPQK,M[655.3735]LEIDPQK,...,0.171321,0.246808,0.517785,0.237241,0.519555,0.451872,0.623949,-0.033769,-0.105834,
206,sp|Q5FW57|GLYAL_MOUSE,Q5FW57,GLYAL_MOUSE,Gm4952,Glycine N-acyltransferase-like protein,34,GLYAL_M34,blue,VYGTVFHM[649.3660]NQGNPFK,VYGTVFHM[655.3735]NQGNPFK,...,0.322265,0.345814,0.489821,0.289352,0.763115,0.743601,1.019832,0.385364,0.260730,0.245604
207,sp|P08113|ENPL_MOUSE,P08113,ENPL_MOUSE,Hsp90b1,Endoplasmin,425,ENPL_M425,blue,RVFITDDFHDM[649.3660]MPK,RVFITDDFHDM[655.3735]MPK,...,0.401574,0.561879,0.660556,0.705344,0.261344,0.543667,0.672810,0.458355,0.165973,0.633643


## Load ReDiMe Dataset - 0 uM

In [42]:
# Load ReDiMe dataset
data_loc = os.path.join(curr_dir_path, "10_01_24_LFD_HFD_statistics.xlsx")
peptides_ReDiMe_0uM = pd.read_excel(data_loc, sheet_name="0 uM ReDiMe")
peptides_ReDiMe_0uM

Unnamed: 0,Protein,Protein ID,Entry Name,Gene,Protein Description,1_1 Median Log2 Ratios HL,1_2 Median Log2 Ratios HL,1_3 Median Log2 Ratios HL,1_4 Median Log2 Ratios HL,1_5 Median Log2 Ratios HL,...,3_1 Median Log2 Ratios HL,3_2 Median Log2 Ratios HL,3_3 Median Log2 Ratios HL,3_4 Median Log2 Ratios HL,3_5 Median Log2 Ratios HL,3_6 Median Log2 Ratios HL,3_7 Median Log2 Ratios HL,3_8 Median Log2 Ratios HL,average,pvalue
0,sp|Q91YI0|ARLY_MOUSE,Q91YI0,ARLY_MOUSE,Asl,Argininosuccinate lyase,-0.916443,-0.977901,-0.888973,-1.037834,-0.944586,...,-0.957254,-0.962434,-1.030610,-1.074627,-0.922918,-0.967780,-0.925565,-0.966659,-0.954802,8.091328e-25
1,sp|P16460|ASSY_MOUSE,P16460,ASSY_MOUSE,Ass1,Argininosuccinate synthase,-1.052989,-1.202156,-1.129500,-1.040042,-1.040275,...,-1.029816,-0.953847,-0.998343,-0.960643,-0.924588,-0.983063,-1.011745,-0.972035,-0.977770,1.679346e-24
2,sp|P05784|K1C18_MOUSE,P05784,K1C18_MOUSE,Krt18,"Keratin, type I cytoskeletal 18",1.090436,1.191949,1.049693,1.141795,1.092737,...,1.150336,1.104929,1.220629,1.090980,1.093305,1.070371,1.262880,1.286621,1.187074,2.068161e-24
3,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",-1.387155,-1.375819,-1.246541,-1.449857,-1.455256,...,-1.081493,-1.076408,-1.057085,-1.181103,-1.106579,-1.137529,-1.128430,-1.156916,-1.261129,9.343334e-24
4,sp|P00329|ADH1_MOUSE,P00329,ADH1_MOUSE,Adh1,Alcohol dehydrogenase 1,-0.780173,-0.731226,-0.906229,-1.040369,-0.995301,...,-1.020629,-0.914203,-0.974374,-1.005065,-0.969847,-1.146716,-1.088566,-1.048351,-0.994785,2.618574e-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1295,sp|Q9JIF7|COPB_MOUSE,Q9JIF7,COPB_MOUSE,Copb1,Coatomer subunit beta,0.389774,-0.364291,0.287114,0.424588,0.169938,...,0.208066,-0.169927,,0.893145,-0.196593,-0.131993,,,0.160926,4.929294e-02
1296,sp|Q7TNG5|EMAL2_MOUSE,Q7TNG5,EMAL2_MOUSE,Eml2,Echinoderm microtubule-associated protein-like 2,,,,,0.117314,...,,,,,-0.253449,-0.573225,-0.618432,,-0.322357,4.948872e-02
1297,sp|Q8R123|FAD1_MOUSE,Q8R123,FAD1_MOUSE,Flad1,FAD synthase,,,,,,...,,,0.145824,,0.559952,0.972980,-0.055343,,0.236306,4.962217e-02
1298,sp|Q8BSF4|PISD_MOUSE,Q8BSF4,PISD_MOUSE,Pisd,"Phosphatidylserine decarboxylase proenzyme, mi...",,,,,,...,,,,,,,,,2.050492,4.980104e-02


## Merge Combined & ReDiMe Dataframes - 0 uM

In [43]:
peptides_combined_500uM = peptides_combined_500uM.merge(
    peptides_ReDiMe_0uM, 
    how="left", 
    left_on=["Protein ID"], 
    right_on=["Protein ID"]
)
peptides_combined_500uM

Unnamed: 0,Protein_x,Protein ID,Entry Name_x,Gene_x,Protein Description_x,Site Number,Label,Color,Light Modified Peptide,Heavy Modified Peptide,...,3_1 Median Log2 Ratios HL,3_2 Median Log2 Ratios HL,3_3 Median Log2 Ratios HL,3_4 Median Log2 Ratios HL,3_5 Median Log2 Ratios HL,3_6 Median Log2 Ratios HL,3_7 Median Log2 Ratios HL,3_8 Median Log2 Ratios HL,average,pvalue
0,sp|P43883|PLIN2_MOUSE,P43883,PLIN2_MOUSE,Plin2,Perilipin-2,265,PLIN2_M265,red,KNM[649.3660]HSANQK,KNM[655.3735]HSANQK,...,,-0.093658,,-0.424812,-0.990354,-0.445707,,,-0.803372,1.446247e-04
1,sp|Q60864|STIP1_MOUSE,Q60864,STIP1_MOUSE,Stip1,Stress-induced-phosphoprotein 1,535,STIP1_M535,red,LM[649.3660]DVGLIAIR,LM[655.3735]DVGLIAIR,...,-0.676230,-0.151476,-0.254596,0.168084,-0.426328,,,-1.662186,-0.461575,5.231347e-04
2,sp|Q8VDD5|MYH9_MOUSE,Q8VDD5,MYH9_MOUSE,Myh9,Myosin-9,1565,MYH9_M1565,red,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,...,,,,,,,,,,
3,sp|P62259|1433E_MOUSE,P62259,1433E_MOUSE,Ywhae,14-3-3 protein epsilon,221,1433E_M221,red,DSTLIM[649.3660]QLLR,DSTLIM[655.3735]QLLR,...,-0.225702,-0.680299,0.044045,,,,,,-0.326781,7.275591e-05
4,sp|P20029|BIP_MOUSE,P20029,BIP_MOUSE,Hspa5,Endoplasmic reticulum chaperone BiP,154,BIP_M154,red,M[649.3660]KETAEAYLGK,M[655.3735]KETAEAYLGK,...,0.771290,0.815559,0.894010,0.713129,0.734807,0.884557,0.818994,0.915525,0.744040,3.861375e-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,sp|P16858|G3P_MOUSE,P16858,G3P_MOUSE,Gapdh,Glyceraldehyde-3-phosphate dehydrogenase,173,G3P_M173,blue,VIHDNFGIVEGLM[649.3660]TTVHAITATQK,VIHDNFGIVEGLM[655.3735]TTVHAITATQK,...,-0.526561,-0.571704,-0.283968,-0.300626,-0.440403,-0.440049,-0.478019,-0.946597,-0.462424,3.020104e-08
205,sp|Q8QZT1|THIL_MOUSE,Q8QZT1,THIL_MOUSE,Acat1,"Acetyl-CoA acetyltransferase, mitochondrial",363,THIL_M363,blue,M[649.3660]LEIDPQK,M[655.3735]LEIDPQK,...,0.139581,0.216953,0.265377,0.201293,0.323118,0.200383,0.336694,0.131345,0.191219,4.738610e-06
206,sp|Q5FW57|GLYAL_MOUSE,Q5FW57,GLYAL_MOUSE,Gm4952,Glycine N-acyltransferase-like protein,34,GLYAL_M34,blue,VYGTVFHM[649.3660]NQGNPFK,VYGTVFHM[655.3735]NQGNPFK,...,0.344145,0.165800,,,0.357807,0.083290,-0.079255,,0.288001,5.527493e-05
207,sp|P08113|ENPL_MOUSE,P08113,ENPL_MOUSE,Hsp90b1,Endoplasmin,425,ENPL_M425,blue,RVFITDDFHDM[649.3660]MPK,RVFITDDFHDM[655.3735]MPK,...,0.578385,0.521907,0.724153,0.465814,0.495633,0.517103,0.528513,0.373274,0.444991,5.815272e-12


In [44]:
# Sanity check - ensure all peptides in initial combined dataframe have a match in the ReDiMe dataframe
temp = peptides_combined_500uM.isna().sum()
(temp["average"] == 0)

False

In [45]:
# Clean up the combined dataframe - drop & rename columns

cols_to_rename = {
    'Protein_x': 'Protein', 
    'Entry Name_x': 'Entry Name', 
    'Gene_x': 'Gene', 
    'Protein Description_x': 'Protein Description', 
    '1_1 Median Log2 Ratios HL': '1_1 ReDiMe Log2 HL 0uM', 
    '1_2 Median Log2 Ratios HL': '1_2 ReDiMe Log2 HL 0uM', 
    '1_3 Median Log2 Ratios HL': '1_3 ReDiMe Log2 HL 0uM', 
    '1_4 Median Log2 Ratios HL': '1_4 ReDiMe Log2 HL 0uM', 
    '1_5 Median Log2 Ratios HL': '1_5 ReDiMe Log2 HL 0uM', 
    '1_6 Median Log2 Ratios HL': '1_6 ReDiMe Log2 HL 0uM', 
    '1_7 Median Log2 Ratios HL': '1_7 ReDiMe Log2 HL 0uM', 
    '1_8 Median Log2 Ratios HL': '1_8 ReDiMe Log2 HL 0uM', 
    '2_1 Median Log2 Ratios HL': '2_1 ReDiMe Log2 HL 0uM', 
    '2_2 Median Log2 Ratios HL': '2_2 ReDiMe Log2 HL 0uM', 
    '2_3 Median Log2 Ratios HL': '2_3 ReDiMe Log2 HL 0uM', 
    '2_4 Median Log2 Ratios HL': '2_4 ReDiMe Log2 HL 0uM', 
    '2_5 Median Log2 Ratios HL': '2_5 ReDiMe Log2 HL 0uM', 
    '2_6 Median Log2 Ratios HL': '2_6 ReDiMe Log2 HL 0uM', 
    '2_7 Median Log2 Ratios HL': '2_7 ReDiMe Log2 HL 0uM', 
    '2_8 Median Log2 Ratios HL': '2_8 ReDiMe Log2 HL 0uM', 
    '3_1 Median Log2 Ratios HL': '3_1 ReDiMe Log2 HL 0uM', 
    '3_2 Median Log2 Ratios HL': '3_2 ReDiMe Log2 HL 0uM', 
    '3_3 Median Log2 Ratios HL': '3_3 ReDiMe Log2 HL 0uM', 
    '3_4 Median Log2 Ratios HL': '3_4 ReDiMe Log2 HL 0uM', 
    '3_5 Median Log2 Ratios HL': '3_5 ReDiMe Log2 HL 0uM', 
    '3_6 Median Log2 Ratios HL': '3_6 ReDiMe Log2 HL 0uM', 
    '3_7 Median Log2 Ratios HL': '3_7 ReDiMe Log2 HL 0uM', 
    '3_8 Median Log2 Ratios HL': '3_8 ReDiMe Log2 HL 0uM'
}
peptides_combined_500uM.rename(columns=cols_to_rename, inplace=True)

cols_to_keep = [
    'Protein', 'Protein ID', 'Entry Name', 'Gene', 'Protein Description', 'Site Number', 'Label', 'Color', 'Light Modified Peptide', 'Heavy Modified Peptide', 
    '1_1 isoDTB Log2 HL 0uM', '1_2 isoDTB Log2 HL 0uM', '1_3 isoDTB Log2 HL 0uM', '2_1 isoDTB Log2 HL 0uM', '2_2 isoDTB Log2 HL 0uM', '2_3 isoDTB Log2 HL 0uM', '3_1 isoDTB Log2 HL 0uM', '3_2 isoDTB Log2 HL 0uM', '3_3 isoDTB Log2 HL 0uM', 
    '1_1 isoDTB Log2 HL 500uM', '1_2 isoDTB Log2 HL 500uM', '1_3 isoDTB Log2 HL 500uM', '2_1 isoDTB Log2 HL 500uM', '2_2 isoDTB Log2 HL 500uM', '2_3 isoDTB Log2 HL 500uM', '3_1 isoDTB Log2 HL 500uM', '3_2 isoDTB Log2 HL 500uM', '3_3 isoDTB Log2 HL 500uM', 
    '1_1 ReDiMe Log2 HL 0uM', '1_2 ReDiMe Log2 HL 0uM', '1_3 ReDiMe Log2 HL 0uM', '1_4 ReDiMe Log2 HL 0uM', '1_5 ReDiMe Log2 HL 0uM', '1_6 ReDiMe Log2 HL 0uM', '1_7 ReDiMe Log2 HL 0uM', '1_8 ReDiMe Log2 HL 0uM', '2_1 ReDiMe Log2 HL 0uM', '2_2 ReDiMe Log2 HL 0uM', '2_3 ReDiMe Log2 HL 0uM', '2_4 ReDiMe Log2 HL 0uM', '2_5 ReDiMe Log2 HL 0uM', '2_6 ReDiMe Log2 HL 0uM', '2_7 ReDiMe Log2 HL 0uM', '2_8 ReDiMe Log2 HL 0uM', '3_1 ReDiMe Log2 HL 0uM', '3_2 ReDiMe Log2 HL 0uM', '3_3 ReDiMe Log2 HL 0uM', '3_4 ReDiMe Log2 HL 0uM', '3_5 ReDiMe Log2 HL 0uM', '3_6 ReDiMe Log2 HL 0uM', '3_7 ReDiMe Log2 HL 0uM', '3_8 ReDiMe Log2 HL 0uM'
]
peptides_combined_500uM = peptides_combined_500uM[cols_to_keep]
peptides_combined_500uM

Unnamed: 0,Protein,Protein ID,Entry Name,Gene,Protein Description,Site Number,Label,Color,Light Modified Peptide,Heavy Modified Peptide,...,2_7 ReDiMe Log2 HL 0uM,2_8 ReDiMe Log2 HL 0uM,3_1 ReDiMe Log2 HL 0uM,3_2 ReDiMe Log2 HL 0uM,3_3 ReDiMe Log2 HL 0uM,3_4 ReDiMe Log2 HL 0uM,3_5 ReDiMe Log2 HL 0uM,3_6 ReDiMe Log2 HL 0uM,3_7 ReDiMe Log2 HL 0uM,3_8 ReDiMe Log2 HL 0uM
0,sp|P43883|PLIN2_MOUSE,P43883,PLIN2_MOUSE,Plin2,Perilipin-2,265,PLIN2_M265,red,KNM[649.3660]HSANQK,KNM[655.3735]HSANQK,...,,,,-0.093658,,-0.424812,-0.990354,-0.445707,,
1,sp|Q60864|STIP1_MOUSE,Q60864,STIP1_MOUSE,Stip1,Stress-induced-phosphoprotein 1,535,STIP1_M535,red,LM[649.3660]DVGLIAIR,LM[655.3735]DVGLIAIR,...,,,-0.676230,-0.151476,-0.254596,0.168084,-0.426328,,,-1.662186
2,sp|Q8VDD5|MYH9_MOUSE,Q8VDD5,MYH9_MOUSE,Myh9,Myosin-9,1565,MYH9_M1565,red,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,...,,,,,,,,,,
3,sp|P62259|1433E_MOUSE,P62259,1433E_MOUSE,Ywhae,14-3-3 protein epsilon,221,1433E_M221,red,DSTLIM[649.3660]QLLR,DSTLIM[655.3735]QLLR,...,,,-0.225702,-0.680299,0.044045,,,,,
4,sp|P20029|BIP_MOUSE,P20029,BIP_MOUSE,Hspa5,Endoplasmic reticulum chaperone BiP,154,BIP_M154,red,M[649.3660]KETAEAYLGK,M[655.3735]KETAEAYLGK,...,0.796865,0.630528,0.771290,0.815559,0.894010,0.713129,0.734807,0.884557,0.818994,0.915525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,sp|P16858|G3P_MOUSE,P16858,G3P_MOUSE,Gapdh,Glyceraldehyde-3-phosphate dehydrogenase,173,G3P_M173,blue,VIHDNFGIVEGLM[649.3660]TTVHAITATQK,VIHDNFGIVEGLM[655.3735]TTVHAITATQK,...,-0.405689,-0.422377,-0.526561,-0.571704,-0.283968,-0.300626,-0.440403,-0.440049,-0.478019,-0.946597
205,sp|Q8QZT1|THIL_MOUSE,Q8QZT1,THIL_MOUSE,Acat1,"Acetyl-CoA acetyltransferase, mitochondrial",363,THIL_M363,blue,M[649.3660]LEIDPQK,M[655.3735]LEIDPQK,...,0.229387,-0.015122,0.139581,0.216953,0.265377,0.201293,0.323118,0.200383,0.336694,0.131345
206,sp|Q5FW57|GLYAL_MOUSE,Q5FW57,GLYAL_MOUSE,Gm4952,Glycine N-acyltransferase-like protein,34,GLYAL_M34,blue,VYGTVFHM[649.3660]NQGNPFK,VYGTVFHM[655.3735]NQGNPFK,...,0.191475,,0.344145,0.165800,,,0.357807,0.083290,-0.079255,
207,sp|P08113|ENPL_MOUSE,P08113,ENPL_MOUSE,Hsp90b1,Endoplasmin,425,ENPL_M425,blue,RVFITDDFHDM[649.3660]MPK,RVFITDDFHDM[655.3735]MPK,...,0.154515,0.263798,0.578385,0.521907,0.724153,0.465814,0.495633,0.517103,0.528513,0.373274


## Load ReDiMe Dataset - 500 uM

In [46]:
# Load ReDiMe dataset
data_loc = os.path.join(curr_dir_path, "10_01_24_LFD_HFD_statistics.xlsx")
peptides_ReDiMe_500uM = pd.read_excel(data_loc, sheet_name="500 uM ReDiMe")
peptides_ReDiMe_500uM

Unnamed: 0,Protein,Protein ID,Entry Name,Gene,Protein Description,1_1 Median Log2 Ratios HL,1_2 Median Log2 Ratios HL,1_3 Median Log2 Ratios HL,1_4 Median Log2 Ratios HL,1_5 Median Log2 Ratios HL,...,3_1 Median Log2 Ratios HL,3_2 Median Log2 Ratios HL,3_3 Median Log2 Ratios HL,3_4 Median Log2 Ratios HL,3_5 Median Log2 Ratios HL,3_6 Median Log2 Ratios HL,3_7 Median Log2 Ratios HL,3_8 Median Log2 Ratios HL,average,pvalue
0,sp|P05784|K1C18_MOUSE,P05784,K1C18_MOUSE,Krt18,"Keratin, type I cytoskeletal 18",1.315108,1.172612,1.320139,1.266418,1.223918,...,1.217441,1.201347,1.320900,1.178578,1.379151,1.286345,1.201482,1.090169,1.270218,7.102002e-25
1,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",-1.315876,-1.279069,-1.210502,-1.335561,-1.433235,...,-0.962218,-0.972879,-1.050426,-1.055437,-1.062025,-1.056383,-1.011555,-1.020289,-1.165733,1.706380e-22
2,sp|P11679|K2C8_MOUSE,P11679,K2C8_MOUSE,Krt8,"Keratin, type II cytoskeletal 8",1.291752,1.404923,1.343208,1.252917,1.308743,...,1.222157,1.197785,1.447329,1.386423,1.290321,1.398027,1.302215,1.751774,1.371552,1.953495e-22
3,sp|P53657|KPYR_MOUSE,P53657,KPYR_MOUSE,Pklr,Pyruvate kinase PKLR,-2.698428,-2.421571,-2.312283,-2.459976,-2.504623,...,-1.976053,-1.878132,-1.660285,-1.760031,-1.846768,-1.857441,-1.949511,-2.076479,-2.145430,1.339003e-21
4,sp|P00342|LDHC_MOUSE,P00342,LDHC_MOUSE,Ldhc,L-lactate dehydrogenase C chain,0.764936,0.548807,0.751123,0.679905,0.770651,...,0.586859,0.439290,0.558739,0.602843,0.620296,0.575109,0.546725,0.608752,0.634538,7.712795e-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1104,sp|Q5EBG8|CA050_MOUSE,Q5EBG8,CA050_MOUSE,,Uncharacterized protein C1orf50 homolog,,,-0.478693,-2.309658,,...,,,-0.518800,-0.623301,,,,,-0.791505,4.956410e-02
1105,sp|Q9CPU2|NDUB2_MOUSE,Q9CPU2,NDUB2_MOUSE,Ndufb2,NADH dehydrogenase [ubiquinone] 1 beta subcomp...,,-0.221876,-0.302547,,,...,,0.197801,-0.346055,,-0.261360,-0.233931,-0.000593,,-0.212241,4.956685e-02
1106,sp|Q8K268|ABCF3_MOUSE,Q8K268,ABCF3_MOUSE,Abcf3,ATP-binding cassette sub-family F member 3,,,,,,...,,,,,,,,,0.315077,4.965795e-02
1107,sp|Q9R111|GUAD_MOUSE,Q9R111,GUAD_MOUSE,Gda,Guanine deaminase,,,,,,...,,,,,,0.151342,0.546226,,0.505696,4.968460e-02


## Merge Combined & ReDiMe Dataframes - 500 uM

In [47]:
peptides_combined_500uM = peptides_combined_500uM.merge(
    peptides_ReDiMe_500uM, 
    how="left", 
    left_on=["Protein ID"], 
    right_on=["Protein ID"]
)
peptides_combined_500uM

Unnamed: 0,Protein_x,Protein ID,Entry Name_x,Gene_x,Protein Description_x,Site Number,Label,Color,Light Modified Peptide,Heavy Modified Peptide,...,3_1 Median Log2 Ratios HL,3_2 Median Log2 Ratios HL,3_3 Median Log2 Ratios HL,3_4 Median Log2 Ratios HL,3_5 Median Log2 Ratios HL,3_6 Median Log2 Ratios HL,3_7 Median Log2 Ratios HL,3_8 Median Log2 Ratios HL,average,pvalue
0,sp|P43883|PLIN2_MOUSE,P43883,PLIN2_MOUSE,Plin2,Perilipin-2,265,PLIN2_M265,red,KNM[649.3660]HSANQK,KNM[655.3735]HSANQK,...,-0.667293,-0.205911,,-0.594694,-0.459627,-0.999736,,,-1.364856,6.463419e-03
1,sp|Q60864|STIP1_MOUSE,Q60864,STIP1_MOUSE,Stip1,Stress-induced-phosphoprotein 1,535,STIP1_M535,red,LM[649.3660]DVGLIAIR,LM[655.3735]DVGLIAIR,...,-0.370481,-0.379932,-0.260647,-0.030374,-0.253105,,,,-0.293048,1.357806e-04
2,sp|Q8VDD5|MYH9_MOUSE,Q8VDD5,MYH9_MOUSE,Myh9,Myosin-9,1565,MYH9_M1565,red,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,...,0.134443,,-0.132577,0.425950,0.048311,-0.593608,-0.255494,,0.333464,4.564357e-02
3,sp|P62259|1433E_MOUSE,P62259,1433E_MOUSE,Ywhae,14-3-3 protein epsilon,221,1433E_M221,red,DSTLIM[649.3660]QLLR,DSTLIM[655.3735]QLLR,...,-0.233643,-0.575995,-0.511899,,,-0.340549,0.202194,,-0.194847,2.332816e-03
4,sp|P20029|BIP_MOUSE,P20029,BIP_MOUSE,Hspa5,Endoplasmic reticulum chaperone BiP,154,BIP_M154,red,M[649.3660]KETAEAYLGK,M[655.3735]KETAEAYLGK,...,0.859519,0.844707,0.943723,0.688677,1.163049,0.871037,0.916651,1.054482,0.933379,2.473924e-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,sp|P16858|G3P_MOUSE,P16858,G3P_MOUSE,Gapdh,Glyceraldehyde-3-phosphate dehydrogenase,173,G3P_M173,blue,VIHDNFGIVEGLM[649.3660]TTVHAITATQK,VIHDNFGIVEGLM[655.3735]TTVHAITATQK,...,-0.217071,-0.367042,-0.310076,-0.208555,-0.456608,-0.479896,-0.272246,-0.504849,-0.268004,2.411873e-04
205,sp|Q8QZT1|THIL_MOUSE,Q8QZT1,THIL_MOUSE,Acat1,"Acetyl-CoA acetyltransferase, mitochondrial",363,THIL_M363,blue,M[649.3660]LEIDPQK,M[655.3735]LEIDPQK,...,0.309189,0.152098,1.507540,0.379357,0.295864,0.216114,0.324873,0.078599,0.442323,2.252013e-06
206,sp|Q5FW57|GLYAL_MOUSE,Q5FW57,GLYAL_MOUSE,Gm4952,Glycine N-acyltransferase-like protein,34,GLYAL_M34,blue,VYGTVFHM[649.3660]NQGNPFK,VYGTVFHM[655.3735]NQGNPFK,...,0.008672,0.509271,,,0.241554,0.308629,0.331026,,0.650951,6.238492e-04
207,sp|P08113|ENPL_MOUSE,P08113,ENPL_MOUSE,Hsp90b1,Endoplasmin,425,ENPL_M425,blue,RVFITDDFHDM[649.3660]MPK,RVFITDDFHDM[655.3735]MPK,...,0.711896,0.592680,0.589632,0.512428,0.598396,0.536723,0.518163,1.472806,0.686001,1.514316e-09


In [48]:
# Sanity check - ensure all peptides in initial combined dataframe have a match in the ReDiMe dataframe
temp = peptides_combined_500uM.isna().sum()
(temp["average"] == 0)

True

In [49]:
# Clean up the combined dataframe - drop & rename columns

cols_to_rename = {
    'Protein_x': 'Protein', 
    'Entry Name_x': 'Entry Name', 
    'Gene_x': 'Gene', 
    'Protein Description_x': 'Protein Description', 
    '1_1 Median Log2 Ratios HL': '1_1 ReDiMe Log2 HL 500uM', 
    '1_2 Median Log2 Ratios HL': '1_2 ReDiMe Log2 HL 500uM', 
    '1_3 Median Log2 Ratios HL': '1_3 ReDiMe Log2 HL 500uM', 
    '1_4 Median Log2 Ratios HL': '1_4 ReDiMe Log2 HL 500uM', 
    '1_5 Median Log2 Ratios HL': '1_5 ReDiMe Log2 HL 500uM', 
    '1_6 Median Log2 Ratios HL': '1_6 ReDiMe Log2 HL 500uM', 
    '1_7 Median Log2 Ratios HL': '1_7 ReDiMe Log2 HL 500uM', 
    '1_8 Median Log2 Ratios HL': '1_8 ReDiMe Log2 HL 500uM', 
    '2_1 Median Log2 Ratios HL': '2_1 ReDiMe Log2 HL 500uM', 
    '2_2 Median Log2 Ratios HL': '2_2 ReDiMe Log2 HL 500uM', 
    '2_3 Median Log2 Ratios HL': '2_3 ReDiMe Log2 HL 500uM', 
    '2_4 Median Log2 Ratios HL': '2_4 ReDiMe Log2 HL 500uM', 
    '2_5 Median Log2 Ratios HL': '2_5 ReDiMe Log2 HL 500uM', 
    '2_6 Median Log2 Ratios HL': '2_6 ReDiMe Log2 HL 500uM', 
    '2_7 Median Log2 Ratios HL': '2_7 ReDiMe Log2 HL 500uM', 
    '2_8 Median Log2 Ratios HL': '2_8 ReDiMe Log2 HL 500uM', 
    '3_1 Median Log2 Ratios HL': '3_1 ReDiMe Log2 HL 500uM', 
    '3_2 Median Log2 Ratios HL': '3_2 ReDiMe Log2 HL 500uM', 
    '3_3 Median Log2 Ratios HL': '3_3 ReDiMe Log2 HL 500uM', 
    '3_4 Median Log2 Ratios HL': '3_4 ReDiMe Log2 HL 500uM', 
    '3_5 Median Log2 Ratios HL': '3_5 ReDiMe Log2 HL 500uM', 
    '3_6 Median Log2 Ratios HL': '3_6 ReDiMe Log2 HL 500uM', 
    '3_7 Median Log2 Ratios HL': '3_7 ReDiMe Log2 HL 500uM', 
    '3_8 Median Log2 Ratios HL': '3_8 ReDiMe Log2 HL 500uM'
}
peptides_combined_500uM.rename(columns=cols_to_rename, inplace=True)

cols_to_keep = [
    'Protein', 'Protein ID', 'Entry Name', 'Gene', 'Protein Description', 'Site Number', 'Label', 'Color', 'Light Modified Peptide', 'Heavy Modified Peptide', 
    '1_1 isoDTB Log2 HL 0uM', '1_2 isoDTB Log2 HL 0uM', '1_3 isoDTB Log2 HL 0uM', '2_1 isoDTB Log2 HL 0uM', '2_2 isoDTB Log2 HL 0uM', '2_3 isoDTB Log2 HL 0uM', '3_1 isoDTB Log2 HL 0uM', '3_2 isoDTB Log2 HL 0uM', '3_3 isoDTB Log2 HL 0uM', 
    '1_1 isoDTB Log2 HL 500uM', '1_2 isoDTB Log2 HL 500uM', '1_3 isoDTB Log2 HL 500uM', '2_1 isoDTB Log2 HL 500uM', '2_2 isoDTB Log2 HL 500uM', '2_3 isoDTB Log2 HL 500uM', '3_1 isoDTB Log2 HL 500uM', '3_2 isoDTB Log2 HL 500uM', '3_3 isoDTB Log2 HL 500uM', 
    '1_1 ReDiMe Log2 HL 0uM', '1_2 ReDiMe Log2 HL 0uM', '1_3 ReDiMe Log2 HL 0uM', '1_4 ReDiMe Log2 HL 0uM', '1_5 ReDiMe Log2 HL 0uM', '1_6 ReDiMe Log2 HL 0uM', '1_7 ReDiMe Log2 HL 0uM', '1_8 ReDiMe Log2 HL 0uM', '2_1 ReDiMe Log2 HL 0uM', '2_2 ReDiMe Log2 HL 0uM', '2_3 ReDiMe Log2 HL 0uM', '2_4 ReDiMe Log2 HL 0uM', '2_5 ReDiMe Log2 HL 0uM', '2_6 ReDiMe Log2 HL 0uM', '2_7 ReDiMe Log2 HL 0uM', '2_8 ReDiMe Log2 HL 0uM', '3_1 ReDiMe Log2 HL 0uM', '3_2 ReDiMe Log2 HL 0uM', '3_3 ReDiMe Log2 HL 0uM', '3_4 ReDiMe Log2 HL 0uM', '3_5 ReDiMe Log2 HL 0uM', '3_6 ReDiMe Log2 HL 0uM', '3_7 ReDiMe Log2 HL 0uM', '3_8 ReDiMe Log2 HL 0uM', 
    '1_1 ReDiMe Log2 HL 500uM', '1_2 ReDiMe Log2 HL 500uM', '1_3 ReDiMe Log2 HL 500uM', '1_4 ReDiMe Log2 HL 500uM', '1_5 ReDiMe Log2 HL 500uM', '1_6 ReDiMe Log2 HL 500uM', '1_7 ReDiMe Log2 HL 500uM', '1_8 ReDiMe Log2 HL 500uM', '2_1 ReDiMe Log2 HL 500uM', '2_2 ReDiMe Log2 HL 500uM', '2_3 ReDiMe Log2 HL 500uM', '2_4 ReDiMe Log2 HL 500uM', '2_5 ReDiMe Log2 HL 500uM', '2_6 ReDiMe Log2 HL 500uM', '2_7 ReDiMe Log2 HL 500uM', '2_8 ReDiMe Log2 HL 500uM', '3_1 ReDiMe Log2 HL 500uM', '3_2 ReDiMe Log2 HL 500uM', '3_3 ReDiMe Log2 HL 500uM', '3_4 ReDiMe Log2 HL 500uM', '3_5 ReDiMe Log2 HL 500uM', '3_6 ReDiMe Log2 HL 500uM', '3_7 ReDiMe Log2 HL 500uM', '3_8 ReDiMe Log2 HL 500uM'
]
peptides_combined_500uM = peptides_combined_500uM[cols_to_keep]
peptides_combined_500uM

Unnamed: 0,Protein,Protein ID,Entry Name,Gene,Protein Description,Site Number,Label,Color,Light Modified Peptide,Heavy Modified Peptide,...,2_7 ReDiMe Log2 HL 500uM,2_8 ReDiMe Log2 HL 500uM,3_1 ReDiMe Log2 HL 500uM,3_2 ReDiMe Log2 HL 500uM,3_3 ReDiMe Log2 HL 500uM,3_4 ReDiMe Log2 HL 500uM,3_5 ReDiMe Log2 HL 500uM,3_6 ReDiMe Log2 HL 500uM,3_7 ReDiMe Log2 HL 500uM,3_8 ReDiMe Log2 HL 500uM
0,sp|P43883|PLIN2_MOUSE,P43883,PLIN2_MOUSE,Plin2,Perilipin-2,265,PLIN2_M265,red,KNM[649.3660]HSANQK,KNM[655.3735]HSANQK,...,,,-0.667293,-0.205911,,-0.594694,-0.459627,-0.999736,,
1,sp|Q60864|STIP1_MOUSE,Q60864,STIP1_MOUSE,Stip1,Stress-induced-phosphoprotein 1,535,STIP1_M535,red,LM[649.3660]DVGLIAIR,LM[655.3735]DVGLIAIR,...,,,-0.370481,-0.379932,-0.260647,-0.030374,-0.253105,,,
2,sp|Q8VDD5|MYH9_MOUSE,Q8VDD5,MYH9_MOUSE,Myh9,Myosin-9,1565,MYH9_M1565,red,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,...,0.091822,,0.134443,,-0.132577,0.425950,0.048311,-0.593608,-0.255494,
3,sp|P62259|1433E_MOUSE,P62259,1433E_MOUSE,Ywhae,14-3-3 protein epsilon,221,1433E_M221,red,DSTLIM[649.3660]QLLR,DSTLIM[655.3735]QLLR,...,0.017882,,-0.233643,-0.575995,-0.511899,,,-0.340549,0.202194,
4,sp|P20029|BIP_MOUSE,P20029,BIP_MOUSE,Hspa5,Endoplasmic reticulum chaperone BiP,154,BIP_M154,red,M[649.3660]KETAEAYLGK,M[655.3735]KETAEAYLGK,...,0.770581,2.266275,0.859519,0.844707,0.943723,0.688677,1.163049,0.871037,0.916651,1.054482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,sp|P16858|G3P_MOUSE,P16858,G3P_MOUSE,Gapdh,Glyceraldehyde-3-phosphate dehydrogenase,173,G3P_M173,blue,VIHDNFGIVEGLM[649.3660]TTVHAITATQK,VIHDNFGIVEGLM[655.3735]TTVHAITATQK,...,-0.194993,-0.316065,-0.217071,-0.367042,-0.310076,-0.208555,-0.456608,-0.479896,-0.272246,-0.504849
205,sp|Q8QZT1|THIL_MOUSE,Q8QZT1,THIL_MOUSE,Acat1,"Acetyl-CoA acetyltransferase, mitochondrial",363,THIL_M363,blue,M[649.3660]LEIDPQK,M[655.3735]LEIDPQK,...,0.305347,,0.309189,0.152098,1.507540,0.379357,0.295864,0.216114,0.324873,0.078599
206,sp|Q5FW57|GLYAL_MOUSE,Q5FW57,GLYAL_MOUSE,Gm4952,Glycine N-acyltransferase-like protein,34,GLYAL_M34,blue,VYGTVFHM[649.3660]NQGNPFK,VYGTVFHM[655.3735]NQGNPFK,...,0.397174,,0.008672,0.509271,,,0.241554,0.308629,0.331026,
207,sp|P08113|ENPL_MOUSE,P08113,ENPL_MOUSE,Hsp90b1,Endoplasmin,425,ENPL_M425,blue,RVFITDDFHDM[649.3660]MPK,RVFITDDFHDM[655.3735]MPK,...,0.498755,1.708266,0.711896,0.592680,0.589632,0.512428,0.598396,0.536723,0.518163,1.472806


## Analysis - Correcting Reactivity Calculation

In [50]:
# Calculate averages for expression ReDiMe, for each bioreplicate, at each molarity

for molarity in ['0uM', '500uM']:
    for i in range(1, 4):
        
        cols_to_avg = [
            f'{i}_1 ReDiMe Log2 HL {molarity}',
            f'{i}_2 ReDiMe Log2 HL {molarity}', 
            f'{i}_3 ReDiMe Log2 HL {molarity}', 
            f'{i}_4 ReDiMe Log2 HL {molarity}', 
            f'{i}_5 ReDiMe Log2 HL {molarity}', 
            f'{i}_6 ReDiMe Log2 HL {molarity}', 
            f'{i}_7 ReDiMe Log2 HL {molarity}', 
            f'{i}_8 ReDiMe Log2 HL {molarity}']
        
        peptides_combined_500uM[f'Rep {i} Avg Expression ReDiMe {molarity}'] = peptides_combined_500uM.loc[:, cols_to_avg].mean(axis = 1)


In [51]:
# Calculate adjusted isoDTB values, for each replicate, at each molarity

for molarity in ['0uM', '500uM']:
    for i in range(1, 4):
        for j in range(1, 4):
            peptides_combined_500uM[f'{i}_{j} isoDTB Adjusted Log2 HL {molarity}'] = peptides_combined_500uM[f'{i}_{j} isoDTB Log2 HL {molarity}'] - peptides_combined_500uM[f'Rep {i} Avg Expression ReDiMe {molarity}']

In [52]:
#pd.set_option("display.max_columns", None)
#display(peptides_combined_500uM)
#pd.reset_option("display.max_columns")

In [53]:
peptides_combined_500uM.to_csv(os.path.join(curr_dir_path, "temp_peptides_combined_500uM.csv"))

In [54]:
# Load dataset
path = os.path.join(curr_dir_path, "temp_peptides_combined_500uM.csv")
peptides_combined_500uM = pd.read_csv(path)
peptides_combined_500uM.set_index("Unnamed: 0", inplace=True)
peptides_combined_500uM.index.name = None
peptides_combined_500uM;

## END