## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
import re
import plotly.express as px
from tqdm import tqdm
import tempfile
import csv
import requests as r
from Bio import SeqIO
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import warnings

warnings.filterwarnings('ignore')

# Import structuremap functions
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score, annotate_proteins_with_idr_pattern, get_extended_flexible_pattern, get_proximity_pvals, perform_enrichment_analysis, perform_enrichment_analysis_per_protein, evaluate_ptm_colocalization, extract_motifs_in_proteome
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization

## Set Parameters of Analysis

In [2]:
analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "655.3735" 
light_modification = "649.3660"

## Load Dataset - MsrAKD

In [3]:
# set correct pathing
curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("global_data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/MsrKD
global_data Directory: /Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/global_data


In [None]:
data_loc = os.path.join(curr_dir_path, "05_10_24_293T_MsrKD_data.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="293T_MsrAKD_quant")
peptides

In [None]:
# Canonicalize data - none to do here
peptides;

In [None]:
label_col_data = ["blue"] * 157 + ["green"] * 381 + ["white"] * 9 + ["red"] * 12 + ["gray"] * 104
label_col = pd.Series(label_col_data)
peptides["Color"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [None]:
# helper function to get full amino acid sequence for a protein
def get_full_protein_seq(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [None]:
# test - get a single amino acid sequence
#first_protein_ID = peptides["Protein ID"].iloc[0]
#test_sequence = get_full_protein_seq(first_protein_ID)
#print(test_sequence[575:587])
#print(peptides["Peptide Sequence"].iloc[0])

In [None]:
unique_uniprotIDs = peptides["Protein ID"].unique()
unique_uniprotIDs, unique_uniprotIDs.size;

In [None]:
# load known completed sequences
path = os.path.join(global_data_path, "uniprotID_to_complete_sequence_mapping.csv")
unique_IDs_to_sequence_df = pd.read_csv(path)
unique_IDs_to_sequence_df.set_index("Unnamed: 0", inplace=True)
unique_IDs_to_sequence_df.index.name = None
unique_IDs_to_sequence_df;

In [None]:
unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, unique_IDs_to_sequence_df["Protein ID"].values)
np.unique(unknown_uniprotIDs_idxs, return_counts=True)

In [None]:
unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
unknown_uniprotIDs, len(unknown_uniprotIDs);

In [None]:
unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
unknown_sequences_df;

In [None]:
# get whole amino acid sequences for unknown proteins
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_full_protein_seq)
#unknown_sequences_df

In [None]:
#unique_IDs_to_sequence_df_updated = pd.concat([unique_IDs_to_sequence_df, unknown_sequences_df])
#unique_IDs_to_sequence_df_updated

In [None]:
#unique_IDs_to_sequence_df_updated.to_csv(os.path.join(global_data_path, "uniprotID_to_complete_sequence_mapping.csv"))

In [None]:
# load (updated) known completed sequences
path = os.path.join(global_data_path, "uniprotID_to_complete_sequence_mapping.csv")
unique_IDs_to_sequence_df = pd.read_csv(path)
unique_IDs_to_sequence_df.set_index("Unnamed: 0", inplace=True)
unique_IDs_to_sequence_df.index.name = None
unique_IDs_to_sequence_df

In [None]:
peptides_completed_sequence = peptides.merge(unique_IDs_to_sequence_df, how="left", on="Protein ID")
peptides_completed_sequence.index = peptides.index
peptides_completed_sequence

In [None]:
# create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    split_mod = modifications[0].split(".")
    whole = split_mod[0]
    mantissa = split_mod[1]
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        split_mod = modifications[i].split(".")
        whole = split_mod[0]
        mantissa = split_mod[1]
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

In [None]:
# extract left prefix of modified methionine (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_completed_sequence["Peptide Sequence"] = peptides_completed_sequence["Light Modified Peptide"].map(filtering)
peptides_completed_sequence

In [None]:
peptides_completed_sequence["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Peptide Sequence"])])
peptides_completed_sequence

In [None]:
peptides_completed_sequence["Sequence Length"] = peptides_completed_sequence["Peptide Sequence"].str.len()
peptides_completed_sequence

In [None]:
# sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Sequence Location"], peptides_completed_sequence["Sequence Length"])]
(temp == peptides_completed_sequence["Peptide Sequence"]).value_counts()

In [None]:
# create regex pattern to identify desired modifications
left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

In [None]:
# extract left prefix of modified methionine (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_completed_sequence["Left Prefix"] = peptides_completed_sequence["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_completed_sequence["Left Prefix"] = peptides_completed_sequence["Left Prefix"].map(filtering)
peptides_completed_sequence["Left Prefix Length"] = peptides_completed_sequence["Left Prefix"].str.len()

peptides_completed_sequence

In [None]:
peptides_completed_sequence["Methionine Location"] = peptides_completed_sequence["Sequence Location"] + peptides_completed_sequence["Left Prefix Length"]
peptides_completed_sequence

In [None]:
# Compute left/right analysis sequences based on threshold
peptides_completed_sequence[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Methionine Location"])]
peptides_completed_sequence[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Methionine Location"])]
peptides_completed_sequence

In [None]:
pd.set_option('display.max_columns', None)
display(peptides_completed_sequence[~(peptides_completed_sequence["Site Number"] == peptides_completed_sequence["Methionine Location"] + 1)])
pd.reset_option('display.max_columns')

In [None]:
# remove invalid proteins (according to alphafold) - TODO: attempt to incorporate these as well
# 7 invalid peptides as a result -> 2 blue, 4 green, 1 gray
invalid_IDs = ['Q09666', 'Q14204', 'Q9Y520', 'Q14789']
peptides_completed_sequence = peptides_completed_sequence[~peptides_completed_sequence["Protein ID"].isin(invalid_IDs)]
peptides_completed_sequence

# Download Alphafold Data - MsrAKD

In [None]:
# path for alphafold protein data
alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

In [None]:
# set uniprot IDs to use
uniprotIDs = peptides_completed_sequence["Protein ID"].unique()
uniprotIDs, len(uniprotIDs)

In [None]:
# download cif data for proteins
# SLOW THE FIRST TIME
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprotIDs,
    out_folder=cif_dir
)

In [None]:
# download pae data for proteins
# SLOW THE FIRST TIME
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprotIDs,
    out_folder=pae_dir, 
)

## Construct Alphafold Dataframe (Calculate Accessibilities) - MsrAKD

In [None]:
# format alphafold data into dataframe
alphafold_annotation_MsrAKD = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=uniprotIDs)
alphafold_annotation_MsrAKD

In [None]:
# calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_annotation_MsrAKD.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD;

In [None]:
# calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_MsrAKD, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

In [None]:
alphafold_accessibility_MsrAKD = alphafold_accessibility_MsrAKD.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrAKD

In [None]:
alphafold_accessibility_MsrAKD.columns

In [None]:
alphafold_accessibility_MsrAKD_smooth = get_smooth_score(
    alphafold_accessibility_MsrAKD, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_MsrAKD_smooth;

In [None]:
alphafold_accessibility_MsrAKD_smooth['IDR'] = np.where(
    alphafold_accessibility_MsrAKD_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_MsrAKD_smooth

In [None]:
alphafold_accessibility_MsrAKD_smooth.columns

# Merge Dataframes into Full Dataset (Includes Alphafold) - MsrAKD

In [None]:
alphafold_accessibility_MsrAKD_smooth["position"] = alphafold_accessibility_MsrAKD_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_with_alphafold = peptides_completed_sequence.merge(
    alphafold_accessibility_MsrAKD_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_with_alphafold

In [None]:
#peptides_with_alphafold.to_csv(os.path.join(curr_dir_path, "MsrAKD_with_alphafold.csv"))

In [None]:
path = os.path.join(curr_dir_path, "MsrAKD_with_alphafold.csv")
peptides_with_alphafold = pd.read_csv(path)
peptides_with_alphafold.set_index("Unnamed: 0", inplace=True)
peptides_with_alphafold.index.name = None
peptides_with_alphafold

## Load Dataset - MsrB2KD

In [4]:
data_loc = os.path.join(curr_dir_path, "05_10_24_293T_MsrKD_data.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="293T_MsrB2KD_quant")
peptides

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,Protein ID,Entry Name,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,Q16836,HCDH_HUMAN,HADH,"Hydroxyacyl-coenzyme A dehydrogenase, mitochon...",7.317853e-14,13.135616,-4.116590,178,M178,HCDH_M178
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,P23193,TCEA1_HUMAN,TCEA1,Transcription elongation factor A protein 1,3.213832e-04,3.492977,-3.852951,48,M48,TCEA1_M48
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,Q16181,SEPT7_HUMAN,SEPTIN7,Septin-7,2.521873e-11,10.598277,-2.655733,355,M355,SEPT7_M355
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,P35579,MYH9_HUMAN,MYH9,Myosin-9,2.469697e-03,2.607356,-2.118498,1565,M1565,MYH9_M1565
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,P62258,1433E_HUMAN,YWHAE,14-3-3 protein epsilon,3.952954e-03,2.403078,-2.045838,160,M160,1433E_M160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,Q00341,VIGLN_HUMAN,HDLBP,Vigilin,1.332075e-01,0.875471,2.654099,128,M128,VIGLN_M128
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,P00374,DYR_HUMAN,DHFR,Dihydrofolate reductase,5.244905e-02,1.280262,2.951996,126,M126,DYR_M126
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,P35579,MYH9_HUMAN,MYH9,Myosin-9,6.250049e-02,1.204117,2.976128,1489,M1489,MYH9_M1489
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,P14868,SYDC_HUMAN,DARS1,"Aspartate--tRNA ligase, cytoplasmic",6.321136e-02,1.199205,3.046635,478,M478,SYDC_M478


In [5]:
# Canonicalize data - none to do here
peptides;

In [6]:
label_col_data = ["blue"] * 10 + ["white"] * 30 + ["green"] * 381 + ["red"] * 213 + ["gray"] * 120
label_col = pd.Series(label_col_data)
peptides["color"] = label_col

#pd.set_option("display.max_rows", None)
display(peptides)
#pd.reset_option("display.max_rows")

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,Entry Name,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label,color
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,HCDH_HUMAN,HADH,"Hydroxyacyl-coenzyme A dehydrogenase, mitochon...",7.317853e-14,13.135616,-4.116590,178,M178,HCDH_M178,blue
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,TCEA1_HUMAN,TCEA1,Transcription elongation factor A protein 1,3.213832e-04,3.492977,-3.852951,48,M48,TCEA1_M48,blue
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,SEPT7_HUMAN,SEPTIN7,Septin-7,2.521873e-11,10.598277,-2.655733,355,M355,SEPT7_M355,blue
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,MYH9_HUMAN,MYH9,Myosin-9,2.469697e-03,2.607356,-2.118498,1565,M1565,MYH9_M1565,blue
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,1433E_HUMAN,YWHAE,14-3-3 protein epsilon,3.952954e-03,2.403078,-2.045838,160,M160,1433E_M160,blue
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,VIGLN_HUMAN,HDLBP,Vigilin,1.332075e-01,0.875471,2.654099,128,M128,VIGLN_M128,gray
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,DYR_HUMAN,DHFR,Dihydrofolate reductase,5.244905e-02,1.280262,2.951996,126,M126,DYR_M126,gray
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,MYH9_HUMAN,MYH9,Myosin-9,6.250049e-02,1.204117,2.976128,1489,M1489,MYH9_M1489,gray
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,SYDC_HUMAN,DARS1,"Aspartate--tRNA ligase, cytoplasmic",6.321136e-02,1.199205,3.046635,478,M478,SYDC_M478,gray


In [7]:
# helper function to get full amino acid sequence for a protein
def get_full_protein_seq(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [8]:
# test - get a single amino acid sequence
#first_protein_ID = peptides["Protein ID"].iloc[0]
#test_sequence = get_full_protein_seq(first_protein_ID)
#print(test_sequence[575:587])
#print(peptides["Peptide Sequence"].iloc[0])

In [9]:
unique_uniprotIDs = peptides["Protein ID"].unique()
unique_uniprotIDs, unique_uniprotIDs.size;

In [10]:
# load known completed sequences
path = os.path.join(global_data_path, "uniprotID_to_complete_sequence_mapping.csv")
unique_IDs_to_sequence_df = pd.read_csv(path)
unique_IDs_to_sequence_df.set_index("Unnamed: 0", inplace=True)
unique_IDs_to_sequence_df.index.name = None
unique_IDs_to_sequence_df;

In [11]:
unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, unique_IDs_to_sequence_df["Protein ID"].values)
np.unique(unknown_uniprotIDs_idxs, return_counts=True)

(array([False]), array([464]))

In [12]:
unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
unknown_uniprotIDs, len(unknown_uniprotIDs);

In [13]:
unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
unknown_sequences_df;

In [14]:
# get whole amino acid sequences for unknown proteins
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

tqdm.pandas()
unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_full_protein_seq)
unknown_sequences_df

0it [00:00, ?it/s]


Unnamed: 0,Protein ID,Complete Sequence


In [15]:
unique_IDs_to_sequence_df_updated = pd.concat([unique_IDs_to_sequence_df, unknown_sequences_df])
unique_IDs_to_sequence_df_updated

Unnamed: 0,Protein ID,Complete Sequence
0,Q8C196,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1,Q9WUR2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...
2,Q9Z1P6,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...
3,Q8K370,MLVRRLFQPSTLHWAWRTTALNHPLGRHQGGLRWTHSGGRSYRAVI...
4,P48410,MPVLSTPRPSRVTTLKRTAVVLALTAYGVHKIYPLVRQCLTPARGP...
...,...,...
118,P18858,MQRSIMSFFHPKKEGKAKKPEKEASNSSRETEPPPKAALKEWNGVV...
119,P16989,MSEAGEATTTTTTTLPQAPTEAAAAAPQDPAPKSPVGSGAPQAAAP...
120,P49959,MSTADALDDENTFKILVATDIHLGFMEKDAVRGNDTFVTLDEILRL...
121,Q00341,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...


In [16]:
#unique_IDs_to_sequence_df_updated.to_csv(os.path.join(global_data_path, "uniprotID_to_complete_sequence_mapping.csv"))

In [17]:
# load (updated) known completed sequences
path = os.path.join(global_data_path, "uniprotID_to_complete_sequence_mapping.csv")
unique_IDs_to_sequence_df = pd.read_csv(path)
unique_IDs_to_sequence_df.set_index("Unnamed: 0", inplace=True)
unique_IDs_to_sequence_df.index.name = None
unique_IDs_to_sequence_df

Unnamed: 0,Protein ID,Complete Sequence
0,Q8C196,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1,Q9WUR2,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...
2,Q9Z1P6,MASATRVIQKLRNWASGQDLQAKLQLRYQEIAKRTQPPPKLPVGPS...
3,Q8K370,MLVRRLFQPSTLHWAWRTTALNHPLGRHQGGLRWTHSGGRSYRAVI...
4,P48410,MPVLSTPRPSRVTTLKRTAVVLALTAYGVHKIYPLVRQCLTPARGP...
...,...,...
118,P18858,MQRSIMSFFHPKKEGKAKKPEKEASNSSRETEPPPKAALKEWNGVV...
119,P16989,MSEAGEATTTTTTTLPQAPTEAAAAAPQDPAPKSPVGSGAPQAAAP...
120,P49959,MSTADALDDENTFKILVATDIHLGFMEKDAVRGNDTFVTLDEILRL...
121,Q00341,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...


In [18]:
peptides_completed_sequence = peptides.merge(unique_IDs_to_sequence_df, how="left", on="Protein ID")
peptides_completed_sequence.index = peptides.index
peptides_completed_sequence

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label,color,Complete Sequence
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,HADH,"Hydroxyacyl-coenzyme A dehydrogenase, mitochon...",7.317853e-14,13.135616,-4.116590,178,M178,HCDH_M178,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,TCEA1,Transcription elongation factor A protein 1,3.213832e-04,3.492977,-3.852951,48,M48,TCEA1_M48,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,SEPTIN7,Septin-7,2.521873e-11,10.598277,-2.655733,355,M355,SEPT7_M355,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,MYH9,Myosin-9,2.469697e-03,2.607356,-2.118498,1565,M1565,MYH9_M1565,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,YWHAE,14-3-3 protein epsilon,3.952954e-03,2.403078,-2.045838,160,M160,1433E_M160,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,HDLBP,Vigilin,1.332075e-01,0.875471,2.654099,128,M128,VIGLN_M128,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,DHFR,Dihydrofolate reductase,5.244905e-02,1.280262,2.951996,126,M126,DYR_M126,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,MYH9,Myosin-9,6.250049e-02,1.204117,2.976128,1489,M1489,MYH9_M1489,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,DARS1,"Aspartate--tRNA ligase, cytoplasmic",6.321136e-02,1.199205,3.046635,478,M478,SYDC_M478,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...


In [19]:
# create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    split_mod = modifications[0].split(".")
    whole = split_mod[0]
    mantissa = split_mod[1]
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        split_mod = modifications[i].split(".")
        whole = split_mod[0]
        mantissa = split_mod[1]
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [20]:
# extract left prefix of modified methionine (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_completed_sequence["Peptide Sequence"] = peptides_completed_sequence["Light Modified Peptide"].map(filtering)
peptides_completed_sequence

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label,color,Complete Sequence,Peptide Sequence
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,"Hydroxyacyl-coenzyme A dehydrogenase, mitochon...",7.317853e-14,13.135616,-4.116590,178,M178,HCDH_M178,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,FAGLHFFNPVPVMK
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,Transcription elongation factor A protein 1,3.213832e-04,3.492977,-3.852951,48,M48,TCEA1_M48,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...,IGMSVNAIR
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,Septin-7,2.521873e-11,10.598277,-2.655733,355,M355,SEPT7_M355,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...,KMEMEMEQVFEMK
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,Myosin-9,2.469697e-03,2.607356,-2.118498,1565,M1565,MYH9_M1565,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,LRLEVNLQAMK
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,14-3-3 protein epsilon,3.952954e-03,2.403078,-2.045838,160,M160,1433E_M160,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,AASDIAMTELPPTHPIR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,Vigilin,1.332075e-01,0.875471,2.654099,128,M128,VIGLN_M128,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,Dihydrofolate reductase,5.244905e-02,1.280262,2.951996,126,M126,DYR_M126,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,EAMNHPGHLK
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,Myosin-9,6.250049e-02,1.204117,2.976128,1489,M1489,MYH9_M1489,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,ALEEAMEQK
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,"Aspartate--tRNA ligase, cytoplasmic",6.321136e-02,1.199205,3.046635,478,M478,SYDC_M478,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...,VTMLFLGLHNVR


In [21]:
peptides_completed_sequence["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Peptide Sequence"])])
peptides_completed_sequence

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,pvalue,neglogp,Log2HL avg,Site Number,Site,Label,color,Complete Sequence,Peptide Sequence,Sequence Location
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,7.317853e-14,13.135616,-4.116590,178,M178,HCDH_M178,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,FAGLHFFNPVPVMK,165
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,3.213832e-04,3.492977,-3.852951,48,M48,TCEA1_M48,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...,IGMSVNAIR,45
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,2.521873e-11,10.598277,-2.655733,355,M355,SEPT7_M355,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...,KMEMEMEQVFEMK,351
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,2.469697e-03,2.607356,-2.118498,1565,M1565,MYH9_M1565,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,LRLEVNLQAMK,1555
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,3.952954e-03,2.403078,-2.045838,160,M160,1433E_M160,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,AASDIAMTELPPTHPIR,153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,1.332075e-01,0.875471,2.654099,128,M128,VIGLN_M128,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK,121
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,5.244905e-02,1.280262,2.951996,126,M126,DYR_M126,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,EAMNHPGHLK,123
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,6.250049e-02,1.204117,2.976128,1489,M1489,MYH9_M1489,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,ALEEAMEQK,1483
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,6.321136e-02,1.199205,3.046635,478,M478,SYDC_M478,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...,VTMLFLGLHNVR,475


In [22]:
peptides_completed_sequence["Sequence Length"] = peptides_completed_sequence["Peptide Sequence"].str.len()
peptides_completed_sequence

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,neglogp,Log2HL avg,Site Number,Site,Label,color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,13.135616,-4.116590,178,M178,HCDH_M178,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,FAGLHFFNPVPVMK,165,14
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,3.492977,-3.852951,48,M48,TCEA1_M48,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...,IGMSVNAIR,45,9
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,10.598277,-2.655733,355,M355,SEPT7_M355,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...,KMEMEMEQVFEMK,351,13
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,2.607356,-2.118498,1565,M1565,MYH9_M1565,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,LRLEVNLQAMK,1555,11
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,2.403078,-2.045838,160,M160,1433E_M160,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,AASDIAMTELPPTHPIR,153,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,0.875471,2.654099,128,M128,VIGLN_M128,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK,121,11
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,1.280262,2.951996,126,M126,DYR_M126,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,EAMNHPGHLK,123,10
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,1.204117,2.976128,1489,M1489,MYH9_M1489,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,ALEEAMEQK,1483,9
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,1.199205,3.046635,478,M478,SYDC_M478,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...,VTMLFLGLHNVR,475,12


In [23]:
# sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Sequence Location"], peptides_completed_sequence["Sequence Length"])]
(temp == peptides_completed_sequence["Peptide Sequence"]).value_counts()

Peptide Sequence
True    754
Name: count, dtype: int64

In [24]:
# create regex pattern to identify desired modifications
left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [25]:
# extract left prefix of modified methionine (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_completed_sequence["Left Prefix"] = peptides_completed_sequence["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_completed_sequence["Left Prefix"] = peptides_completed_sequence["Left Prefix"].map(filtering)
peptides_completed_sequence["Left Prefix Length"] = peptides_completed_sequence["Left Prefix"].str.len()

peptides_completed_sequence

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,Site Number,Site,Label,color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,178,M178,HCDH_M178,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,FAGLHFFNPVPVMK,165,14,FAGLHFFNPVPV,12
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,48,M48,TCEA1_M48,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...,IGMSVNAIR,45,9,IG,2
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,355,M355,SEPT7_M355,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...,KMEMEMEQVFEMK,351,13,KME,3
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,1565,M1565,MYH9_M1565,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,LRLEVNLQAMK,1555,11,LRLEVNLQA,9
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,160,M160,1433E_M160,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,AASDIAMTELPPTHPIR,153,17,AASDIA,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,128,M128,VIGLN_M128,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK,121,11,DQGLSI,6
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,126,M126,DYR_M126,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,EAMNHPGHLK,123,10,EA,2
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,1489,M1489,MYH9_M1489,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,ALEEAMEQK,1483,9,ALEEA,5
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,478,M478,SYDC_M478,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...,VTMLFLGLHNVR,475,12,VT,2


In [26]:
peptides_completed_sequence["Methionine Location"] = peptides_completed_sequence["Sequence Location"] + peptides_completed_sequence["Left Prefix Length"]
peptides_completed_sequence

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,Site,Label,color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,M178,HCDH_M178,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,FAGLHFFNPVPVMK,165,14,FAGLHFFNPVPV,12,177
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,M48,TCEA1_M48,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...,IGMSVNAIR,45,9,IG,2,47
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,M355,SEPT7_M355,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...,KMEMEMEQVFEMK,351,13,KME,3,354
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,M1565,MYH9_M1565,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,LRLEVNLQAMK,1555,11,LRLEVNLQA,9,1564
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,M160,1433E_M160,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,AASDIAMTELPPTHPIR,153,17,AASDIA,6,159
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,M128,VIGLN_M128,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK,121,11,DQGLSI,6,127
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,M126,DYR_M126,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,EAMNHPGHLK,123,10,EA,2,125
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,M1489,MYH9_M1489,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,ALEEAMEQK,1483,9,ALEEA,5,1488
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,M478,SYDC_M478,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...,VTMLFLGLHNVR,475,12,VT,2,477


In [27]:
# Compute left/right analysis sequences based on threshold
peptides_completed_sequence[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Methionine Location"])]
peptides_completed_sequence[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Methionine Location"])]
peptides_completed_sequence

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,FAGLHFFNPVPVMK,165,14,FAGLHFFNPVPV,12,177,NATTRQDRFAGLHFFNPVPV,KLVEVIKTPMTSQKTFESLV
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...,IGMSVNAIR,45,9,IG,2,47,LKELKNIPMTLELLQSTRIG,SVNAIRKQSTDEEVTSLAKS
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...,KMEMEMEQVFEMK,351,13,KME,3,354,PLAQMEEERREHVAKMKKME,EMEQVFEMKVKEKVQKLKDS
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,LRLEVNLQAMK,1555,11,LRLEVNLQA,9,1564,EDELQATEDAKLRLEVNLQA,KAQFERDLQGRDEQSEEKKK
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,AASDIAMTELPPTHPIR,153,17,AASDIA,6,159,DRKEAAENSLVAYKAASDIA,TELPPTHPIRLGLALNFSVF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK,121,11,DQGLSI,6,127,MQRTGAHLELSLAKDQGLSI,VSGKLDAVMKARKDIVARLQ
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,EAMNHPGHLK,123,10,EA,2,125,LANKVDMVWIVGGSSVYKEA,NHPGHLKLFVTRIMQDFESD
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,ALEEAMEQK,1483,9,ALEEA,5,1488,AEAREKETKALSLARALEEA,EQKAELERLNKQFRTEMEDL
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...,VTMLFLGLHNVR,475,12,VT,2,477,SFRFGAPPHAGGGIGLERVT,LFLGLHNVRQTSMFPRDPKR


In [28]:
pd.set_option('display.max_columns', None)
display(peptides_completed_sequence[~(peptides_completed_sequence["Site Number"] == peptides_completed_sequence["Methionine Location"] + 1)])
pd.reset_option('display.max_columns')

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,MsrB2_KD_6 Log2 Ratio HL,MsrB2_KD_7 Log2 Ratio HL,MsrB2_KD_8 Log2 Ratio HL,MsrB2_KD_9 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,pvalue,neglogp,Log2HL avg,Site Number,Site,Label,color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
108,ISM[649.3660]PDIDLNLKGPK,ISM[655.3735]PDIDLNLKGPK,,,2.317727,1.315954,2.486824,2.297093,,2.372561,1.472162,2.508575,,1.808202,sp|Q09666|AHNK_HUMAN,Q09666,AHNK_HUMAN,AHNAK,Neuroblast differentiation-associated protein ...,5.110537e-06,5.291533,2.072387,0,M0,AHNK_M0,green,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,ISMPDIDLNLKGPK,2706,14,IS,2,2708,KLKGPKFKMPEMNIKAPKIS,PDIDLNLKGPKVKGDVDVSL
253,ISM[649.3660]PDVDLHLK,ISM[655.3735]PDVDLHLK,2.023708,,1.388773,,,,2.04048,1.42799,,,1.598495,,sp|Q09666|AHNK_HUMAN,Q09666,AHNK_HUMAN,AHNAK,Neuroblast differentiation-associated protein ...,0.0002795943,3.553472,1.695889,0,M0,AHNK_M0,green,MEKEETTRELLLPNWQGSGSHGLTIAQRDDGVFVQEVTQNSPAART...,ISMPDVDLHLK,817,11,IS,2,819,KLKGPKFKMPEMNIKVPKIS,PDVDLHLKGPNVKGEYDVTM
274,RGM[649.3660]DDDRGPR,RGM[655.3735]DDDRGPR,1.689388,1.636527,1.738867,1.672636,1.615026,1.706146,1.594494,1.616323,1.725671,1.822155,1.577698,1.45233,sp|Q14152|EIF3A_HUMAN,Q14152,EIF3A_HUMAN,EIF3A,Eukaryotic translation initiation factor 3 sub...,3.037248e-15,14.51752,1.653938,0,M0,EIF3A_M0,green,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,RGMDDDRGPR,959,10,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA
387,M[649.3660]GANSLER,M[655.3735]GANSLER,1.356775,1.283564,1.510369,1.369582,1.35557,1.498688,1.530735,1.459537,1.422615,1.747021,1.318855,1.088088,sp|P52272|HNRPM_HUMAN,P52272,HNRPM_HUMAN,HNRNPM,Heterogeneous nuclear ribonucleoprotein M,5.699878e-12,11.244134,1.411783,0,M0,HNRPM_M0,green,MAAGVEAAAEVAATEIKMEEESGAPGVPSGNGAPGPKGEGERPAQN...,MGANSLER,570,8,,0,570,MATGLERMGANNLERMGLER,GANSLERMGLERMGANSLER
422,RGM[649.3660]DDDR,RGM[655.3735]DDDR,,1.757216,,,1.155705,,,1.76007,1.273511,,0.885131,,sp|Q14152|EIF3A_HUMAN,Q14152,EIF3A_HUMAN,EIF3A,Eukaryotic translation initiation factor 3 sub...,0.00136283,2.865558,1.366327,0,M0,EIF3A_M0,red,MPAYFQRPENALKRANEFLEVGKKQPALDVLYDVMKSKKHRTWQKI...,RGMDDDR,959,7,RG,2,961,DDEDREPSLRPDDDRVPRRG,DDDRGPRRGPEEDRFSRRGA
547,SM[15.9949]M[649.3660]SAYER,SM[15.9949]M[655.3735]SAYER,,1.01538,1.08481,1.091951,1.049318,1.202029,1.020585,1.133366,1.097958,1.25279,1.066407,0.936811,sp|P18583|SON_HUMAN,P18583,SON_HUMAN,SON,Protein SON,1.751139e-12,11.756679,1.086491,0,M0,SON_M0,red,MATNIEQIFRSFVVSKFREIQQELSSGRNEGQLNGETNTPIEGNQA...,SMMSAYER,1031,8,SM,2,1033,ERSMMSYERSMMSPMAERSM,SAYERSMMSAYERSMMSPMA
609,SM[15.9949]M[649.3660]SSYSAADR,SM[15.9949]M[655.3735]SSYSAADR,0.795118,,0.866351,0.688615,0.901931,0.961633,,0.853249,0.981662,1.057892,,,sp|P18583|SON_HUMAN,P18583,SON_HUMAN,SON,Protein SON,1.091633e-07,6.961923,0.888306,0,M0,SON_M0,red,MATNIEQIFRSFVVSKFREIQQELSSGRNEGQLNGETNTPIEGNQA...,SMMSSYSAADR,1090,11,SM,2,1092,SMMSPMADRSMMSMGADRSM,SSYSAADRSMMSSYSAADRS
628,GM[649.3660]QGPPGPR,GM[655.3735]QGPPGPR,1.281578,1.361743,,,,1.625771,,,,,1.429844,,sp|Q9C0J8|WDR33_HUMAN,Q9C0J8,WDR33_HUMAN,WDR33,pre-mRNA 3' end processing protein WDR33,0.0003004325,3.522253,1.424734,784,M784,WDR33_M784,red,MATEIGSPPRFFHMPRFQHQAPRQLFYKRPDFAQQQAMQQLTFDGK...,GMQGPPGPR,738,9,G,1,739,QGPPGPQGHLGPQGPPGTQG,QGPPGPRGMQGPPHPHGIQG
652,SM[649.3660]M[15.9949]SPMAER,SM[655.3735]M[15.9949]SPMAER,,,,-0.068689,0.573475,,,-0.19599,,-0.160331,-0.696776,,sp|P18583|SON_HUMAN,P18583,SON_HUMAN,SON,Protein SON,0.6174987,0.209364,-0.109662,0,M0,SON_M0,gray,MATNIEQIFRSFVVSKFREIQQELSSGRNEGQLNGETNTPIEGNQA...,SMMSPMAER,1022,9,S,1,1023,AAERSMMSSYERSMMSYERS,MSPMAERSMMSAYERSMMSA
684,n[42.0106]MDRM[649.3660]TEDALR,n[42.0106]MDRM[655.3735]TEDALR,0.398543,,,,0.678404,,,,,,,,sp|Q8WXI9|P66B_HUMAN,Q8WXI9,P66B_HUMAN,GATAD2B,Transcriptional repressor p66-beta,0.1618548,0.790874,0.538474,1,M1,P66B_M1,gray,MDRMTEDALRLNLLKRSLDPADERDDVLAKRLKMEGHEAMERLKML...,MDRMTEDALR,0,10,MDR,3,3,MD,TEDALRLNLLKRSLDPADER


In [29]:
# remove invalid proteins (according to alphafold) - TODO: attempt to incorporate these as well
# 12 invalid peptides as a result -> 5 green, 4 red, 3 gray
invalid_IDs = ['Q14204', 'Q09666', 'Q14789', 'Q9Y520', 'P46013', 'Q9NU22']
peptides_completed_sequence = peptides_completed_sequence[~peptides_completed_sequence["Protein ID"].isin(invalid_IDs)]
peptides_completed_sequence

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,blue,MAFVTRQFMRSVSSSSTASASAKKIIVKHVTVIGGGLMGAGIAQVA...,FAGLHFFNPVPVMK,165,14,FAGLHFFNPVPV,12,177,NATTRQDRFAGLHFFNPVPV,KLVEVIKTPMTSQKTFESLV
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,blue,MEDEVVRFAKKMDKMVQKKNAAGALDLLKELKNIPMTLELLQSTRI...,IGMSVNAIR,45,9,IG,2,47,LKELKNIPMTLELLQSTRIG,SVNAIRKQSTDEEVTSLAKS
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,blue,MSVSARSAAAEERSVNSSTMVAQQKNLEGYVGFANLPNQVYRKSVK...,KMEMEMEQVFEMK,351,13,KME,3,354,PLAQMEEERREHVAKMKKME,EMEQVFEMKVKEKVQKLKDS
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,blue,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,LRLEVNLQAMK,1555,11,LRLEVNLQA,9,1564,EDELQATEDAKLRLEVNLQA,KAQFERDLQGRDEQSEEKKK
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,blue,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,AASDIAMTELPPTHPIR,153,17,AASDIA,6,159,DRKEAAENSLVAYKAASDIA,TELPPTHPIRLGLALNFSVF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,gray,MSSVAVLTQESFAEHRSGLVPQQIKVATLNSEEESDPPTYKDAFPP...,DQGLSIMVSGK,121,11,DQGLSI,6,127,MQRTGAHLELSLAKDQGLSI,VSGKLDAVMKARKDIVARLQ
750,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,gray,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,EAMNHPGHLK,123,10,EA,2,125,LANKVDMVWIVGGSSVYKEA,NHPGHLKLFVTRIMQDFESD
751,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,gray,MAQQAADKYLYVDKNFINNPLAQADWAAKKLVWVPSDKSGFEPASL...,ALEEAMEQK,1483,9,ALEEA,5,1488,AEAREKETKALSLARALEEA,EQKAELERLNKQFRTEMEDL
752,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,gray,MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLV...,VTMLFLGLHNVR,475,12,VT,2,477,SFRFGAPPHAGGGIGLERVT,LFLGLHNVRQTSMFPRDPKR


# Download Alphafold Data - MsrB2KD

In [30]:
# path for alphafold protein data
alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/pae


In [31]:
# set uniprot IDs to use
uniprotIDs = peptides_completed_sequence["Protein ID"].unique()
uniprotIDs, len(uniprotIDs)

(array(['Q16836', 'P23193', 'Q16181', 'P35579', 'P62258', 'P46109',
        'P55072', 'Q9Y265', 'P25205', 'P61024', 'P41227', 'P18583',
        'Q9UN37', 'O14744', 'Q86UP2', 'O14874', 'P36543', 'Q9Y2W2',
        'P14174', 'Q9Y617', 'Q8WVK2', 'P31948', 'Q9Y3U8', 'Q99729',
        'Q9UKD2', 'Q9Y3I0', 'P27144', 'Q9UHX1', 'P22307', 'Q01518',
        'Q9BWF3', 'Q9Y580', 'O43707', 'P22061', 'P52272', 'Q9HD42',
        'P50454', 'O95831', 'P18859', 'P05067', 'P60709', 'P68032',
        'P35611', 'P55196', 'Q4VCS5', 'P08243', 'P05023', 'P24539',
        'Q9NVI7', 'Q8WWM7', 'Q07812', 'Q9NYF8', 'Q9UHR4', 'P11021',
        'Q9BRK5', 'O43852', 'Q14444', 'Q96CT7', 'Q16543', 'P06493',
        'P61604', 'P10809', 'Q9UQN3', 'Q9H444', 'Q9Y3Y2', 'Q14011',
        'E9PRG8', 'Q07065', 'Q15003', 'P09669', 'P33240', 'Q9H0L4',
        'Q92841', 'Q9NR30', 'Q9BUQ8', 'P00367', 'Q08211', 'Q99615',
        'O75937', 'P55265', 'P33316', 'P55084', 'P42126', 'Q6P2E9',
        'P29692', 'Q14152', 'O75821', 'O15372', 

In [32]:
# download cif data for proteins
# SLOW THE FIRST TIME
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 458/458 [00:00<00:00, 160416.80it/s]

2024-05-21 15:37:21> Valid proteins: 0
2024-05-21 15:37:21> Invalid proteins: 0
2024-05-21 15:37:21> Existing proteins: 458





In [33]:
# download pae data for proteins
# SLOW THE FIRST TIME
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 458/458 [00:00<00:00, 152774.87it/s]

2024-05-21 15:37:21> Valid proteins: 0
2024-05-21 15:37:21> Invalid proteins: 0
2024-05-21 15:37:21> Existing proteins: 458





## Construct Alphafold Dataframe (Calculate Accessibilities) - MsrB2KD

In [34]:
# format alphafold data into dataframe
alphafold_annotation_MsrB2KD = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=uniprotIDs)
alphafold_annotation_MsrB2KD

100%|██████████| 1696/1696 [00:56<00:00, 29.84it/s] 


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A8MWD9,1,M,1,47.25,6.065,4.721,3.923,4.980,25.474,...,-6.924,-8.038,-7.255,unstructured,unstructured,0,0,0,0,1
1,A8MWD9,1,S,2,59.97,7.417,7.567,7.764,6.367,22.925,...,-5.364,-3.908,-5.523,unstructured,unstructured,0,0,0,0,1
2,A8MWD9,1,K,3,62.90,8.386,8.471,9.760,8.438,20.286,...,-7.835,-8.662,-7.119,unstructured,unstructured,0,0,0,0,1
3,A8MWD9,1,A,4,64.57,8.540,7.245,5.993,7.332,17.535,...,-5.865,-6.199,-6.761,unstructured,unstructured,0,0,0,0,1
4,A8MWD9,1,H,5,69.53,10.062,10.548,11.445,9.352,15.189,...,-5.055,-3.843,-4.996,HELX_LH_PP_P,HELX,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275134,Q9Y617,458,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,15.087,16.015,14.291,HELX_RH_AL_P,HELX,0,1,0,0,0
275135,Q9Y617,458,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,17.344,18.096,16.529,HELX_RH_AL_P,HELX,0,1,0,0,0
275136,Q9Y617,458,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,14.388,13.842,15.221,HELX_RH_AL_P,HELX,0,1,0,0,0
275137,Q9Y617,458,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,12.177,11.359,13.111,HELX_RH_AL_P,HELX,0,1,0,0,0


In [35]:
# calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 458/458 [00:08<00:00, 52.76it/s] 


In [36]:
alphafold_accessibility_MsrB2KD = alphafold_annotation_MsrB2KD.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [37]:
# calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 458/458 [00:06<00:00, 67.36it/s] 


In [38]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [39]:
# calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 458/458 [00:06<00:00, 68.76it/s] 


In [40]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [41]:
# calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 458/458 [00:06<00:00, 66.00it/s] 


In [42]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [43]:
# calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 458/458 [00:06<00:00, 65.76it/s] 


In [44]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [45]:
# calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 458/458 [00:06<00:00, 67.20it/s] 


In [46]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [47]:
# calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 458/458 [00:06<00:00, 65.81it/s] 


In [48]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [49]:
# calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 458/458 [00:06<00:00, 66.19it/s] 


In [50]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [51]:
# calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 458/458 [00:06<00:00, 65.90it/s] 


In [52]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [53]:
# calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 458/458 [00:06<00:00, 66.72it/s] 


In [54]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [55]:
# calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 458/458 [00:07<00:00, 65.01it/s] 


In [56]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [57]:
# calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 458/458 [00:07<00:00, 59.63it/s] 


In [58]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [59]:
# calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 458/458 [00:09<00:00, 50.73it/s]


In [60]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [61]:
# calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 458/458 [00:10<00:00, 41.90it/s]


In [62]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD;

In [63]:
# calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_MsrB2KD, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 458/458 [00:07<00:00, 60.48it/s] 


In [64]:
alphafold_accessibility_MsrB2KD = alphafold_accessibility_MsrB2KD.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_MsrB2KD

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,A8MWD9,1,M,1,47.25,6.065,4.721,3.923,4.980,25.474,...,1,1,1,1,1,1,2,3,4,0
1,A8MWD9,1,S,2,59.97,7.417,7.567,7.764,6.367,22.925,...,0,2,2,2,2,2,3,4,5,0
2,A8MWD9,1,K,3,62.90,8.386,8.471,9.760,8.438,20.286,...,1,2,2,2,2,2,4,5,7,0
3,A8MWD9,1,A,4,64.57,8.540,7.245,5.993,7.332,17.535,...,1,2,2,2,2,2,4,8,12,0
4,A8MWD9,1,H,5,69.53,10.062,10.548,11.445,9.352,15.189,...,1,2,2,2,2,2,4,9,16,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275134,Q9Y617,458,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,2,2,3,6,7,7,12,35,70,4
275135,Q9Y617,458,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,2,2,3,4,5,5,9,33,62,2
275136,Q9Y617,458,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,2,2,3,4,4,5,10,29,54,4
275137,Q9Y617,458,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,1,2,2,2,2,2,5,13,30,2


In [65]:
alphafold_accessibility_MsrB2KD.columns

Index(['protein_id', 'protein_number', 'AA', 'position', 'quality',
       'x_coord_c', 'x_coord_ca', 'x_coord_cb', 'x_coord_n', 'y_coord_c',
       'y_coord_ca', 'y_coord_cb', 'y_coord_n', 'z_coord_c', 'z_coord_ca',
       'z_coord_cb', 'z_coord_n', 'secondary_structure', 'structure_group',
       'BEND', 'HELX', 'STRN', 'TURN', 'unstructured', 'nAA_2_180_pae',
       'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae',
       'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae',
       'nAA_7.5_180_pae', 'nAA_8_180_pae', 'nAA_12_180_pae', 'nAA_18_180_pae',
       'nAA_24_180_pae', 'nAA_12_70_pae'],
      dtype='object')

In [66]:
alphafold_accessibility_MsrB2KD_smooth = get_smooth_score(
    alphafold_accessibility_MsrB2KD, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_MsrB2KD_smooth;

100%|██████████| 458/458 [00:01<00:00, 369.51it/s]


In [67]:
alphafold_accessibility_MsrB2KD_smooth['IDR'] = np.where(
    alphafold_accessibility_MsrB2KD_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_MsrB2KD_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,A8MWD9,1,M,1,47.25,6.065,4.721,3.923,4.980,25.474,...,1.909091,2.090909,2.181818,2.272727,2.727273,5.909091,15.090909,26.090909,0.636364,1
1,A8MWD9,1,S,2,59.97,7.417,7.567,7.764,6.367,22.925,...,1.916667,2.250000,2.333333,2.500000,2.916667,6.833333,16.750000,28.583333,0.916667,1
2,A8MWD9,1,K,3,62.90,8.386,8.471,9.760,8.438,20.286,...,1.923077,2.307692,2.461538,2.769231,3.230769,7.384615,18.461538,30.846154,0.923077,1
3,A8MWD9,1,A,4,64.57,8.540,7.245,5.993,7.332,17.535,...,1.928571,2.357143,2.571429,2.928571,3.500000,7.928571,19.785714,32.785714,0.857143,1
4,A8MWD9,1,H,5,69.53,10.062,10.548,11.445,9.352,15.189,...,1.933333,2.533333,2.800000,3.200000,3.733333,8.666667,21.200000,34.666667,1.333333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,Q9Y617,458,E,366,96.59,-31.524,-30.837,-30.319,-29.715,-7.654,...,2.000000,4.266667,5.333333,6.466667,6.733333,17.133333,52.133333,84.600000,6.000000,0
366,Q9Y617,458,M,367,95.45,-32.060,-31.279,-30.126,-30.751,-4.561,...,2.000000,4.142857,5.285714,6.357143,6.642857,16.857143,51.285714,82.857143,5.928571,0
367,Q9Y617,458,H,368,93.04,-33.455,-32.551,-31.427,-31.969,-3.978,...,2.000000,4.076923,5.230769,6.307692,6.538462,16.000000,48.769231,79.307692,5.615385,0
368,Q9Y617,458,Q,369,78.73,-35.920,-34.626,-33.960,-33.654,-6.394,...,1.916667,3.916667,5.166667,6.083333,6.250000,15.333333,46.166667,75.333333,5.333333,0


In [68]:
alphafold_accessibility_MsrB2KD_smooth.columns

Index(['protein_id', 'protein_number', 'AA', 'position', 'quality',
       'x_coord_c', 'x_coord_ca', 'x_coord_cb', 'x_coord_n', 'y_coord_c',
       'y_coord_ca', 'y_coord_cb', 'y_coord_n', 'z_coord_c', 'z_coord_ca',
       'z_coord_cb', 'z_coord_n', 'secondary_structure', 'structure_group',
       'BEND', 'HELX', 'STRN', 'TURN', 'unstructured', 'nAA_2_180_pae',
       'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae',
       'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae',
       'nAA_7.5_180_pae', 'nAA_8_180_pae', 'nAA_12_180_pae', 'nAA_18_180_pae',
       'nAA_24_180_pae', 'nAA_12_70_pae', 'nAA_2_180_pae_smooth10',
       'nAA_3_180_pae_smooth10', 'nAA_4_180_pae_smooth10',
       'nAA_4.5_180_pae_smooth10', 'nAA_5_180_pae_smooth10',
       'nAA_5.5_180_pae_smooth10', 'nAA_6_180_pae_smooth10',
       'nAA_6.5_180_pae_smooth10', 'nAA_7_180_pae_smooth10',
       'nAA_7.5_180_pae_smooth10', 'nAA_8_180_pae_smooth10',
       'nAA_12_180_pae_smooth

# Merge Dataframes into Full Dataset (Includes Alphafold) - MsrB2KD

In [69]:
alphafold_accessibility_MsrB2KD_smooth["position"] = alphafold_accessibility_MsrB2KD_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_with_alphafold = peptides_completed_sequence.merge(
    alphafold_accessibility_MsrB2KD_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_with_alphafold

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,2.571429,3.571429,5.333333,6.571429,8.333333,26.047619,71.952381,142.619048,6.809524,0.0
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,2.047619,3.238095,3.809524,4.857143,5.714286,12.904762,38.190476,60.285714,3.523810,0.0
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,2.285714,4.809524,5.571429,7.190476,7.380952,11.857143,18.476190,25.095238,2.047619,1.0
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,2.523810,1.0
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,2.619048,4.523810,5.857143,7.428571,8.142857,23.523810,60.000000,109.238095,7.190476,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,2.095238,2.619048,3.190476,4.000000,4.904762,13.142857,29.952381,49.190476,1.952381,0.0
738,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,2.333333,3.714286,5.095238,6.809524,7.857143,22.428571,67.000000,123.000000,5.809524,0.0
739,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,2.714286,5.000000,6.142857,7.523810,7.523810,11.904762,19.571429,26.333333,2.333333,1.0
740,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,2.619048,4.142857,6.047619,7.666667,8.428571,28.761905,87.285714,172.428571,8.761905,0.0


In [72]:
#peptides_with_alphafold.to_csv(os.path.join(curr_dir_path, "MsrB2KD_with_alphafold.csv"))

In [73]:
path = os.path.join(curr_dir_path, "MsrB2KD_with_alphafold.csv")
peptides_with_alphafold = pd.read_csv(path)
peptides_with_alphafold.set_index("Unnamed: 0", inplace=True)
peptides_with_alphafold.index.name = None
peptides_with_alphafold

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,2.571429,3.571429,5.333333,6.571429,8.333333,26.047619,71.952381,142.619048,6.809524,0.0
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,2.047619,3.238095,3.809524,4.857143,5.714286,12.904762,38.190476,60.285714,3.523810,0.0
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,2.285714,4.809524,5.571429,7.190476,7.380952,11.857143,18.476190,25.095238,2.047619,1.0
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,2.523810,1.0
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,2.619048,4.523810,5.857143,7.428571,8.142857,23.523810,60.000000,109.238095,7.190476,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,2.095238,2.619048,3.190476,4.000000,4.904762,13.142857,29.952381,49.190476,1.952381,0.0
738,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,2.333333,3.714286,5.095238,6.809524,7.857143,22.428571,67.000000,123.000000,5.809524,0.0
739,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,2.714286,5.000000,6.142857,7.523810,7.523810,11.904762,19.571429,26.333333,2.333333,1.0
740,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,2.619048,4.142857,6.047619,7.666667,8.428571,28.761905,87.285714,172.428571,8.761905,0.0


# The End (For Now)