## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
import re
import plotly.express as px
from tqdm import tqdm
import tempfile
import csv
import requests as r
from Bio import SeqIO
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import warnings

warnings.filterwarnings('ignore')

# Import structuremap functions
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score, annotate_proteins_with_idr_pattern, get_extended_flexible_pattern, get_proximity_pvals, perform_enrichment_analysis, perform_enrichment_analysis_per_protein, evaluate_ptm_colocalization, extract_motifs_in_proteome
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization

## Set Parameters of Analysis

In [2]:
analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "655.3735" 
light_modification = "649.3660"

## Load Dataset - Labeled Methionines

In [3]:
# path for csv output data
datasets_path_str = "../datasets/"
datasets_path = os.path.abspath(datasets_path_str)
print(datasets_path)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/datasets


In [4]:
data_loc = os.path.join(datasets_path, "RvsS_DataSet.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="Sheet2")
peptides;

In [5]:
# Canonicalize data - here, drop extra columns
peptides.drop(columns=["Unnamed: 24", "Protein ID.1"], axis=1, inplace=True)
peptides;

In [6]:
label_col_data = ["green"] * 39 + ["white"] * 143 + ["yellow"] * 21
label_col = pd.Series(label_col_data)
peptides["label"] = label_col
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,9 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,p-value,neglogp,Log2HL avg,label
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",4.058191e-03,2.391668,3.289988,green
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,,sp|Q07417|ACADS_MOUSE,Q07417,ACADS_MOUSE,Acads,"Short-chain specific acyl-CoA dehydrogenase, m...",9.647931e-06,5.015566,2.789195,green
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,,sp|Q91YI0|ARLY_MOUSE,Q91YI0,ARLY_MOUSE,Asl,Argininosuccinate lyase,4.283587e-02,1.368192,2.555706,green
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,2.607501,sp|P50247|SAHH_MOUSE,P50247,SAHH_MOUSE,Ahcy,Adenosylhomocysteinase,8.561078e-10,9.067472,2.538998,green
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,,sp|P33267|CP2F2_MOUSE,P33267,CP2F2_MOUSE,Cyp2f2,Cytochrome P450 2F2,2.032232e-04,3.692027,2.389057,green
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,-1.760150,sp|Q9QXF8|GNMT_MOUSE,Q9QXF8,GNMT_MOUSE,Gnmt,Glycine N-methyltransferase,1.880219e-08,7.725792,-1.857135,yellow
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,-1.881011,sp|Q8K3J1|NDUS8_MOUSE,Q8K3J1,NDUS8_MOUSE,Ndufs8,NADH dehydrogenase [ubiquinone] iron-sulfur pr...,7.034064e-14,13.152794,-1.871741,yellow
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,-1.935236,sp|P63038|CH60_MOUSE,P63038,CH60_MOUSE,Hspd1,"60 kDa heat shock protein, mitochondrial",7.289026e-11,10.137330,-1.950460,yellow
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,-2.205174,sp|Q8BMS1|ECHA_MOUSE,Q8BMS1,ECHA_MOUSE,Hadha,"Trifunctional enzyme subunit alpha, mitochondrial",5.582467e-09,8.253174,-2.270072,yellow


In [7]:
# helper function to get full amino acid sequence for a protein
def get_full_protein_seq(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [8]:
# test - get a single amino acid sequence
#first_protein_ID = peptides["Protein ID"].iloc[0]
#test_sequence = get_full_protein_seq(first_protein_ID)
#print(test_sequence[575:587])
#print(peptides["Peptide Sequence"].iloc[0])

In [9]:
# get whole amino acid sequences for methionine peptides
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#tqdm.pandas()
#peptides_completed_sequence = peptides.copy()
#peptides_completed_sequence["Complete Sequence"] = peptides_completed_sequence["Protein ID"].progress_apply(get_full_protein_seq)
#peptides_completed_sequence

# NOTE: WE CAN SPEED THIS UP BY ONLY GETTING UNIQUE PROTEINS (lots of repeats in the dataset)

In [10]:
#peptides_completed_sequence.to_csv(os.path.join(datasets_path, "RvsS_peptides_completed_sequence.csv"))

In [11]:
path = os.path.join(datasets_path, "RvsS_peptides_completed_sequence.csv")
peptides_completed_sequence = pd.read_csv(path)
peptides_completed_sequence.set_index("Unnamed: 0", inplace=True)
peptides_completed_sequence.index.name = None
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,Protein,Protein ID,Entry Name,Gene,Protein Description,p-value,neglogp,Log2HL avg,label,Complete Sequence
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",4.058191e-03,2.391668,3.289988,green,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,sp|Q07417|ACADS_MOUSE,Q07417,ACADS_MOUSE,Acads,"Short-chain specific acyl-CoA dehydrogenase, m...",9.647931e-06,5.015566,2.789195,green,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,sp|Q91YI0|ARLY_MOUSE,Q91YI0,ARLY_MOUSE,Asl,Argininosuccinate lyase,4.283587e-02,1.368192,2.555706,green,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,sp|P50247|SAHH_MOUSE,P50247,SAHH_MOUSE,Ahcy,Adenosylhomocysteinase,8.561078e-10,9.067472,2.538998,green,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,sp|P33267|CP2F2_MOUSE,P33267,CP2F2_MOUSE,Cyp2f2,Cytochrome P450 2F2,2.032232e-04,3.692027,2.389057,green,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,sp|Q9QXF8|GNMT_MOUSE,Q9QXF8,GNMT_MOUSE,Gnmt,Glycine N-methyltransferase,1.880219e-08,7.725792,-1.857135,yellow,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,sp|Q8K3J1|NDUS8_MOUSE,Q8K3J1,NDUS8_MOUSE,Ndufs8,NADH dehydrogenase [ubiquinone] iron-sulfur pr...,7.034064e-14,13.152794,-1.871741,yellow,MYRLSSSMLPRALAQAMRTGHLNGQSLHSSAVAATYKYVNKKEQES...
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,sp|P63038|CH60_MOUSE,P63038,CH60_MOUSE,Hspd1,"60 kDa heat shock protein, mitochondrial",7.289026e-11,10.137330,-1.950460,yellow,MLRLPTVLRQMRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDL...
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,sp|Q8BMS1|ECHA_MOUSE,Q8BMS1,ECHA_MOUSE,Hadha,"Trifunctional enzyme subunit alpha, mitochondrial",5.582467e-09,8.253174,-2.270072,yellow,MVASRAIGSLSRFSAFRILRSRGCICRSFTTSSALLTRTHINYGVK...


In [12]:
# create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    split_mod = modifications[0].split(".")
    whole = split_mod[0]
    mantissa = split_mod[1]
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        split_mod = modifications[i].split(".")
        whole = split_mod[0]
        mantissa = split_mod[1]
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [13]:
peptides_completed_sequence["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Peptide Sequence"])])
peptides_completed_sequence;

In [14]:
peptides_completed_sequence["Sequence Length"] = peptides_completed_sequence["Peptide Sequence"].str.len()
peptides_completed_sequence;

In [15]:
# sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Sequence Location"], peptides_completed_sequence["Sequence Length"])]
(temp == peptides_completed_sequence["Peptide Sequence"]).value_counts()

Peptide Sequence
True    203
Name: count, dtype: int64

In [16]:
# create regex pattern to identify desired modifications
left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [17]:
# extract left prefix of modified methionine (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_completed_sequence["Left Prefix"] = peptides_completed_sequence["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_completed_sequence["Left Prefix"] = peptides_completed_sequence["Left Prefix"].map(filtering)
peptides_completed_sequence["Left Prefix Length"] = peptides_completed_sequence["Left Prefix"].str.len()

peptides_completed_sequence;

In [18]:
peptides_completed_sequence["Methionine Location"] = peptides_completed_sequence["Sequence Location"] + peptides_completed_sequence["Left Prefix Length"]
peptides_completed_sequence;

In [19]:
# Compute left/right analysis sequences based on threshold
peptides_completed_sequence[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Methionine Location"])]
peptides_completed_sequence[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Methionine Location"])]
peptides_completed_sequence;

In [20]:
#peptides_completed_sequence.to_csv(os.path.join(datasets_path, "RvsS_peptides_completed_sequence_with_thresholds.csv"))

In [21]:
path = os.path.join(datasets_path, "RvsS_peptides_completed_sequence_with_thresholds.csv")
peptides_completed_sequence = pd.read_csv(path)
peptides_completed_sequence.set_index("Unnamed: 0", inplace=True)
peptides_completed_sequence.index.name = None
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,Log2HL avg,label,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,3.289988,green,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12,AADTIGYPV,9,584,FAVESMEDALKAADTIGYPV,IRSAYALGGLGSGICPNKET
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,2.789195,green,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...,262,10,IA,2,264,DCRIPKENLLGEPGMGFKIA,QTLDMGRIGIASQALGIAQA
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,2.555706,green,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...,12,11,FVGAVDPI,8,20,MASESGKLWGGRFVGAVDPI,EKFNSSISYDRHLWNVDVQG
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,2.538998,green,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...,412,20,QAQYLG,6,418,LGKLNVKLTKLTEKQAQYLG,PINGPFKPDHYRY
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,2.389057,green,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...,358,12,FADVIP,6,364,SMPYTDAVIHEVQRFADVIP,NLPHRVTRDTPFRGFLIPKG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,-1.857135,yellow,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...,213,17,AH,2,215,KSDLTKDITTSVLTVNNKAH,VTLDYTVQVPGTGRDGSPGF
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,-1.871741,yellow,MYRLSSSMLPRALAQAMRTGHLNGQSLHSSAVAATYKYVNKKEQES...,41,10,KEQESEVD,8,49,SAVAATYKYVNKKEQESEVD,KSATDNAARILMWTELIRGL
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,-1.950460,yellow,MLRLPTVLRQMRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDL...,141,15,RGV,3,144,KEGFEKISKGANPVEIRRGV,LAVDAVIAELKKQSKPVTTP
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,-2.270072,yellow,MVASRAIGSLSRFSAFRILRSRGCICRSFTTSSALLTRTHINYGVK...,505,11,,0,505,AVSKRPEKVIGMHYFSPVDK,QLLEIITTDKTSKDTTASAV


# Download Alphafold Data - Labeled Methionines

In [22]:
# path for alphafold protein data
alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/pae


In [23]:
# set uniprot IDs to use
uniprotIDs = peptides_completed_sequence["Protein ID"].unique()
uniprotIDs, len(uniprotIDs)

(array(['Q8C196', 'Q07417', 'Q91YI0', 'P50247', 'P33267', 'P97872',
        'Q9DBT9', 'P32020', 'P50136', 'P26443', 'Q91VS7', 'Q920A5',
        'O55125', 'Q05920', 'Q91VR2', 'P51881', 'P56480', 'Q61425',
        'P47962', 'P53395', 'Q921G7', 'Q9CQR4', 'P63017', 'Q62425',
        'P20029', 'O35423', 'Q61102', 'Q9WUR2', 'Q9CQQ7', 'P55096',
        'P01942', 'O35490', 'P38647', 'Q60991', 'Q9Z2I8', 'Q9EQ20',
        'Q9Z1J3', 'Q5U458', 'O35129', 'Q99JY0', 'Q9WTP6', 'Q99LC5',
        'P16460', 'Q9CZW5', 'P08226', 'Q03265', 'P63038', 'Q8CAQ8',
        'O35386', 'Q9Z2I0', 'P54869', 'P97742', 'Q9DCN1', 'Q9WUM5',
        'P60710', 'P68033', 'Q9CQ69', 'Q9DBJ1', 'Q9CR62', 'P19783',
        'Q8QZY2', 'O70579', 'Q9CQN1', 'Q8BH95', 'P51660', 'Q8BGY7',
        'Q8CC88', 'Q9CRB9', 'Q9Z0X1', 'Q9QXD1', 'Q2TPA8', 'P24270',
        'Q9CPQ8', 'Q80XN0', 'Q925I1', 'Q8CIM7', 'Q9Z1P6', 'Q8VC30',
        'Q9R0H0', 'P51658', 'P16332', 'Q8VDN2', 'Q8BJ64', 'P42125',
        'Q99MR8', 'P29758', 'Q5FW57', 'P62821', 

In [24]:
# download cif data for proteins
# SLOW THE FIRST TIME
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 113/113 [00:00<00:00, 101794.75it/s]

2024-05-04 12:55:36> Valid proteins: 0
2024-05-04 12:55:36> Invalid proteins: 0
2024-05-04 12:55:36> Existing proteins: 113





In [25]:
# download pae data for proteins
# SLOW THE FIRST TIME
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 113/113 [00:00<00:00, 90122.90it/s]

2024-05-04 12:55:36> Valid proteins: 0
2024-05-04 12:55:36> Invalid proteins: 0
2024-05-04 12:55:36> Existing proteins: 113





## Construct Alphafold Dataframe (Calculate Accessibilities) - Labeled Methionines

In [26]:
# format alphafold data into dataframe
alphafold_annotation = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=uniprotIDs)
alphafold_annotation

100%|██████████| 1617/1617 [00:08<00:00, 195.63it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,51.060,49.845,51.772,unstructured,unstructured,0,0,0,0,1
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,50.385,50.224,50.740,TURN_TY1_P,TURN,0,0,0,1,0
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,53.616,55.022,52.617,HELX_RH_AL_P,HELX,0,1,0,0,0
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,52.434,52.182,52.692,HELX_RH_AL_P,HELX,0,1,0,0,0
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,49.354,48.180,50.434,HELX_RH_AL_P,HELX,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52926,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,-18.565,-18.413,-17.274,HELX_RH_AL_P,HELX,0,1,0,0,0
52927,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,-18.909,-17.814,-18.401,BEND,BEND,1,0,0,0,0
52928,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,-21.302,-22.660,-20.673,unstructured,unstructured,0,0,0,0,1
52929,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,-19.459,-19.112,-20.119,unstructured,unstructured,0,0,0,0,1


In [27]:
# calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 113/113 [00:01<00:00, 59.61it/s] 


In [28]:
alphafold_accessibility = alphafold_annotation.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [29]:
# calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 113/113 [00:00<00:00, 180.30it/s]


In [30]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [31]:
# calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 113/113 [00:00<00:00, 192.95it/s]


In [32]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [33]:
# calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 113/113 [00:00<00:00, 143.60it/s]


In [34]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [35]:
# calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 113/113 [00:00<00:00, 181.27it/s]


In [36]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [37]:
# calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 113/113 [00:00<00:00, 180.91it/s]


In [38]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [39]:
# calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 113/113 [00:00<00:00, 169.10it/s]


In [40]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [41]:
# calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 113/113 [00:00<00:00, 174.49it/s]


In [42]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [43]:
# calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 113/113 [00:00<00:00, 172.39it/s]


In [44]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [45]:
# calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 113/113 [00:00<00:00, 164.69it/s]


In [46]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [47]:
# calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 113/113 [00:00<00:00, 162.36it/s]


In [48]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [49]:
# calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 113/113 [00:00<00:00, 139.57it/s]


In [50]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [51]:
# calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 113/113 [00:01<00:00, 97.31it/s]


In [52]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [53]:
# calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 113/113 [00:01<00:00, 66.70it/s]


In [54]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [55]:
# calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 113/113 [00:00<00:00, 135.92it/s]


In [56]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,1,1,1,1,1,1,3,5,7,0
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,0,2,2,2,2,2,4,6,9,0
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,1,2,2,2,2,2,5,7,10,0
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,0,2,2,2,2,2,6,9,11,1
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,2,2,2,2,2,2,8,9,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52926,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,2,2,4,4,5,5,18,53,91,12
52927,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,2,2,2,5,5,7,15,44,78,4
52928,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,2,2,2,2,2,2,5,15,36,0
52929,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,2,2,2,2,2,2,3,7,19,0


In [57]:
alphafold_accessibility.columns

Index(['protein_id', 'protein_number', 'AA', 'position', 'quality',
       'x_coord_c', 'x_coord_ca', 'x_coord_cb', 'x_coord_n', 'y_coord_c',
       'y_coord_ca', 'y_coord_cb', 'y_coord_n', 'z_coord_c', 'z_coord_ca',
       'z_coord_cb', 'z_coord_n', 'secondary_structure', 'structure_group',
       'BEND', 'HELX', 'STRN', 'TURN', 'unstructured', 'nAA_2_180_pae',
       'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae',
       'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae',
       'nAA_7.5_180_pae', 'nAA_8_180_pae', 'nAA_12_180_pae', 'nAA_18_180_pae',
       'nAA_24_180_pae', 'nAA_12_70_pae'],
      dtype='object')

In [58]:
alphafold_accessibility_smooth = get_smooth_score(
    alphafold_accessibility, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_smooth;

100%|██████████| 113/113 [00:00<00:00, 357.10it/s]


In [59]:
alphafold_accessibility_smooth['IDR'] = np.where(
    alphafold_accessibility_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,1.909091,1.909091,1.909091,1.909091,1.909091,6.000000,8.727273,11.181818,1.000000,1
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,1.916667,1.916667,1.916667,1.916667,1.916667,5.916667,8.666667,11.416667,1.000000,1
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,1.923077,1.923077,1.923077,1.923077,1.923077,5.769231,8.538462,11.538462,0.923077,1
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,1.857143,1.857143,1.928571,1.928571,1.928571,5.642857,8.500000,11.571429,0.857143,1
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,1.800000,1.800000,1.933333,1.933333,1.933333,5.466667,8.466667,11.733333,0.800000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,2.133333,3.533333,5.133333,6.333333,7.000000,20.666667,47.266667,85.200000,6.133333,0
429,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,2.071429,3.571429,5.214286,6.428571,7.142857,20.285714,46.357143,83.642857,6.142857,0
430,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,2.076923,3.538462,5.230769,6.461538,7.153846,20.307692,46.153846,83.307692,6.615385,0
431,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,2.083333,3.500000,5.083333,6.250000,7.000000,20.333333,45.583333,81.916667,6.833333,0


In [60]:
alphafold_accessibility_smooth.columns

Index(['protein_id', 'protein_number', 'AA', 'position', 'quality',
       'x_coord_c', 'x_coord_ca', 'x_coord_cb', 'x_coord_n', 'y_coord_c',
       'y_coord_ca', 'y_coord_cb', 'y_coord_n', 'z_coord_c', 'z_coord_ca',
       'z_coord_cb', 'z_coord_n', 'secondary_structure', 'structure_group',
       'BEND', 'HELX', 'STRN', 'TURN', 'unstructured', 'nAA_2_180_pae',
       'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae',
       'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae',
       'nAA_7.5_180_pae', 'nAA_8_180_pae', 'nAA_12_180_pae', 'nAA_18_180_pae',
       'nAA_24_180_pae', 'nAA_12_70_pae', 'nAA_2_180_pae_smooth10',
       'nAA_3_180_pae_smooth10', 'nAA_4_180_pae_smooth10',
       'nAA_4.5_180_pae_smooth10', 'nAA_5_180_pae_smooth10',
       'nAA_5.5_180_pae_smooth10', 'nAA_6_180_pae_smooth10',
       'nAA_6.5_180_pae_smooth10', 'nAA_7_180_pae_smooth10',
       'nAA_7.5_180_pae_smooth10', 'nAA_8_180_pae_smooth10',
       'nAA_12_180_pae_smooth

# Merge Dataframes into Full Dataset (Includes Alphafold) - Labeled Methionines

In [61]:
alphafold_accessibility_smooth["position"] = alphafold_accessibility_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_with_alphafold = peptides_completed_sequence.merge(
    alphafold_accessibility_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_with_alphafold

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,2.380952,3.666667,4.857143,6.190476,7.857143,22.190476,59.952381,120.857143,6.285714,0
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,2.761905,4.857143,6.285714,8.047619,9.047619,30.571429,90.857143,176.571429,8.285714,0
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,2.238095,3.285714,4.380952,5.238095,5.523810,11.238095,29.571429,61.952381,2.380952,0
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,2.095238,3.238095,4.428571,5.380952,5.952381,13.380952,27.047619,48.190476,3.047619,0
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,2.857143,4.238095,6.095238,7.380952,8.904762,27.523810,85.285714,171.619048,7.809524,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,2.523810,3.380952,4.428571,6.095238,7.666667,16.476190,42.238095,79.904762,2.380952,0
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,2.000000,2.904762,3.238095,4.000000,4.238095,7.190476,12.523810,19.428571,0.857143,1
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,2.857143,5.285714,6.238095,8.476190,8.952381,26.095238,62.571429,115.380952,7.571429,0
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,2.571429,3.428571,4.904762,6.761905,8.190476,25.952381,80.380952,173.428571,7.190476,0


In [62]:
#peptides_with_alphafold.to_csv(os.path.join(datasets_path, "RvsS_peptides_with_alphafold.csv"))

In [63]:
path = os.path.join(datasets_path, "RvsS_peptides_with_alphafold.csv")
peptides_with_alphafold = pd.read_csv(path)
peptides_with_alphafold.set_index("Unnamed: 0", inplace=True)
peptides_with_alphafold.index.name = None
peptides_with_alphafold

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,2.380952,3.666667,4.857143,6.190476,7.857143,22.190476,59.952381,120.857143,6.285714,0
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,2.761905,4.857143,6.285714,8.047619,9.047619,30.571429,90.857143,176.571429,8.285714,0
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,2.238095,3.285714,4.380952,5.238095,5.523810,11.238095,29.571429,61.952381,2.380952,0
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,2.095238,3.238095,4.428571,5.380952,5.952381,13.380952,27.047619,48.190476,3.047619,0
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,2.857143,4.238095,6.095238,7.380952,8.904762,27.523810,85.285714,171.619048,7.809524,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,2.523810,3.380952,4.428571,6.095238,7.666667,16.476190,42.238095,79.904762,2.380952,0
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,2.000000,2.904762,3.238095,4.000000,4.238095,7.190476,12.523810,19.428571,0.857143,1
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,2.857143,5.285714,6.238095,8.476190,8.952381,26.095238,62.571429,115.380952,7.571429,0
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,2.571429,3.428571,4.904762,6.761905,8.190476,25.952381,80.380952,173.428571,7.190476,0


# Load Dataset (MitoCarta3.0) - Full Mitochondrial Proteome

In [64]:
data_loc = os.path.join(datasets_path, "Mouse.MitoCarta3.0.xls")
mitocarta3_0 = pd.read_excel(data_loc, sheet_name="A Mouse MitoCarta3.0")
mitocarta3_0

Unnamed: 0,MouseGeneID,HumanOrthologGeneID,Symbol,Synonyms,Description,MitoCarta3.0_List,MitoCarta3.0_Evidence,MitoCarta3.0_SubMitoLocalization,MitoCarta3.0_MitoPathways,TrainingDataset,...,liver_total_peak_intensity_log10,heart_total_peak_intensity_log10,skeletalmuscle_total_peak_intensity_log10,adipose_total_peak_intensity_log10,smallintestine_total_peak_intensity_log10,largeintestine_total_peak_intensity_log10,stomach_total_peak_intensity_log10,placenta_total_peak_intensity_log10,testis_total_peak_intensity_log10,HPA_Main_Location_2020 (Reliability)
0,66445,1537.0,Cyc1,2610002H19Rik|AA408921|Cyct1,cytochrome c-1,MitoCarta3.0,"literature, APEX_IMS, APEX_matrix, targetP sig...",MIM,OXPHOS > Complex III > CIII subunits | Metabol...,Tmito,...,10.0,10.4,9.8,10.1,10.2,10.2,10.0,10.0,9.8,Mitochondria (Supported)
1,18597,5160.0,Pdha1,Pdha|Pdha-1,pyruvate dehydrogenase E1 alpha 1,MitoCarta3.0,"literature, APEX_matrix, targetP signal+, yeas...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,9.6,10.5,10.0,10.1,9.9,9.6,10.0,9.9,8.9,Mitochondria (Supported)
2,66043,513.0,Atp5d,0610008F14Rik|1500000I11Rik|AA960090|AI876556|...,"ATP synthase, H+ transporting, mitochondrial F...",MitoCarta3.0,"literature, APEX_matrix, targetP signal, yeast...",MIM,OXPHOS > Complex V > CV subunits | OXPHOS > OX...,Tmito,...,9.8,9.8,9.5,9.8,10.1,10.0,9.8,9.8,9.6,
3,74316,122961.0,Isca2,0710001C05Rik|5730594E03Rik|Hbld|Hbld1,iron-sulfur cluster assembly 2,MitoCarta3.0,"APEX_matrix, targetP signal+, yeast mito homol...",Matrix,Metabolism > Metals and cofactors > Fe-S clust...,Tmito,...,8.2,7.9,7.2,8.7,8.5,8.4,8.0,8.5,8.5,
4,68263,5162.0,Pdhb,2610103L06Rik|AL024199|C81408,pyruvate dehydrogenase (lipoamide) beta,MitoCarta3.0,"literature, targetP signal+, yeast mito homolo...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,9.6,10.5,9.8,10.4,9.8,9.9,9.9,10.0,10.5,Mitochondria (Supported)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,71703,51566.0,Armcx3,1200004E24Rik|AI450003|ALEX3,"armadillo repeat containing, X-linked 3",MitoCarta3.0,literature,MOM,Mitochondrial dynamics and surveillance > Traf...,Tpossible_mito,...,,,,,,,,,,Nucleoplasm (Approved)
1136,67474,9342.0,Snap29,1300018G05Rik|AI891940|AU020222|BB131856|Gs32,synaptosomal-associated protein 29,MitoCarta3.0,APEX_IMS,IMS,Mitochondrial dynamics and surveillance > Auto...,Tmito,...,,,,,,,,,,Cytosol (Supported)
1137,18970,5423.0,Polb,A430088C08Rik,"polymerase (DNA directed), beta",MitoCarta3.0,literature,Matrix,Mitochondrial central dogma > mtDNA maintenanc...,Tnon_mito,...,,,,,,,,,,Vesicles (Uncertain)
1138,109729085,109703458.0,Htd2,-,hydroxyacyl-thioester dehydratase type 2,MitoCarta3.0,literature,Matrix,Metabolism > Lipid metabolism > Type II fatty ...,NA - newly added to NCBI Entrez Gene,...,,,,,,,,,,Mitochondria (Supported)


In [65]:
# calculate number of proteins in the mitochondrial proteome
is_mitochondrial = (mitocarta3_0["HPA_Main_Location_2020 (Reliability)"].str.contains("mitoch", case=False))
(is_mitochondrial == True).value_counts(dropna=False);

In [66]:
# ensure protein split was done correctly (correct mitoch... string matching)
pd.set_option('display.max_rows', None)
#display(mitocarta3_0[is_mitochondrial == True]["HPA_Main_Location_2020 (Reliability)"].value_counts())
pd.reset_option('display.max_rows')

In [67]:
# filter MitoCarta3.0 dataset to only include mitochondrial proteins
mitocarta3_0_mitochondrial = mitocarta3_0[is_mitochondrial == True]
mitocarta3_0_mitochondrial;

In [68]:
mitocarta3_0_mitochondrial["UniProt"].isna().value_counts();

In [69]:
# drop rows with NaN UniProt IDs (just one)
mitocarta3_0_mitochondrial = mitocarta3_0_mitochondrial.dropna(subset=["UniProt"])
mitocarta3_0_mitochondrial;

In [70]:
# get whole amino acid sequences for mitochondrial proteome
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#tqdm.pandas()
#mitocarta3_0_mitochondrial_completed_sequence = mitocarta3_0_mitochondrial.copy()
#mitocarta3_0_mitochondrial_completed_sequence["Complete Sequence"] = mitocarta3_0_mitochondrial_completed_sequence["UniProt"].progress_apply(get_full_protein_seq)
#mitocarta3_0_mitochondrial_completed_sequence

In [71]:
#mitocarta3_0_mitochondrial_completed_sequence.to_csv(os.path.join(datasets_path, "RvsS_full_mitochondrial_completed_sequence.csv"))

In [72]:
path = os.path.join(datasets_path, "RvsS_full_mitochondrial_completed_sequence.csv")
mitocarta3_0_mitochondrial_completed_sequence = pd.read_csv(path)
mitocarta3_0_mitochondrial_completed_sequence.set_index("Unnamed: 0", inplace=True)
mitocarta3_0_mitochondrial_completed_sequence.index.name = None
mitocarta3_0_mitochondrial_completed_sequence

Unnamed: 0,MouseGeneID,HumanOrthologGeneID,Symbol,Synonyms,Description,MitoCarta3.0_List,MitoCarta3.0_Evidence,MitoCarta3.0_SubMitoLocalization,MitoCarta3.0_MitoPathways,TrainingDataset,...,heart_total_peak_intensity_log10,skeletalmuscle_total_peak_intensity_log10,adipose_total_peak_intensity_log10,smallintestine_total_peak_intensity_log10,largeintestine_total_peak_intensity_log10,stomach_total_peak_intensity_log10,placenta_total_peak_intensity_log10,testis_total_peak_intensity_log10,HPA_Main_Location_2020 (Reliability),Complete Sequence
0,66445,1537.0,Cyc1,2610002H19Rik|AA408921|Cyct1,cytochrome c-1,MitoCarta3.0,"literature, APEX_IMS, APEX_matrix, targetP sig...",MIM,OXPHOS > Complex III > CIII subunits | Metabol...,Tmito,...,10.4,9.8,10.1,10.2,10.2,10.0,10.0,9.8,Mitochondria (Supported),MAAAAASLRRTVLGPRGVGLPGASAPGLLGGARSRQLPLRTPQAVS...
1,18597,5160.0,Pdha1,Pdha|Pdha-1,pyruvate dehydrogenase E1 alpha 1,MitoCarta3.0,"literature, APEX_matrix, targetP signal+, yeas...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,10.5,10.0,10.1,9.9,9.6,10.0,9.9,8.9,Mitochondria (Supported),MRKMLAAVSRVLAGSAQKPASRVLVASRNFANDATFEIKKCDLHRL...
4,68263,5162.0,Pdhb,2610103L06Rik|AL024199|C81408,pyruvate dehydrogenase (lipoamide) beta,MitoCarta3.0,"literature, targetP signal+, yeast mito homolo...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,10.5,9.8,10.4,9.8,9.9,9.9,10.0,10.5,Mitochondria (Supported),MAVVAGLVRGPLRQASGLLKRRFHRSAPAAVQLTVREAINQGMDEE...
5,22273,7384.0,Uqcrc1,1110032G10Rik,ubiquinol-cytochrome c reductase core protein 1,MitoCarta3.0,"literature, APEX_matrix, targetP signal+, yeas...",MIM,"Protein import, sorting and homeostasis > Prot...",Tmito,...,10.8,10.3,10.1,10.6,10.2,10.3,10.2,9.9,Mitochondria (Supported),MAASAVCRAACSGTQVLLRTRRSPALLRLPALRGTATFAQALQSVP...
7,56282,6182.0,Mrpl12,0610034O11Rik|1500031N16Rik|L12mt|MRP-|MRP-L12...,mitochondrial ribosomal protein L12,MitoCarta3.0,"literature, targetP signal+, yeast mito homolo...",Matrix,Mitochondrial central dogma > mtRNA metabolism...,Tmito,...,8.9,8.6,9.3,9.2,9.8,9.0,9.4,9.0,Mitochondria (Supported),MLPVAASRCLWGPRLGLRGAALRLARQQMPSVCAARQLRSSSHRRS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,67759,55848.0,Plgrkt,1110007H22Rik|5033414D02Rik|AI852040|Plg-|Plg-...,"plasminogen receptor, C-terminal lysine transm...",MitoCarta3.0,GFP,Membrane,0,Tmito,...,,6.9,8.6,9.3,9.5,8.1,9.5,8.7,Mitochondria (Uncertain),MGFIFSKSMNENMKNQQEFMVTHARLQLERHLTMQNEMRERQMAMQ...
1129,78248,51309.0,Armcx1,3010033I09Rik|ALEX1,"armadillo repeat containing, X-linked 1",MitoCarta3.0,literature,MOM,Mitochondrial dynamics and surveillance > Traf...,Tnon_mito,...,,,,,,,,,Mitochondria;Nucleoplasm (Approved),MGRTREAGCVAAGMVIGAGACYCVYRLTWGKDENEKLWDEEEEEEE...
1130,67416,9823.0,Armcx2,3230401N03Rik|AI043003|ALEX2,"armadillo repeat containing, X-linked 2",MitoCarta3.0,literature,Membrane,0,Tnon_mito,...,,,,,,,,,Mitochondria (Approved),MSRARDAGCVAAGIVIGASAWYCVYKYTRGKDQKKKRLTKPKNRAS...
1132,269642,339983.0,Nat8l,1110038O08Rik|Sh|Shati,N-acetyltransferase 8-like,MitoCarta3.0,literature,Membrane,Metabolism > Amino acid metabolism > Glutamate...,Tpossible_mito,...,,,,,,,,,Mitochondria (Approved),MHCGPPDMVCETKIVATEDHEALPGAKKDALLVAAGAMWPPLPAAP...


In [73]:
# NOTE: sequence length from database doesn't exactly match up with length of sequence as determined by UniProtID - weird
(mitocarta3_0_mitochondrial_completed_sequence["ProteinLength"] - mitocarta3_0_mitochondrial_completed_sequence["Complete Sequence"].str.len()).value_counts();

# Download Alphafold Data - Full Mitochondrial Proteome

In [74]:
# path for alphafold protein data
alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/pae


In [75]:
# NOTE: these IDs are invalid from an Alphafold perspective - they are the secondary UniProtIDs, which was fine for querying UniProt, but not Alphafold
# so, manually impute these IDs with their primary ones
# invalid_proteins_cif -> ['F7C846', 'Q9JLT4', 'Q3UW66', 'Q91XR9']

replace_dict = {'F7C846': 'Q8R5C0', 'Q3UW66': 'Q99J99', 'Q91XR9': 'O70325'} 
# Q9JLT4 is weird - seems to be primary ID, but not in Alphafold
# Q91XR9 / O70325 is weird - O70325 exists in UniProt, but not Alphafold

mitocarta3_0_mitochondrial_completed_sequence["UniProt-Primary"] = mitocarta3_0_mitochondrial_completed_sequence["UniProt"].replace(replace_dict)
mitocarta3_0_mitochondrial_completed_sequence = mitocarta3_0_mitochondrial_completed_sequence.drop(mitocarta3_0_mitochondrial_completed_sequence[mitocarta3_0_mitochondrial_completed_sequence["UniProt"] == "Q9JLT4"].index)
mitocarta3_0_mitochondrial_completed_sequence = mitocarta3_0_mitochondrial_completed_sequence.drop(mitocarta3_0_mitochondrial_completed_sequence[mitocarta3_0_mitochondrial_completed_sequence["UniProt-Primary"] == "O70325"].index)
mitocarta3_0_mitochondrial_completed_sequence;

In [76]:
# set uniprot IDs to use
uniprotIDs_fullproteome = mitocarta3_0_mitochondrial_completed_sequence["UniProt-Primary"].unique()
uniprotIDs_fullproteome, len(uniprotIDs_fullproteome);

In [77]:
# download cif data for proteins
# SLOW THE FIRST TIME
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprotIDs_fullproteome,
    out_folder=cif_dir
)

100%|██████████| 528/528 [00:00<00:00, 184426.43it/s]

2024-05-04 12:55:59> Valid proteins: 0
2024-05-04 12:55:59> Invalid proteins: 0
2024-05-04 12:55:59> Existing proteins: 528





In [78]:
# download pae data for proteins
# SLOW THE FIRST TIME
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprotIDs_fullproteome,
    out_folder=pae_dir, 
)

100%|██████████| 528/528 [00:00<00:00, 193701.79it/s]

2024-05-04 12:55:59> Valid proteins: 0
2024-05-04 12:55:59> Invalid proteins: 0
2024-05-04 12:55:59> Existing proteins: 528





# Construct Alphafold Dataframe (Calculate Accessibilities) - Full Mitochondrial Proteome

In [79]:
# format alphafold data into dataframe
alphafold_annotation_full = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=uniprotIDs_fullproteome)
alphafold_annotation_full

100%|██████████| 1617/1617 [00:30<00:00, 53.53it/s] 


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,38.096,37.222,39.345,unstructured,unstructured,0,0,0,0,1
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,38.724,39.162,38.463,unstructured,unstructured,0,0,0,0,1
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,36.289,36.675,37.473,unstructured,unstructured,0,0,0,0,1
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,33.884,32.424,34.597,unstructured,unstructured,0,0,0,0,1
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,34.618,35.575,34.658,unstructured,unstructured,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197263,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,-27.747,-27.920,-26.354,HELX_RH_AL_P,HELX,0,1,0,0,0
197264,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,-27.713,-26.580,-27.423,HELX_RH_AL_P,HELX,0,1,0,0,0
197265,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,-27.292,-26.038,-27.216,HELX_RH_AL_P,HELX,0,1,0,0,0
197266,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,-30.321,-30.226,-29.116,HELX_RH_AL_P,HELX,0,1,0,0,0


In [80]:
# calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 528/528 [00:02<00:00, 244.69it/s]


In [81]:
alphafold_accessibility_full = alphafold_annotation_full.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [82]:
# calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 528/528 [00:01<00:00, 270.66it/s]


In [83]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [84]:
# calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 528/528 [00:01<00:00, 266.23it/s]


In [85]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [86]:
# calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 528/528 [00:01<00:00, 266.65it/s]


In [87]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [88]:
# calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 528/528 [00:02<00:00, 261.16it/s]


In [89]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [90]:
# calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 528/528 [00:02<00:00, 259.99it/s]


In [91]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [92]:
# calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 528/528 [00:02<00:00, 255.90it/s]


In [93]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [94]:
# calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 528/528 [00:02<00:00, 248.77it/s]


In [95]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [96]:
# calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 528/528 [00:02<00:00, 244.58it/s]


In [97]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [98]:
# calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 528/528 [00:02<00:00, 230.23it/s]


In [99]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [100]:
# calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 528/528 [00:02<00:00, 240.07it/s]


In [101]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [102]:
# calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 528/528 [00:02<00:00, 204.94it/s]


In [103]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [104]:
# calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 528/528 [00:03<00:00, 142.28it/s]


In [105]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [106]:
# calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 528/528 [00:05<00:00, 99.85it/s] 


In [107]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [108]:
# calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 528/528 [00:02<00:00, 198.83it/s]


In [109]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,1,1,1,1,1,1,2,3,4,0
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,1,2,2,2,2,2,3,4,6,0
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,2,2,2,2,2,2,4,5,6,0
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,1,2,2,2,2,2,4,6,8,0
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,1,2,2,2,2,2,4,6,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197263,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,2,2,2,2,3,3,6,9,15,0
197264,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,2,2,2,2,2,3,6,8,13,1
197265,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,2,2,2,2,2,3,5,8,11,1
197266,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,2,2,2,2,2,2,5,6,9,2


In [110]:
alphafold_accessibility_full.columns

Index(['protein_id', 'protein_number', 'AA', 'position', 'quality',
       'x_coord_c', 'x_coord_ca', 'x_coord_cb', 'x_coord_n', 'y_coord_c',
       'y_coord_ca', 'y_coord_cb', 'y_coord_n', 'z_coord_c', 'z_coord_ca',
       'z_coord_cb', 'z_coord_n', 'secondary_structure', 'structure_group',
       'BEND', 'HELX', 'STRN', 'TURN', 'unstructured', 'nAA_2_180_pae',
       'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae',
       'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae',
       'nAA_7.5_180_pae', 'nAA_8_180_pae', 'nAA_12_180_pae', 'nAA_18_180_pae',
       'nAA_24_180_pae', 'nAA_12_70_pae'],
      dtype='object')

In [111]:
alphafold_accessibility_full_smooth = get_smooth_score(
    alphafold_accessibility_full, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_full_smooth;

100%|██████████| 528/528 [00:00<00:00, 676.14it/s]


In [112]:
alphafold_accessibility_full_smooth['IDR'] = np.where(
    alphafold_accessibility_full_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_full_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,1.909091,1.909091,1.909091,1.909091,1.909091,3.727273,5.545455,8.454545,0.000000,1
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,1.916667,1.916667,1.916667,1.916667,1.916667,3.750000,5.666667,8.666667,0.000000,1
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,1.923077,1.923077,1.923077,1.923077,1.923077,3.769231,5.769231,8.769231,0.000000,1
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,1.928571,1.928571,1.928571,1.928571,1.928571,3.857143,5.857143,8.714286,0.000000,1
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,1.933333,1.933333,1.933333,1.933333,1.933333,3.866667,5.866667,8.733333,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,2.066667,2.533333,3.333333,3.866667,4.333333,10.133333,20.333333,38.200000,1.666667,0
227,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,2.071429,2.571429,3.214286,3.714286,4.071429,9.500000,19.357143,35.785714,1.500000,0
228,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,2.076923,2.461538,3.076923,3.615385,4.000000,9.153846,18.615385,34.000000,1.615385,1
229,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,2.083333,2.500000,3.000000,3.583333,4.000000,9.250000,18.250000,33.250000,1.750000,1


In [113]:
alphafold_accessibility_full_smooth.columns

Index(['protein_id', 'protein_number', 'AA', 'position', 'quality',
       'x_coord_c', 'x_coord_ca', 'x_coord_cb', 'x_coord_n', 'y_coord_c',
       'y_coord_ca', 'y_coord_cb', 'y_coord_n', 'z_coord_c', 'z_coord_ca',
       'z_coord_cb', 'z_coord_n', 'secondary_structure', 'structure_group',
       'BEND', 'HELX', 'STRN', 'TURN', 'unstructured', 'nAA_2_180_pae',
       'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae',
       'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae',
       'nAA_7.5_180_pae', 'nAA_8_180_pae', 'nAA_12_180_pae', 'nAA_18_180_pae',
       'nAA_24_180_pae', 'nAA_12_70_pae', 'nAA_2_180_pae_smooth10',
       'nAA_3_180_pae_smooth10', 'nAA_4_180_pae_smooth10',
       'nAA_4.5_180_pae_smooth10', 'nAA_5_180_pae_smooth10',
       'nAA_5.5_180_pae_smooth10', 'nAA_6_180_pae_smooth10',
       'nAA_6.5_180_pae_smooth10', 'nAA_7_180_pae_smooth10',
       'nAA_7.5_180_pae_smooth10', 'nAA_8_180_pae_smooth10',
       'nAA_12_180_pae_smooth

# ????? Merge Dataframes into Full Dataset (Includes Alphafold) - Full Mitochondrial Proteome

In [114]:
# filter out table to only include methionines
mitocarta3_0_methionines = alphafold_accessibility_full_smooth[alphafold_accessibility_full_smooth["AA"] == "M"]
mitocarta3_0_methionines

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,1.909091,1.909091,1.909091,1.909091,1.909091,3.727273,5.545455,8.454545,0.000000,1
7,A2ADF7,1,M,8,25.00,-17.959,-18.801,-19.503,-17.880,-11.797,...,1.944444,1.944444,1.944444,1.944444,1.944444,4.000000,6.555556,10.500000,0.111111,1
11,A2ADF7,1,M,12,31.62,-22.014,-20.790,-20.768,-20.882,-2.522,...,2.000000,2.190476,2.333333,2.476190,2.476190,5.000000,9.142857,17.666667,0.285714,1
16,A2ADF7,1,M,17,60.19,-24.992,-24.872,-26.170,-24.382,3.164,...,2.142857,2.904762,3.285714,3.857143,3.904762,7.666667,17.238095,32.380952,1.047619,1
96,A2ADF7,1,M,97,86.39,-12.515,-11.617,-10.128,-11.907,1.475,...,2.428571,4.809524,6.000000,7.619048,8.142857,19.047619,51.666667,104.095238,5.095238,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,S4R2K0,528,M,91,98.40,-3.270,-3.856,-3.011,-5.229,-11.159,...,2.285714,4.380952,6.000000,7.523810,8.571429,25.428571,66.619048,119.285714,7.761905,0
132,S4R2K0,528,M,133,98.22,14.774,14.306,13.644,15.422,-7.821,...,2.428571,3.761905,5.142857,6.380952,7.666667,18.952381,49.142857,89.476190,4.380952,0
203,S4R2K0,528,M,204,98.79,0.508,1.302,2.674,1.468,3.585,...,2.333333,5.000000,6.333333,7.619048,8.809524,28.714286,82.714286,134.619048,8.571429,0
215,S4R2K0,528,M,216,98.03,3.940,3.358,2.382,2.708,12.422,...,2.333333,3.904762,5.000000,5.761905,6.714286,18.619048,46.714286,87.380952,4.428571,0


In [115]:
mitocarta3_0_methionines["position"] = mitocarta3_0_methionines["position"] - 1 # zero-index the positions to match initial dataframe

mitocarta3_0_methionines_with_alphafold = mitocarta3_0_methionines.merge(
    mitocarta3_0_mitochondrial_completed_sequence[["UniProt", "UniProt-Primary", "Complete Sequence"]], 
    how="left", 
    left_on="protein_id", 
    right_on="UniProt-Primary"
)
mitocarta3_0_methionines_with_alphafold

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR,UniProt,UniProt-Primary,Complete Sequence
0,A2ADF7,1,M,0,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,1.909091,1.909091,3.727273,5.545455,8.454545,0.000000,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
1,A2ADF7,1,M,7,25.00,-17.959,-18.801,-19.503,-17.880,-11.797,...,1.944444,1.944444,4.000000,6.555556,10.500000,0.111111,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
2,A2ADF7,1,M,11,31.62,-22.014,-20.790,-20.768,-20.882,-2.522,...,2.476190,2.476190,5.000000,9.142857,17.666667,0.285714,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
3,A2ADF7,1,M,16,60.19,-24.992,-24.872,-26.170,-24.382,3.164,...,3.857143,3.904762,7.666667,17.238095,32.380952,1.047619,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
4,A2ADF7,1,M,96,86.39,-12.515,-11.617,-10.128,-11.907,1.475,...,7.619048,8.142857,19.047619,51.666667,104.095238,5.095238,0,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4843,S4R2K0,528,M,90,98.40,-3.270,-3.856,-3.011,-5.229,-11.159,...,7.523810,8.571429,25.428571,66.619048,119.285714,7.761905,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4844,S4R2K0,528,M,132,98.22,14.774,14.306,13.644,15.422,-7.821,...,6.380952,7.666667,18.952381,49.142857,89.476190,4.380952,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4845,S4R2K0,528,M,203,98.79,0.508,1.302,2.674,1.468,3.585,...,7.619048,8.809524,28.714286,82.714286,134.619048,8.571429,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4846,S4R2K0,528,M,215,98.03,3.940,3.358,2.382,2.708,12.422,...,5.761905,6.714286,18.619048,46.714286,87.380952,4.428571,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...


In [116]:
#mitocarta3_0_methionines_with_alphafold.to_csv(os.path.join(datasets_path, "RvsS_full_mitochondrial_with_alphafold.csv"))

In [117]:
path = os.path.join(datasets_path, "RvsS_full_mitochondrial_with_alphafold.csv")
mitocarta3_0_methionines_with_alphafold = pd.read_csv(path)
mitocarta3_0_methionines_with_alphafold.set_index("Unnamed: 0", inplace=True)
mitocarta3_0_methionines_with_alphafold.index.name = None
mitocarta3_0_methionines_with_alphafold

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR,UniProt,UniProt-Primary,Complete Sequence
0,A2ADF7,1,M,0,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,1.909091,1.909091,3.727273,5.545455,8.454545,0.000000,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
1,A2ADF7,1,M,7,25.00,-17.959,-18.801,-19.503,-17.880,-11.797,...,1.944444,1.944444,4.000000,6.555556,10.500000,0.111111,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
2,A2ADF7,1,M,11,31.62,-22.014,-20.790,-20.768,-20.882,-2.522,...,2.476190,2.476190,5.000000,9.142857,17.666667,0.285714,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
3,A2ADF7,1,M,16,60.19,-24.992,-24.872,-26.170,-24.382,3.164,...,3.857143,3.904762,7.666667,17.238095,32.380952,1.047619,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
4,A2ADF7,1,M,96,86.39,-12.515,-11.617,-10.128,-11.907,1.475,...,7.619048,8.142857,19.047619,51.666667,104.095238,5.095238,0,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4843,S4R2K0,528,M,90,98.40,-3.270,-3.856,-3.011,-5.229,-11.159,...,7.523810,8.571429,25.428571,66.619048,119.285714,7.761905,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4844,S4R2K0,528,M,132,98.22,14.774,14.306,13.644,15.422,-7.821,...,6.380952,7.666667,18.952381,49.142857,89.476190,4.380952,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4845,S4R2K0,528,M,203,98.79,0.508,1.302,2.674,1.468,3.585,...,7.619048,8.809524,28.714286,82.714286,134.619048,8.571429,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4846,S4R2K0,528,M,215,98.03,3.940,3.358,2.382,2.708,12.422,...,5.761905,6.714286,18.619048,46.714286,87.380952,4.428571,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...


# End of Data Generation (For Now)