## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
import re
import plotly.express as px
from tqdm import tqdm
import tempfile
import csv
import requests as r
from Bio import SeqIO
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns
import scipy


# Import structuremap functions
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score, annotate_proteins_with_idr_pattern, get_extended_flexible_pattern, get_proximity_pvals, perform_enrichment_analysis, perform_enrichment_analysis_per_protein, evaluate_ptm_colocalization, extract_motifs_in_proteome
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization

## Set Parameters of Analysis

In [2]:
analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "655.3735" 
light_modification = "649.3660"

## Load Dataset - Labeled Methionines

In [3]:
# path for csv output data
datasets_path_str = "../datasets/"
datasets_path = os.path.abspath(datasets_path_str)
print(datasets_path)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/datasets


In [4]:
data_loc = os.path.join(datasets_path, "RvsS_DataSet.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="Sheet2")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,Protein,Protein ID,Entry Name,Gene,Protein Description,p-value,neglogp,Log2HL avg,Unnamed: 24,Protein ID.1
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",4.058191e-03,2.391668,3.289988,,Q8C196
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,sp|Q07417|ACADS_MOUSE,Q07417,ACADS_MOUSE,Acads,"Short-chain specific acyl-CoA dehydrogenase, m...",9.647931e-06,5.015566,2.789195,,Q07417
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,sp|Q91YI0|ARLY_MOUSE,Q91YI0,ARLY_MOUSE,Asl,Argininosuccinate lyase,4.283587e-02,1.368192,2.555706,,Q91YI0
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,sp|P50247|SAHH_MOUSE,P50247,SAHH_MOUSE,Ahcy,Adenosylhomocysteinase,8.561078e-10,9.067472,2.538998,,P50247
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,sp|P33267|CP2F2_MOUSE,P33267,CP2F2_MOUSE,Cyp2f2,Cytochrome P450 2F2,2.032232e-04,3.692027,2.389057,,P33267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,sp|Q9QXF8|GNMT_MOUSE,Q9QXF8,GNMT_MOUSE,Gnmt,Glycine N-methyltransferase,1.880219e-08,7.725792,-1.857135,,Q9QXF8
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,sp|Q8K3J1|NDUS8_MOUSE,Q8K3J1,NDUS8_MOUSE,Ndufs8,NADH dehydrogenase [ubiquinone] iron-sulfur pr...,7.034064e-14,13.152794,-1.871741,,Q8K3J1
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,sp|P63038|CH60_MOUSE,P63038,CH60_MOUSE,Hspd1,"60 kDa heat shock protein, mitochondrial",7.289026e-11,10.137330,-1.950460,,P63038
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,sp|Q8BMS1|ECHA_MOUSE,Q8BMS1,ECHA_MOUSE,Hadha,"Trifunctional enzyme subunit alpha, mitochondrial",5.582467e-09,8.253174,-2.270072,,Q8BMS1


In [5]:
# Canonicalize data
#peptides["Assigned Modifications"] = peptides["Assigned Modifications"].str.replace("(", "[", regex=False)
#peptides["Assigned Modifications"] = peptides["Assigned Modifications"].str.replace(")", "]", regex=False)

peptides.drop(columns=["Unnamed: 24", "Protein ID.1"], axis=1, inplace=True)
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,8 Log2 Ratio HL,9 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,p-value,neglogp,Log2HL avg
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,,,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",4.058191e-03,2.391668,3.289988
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,,,sp|Q07417|ACADS_MOUSE,Q07417,ACADS_MOUSE,Acads,"Short-chain specific acyl-CoA dehydrogenase, m...",9.647931e-06,5.015566,2.789195
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,,,sp|Q91YI0|ARLY_MOUSE,Q91YI0,ARLY_MOUSE,Asl,Argininosuccinate lyase,4.283587e-02,1.368192,2.555706
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,2.839492,2.607501,sp|P50247|SAHH_MOUSE,P50247,SAHH_MOUSE,Ahcy,Adenosylhomocysteinase,8.561078e-10,9.067472,2.538998
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,,,sp|P33267|CP2F2_MOUSE,P33267,CP2F2_MOUSE,Cyp2f2,Cytochrome P450 2F2,2.032232e-04,3.692027,2.389057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,-1.613004,-1.760150,sp|Q9QXF8|GNMT_MOUSE,Q9QXF8,GNMT_MOUSE,Gnmt,Glycine N-methyltransferase,1.880219e-08,7.725792,-1.857135
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,-1.891596,-1.881011,sp|Q8K3J1|NDUS8_MOUSE,Q8K3J1,NDUS8_MOUSE,Ndufs8,NADH dehydrogenase [ubiquinone] iron-sulfur pr...,7.034064e-14,13.152794,-1.871741
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,-1.829820,-1.935236,sp|P63038|CH60_MOUSE,P63038,CH60_MOUSE,Hspd1,"60 kDa heat shock protein, mitochondrial",7.289026e-11,10.137330,-1.950460
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,-2.326194,-2.205174,sp|Q8BMS1|ECHA_MOUSE,Q8BMS1,ECHA_MOUSE,Hadha,"Trifunctional enzyme subunit alpha, mitochondrial",5.582467e-09,8.253174,-2.270072


In [6]:
label_col_data = ["green"] * 39 + ["white"] * 143 + ["yellow"] * 21
label_col = pd.Series(label_col_data)
peptides["label"] = label_col
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,9 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,p-value,neglogp,Log2HL avg,label
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",4.058191e-03,2.391668,3.289988,green
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,,sp|Q07417|ACADS_MOUSE,Q07417,ACADS_MOUSE,Acads,"Short-chain specific acyl-CoA dehydrogenase, m...",9.647931e-06,5.015566,2.789195,green
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,,sp|Q91YI0|ARLY_MOUSE,Q91YI0,ARLY_MOUSE,Asl,Argininosuccinate lyase,4.283587e-02,1.368192,2.555706,green
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,2.607501,sp|P50247|SAHH_MOUSE,P50247,SAHH_MOUSE,Ahcy,Adenosylhomocysteinase,8.561078e-10,9.067472,2.538998,green
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,,sp|P33267|CP2F2_MOUSE,P33267,CP2F2_MOUSE,Cyp2f2,Cytochrome P450 2F2,2.032232e-04,3.692027,2.389057,green
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,-1.760150,sp|Q9QXF8|GNMT_MOUSE,Q9QXF8,GNMT_MOUSE,Gnmt,Glycine N-methyltransferase,1.880219e-08,7.725792,-1.857135,yellow
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,-1.881011,sp|Q8K3J1|NDUS8_MOUSE,Q8K3J1,NDUS8_MOUSE,Ndufs8,NADH dehydrogenase [ubiquinone] iron-sulfur pr...,7.034064e-14,13.152794,-1.871741,yellow
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,-1.935236,sp|P63038|CH60_MOUSE,P63038,CH60_MOUSE,Hspd1,"60 kDa heat shock protein, mitochondrial",7.289026e-11,10.137330,-1.950460,yellow
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,-2.205174,sp|Q8BMS1|ECHA_MOUSE,Q8BMS1,ECHA_MOUSE,Hadha,"Trifunctional enzyme subunit alpha, mitochondrial",5.582467e-09,8.253174,-2.270072,yellow


In [7]:
# helper function to get full amino acid sequence for a protein
def get_full_protein_seq(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [8]:
# test - get a single amino acid sequence
#first_protein_ID = peptides["Protein ID"].iloc[0]
#test_sequence = get_full_protein_seq(first_protein_ID)
#print(test_sequence[575:587])
#print(peptides["Peptide Sequence"].iloc[0])

In [9]:
# get whole amino acid sequences for methionine peptides
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#tqdm.pandas()
#peptides_completed_sequence = peptides.copy()
#peptides_completed_sequence["Complete Sequence"] = peptides_completed_sequence["Protein ID"].progress_apply(get_full_protein_seq)
#peptides_completed_sequence

# NOTE: WE CAN SPEED THIS UP BY ONLY GETTING UNIQUE PROTEINS (lots of repeats in the dataset)

In [10]:
#peptides_completed_sequence.to_csv(os.path.join(datasets_path, "RvsS_peptides_completed_sequence.csv"))

In [11]:
path = os.path.join(datasets_path, "RvsS_peptides_completed_sequence.csv")
peptides_completed_sequence = pd.read_csv(path)
peptides_completed_sequence.set_index("Unnamed: 0", inplace=True)
peptides_completed_sequence.index.name = None
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,Protein,Protein ID,Entry Name,Gene,Protein Description,p-value,neglogp,Log2HL avg,label,Complete Sequence
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",4.058191e-03,2.391668,3.289988,green,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,sp|Q07417|ACADS_MOUSE,Q07417,ACADS_MOUSE,Acads,"Short-chain specific acyl-CoA dehydrogenase, m...",9.647931e-06,5.015566,2.789195,green,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,sp|Q91YI0|ARLY_MOUSE,Q91YI0,ARLY_MOUSE,Asl,Argininosuccinate lyase,4.283587e-02,1.368192,2.555706,green,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,sp|P50247|SAHH_MOUSE,P50247,SAHH_MOUSE,Ahcy,Adenosylhomocysteinase,8.561078e-10,9.067472,2.538998,green,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,sp|P33267|CP2F2_MOUSE,P33267,CP2F2_MOUSE,Cyp2f2,Cytochrome P450 2F2,2.032232e-04,3.692027,2.389057,green,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,sp|Q9QXF8|GNMT_MOUSE,Q9QXF8,GNMT_MOUSE,Gnmt,Glycine N-methyltransferase,1.880219e-08,7.725792,-1.857135,yellow,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,sp|Q8K3J1|NDUS8_MOUSE,Q8K3J1,NDUS8_MOUSE,Ndufs8,NADH dehydrogenase [ubiquinone] iron-sulfur pr...,7.034064e-14,13.152794,-1.871741,yellow,MYRLSSSMLPRALAQAMRTGHLNGQSLHSSAVAATYKYVNKKEQES...
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,sp|P63038|CH60_MOUSE,P63038,CH60_MOUSE,Hspd1,"60 kDa heat shock protein, mitochondrial",7.289026e-11,10.137330,-1.950460,yellow,MLRLPTVLRQMRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDL...
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,sp|Q8BMS1|ECHA_MOUSE,Q8BMS1,ECHA_MOUSE,Hadha,"Trifunctional enzyme subunit alpha, mitochondrial",5.582467e-09,8.253174,-2.270072,yellow,MVASRAIGSLSRFSAFRILRSRGCICRSFTTSSALLTRTHINYGVK...


In [12]:
# create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    split_mod = modifications[0].split(".")
    whole = split_mod[0]
    mantissa = split_mod[1]
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        split_mod = modifications[i].split(".")
        whole = split_mod[0]
        mantissa = split_mod[1]
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [13]:
peptides_completed_sequence["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Peptide Sequence"])])
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,Protein ID,Entry Name,Gene,Protein Description,p-value,neglogp,Log2HL avg,label,Complete Sequence,Sequence Location
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",4.058191e-03,2.391668,3.289988,green,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,Q07417,ACADS_MOUSE,Acads,"Short-chain specific acyl-CoA dehydrogenase, m...",9.647931e-06,5.015566,2.789195,green,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...,262
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,Q91YI0,ARLY_MOUSE,Asl,Argininosuccinate lyase,4.283587e-02,1.368192,2.555706,green,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...,12
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,P50247,SAHH_MOUSE,Ahcy,Adenosylhomocysteinase,8.561078e-10,9.067472,2.538998,green,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...,412
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,P33267,CP2F2_MOUSE,Cyp2f2,Cytochrome P450 2F2,2.032232e-04,3.692027,2.389057,green,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...,358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,Q9QXF8,GNMT_MOUSE,Gnmt,Glycine N-methyltransferase,1.880219e-08,7.725792,-1.857135,yellow,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...,213
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,Q8K3J1,NDUS8_MOUSE,Ndufs8,NADH dehydrogenase [ubiquinone] iron-sulfur pr...,7.034064e-14,13.152794,-1.871741,yellow,MYRLSSSMLPRALAQAMRTGHLNGQSLHSSAVAATYKYVNKKEQES...,41
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,P63038,CH60_MOUSE,Hspd1,"60 kDa heat shock protein, mitochondrial",7.289026e-11,10.137330,-1.950460,yellow,MLRLPTVLRQMRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDL...,141
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,Q8BMS1,ECHA_MOUSE,Hadha,"Trifunctional enzyme subunit alpha, mitochondrial",5.582467e-09,8.253174,-2.270072,yellow,MVASRAIGSLSRFSAFRILRSRGCICRSFTTSSALLTRTHINYGVK...,505


In [14]:
peptides_completed_sequence["Sequence Length"] = peptides_completed_sequence["Peptide Sequence"].str.len()
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,Entry Name,Gene,Protein Description,p-value,neglogp,Log2HL avg,label,Complete Sequence,Sequence Location,Sequence Length
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",4.058191e-03,2.391668,3.289988,green,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,ACADS_MOUSE,Acads,"Short-chain specific acyl-CoA dehydrogenase, m...",9.647931e-06,5.015566,2.789195,green,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...,262,10
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,ARLY_MOUSE,Asl,Argininosuccinate lyase,4.283587e-02,1.368192,2.555706,green,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...,12,11
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,SAHH_MOUSE,Ahcy,Adenosylhomocysteinase,8.561078e-10,9.067472,2.538998,green,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...,412,20
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,CP2F2_MOUSE,Cyp2f2,Cytochrome P450 2F2,2.032232e-04,3.692027,2.389057,green,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...,358,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,GNMT_MOUSE,Gnmt,Glycine N-methyltransferase,1.880219e-08,7.725792,-1.857135,yellow,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...,213,17
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,NDUS8_MOUSE,Ndufs8,NADH dehydrogenase [ubiquinone] iron-sulfur pr...,7.034064e-14,13.152794,-1.871741,yellow,MYRLSSSMLPRALAQAMRTGHLNGQSLHSSAVAATYKYVNKKEQES...,41,10
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,CH60_MOUSE,Hspd1,"60 kDa heat shock protein, mitochondrial",7.289026e-11,10.137330,-1.950460,yellow,MLRLPTVLRQMRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDL...,141,15
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,ECHA_MOUSE,Hadha,"Trifunctional enzyme subunit alpha, mitochondrial",5.582467e-09,8.253174,-2.270072,yellow,MVASRAIGSLSRFSAFRILRSRGCICRSFTTSSALLTRTHINYGVK...,505,11


In [15]:
temp = [A[B:B+C] for A, B, C in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Sequence Location"], peptides_completed_sequence["Sequence Length"])]
(temp == peptides_completed_sequence["Peptide Sequence"]).value_counts()

Peptide Sequence
True    203
Name: count, dtype: int64

In [16]:
# create regex pattern to identify desired modifications
left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [17]:
# extract left prefix of modified methionine (for indexing purposes)
IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

peptides_completed_sequence["Left Prefix"] = peptides_completed_sequence["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_completed_sequence["Left Prefix"] = peptides_completed_sequence["Left Prefix"].map(filtering)
peptides_completed_sequence["Left Prefix Length"] = peptides_completed_sequence["Left Prefix"].str.len()

peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,Protein Description,p-value,neglogp,Log2HL avg,label,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,"Carbamoyl-phosphate synthase [ammonia], mitoch...",4.058191e-03,2.391668,3.289988,green,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12,AADTIGYPV,9
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,"Short-chain specific acyl-CoA dehydrogenase, m...",9.647931e-06,5.015566,2.789195,green,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...,262,10,IA,2
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,Argininosuccinate lyase,4.283587e-02,1.368192,2.555706,green,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...,12,11,FVGAVDPI,8
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,Adenosylhomocysteinase,8.561078e-10,9.067472,2.538998,green,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...,412,20,QAQYLG,6
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,Cytochrome P450 2F2,2.032232e-04,3.692027,2.389057,green,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...,358,12,FADVIP,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,Glycine N-methyltransferase,1.880219e-08,7.725792,-1.857135,yellow,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...,213,17,AH,2
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,NADH dehydrogenase [ubiquinone] iron-sulfur pr...,7.034064e-14,13.152794,-1.871741,yellow,MYRLSSSMLPRALAQAMRTGHLNGQSLHSSAVAATYKYVNKKEQES...,41,10,KEQESEVD,8
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,"60 kDa heat shock protein, mitochondrial",7.289026e-11,10.137330,-1.950460,yellow,MLRLPTVLRQMRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDL...,141,15,RGV,3
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,"Trifunctional enzyme subunit alpha, mitochondrial",5.582467e-09,8.253174,-2.270072,yellow,MVASRAIGSLSRFSAFRILRSRGCICRSFTTSSALLTRTHINYGVK...,505,11,,0


In [18]:
peptides_completed_sequence["Methionine Location"] = peptides_completed_sequence["Sequence Location"] + peptides_completed_sequence["Left Prefix Length"]
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,p-value,neglogp,Log2HL avg,label,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,4.058191e-03,2.391668,3.289988,green,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12,AADTIGYPV,9,584
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,9.647931e-06,5.015566,2.789195,green,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...,262,10,IA,2,264
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,4.283587e-02,1.368192,2.555706,green,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...,12,11,FVGAVDPI,8,20
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,8.561078e-10,9.067472,2.538998,green,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...,412,20,QAQYLG,6,418
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,2.032232e-04,3.692027,2.389057,green,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...,358,12,FADVIP,6,364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,1.880219e-08,7.725792,-1.857135,yellow,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...,213,17,AH,2,215
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,7.034064e-14,13.152794,-1.871741,yellow,MYRLSSSMLPRALAQAMRTGHLNGQSLHSSAVAATYKYVNKKEQES...,41,10,KEQESEVD,8,49
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,7.289026e-11,10.137330,-1.950460,yellow,MLRLPTVLRQMRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDL...,141,15,RGV,3,144
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,5.582467e-09,8.253174,-2.270072,yellow,MVASRAIGSLSRFSAFRILRSRGCICRSFTTSSALLTRTHINYGVK...,505,11,,0,505


In [19]:
# Compute left/right analysis sequences based on threshold
peptides_completed_sequence[f"Left {analysis_threshold}"] = [A[B-1-analysis_threshold:B-1]  if (B - 1 - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Methionine Location"])]
peptides_completed_sequence[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_completed_sequence["Complete Sequence"], peptides_completed_sequence["Methionine Location"])]
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,Log2HL avg,label,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,3.289988,green,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12,AADTIGYPV,9,584,SFAVESMEDALKAADTIGYP,IRSAYALGGLGSGICPNKET
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,2.789195,green,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...,262,10,IA,2,264,EDCRIPKENLLGEPGMGFKI,QTLDMGRIGIASQALGIAQA
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,2.555706,green,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...,12,11,FVGAVDPI,8,20,MASESGKLWGGRFVGAVDP,EKFNSSISYDRHLWNVDVQG
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,2.538998,green,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...,412,20,QAQYLG,6,418,HLGKLNVKLTKLTEKQAQYL,PINGPFKPDHYRY
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,2.389057,green,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...,358,12,FADVIP,6,364,TSMPYTDAVIHEVQRFADVI,NLPHRVTRDTPFRGFLIPKG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,-1.857135,yellow,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...,213,17,AH,2,215,YKSDLTKDITTSVLTVNNKA,VTLDYTVQVPGTGRDGSPGF
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,-1.871741,yellow,MYRLSSSMLPRALAQAMRTGHLNGQSLHSSAVAATYKYVNKKEQES...,41,10,KEQESEVD,8,49,SSAVAATYKYVNKKEQESEV,KSATDNAARILMWTELIRGL
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,-1.950460,yellow,MLRLPTVLRQMRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDL...,141,15,RGV,3,144,AKEGFEKISKGANPVEIRRG,LAVDAVIAELKKQSKPVTTP
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,-2.270072,yellow,MVASRAIGSLSRFSAFRILRSRGCICRSFTTSSALLTRTHINYGVK...,505,11,,0,505,AAVSKRPEKVIGMHYFSPVD,QLLEIITTDKTSKDTTASAV


In [20]:
#peptides_completed_sequence.to_csv(os.path.join(datasets_path, "RvsS_peptides_completed_sequence_with_thresholds.csv"))

In [21]:
path = os.path.join(datasets_path, "RvsS_peptides_completed_sequence_with_thresholds.csv")
peptides_completed_sequence = pd.read_csv(path)
peptides_completed_sequence.set_index("Unnamed: 0", inplace=True)
peptides_completed_sequence.index.name = None
peptides_completed_sequence

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,Log2HL avg,label,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,3.289988,green,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12,AADTIGYPV,9,584,SFAVESMEDALKAADTIGYP,IRSAYALGGLGSGICPNKET
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,2.789195,green,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...,262,10,IA,2,264,EDCRIPKENLLGEPGMGFKI,QTLDMGRIGIASQALGIAQA
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,2.555706,green,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...,12,11,FVGAVDPI,8,20,MASESGKLWGGRFVGAVDP,EKFNSSISYDRHLWNVDVQG
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,2.538998,green,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...,412,20,QAQYLG,6,418,HLGKLNVKLTKLTEKQAQYL,PINGPFKPDHYRY
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,2.389057,green,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...,358,12,FADVIP,6,364,TSMPYTDAVIHEVQRFADVI,NLPHRVTRDTPFRGFLIPKG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,-1.857135,yellow,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...,213,17,AH,2,215,YKSDLTKDITTSVLTVNNKA,VTLDYTVQVPGTGRDGSPGF
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,-1.871741,yellow,MYRLSSSMLPRALAQAMRTGHLNGQSLHSSAVAATYKYVNKKEQES...,41,10,KEQESEVD,8,49,SSAVAATYKYVNKKEQESEV,KSATDNAARILMWTELIRGL
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,-1.950460,yellow,MLRLPTVLRQMRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDL...,141,15,RGV,3,144,AKEGFEKISKGANPVEIRRG,LAVDAVIAELKKQSKPVTTP
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,-2.270072,yellow,MVASRAIGSLSRFSAFRILRSRGCICRSFTTSSALLTRTHINYGVK...,505,11,,0,505,AAVSKRPEKVIGMHYFSPVD,QLLEIITTDKTSKDTTASAV


# Download Alphafold Data - Labeled Methionines

In [22]:
# path for alphafold protein data
alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/pae


In [23]:
# set uniprot IDs to use
uniprotIDs = peptides_completed_sequence["Protein ID"].values
uniprotIDs

array(['Q8C196', 'Q07417', 'Q91YI0', 'P50247', 'P33267', 'P97872',
       'Q9DBT9', 'P32020', 'P32020', 'P50136', 'P26443', 'Q91VS7',
       'Q920A5', 'O55125', 'Q8C196', 'Q05920', 'Q91VR2', 'Q91VR2',
       'P51881', 'P56480', 'P97872', 'Q61425', 'P47962', 'P53395',
       'Q921G7', 'Q9CQR4', 'Q921G7', 'P63017', 'Q62425', 'P20029',
       'O35423', 'Q61102', 'Q8C196', 'Q9WUR2', 'Q9CQQ7', 'P55096',
       'P01942', 'O35490', 'P38647', 'Q60991', 'P56480', 'Q9Z2I8',
       'Q9EQ20', 'Q9Z1J3', 'Q5U458', 'O35129', 'Q05920', 'Q99JY0',
       'P55096', 'Q91VS7', 'Q99JY0', 'Q9WTP6', 'Q99LC5', 'P16460',
       'Q9CZW5', 'P08226', 'P51881', 'P08226', 'Q03265', 'P55096',
       'P16460', 'Q99LC5', 'P63038', 'Q8CAQ8', 'O35386', 'Q9Z2I0',
       'P54869', 'P56480', 'Q9Z2I8', 'Q8C196', 'P97742', 'P55096',
       'P63038', 'Q9DCN1', 'P56480', 'Q05920', 'Q9WUM5', 'P63038',
       'Q05920', 'Q91VS7', 'P56480', 'P60710', 'Q9Z2I8', 'Q8C196',
       'P68033', 'Q9CQ69', 'Q9DBJ1', 'Q9CR62', 'P19783', 'P630

In [24]:
# download cif data for proteins
# SLOW THE FIRST TIME
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 203/203 [00:00<00:00, 42246.88it/s]

2024-04-25 21:21:02> Valid proteins: 0
2024-04-25 21:21:02> Invalid proteins: 0
2024-04-25 21:21:02> Existing proteins: 203





In [25]:
# download pae data for proteins
# SLOW THE FIRST TIME
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 203/203 [00:00<00:00, 98786.83it/s]

2024-04-25 21:21:03> Valid proteins: 0
2024-04-25 21:21:03> Invalid proteins: 0
2024-04-25 21:21:03> Existing proteins: 203





## Construct Alphafold Dataframe (Calculate Accessibilities) - Labeled Methionines

In [26]:
# format alphafold data into dataframe
alphafold_annotation = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=uniprotIDs)
alphafold_annotation

  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='i

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,51.060,49.845,51.772,unstructured,unstructured,0,0,0,0,1
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,50.385,50.224,50.740,TURN_TY1_P,TURN,0,0,0,1,0
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,53.616,55.022,52.617,HELX_RH_AL_P,HELX,0,1,0,0,0
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,52.434,52.182,52.692,HELX_RH_AL_P,HELX,0,1,0,0,0
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,49.354,48.180,50.434,HELX_RH_AL_P,HELX,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52926,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,-18.565,-18.413,-17.274,HELX_RH_AL_P,HELX,0,1,0,0,0
52927,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,-18.909,-17.814,-18.401,BEND,BEND,1,0,0,0,0
52928,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,-21.302,-22.660,-20.673,unstructured,unstructured,0,0,0,0,1
52929,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,-19.459,-19.112,-20.119,unstructured,unstructured,0,0,0,0,1


In [27]:
# calculate full sphere exposure -> radius = 24
full_sphere_exposure_24 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
full_sphere_exposure_24

100%|██████████| 113/113 [00:03<00:00, 34.72it/s]


Unnamed: 0,protein_id,AA,position,nAA_24_180_pae
0,O35129,M,1,7
1,O35129,A,2,9
2,O35129,Q,3,10
3,O35129,N,4,11
4,O35129,L,5,11
...,...,...,...,...
52926,Q9Z2I8,S,429,91
52927,Q9Z2I8,V,430,78
52928,Q9Z2I8,A,431,36
52929,Q9Z2I8,K,432,19


In [28]:
alphafold_accessibility = alphafold_annotation.merge(
    full_sphere_exposure_24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,49.845,51.772,unstructured,unstructured,0,0,0,0,1,7
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,50.224,50.740,TURN_TY1_P,TURN,0,0,0,1,0,9
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,55.022,52.617,HELX_RH_AL_P,HELX,0,1,0,0,0,10
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,52.182,52.692,HELX_RH_AL_P,HELX,0,1,0,0,0,11
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,48.180,50.434,HELX_RH_AL_P,HELX,0,1,0,0,0,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52926,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,-18.413,-17.274,HELX_RH_AL_P,HELX,0,1,0,0,0,91
52927,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,-17.814,-18.401,BEND,BEND,1,0,0,0,0,78
52928,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,-22.660,-20.673,unstructured,unstructured,0,0,0,0,1,36
52929,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,-19.112,-20.119,unstructured,unstructured,0,0,0,0,1,19


In [29]:
# calculate full sphere exposure -> angle = 70, radius = 12
part_sphere_exposure = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
part_sphere_exposure

100%|██████████| 113/113 [00:00<00:00, 123.50it/s]


Unnamed: 0,protein_id,AA,position,nAA_12_70_pae
0,O35129,M,1,0
1,O35129,A,2,0
2,O35129,Q,3,0
3,O35129,N,4,1
4,O35129,L,5,1
...,...,...,...,...
52926,Q9Z2I8,S,429,12
52927,Q9Z2I8,V,430,4
52928,Q9Z2I8,A,431,0
52929,Q9Z2I8,K,432,0


In [30]:
alphafold_accessibility = alphafold_accessibility.merge(
    part_sphere_exposure, how='left', on=['protein_id','AA','position'])
alphafold_accessibility

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,51.772,unstructured,unstructured,0,0,0,0,1,7,0
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,50.740,TURN_TY1_P,TURN,0,0,0,1,0,9,0
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,52.617,HELX_RH_AL_P,HELX,0,1,0,0,0,10,0
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,52.692,HELX_RH_AL_P,HELX,0,1,0,0,0,11,1
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,50.434,HELX_RH_AL_P,HELX,0,1,0,0,0,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52926,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,-17.274,HELX_RH_AL_P,HELX,0,1,0,0,0,91,12
52927,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,-18.401,BEND,BEND,1,0,0,0,0,78,4
52928,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,-20.673,unstructured,unstructured,0,0,0,0,1,36,0
52929,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,-20.119,unstructured,unstructured,0,0,0,0,1,19,0


In [31]:
# calculate full sphere exposure -> radius = 6
full_sphere_exposure_6 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
full_sphere_exposure_6

100%|██████████| 113/113 [00:00<00:00, 175.37it/s]


Unnamed: 0,protein_id,AA,position,nAA_6_180_pae
0,O35129,M,1,1
1,O35129,A,2,2
2,O35129,Q,3,2
3,O35129,N,4,2
4,O35129,L,5,2
...,...,...,...,...
52926,Q9Z2I8,S,429,2
52927,Q9Z2I8,V,430,2
52928,Q9Z2I8,A,431,2
52929,Q9Z2I8,K,432,2


In [32]:
alphafold_accessibility = alphafold_accessibility.merge(
    full_sphere_exposure_6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,nAA_6_180_pae
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,unstructured,unstructured,0,0,0,0,1,7,0,1
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,TURN_TY1_P,TURN,0,0,0,1,0,9,0,2
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,HELX_RH_AL_P,HELX,0,1,0,0,0,10,0,2
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,HELX_RH_AL_P,HELX,0,1,0,0,0,11,1,2
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,HELX_RH_AL_P,HELX,0,1,0,0,0,11,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52926,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,HELX_RH_AL_P,HELX,0,1,0,0,0,91,12,2
52927,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,BEND,BEND,1,0,0,0,0,78,4,2
52928,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,unstructured,unstructured,0,0,0,0,1,36,0,2
52929,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,unstructured,unstructured,0,0,0,0,1,19,0,2


In [33]:
# calculate full sphere exposure -> radius = 12
full_sphere_exposure_12 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
full_sphere_exposure_12

100%|██████████| 113/113 [00:00<00:00, 140.01it/s]


Unnamed: 0,protein_id,AA,position,nAA_12_180_pae
0,O35129,M,1,3
1,O35129,A,2,4
2,O35129,Q,3,5
3,O35129,N,4,6
4,O35129,L,5,8
...,...,...,...,...
52926,Q9Z2I8,S,429,18
52927,Q9Z2I8,V,430,15
52928,Q9Z2I8,A,431,5
52929,Q9Z2I8,K,432,3


In [34]:
alphafold_accessibility = alphafold_accessibility.merge(
    full_sphere_exposure_12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,nAA_6_180_pae,nAA_12_180_pae
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,unstructured,0,0,0,0,1,7,0,1,3
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,TURN,0,0,0,1,0,9,0,2,4
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,HELX,0,1,0,0,0,10,0,2,5
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,HELX,0,1,0,0,0,11,1,2,6
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,HELX,0,1,0,0,0,11,1,2,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52926,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,HELX,0,1,0,0,0,91,12,2,18
52927,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,BEND,1,0,0,0,0,78,4,2,15
52928,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,unstructured,0,0,0,0,1,36,0,2,5
52929,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,unstructured,0,0,0,0,1,19,0,2,3


In [35]:
# calculate full sphere exposure -> radius = 18
full_sphere_exposure_18 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
full_sphere_exposure_18

100%|██████████| 113/113 [00:01<00:00, 99.36it/s]


Unnamed: 0,protein_id,AA,position,nAA_18_180_pae
0,O35129,M,1,5
1,O35129,A,2,6
2,O35129,Q,3,7
3,O35129,N,4,9
4,O35129,L,5,9
...,...,...,...,...
52926,Q9Z2I8,S,429,53
52927,Q9Z2I8,V,430,44
52928,Q9Z2I8,A,431,15
52929,Q9Z2I8,K,432,7


In [36]:
alphafold_accessibility = alphafold_accessibility.merge(
    full_sphere_exposure_18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,nAA_6_180_pae,nAA_12_180_pae,nAA_18_180_pae
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,0,0,0,0,1,7,0,1,3,5
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,0,0,0,1,0,9,0,2,4,6
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,0,1,0,0,0,10,0,2,5,7
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,0,1,0,0,0,11,1,2,6,9
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,0,1,0,0,0,11,1,2,8,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52926,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,0,1,0,0,0,91,12,2,18,53
52927,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,1,0,0,0,0,78,4,2,15,44
52928,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,0,0,0,0,1,36,0,2,5,15
52929,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,0,0,0,0,1,19,0,2,3,7


In [37]:
alphafold_accessibility_smooth = get_smooth_score(
    alphafold_accessibility, 
    np.array(['nAA_6_180_pae', 'nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae']), 
    [10])
alphafold_accessibility_smooth

100%|██████████| 113/113 [00:00<00:00, 693.63it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,unstructured,nAA_24_180_pae,nAA_12_70_pae,nAA_6_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_6_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,1,7,0,1,3,5,1.909091,6.000000,8.727273,11.181818
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,0,9,0,2,4,6,1.916667,5.916667,8.666667,11.416667
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,0,10,0,2,5,7,1.923077,5.769231,8.538462,11.538462
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,0,11,1,2,6,9,1.857143,5.642857,8.500000,11.571429
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,0,11,1,2,8,9,1.800000,5.466667,8.466667,11.733333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,0,91,12,2,18,53,2.133333,20.666667,47.266667,85.200000
429,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,0,78,4,2,15,44,2.071429,20.285714,46.357143,83.642857
430,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,1,36,0,2,5,15,2.076923,20.307692,46.153846,83.307692
431,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,1,19,0,2,3,7,2.083333,20.333333,45.583333,81.916667


In [38]:
alphafold_accessibility_smooth['IDR'] = np.where(
    alphafold_accessibility_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_24_180_pae,nAA_12_70_pae,nAA_6_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_6_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,7,0,1,3,5,1.909091,6.000000,8.727273,11.181818,1
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,9,0,2,4,6,1.916667,5.916667,8.666667,11.416667,1
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,10,0,2,5,7,1.923077,5.769231,8.538462,11.538462,1
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,11,1,2,6,9,1.857143,5.642857,8.500000,11.571429,1
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,11,1,2,8,9,1.800000,5.466667,8.466667,11.733333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,91,12,2,18,53,2.133333,20.666667,47.266667,85.200000,0
429,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,78,4,2,15,44,2.071429,20.285714,46.357143,83.642857,0
430,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,36,0,2,5,15,2.076923,20.307692,46.153846,83.307692,0
431,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,19,0,2,3,7,2.083333,20.333333,45.583333,81.916667,0


# Merge Dataframes into Full Dataset (Includes Alphafold) - Labeled Methionines

In [39]:
alphafold_accessibility_smooth["position"] -= 1 # zero-index the positions to match initial dataframe

peptides_with_alphafold = peptides_completed_sequence.merge(
    alphafold_accessibility_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_with_alphafold

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,nAA_24_180_pae,nAA_12_70_pae,nAA_6_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_6_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,144,5,3,31,85,2.380952,22.190476,59.952381,120.857143,0
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,173,1,2,19,90,2.761905,30.571429,90.857143,176.571429,0
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,52,2,3,9,24,2.238095,11.238095,29.571429,61.952381,0
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,39,5,2,15,24,2.095238,13.380952,27.047619,48.190476,0
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,207,6,3,27,104,2.857143,27.523810,85.285714,171.619048,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,104,2,3,25,60,2.523810,16.476190,42.238095,79.904762,0
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,19,0,2,7,11,2.000000,7.190476,12.523810,19.428571,1
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,128,2,3,26,71,2.857143,26.095238,62.571429,115.380952,0
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,158,8,2,15,55,2.571429,25.952381,80.380952,173.428571,0


In [40]:
#peptides_with_alphafold.to_csv(os.path.join(datasets_path, "RvsS_peptides_with_alphafold.csv"))

In [41]:
path = os.path.join(datasets_path, "RvsS_peptides_with_alphafold.csv")
peptides_with_alphafold = pd.read_csv(path)
peptides_with_alphafold.set_index("Unnamed: 0", inplace=True)
peptides_with_alphafold.index.name = None
peptides_with_alphafold

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,nAA_24_180_pae,nAA_12_70_pae,nAA_6_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_6_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,144,5,3,31,85,2.380952,22.190476,59.952381,120.857143,0
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,173,1,2,19,90,2.761905,30.571429,90.857143,176.571429,0
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,52,2,3,9,24,2.238095,11.238095,29.571429,61.952381,0
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,39,5,2,15,24,2.095238,13.380952,27.047619,48.190476,0
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,207,6,3,27,104,2.857143,27.523810,85.285714,171.619048,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,104,2,3,25,60,2.523810,16.476190,42.238095,79.904762,0
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,19,0,2,7,11,2.000000,7.190476,12.523810,19.428571,1
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,128,2,3,26,71,2.857143,26.095238,62.571429,115.380952,0
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,158,8,2,15,55,2.571429,25.952381,80.380952,173.428571,0


# Load Dataset (MitoCarta3.0) - Full Mitochondrial Proteome

In [42]:
data_loc = os.path.join(datasets_path, "Mouse.MitoCarta3.0.xls")
mitocarta3_0 = pd.read_excel(data_loc, sheet_name="A Mouse MitoCarta3.0")
mitocarta3_0

Unnamed: 0,MouseGeneID,HumanOrthologGeneID,Symbol,Synonyms,Description,MitoCarta3.0_List,MitoCarta3.0_Evidence,MitoCarta3.0_SubMitoLocalization,MitoCarta3.0_MitoPathways,TrainingDataset,...,liver_total_peak_intensity_log10,heart_total_peak_intensity_log10,skeletalmuscle_total_peak_intensity_log10,adipose_total_peak_intensity_log10,smallintestine_total_peak_intensity_log10,largeintestine_total_peak_intensity_log10,stomach_total_peak_intensity_log10,placenta_total_peak_intensity_log10,testis_total_peak_intensity_log10,HPA_Main_Location_2020 (Reliability)
0,66445,1537.0,Cyc1,2610002H19Rik|AA408921|Cyct1,cytochrome c-1,MitoCarta3.0,"literature, APEX_IMS, APEX_matrix, targetP sig...",MIM,OXPHOS > Complex III > CIII subunits | Metabol...,Tmito,...,10.0,10.4,9.8,10.1,10.2,10.2,10.0,10.0,9.8,Mitochondria (Supported)
1,18597,5160.0,Pdha1,Pdha|Pdha-1,pyruvate dehydrogenase E1 alpha 1,MitoCarta3.0,"literature, APEX_matrix, targetP signal+, yeas...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,9.6,10.5,10.0,10.1,9.9,9.6,10.0,9.9,8.9,Mitochondria (Supported)
2,66043,513.0,Atp5d,0610008F14Rik|1500000I11Rik|AA960090|AI876556|...,"ATP synthase, H+ transporting, mitochondrial F...",MitoCarta3.0,"literature, APEX_matrix, targetP signal, yeast...",MIM,OXPHOS > Complex V > CV subunits | OXPHOS > OX...,Tmito,...,9.8,9.8,9.5,9.8,10.1,10.0,9.8,9.8,9.6,
3,74316,122961.0,Isca2,0710001C05Rik|5730594E03Rik|Hbld|Hbld1,iron-sulfur cluster assembly 2,MitoCarta3.0,"APEX_matrix, targetP signal+, yeast mito homol...",Matrix,Metabolism > Metals and cofactors > Fe-S clust...,Tmito,...,8.2,7.9,7.2,8.7,8.5,8.4,8.0,8.5,8.5,
4,68263,5162.0,Pdhb,2610103L06Rik|AL024199|C81408,pyruvate dehydrogenase (lipoamide) beta,MitoCarta3.0,"literature, targetP signal+, yeast mito homolo...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,9.6,10.5,9.8,10.4,9.8,9.9,9.9,10.0,10.5,Mitochondria (Supported)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,71703,51566.0,Armcx3,1200004E24Rik|AI450003|ALEX3,"armadillo repeat containing, X-linked 3",MitoCarta3.0,literature,MOM,Mitochondrial dynamics and surveillance > Traf...,Tpossible_mito,...,,,,,,,,,,Nucleoplasm (Approved)
1136,67474,9342.0,Snap29,1300018G05Rik|AI891940|AU020222|BB131856|Gs32,synaptosomal-associated protein 29,MitoCarta3.0,APEX_IMS,IMS,Mitochondrial dynamics and surveillance > Auto...,Tmito,...,,,,,,,,,,Cytosol (Supported)
1137,18970,5423.0,Polb,A430088C08Rik,"polymerase (DNA directed), beta",MitoCarta3.0,literature,Matrix,Mitochondrial central dogma > mtDNA maintenanc...,Tnon_mito,...,,,,,,,,,,Vesicles (Uncertain)
1138,109729085,109703458.0,Htd2,-,hydroxyacyl-thioester dehydratase type 2,MitoCarta3.0,literature,Matrix,Metabolism > Lipid metabolism > Type II fatty ...,NA - newly added to NCBI Entrez Gene,...,,,,,,,,,,Mitochondria (Supported)


In [43]:
# calculate number of proteins in the mitochondrial proteome
is_mitochondrial = (mitocarta3_0["HPA_Main_Location_2020 (Reliability)"].str.contains("mitoch", case=False))
(is_mitochondrial == True).value_counts(dropna=False)

HPA_Main_Location_2020 (Reliability)
False    609
True     531
Name: count, dtype: int64

In [44]:
# ensure protein split was done correctly (correct mitoch... string matching)
pd.set_option('display.max_rows', None)
display(mitocarta3_0[is_mitochondrial == True]["HPA_Main_Location_2020 (Reliability)"].value_counts())
pd.reset_option('display.max_rows')

HPA_Main_Location_2020 (Reliability)
Mitochondria (Supported)                               266
Mitochondria (Enhanced)                                108
Mitochondria (Approved)                                 87
Mitochondria;Nucleoplasm (Supported)                    17
Mitochondria;Nucleoplasm (Approved)                     16
Cytosol;Mitochondria (Approved)                          6
Cytosol;Mitochondria (Supported)                         5
Mitochondria;Nucleoli (Approved)                         3
Mitochondria (Uncertain)                                 3
Mitochondria;Nuclear bodies (Supported)                  2
Mitochondria;Plasma membrane (Supported)                 2
Cytosol;Mitochondria (Enhanced)                          2
Mitochondria;Nuclear bodies (Approved)                   1
Mitochondria;Nucleoplasm;Vesicles (Approved)             1
Mitochondria;Nucleoli fibrillar center (Approved)        1
Cytosol;Mitochondria;Nucleoplasm (Approved)              1
Mitochondria;Peroxi

In [45]:
# filter MitoCarta3.0 dataset to only include mitochondrial proteins
mitocarta3_0_mitochondrial = mitocarta3_0[is_mitochondrial == True]
mitocarta3_0_mitochondrial

Unnamed: 0,MouseGeneID,HumanOrthologGeneID,Symbol,Synonyms,Description,MitoCarta3.0_List,MitoCarta3.0_Evidence,MitoCarta3.0_SubMitoLocalization,MitoCarta3.0_MitoPathways,TrainingDataset,...,liver_total_peak_intensity_log10,heart_total_peak_intensity_log10,skeletalmuscle_total_peak_intensity_log10,adipose_total_peak_intensity_log10,smallintestine_total_peak_intensity_log10,largeintestine_total_peak_intensity_log10,stomach_total_peak_intensity_log10,placenta_total_peak_intensity_log10,testis_total_peak_intensity_log10,HPA_Main_Location_2020 (Reliability)
0,66445,1537.0,Cyc1,2610002H19Rik|AA408921|Cyct1,cytochrome c-1,MitoCarta3.0,"literature, APEX_IMS, APEX_matrix, targetP sig...",MIM,OXPHOS > Complex III > CIII subunits | Metabol...,Tmito,...,10.0,10.4,9.8,10.1,10.2,10.2,10.0,10.0,9.8,Mitochondria (Supported)
1,18597,5160.0,Pdha1,Pdha|Pdha-1,pyruvate dehydrogenase E1 alpha 1,MitoCarta3.0,"literature, APEX_matrix, targetP signal+, yeas...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,9.6,10.5,10.0,10.1,9.9,9.6,10.0,9.9,8.9,Mitochondria (Supported)
4,68263,5162.0,Pdhb,2610103L06Rik|AL024199|C81408,pyruvate dehydrogenase (lipoamide) beta,MitoCarta3.0,"literature, targetP signal+, yeast mito homolo...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,9.6,10.5,9.8,10.4,9.8,9.9,9.9,10.0,10.5,Mitochondria (Supported)
5,22273,7384.0,Uqcrc1,1110032G10Rik,ubiquinol-cytochrome c reductase core protein 1,MitoCarta3.0,"literature, APEX_matrix, targetP signal+, yeas...",MIM,"Protein import, sorting and homeostasis > Prot...",Tmito,...,10.0,10.8,10.3,10.1,10.6,10.2,10.3,10.2,9.9,Mitochondria (Supported)
7,56282,6182.0,Mrpl12,0610034O11Rik|1500031N16Rik|L12mt|MRP-|MRP-L12...,mitochondrial ribosomal protein L12,MitoCarta3.0,"literature, targetP signal+, yeast mito homolo...",Matrix,Mitochondrial central dogma > mtRNA metabolism...,Tmito,...,9.3,8.9,8.6,9.3,9.2,9.8,9.0,9.4,9.0,Mitochondria (Supported)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1129,78248,51309.0,Armcx1,3010033I09Rik|ALEX1,"armadillo repeat containing, X-linked 1",MitoCarta3.0,literature,MOM,Mitochondrial dynamics and surveillance > Traf...,Tnon_mito,...,,,,,,,,,,Mitochondria;Nucleoplasm (Approved)
1130,67416,9823.0,Armcx2,3230401N03Rik|AI043003|ALEX2,"armadillo repeat containing, X-linked 2",MitoCarta3.0,literature,Membrane,0,Tnon_mito,...,,,,,,,,,,Mitochondria (Approved)
1132,269642,339983.0,Nat8l,1110038O08Rik|Sh|Shati,N-acetyltransferase 8-like,MitoCarta3.0,literature,Membrane,Metabolism > Amino acid metabolism > Glutamate...,Tpossible_mito,...,,,,,,,,,,Mitochondria (Approved)
1134,76781,64863.0,Mettl4,2410198H06Rik|A730091E08Rik|AV296509|HsT66|HsT661,methyltransferase like 4,MitoCarta3.0,literature,Matrix,Mitochondrial central dogma > mtDNA maintenanc...,Tnon_mito,...,,,,,,,,,,Mitochondria (Approved)


In [46]:
mitocarta3_0_mitochondrial["UniProt"].isna().value_counts()

UniProt
False    530
True       1
Name: count, dtype: int64

In [47]:
# drop rows with NaN UniProt IDs (just one)
mitocarta3_0_mitochondrial.dropna(subset=["UniProt"], inplace=True)
mitocarta3_0_mitochondrial

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mitocarta3_0_mitochondrial.dropna(subset=["UniProt"], inplace=True)


Unnamed: 0,MouseGeneID,HumanOrthologGeneID,Symbol,Synonyms,Description,MitoCarta3.0_List,MitoCarta3.0_Evidence,MitoCarta3.0_SubMitoLocalization,MitoCarta3.0_MitoPathways,TrainingDataset,...,liver_total_peak_intensity_log10,heart_total_peak_intensity_log10,skeletalmuscle_total_peak_intensity_log10,adipose_total_peak_intensity_log10,smallintestine_total_peak_intensity_log10,largeintestine_total_peak_intensity_log10,stomach_total_peak_intensity_log10,placenta_total_peak_intensity_log10,testis_total_peak_intensity_log10,HPA_Main_Location_2020 (Reliability)
0,66445,1537.0,Cyc1,2610002H19Rik|AA408921|Cyct1,cytochrome c-1,MitoCarta3.0,"literature, APEX_IMS, APEX_matrix, targetP sig...",MIM,OXPHOS > Complex III > CIII subunits | Metabol...,Tmito,...,10.0,10.4,9.8,10.1,10.2,10.2,10.0,10.0,9.8,Mitochondria (Supported)
1,18597,5160.0,Pdha1,Pdha|Pdha-1,pyruvate dehydrogenase E1 alpha 1,MitoCarta3.0,"literature, APEX_matrix, targetP signal+, yeas...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,9.6,10.5,10.0,10.1,9.9,9.6,10.0,9.9,8.9,Mitochondria (Supported)
4,68263,5162.0,Pdhb,2610103L06Rik|AL024199|C81408,pyruvate dehydrogenase (lipoamide) beta,MitoCarta3.0,"literature, targetP signal+, yeast mito homolo...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,9.6,10.5,9.8,10.4,9.8,9.9,9.9,10.0,10.5,Mitochondria (Supported)
5,22273,7384.0,Uqcrc1,1110032G10Rik,ubiquinol-cytochrome c reductase core protein 1,MitoCarta3.0,"literature, APEX_matrix, targetP signal+, yeas...",MIM,"Protein import, sorting and homeostasis > Prot...",Tmito,...,10.0,10.8,10.3,10.1,10.6,10.2,10.3,10.2,9.9,Mitochondria (Supported)
7,56282,6182.0,Mrpl12,0610034O11Rik|1500031N16Rik|L12mt|MRP-|MRP-L12...,mitochondrial ribosomal protein L12,MitoCarta3.0,"literature, targetP signal+, yeast mito homolo...",Matrix,Mitochondrial central dogma > mtRNA metabolism...,Tmito,...,9.3,8.9,8.6,9.3,9.2,9.8,9.0,9.4,9.0,Mitochondria (Supported)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,67759,55848.0,Plgrkt,1110007H22Rik|5033414D02Rik|AI852040|Plg-|Plg-...,"plasminogen receptor, C-terminal lysine transm...",MitoCarta3.0,GFP,Membrane,0,Tmito,...,,,6.9,8.6,9.3,9.5,8.1,9.5,8.7,Mitochondria (Uncertain)
1129,78248,51309.0,Armcx1,3010033I09Rik|ALEX1,"armadillo repeat containing, X-linked 1",MitoCarta3.0,literature,MOM,Mitochondrial dynamics and surveillance > Traf...,Tnon_mito,...,,,,,,,,,,Mitochondria;Nucleoplasm (Approved)
1130,67416,9823.0,Armcx2,3230401N03Rik|AI043003|ALEX2,"armadillo repeat containing, X-linked 2",MitoCarta3.0,literature,Membrane,0,Tnon_mito,...,,,,,,,,,,Mitochondria (Approved)
1132,269642,339983.0,Nat8l,1110038O08Rik|Sh|Shati,N-acetyltransferase 8-like,MitoCarta3.0,literature,Membrane,Metabolism > Amino acid metabolism > Glutamate...,Tpossible_mito,...,,,,,,,,,,Mitochondria (Approved)


In [48]:
# get whole amino acid sequences for mitochondrial proteome
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#tqdm.pandas()
#mitocarta3_0_mitochondrial_completed_sequence = mitocarta3_0_mitochondrial.copy()
#mitocarta3_0_mitochondrial_completed_sequence["Complete Sequence"] = mitocarta3_0_mitochondrial_completed_sequence["UniProt"].progress_apply(get_full_protein_seq)
#mitocarta3_0_mitochondrial_completed_sequence

In [49]:
#mitocarta3_0_mitochondrial_completed_sequence.to_csv(os.path.join(datasets_path, "RvsS_full_mitochondrial_completed_sequence.csv"))

In [50]:
path = os.path.join(datasets_path, "RvsS_full_mitochondrial_completed_sequence.csv")
mitocarta3_0_mitochondrial_completed_sequence = pd.read_csv(path)
mitocarta3_0_mitochondrial_completed_sequence.set_index("Unnamed: 0", inplace=True)
mitocarta3_0_mitochondrial_completed_sequence.index.name = None
mitocarta3_0_mitochondrial_completed_sequence

Unnamed: 0,MouseGeneID,HumanOrthologGeneID,Symbol,Synonyms,Description,MitoCarta3.0_List,MitoCarta3.0_Evidence,MitoCarta3.0_SubMitoLocalization,MitoCarta3.0_MitoPathways,TrainingDataset,...,heart_total_peak_intensity_log10,skeletalmuscle_total_peak_intensity_log10,adipose_total_peak_intensity_log10,smallintestine_total_peak_intensity_log10,largeintestine_total_peak_intensity_log10,stomach_total_peak_intensity_log10,placenta_total_peak_intensity_log10,testis_total_peak_intensity_log10,HPA_Main_Location_2020 (Reliability),Complete Sequence
0,66445,1537.0,Cyc1,2610002H19Rik|AA408921|Cyct1,cytochrome c-1,MitoCarta3.0,"literature, APEX_IMS, APEX_matrix, targetP sig...",MIM,OXPHOS > Complex III > CIII subunits | Metabol...,Tmito,...,10.4,9.8,10.1,10.2,10.2,10.0,10.0,9.8,Mitochondria (Supported),MAAAAASLRRTVLGPRGVGLPGASAPGLLGGARSRQLPLRTPQAVS...
1,18597,5160.0,Pdha1,Pdha|Pdha-1,pyruvate dehydrogenase E1 alpha 1,MitoCarta3.0,"literature, APEX_matrix, targetP signal+, yeas...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,10.5,10.0,10.1,9.9,9.6,10.0,9.9,8.9,Mitochondria (Supported),MRKMLAAVSRVLAGSAQKPASRVLVASRNFANDATFEIKKCDLHRL...
4,68263,5162.0,Pdhb,2610103L06Rik|AL024199|C81408,pyruvate dehydrogenase (lipoamide) beta,MitoCarta3.0,"literature, targetP signal+, yeast mito homolo...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,10.5,9.8,10.4,9.8,9.9,9.9,10.0,10.5,Mitochondria (Supported),MAVVAGLVRGPLRQASGLLKRRFHRSAPAAVQLTVREAINQGMDEE...
5,22273,7384.0,Uqcrc1,1110032G10Rik,ubiquinol-cytochrome c reductase core protein 1,MitoCarta3.0,"literature, APEX_matrix, targetP signal+, yeas...",MIM,"Protein import, sorting and homeostasis > Prot...",Tmito,...,10.8,10.3,10.1,10.6,10.2,10.3,10.2,9.9,Mitochondria (Supported),MAASAVCRAACSGTQVLLRTRRSPALLRLPALRGTATFAQALQSVP...
7,56282,6182.0,Mrpl12,0610034O11Rik|1500031N16Rik|L12mt|MRP-|MRP-L12...,mitochondrial ribosomal protein L12,MitoCarta3.0,"literature, targetP signal+, yeast mito homolo...",Matrix,Mitochondrial central dogma > mtRNA metabolism...,Tmito,...,8.9,8.6,9.3,9.2,9.8,9.0,9.4,9.0,Mitochondria (Supported),MLPVAASRCLWGPRLGLRGAALRLARQQMPSVCAARQLRSSSHRRS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,67759,55848.0,Plgrkt,1110007H22Rik|5033414D02Rik|AI852040|Plg-|Plg-...,"plasminogen receptor, C-terminal lysine transm...",MitoCarta3.0,GFP,Membrane,0,Tmito,...,,6.9,8.6,9.3,9.5,8.1,9.5,8.7,Mitochondria (Uncertain),MGFIFSKSMNENMKNQQEFMVTHARLQLERHLTMQNEMRERQMAMQ...
1129,78248,51309.0,Armcx1,3010033I09Rik|ALEX1,"armadillo repeat containing, X-linked 1",MitoCarta3.0,literature,MOM,Mitochondrial dynamics and surveillance > Traf...,Tnon_mito,...,,,,,,,,,Mitochondria;Nucleoplasm (Approved),MGRTREAGCVAAGMVIGAGACYCVYRLTWGKDENEKLWDEEEEEEE...
1130,67416,9823.0,Armcx2,3230401N03Rik|AI043003|ALEX2,"armadillo repeat containing, X-linked 2",MitoCarta3.0,literature,Membrane,0,Tnon_mito,...,,,,,,,,,Mitochondria (Approved),MSRARDAGCVAAGIVIGASAWYCVYKYTRGKDQKKKRLTKPKNRAS...
1132,269642,339983.0,Nat8l,1110038O08Rik|Sh|Shati,N-acetyltransferase 8-like,MitoCarta3.0,literature,Membrane,Metabolism > Amino acid metabolism > Glutamate...,Tpossible_mito,...,,,,,,,,,Mitochondria (Approved),MHCGPPDMVCETKIVATEDHEALPGAKKDALLVAAGAMWPPLPAAP...


In [51]:
# NOTE: sequence length from database doesn't exactly match up with length of sequence as determined by UniProtID - weird
(mitocarta3_0_mitochondrial_completed_sequence["ProteinLength"] - mitocarta3_0_mitochondrial_completed_sequence["Complete Sequence"].str.len()).value_counts()

 0      491
 1        6
 2        3
 6        3
 18       2
 19       2
 27       1
 56       1
-7        1
-1        1
 23       1
 68       1
 36       1
 10       1
 24       1
 14       1
-29       1
-107      1
 15       1
 45       1
 3        1
 4        1
 144      1
 9        1
 364      1
 7        1
 5        1
-68       1
-114      1
Name: count, dtype: int64

# Download Alphafold Data - Full Mitochondrial Proteome

In [52]:
# path for alphafold protein data
alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/pae


In [53]:
# NOTE: these IDs are invalid from anAalphafold perspective - they are the secondary UniProtIDs, which was fine for querying UniProt, but not Alphafold
# so, manually impute these IDs with their primary ones
# invalid_proteins_cif -> ['F7C846', 'Q9JLT4', 'Q3UW66', 'Q91XR9']

replace_dict = {'F7C846': 'Q8R5C0', 'Q3UW66': 'Q99J99', 'Q91XR9': 'O70325'} 
# Q9JLT4 is weird - seems to be primary ID, but not in Alphafold
# Q91XR9 / O70325 is weird - O70325 exists in UniProt, but not Alphafold

mitocarta3_0_mitochondrial_completed_sequence["UniProt-Primary"] = mitocarta3_0_mitochondrial_completed_sequence["UniProt"].replace(replace_dict)
mitocarta3_0_mitochondrial_completed_sequence = mitocarta3_0_mitochondrial_completed_sequence.drop(mitocarta3_0_mitochondrial_completed_sequence[mitocarta3_0_mitochondrial_completed_sequence["UniProt"] == "Q9JLT4"].index)
mitocarta3_0_mitochondrial_completed_sequence = mitocarta3_0_mitochondrial_completed_sequence.drop(mitocarta3_0_mitochondrial_completed_sequence[mitocarta3_0_mitochondrial_completed_sequence["UniProt-Primary"] == "O70325"].index)

In [54]:
# set uniprot IDs to use
uniprotIDs_fullproteome = mitocarta3_0_mitochondrial_completed_sequence["UniProt-Primary"].values
uniprotIDs_fullproteome

array(['Q9D0M3', 'P35486', 'Q9D051', 'Q9CZ13', 'Q9DB15', 'Q8BJZ4',
       'Q9CR68', 'Q8K2B3', 'Q9CQA3', 'Q03265', 'Q8BTE0', 'Q9CXT8',
       'Q9DB77', 'Q60597', 'Q9CQ69', 'P08249', 'P53395', 'Q8BMF4',
       'Q6PB66', 'Q9D1H8', 'Q8K0D5', 'Q99LP6', 'Q8BMS4', 'Q8VEM8',
       'Q9DC71', 'P56480', 'Q8VE22', 'Q99LC5', 'P97807', 'Q8BK72',
       'Q9D6R2', 'Q91VD9', 'Q8JZQ2', 'Q9DC61', 'Q9Z2I9', 'Q8VDC0',
       'Q9DB20', 'Q9EQI8', 'Q8BJ03', 'Q9CQJ1', 'P62073', 'Q8BH95',
       'Q8BMS1', 'Q9WUM5', 'Q9CZU6', 'P43024', 'Q07417', 'Q9D6J6',
       'Q8BIJ6', 'Q06185', 'Q91VA7', 'Q9Z2I0', 'Q5RL20', 'Q8K4F5',
       'Q9D880', 'Q9D1B9', 'Q8C6I2', 'Q9D6J5', 'O08749', 'Q91WD5',
       'Q91YT0', 'P58281', 'Q8CAQ8', 'Q78IK4', 'Q8CGK3', 'Q924L1',
       'Q91YJ5', 'Q8K3J1', 'Q9D773', 'Q8R2Q4', 'Q9WV98', 'Q8K411',
       'P45952', 'Q80YD1', 'Q99N96', 'Q99KI0', 'Q8BGH2', 'Q924T2',
       'Q9CY73', 'P67778', 'Q8C3X4', 'Q8BKF1', 'Q91VC9', 'P62075',
       'P52825', 'Q3TBW2', 'P97450', 'Q8CAK1', 'Q9Z2Q5', 'Q8QZ

In [55]:
# download cif data for proteins
# SLOW THE FIRST TIME
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprotIDs_fullproteome,
    out_folder=cif_dir
)

100%|██████████| 528/528 [00:00<00:00, 190600.96it/s]

2024-04-25 21:21:19> Valid proteins: 0
2024-04-25 21:21:19> Invalid proteins: 0
2024-04-25 21:21:19> Existing proteins: 528





In [56]:
# download pae data for proteins
# SLOW THE FIRST TIME
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprotIDs_fullproteome,
    out_folder=pae_dir, 
)

100%|██████████| 528/528 [00:00<00:00, 182270.99it/s]

2024-04-25 21:21:19> Valid proteins: 0
2024-04-25 21:21:19> Invalid proteins: 0
2024-04-25 21:21:19> Existing proteins: 528





# Construct Alphafold Dataframe (Calculate Accessibilities) - Full Mitochondrial Proteome

In [57]:
# format alphafold data into dataframe
alphafold_annotation_full = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=uniprotIDs_fullproteome)
alphafold_annotation_full

  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='i

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,38.096,37.222,39.345,unstructured,unstructured,0,0,0,0,1
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,38.724,39.162,38.463,unstructured,unstructured,0,0,0,0,1
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,36.289,36.675,37.473,unstructured,unstructured,0,0,0,0,1
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,33.884,32.424,34.597,unstructured,unstructured,0,0,0,0,1
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,34.618,35.575,34.658,unstructured,unstructured,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197263,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,-27.747,-27.920,-26.354,HELX_RH_AL_P,HELX,0,1,0,0,0
197264,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,-27.713,-26.580,-27.423,HELX_RH_AL_P,HELX,0,1,0,0,0
197265,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,-27.292,-26.038,-27.216,HELX_RH_AL_P,HELX,0,1,0,0,0
197266,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,-30.321,-30.226,-29.116,HELX_RH_AL_P,HELX,0,1,0,0,0


In [58]:
# calculate full sphere exposure -> radius = 24
full_sphere_exposure_24 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
full_sphere_exposure_24

100%|██████████| 528/528 [00:05<00:00, 101.72it/s]


Unnamed: 0,protein_id,AA,position,nAA_24_180_pae
0,A2ADF7,M,1,4
1,A2ADF7,K,2,6
2,A2ADF7,P,3,6
3,A2ADF7,T,4,8
4,A2ADF7,Q,5,9
...,...,...,...,...
197263,S4R2K0,M,227,15
197264,S4R2K0,E,228,13
197265,S4R2K0,V,229,11
197266,S4R2K0,N,230,9


In [59]:
alphafold_accessibility_full = alphafold_annotation_full.merge(
    full_sphere_exposure_24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,37.222,39.345,unstructured,unstructured,0,0,0,0,1,4
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,39.162,38.463,unstructured,unstructured,0,0,0,0,1,6
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,36.675,37.473,unstructured,unstructured,0,0,0,0,1,6
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,32.424,34.597,unstructured,unstructured,0,0,0,0,1,8
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,35.575,34.658,unstructured,unstructured,0,0,0,0,1,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197263,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,-27.920,-26.354,HELX_RH_AL_P,HELX,0,1,0,0,0,15
197264,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,-26.580,-27.423,HELX_RH_AL_P,HELX,0,1,0,0,0,13
197265,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,-26.038,-27.216,HELX_RH_AL_P,HELX,0,1,0,0,0,11
197266,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,-30.226,-29.116,HELX_RH_AL_P,HELX,0,1,0,0,0,9


In [60]:
# calculate full sphere exposure -> angle = 70, radius = 12
part_sphere_exposure = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
part_sphere_exposure

  0%|          | 0/528 [00:00<?, ?it/s]

100%|██████████| 528/528 [00:02<00:00, 198.35it/s]


Unnamed: 0,protein_id,AA,position,nAA_12_70_pae
0,A2ADF7,M,1,0
1,A2ADF7,K,2,0
2,A2ADF7,P,3,0
3,A2ADF7,T,4,0
4,A2ADF7,Q,5,0
...,...,...,...,...
197263,S4R2K0,M,227,0
197264,S4R2K0,E,228,1
197265,S4R2K0,V,229,1
197266,S4R2K0,N,230,2


In [61]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    part_sphere_exposure, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,39.345,unstructured,unstructured,0,0,0,0,1,4,0
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,38.463,unstructured,unstructured,0,0,0,0,1,6,0
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,37.473,unstructured,unstructured,0,0,0,0,1,6,0
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,34.597,unstructured,unstructured,0,0,0,0,1,8,0
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,34.658,unstructured,unstructured,0,0,0,0,1,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197263,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,-26.354,HELX_RH_AL_P,HELX,0,1,0,0,0,15,0
197264,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,-27.423,HELX_RH_AL_P,HELX,0,1,0,0,0,13,1
197265,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,-27.216,HELX_RH_AL_P,HELX,0,1,0,0,0,11,1
197266,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,-29.116,HELX_RH_AL_P,HELX,0,1,0,0,0,9,2


In [62]:
# calculate full sphere exposure -> radius = 6
full_sphere_exposure_6 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
full_sphere_exposure_6

  0%|          | 0/528 [00:00<?, ?it/s]

100%|██████████| 528/528 [00:02<00:00, 257.23it/s]


Unnamed: 0,protein_id,AA,position,nAA_6_180_pae
0,A2ADF7,M,1,1
1,A2ADF7,K,2,2
2,A2ADF7,P,3,2
3,A2ADF7,T,4,2
4,A2ADF7,Q,5,2
...,...,...,...,...
197263,S4R2K0,M,227,2
197264,S4R2K0,E,228,2
197265,S4R2K0,V,229,2
197266,S4R2K0,N,230,2


In [63]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    full_sphere_exposure_6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,nAA_6_180_pae
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,unstructured,unstructured,0,0,0,0,1,4,0,1
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,unstructured,unstructured,0,0,0,0,1,6,0,2
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,unstructured,unstructured,0,0,0,0,1,6,0,2
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,unstructured,unstructured,0,0,0,0,1,8,0,2
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,unstructured,unstructured,0,0,0,0,1,9,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197263,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,HELX_RH_AL_P,HELX,0,1,0,0,0,15,0,2
197264,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,HELX_RH_AL_P,HELX,0,1,0,0,0,13,1,2
197265,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,HELX_RH_AL_P,HELX,0,1,0,0,0,11,1,2
197266,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,HELX_RH_AL_P,HELX,0,1,0,0,0,9,2,2


In [64]:
# calculate full sphere exposure -> radius = 12
full_sphere_exposure_12 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
full_sphere_exposure_12

100%|██████████| 528/528 [00:02<00:00, 198.32it/s]


Unnamed: 0,protein_id,AA,position,nAA_12_180_pae
0,A2ADF7,M,1,2
1,A2ADF7,K,2,3
2,A2ADF7,P,3,4
3,A2ADF7,T,4,4
4,A2ADF7,Q,5,4
...,...,...,...,...
197263,S4R2K0,M,227,6
197264,S4R2K0,E,228,6
197265,S4R2K0,V,229,5
197266,S4R2K0,N,230,5


In [65]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    full_sphere_exposure_12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,nAA_6_180_pae,nAA_12_180_pae
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,unstructured,0,0,0,0,1,4,0,1,2
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,unstructured,0,0,0,0,1,6,0,2,3
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,unstructured,0,0,0,0,1,6,0,2,4
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,unstructured,0,0,0,0,1,8,0,2,4
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,unstructured,0,0,0,0,1,9,0,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197263,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,HELX,0,1,0,0,0,15,0,2,6
197264,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,HELX,0,1,0,0,0,13,1,2,6
197265,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,HELX,0,1,0,0,0,11,1,2,5
197266,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,HELX,0,1,0,0,0,9,2,2,5


In [66]:
# calculate full sphere exposure -> radius = 18
full_sphere_exposure_18 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
full_sphere_exposure_18

100%|██████████| 528/528 [00:03<00:00, 140.63it/s]


Unnamed: 0,protein_id,AA,position,nAA_18_180_pae
0,A2ADF7,M,1,3
1,A2ADF7,K,2,4
2,A2ADF7,P,3,5
3,A2ADF7,T,4,6
4,A2ADF7,Q,5,6
...,...,...,...,...
197263,S4R2K0,M,227,9
197264,S4R2K0,E,228,8
197265,S4R2K0,V,229,8
197266,S4R2K0,N,230,6


In [67]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    full_sphere_exposure_18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,nAA_6_180_pae,nAA_12_180_pae,nAA_18_180_pae
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,0,0,0,0,1,4,0,1,2,3
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,0,0,0,0,1,6,0,2,3,4
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,0,0,0,0,1,6,0,2,4,5
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,0,0,0,0,1,8,0,2,4,6
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,0,0,0,0,1,9,0,2,4,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197263,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,0,1,0,0,0,15,0,2,6,9
197264,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,0,1,0,0,0,13,1,2,6,8
197265,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,0,1,0,0,0,11,1,2,5,8
197266,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,0,1,0,0,0,9,2,2,5,6


In [68]:
alphafold_accessibility_full_smooth = get_smooth_score(
    alphafold_accessibility_full, 
    np.array(['nAA_6_180_pae', 'nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae']), 
    [10])
alphafold_accessibility_full_smooth

100%|██████████| 528/528 [00:00<00:00, 1494.96it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,unstructured,nAA_24_180_pae,nAA_12_70_pae,nAA_6_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_6_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,1,4,0,1,2,3,1.909091,3.727273,5.545455,8.454545
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,1,6,0,2,3,4,1.916667,3.750000,5.666667,8.666667
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,1,6,0,2,4,5,1.923077,3.769231,5.769231,8.769231
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,1,8,0,2,4,6,1.928571,3.857143,5.857143,8.714286
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,1,9,0,2,4,6,1.933333,3.866667,5.866667,8.733333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,0,15,0,2,6,9,2.066667,10.133333,20.333333,38.200000
227,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,0,13,1,2,6,8,2.071429,9.500000,19.357143,35.785714
228,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,0,11,1,2,5,8,2.076923,9.153846,18.615385,34.000000
229,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,0,9,2,2,5,6,2.083333,9.250000,18.250000,33.250000


In [69]:
alphafold_accessibility_full_smooth['IDR'] = np.where(
    alphafold_accessibility_full_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_full_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_24_180_pae,nAA_12_70_pae,nAA_6_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_6_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,4,0,1,2,3,1.909091,3.727273,5.545455,8.454545,1
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,6,0,2,3,4,1.916667,3.750000,5.666667,8.666667,1
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,6,0,2,4,5,1.923077,3.769231,5.769231,8.769231,1
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,8,0,2,4,6,1.928571,3.857143,5.857143,8.714286,1
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,9,0,2,4,6,1.933333,3.866667,5.866667,8.733333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,15,0,2,6,9,2.066667,10.133333,20.333333,38.200000,0
227,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,13,1,2,6,8,2.071429,9.500000,19.357143,35.785714,0
228,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,11,1,2,5,8,2.076923,9.153846,18.615385,34.000000,1
229,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,9,2,2,5,6,2.083333,9.250000,18.250000,33.250000,1


# ????? Merge Dataframes into Full Dataset (Includes Alphafold) - Full Mitochondrial Proteome

In [70]:
# filter out table to only include methionines
mitocarta3_0_methionines = alphafold_accessibility_full_smooth[alphafold_accessibility_full_smooth["AA"] == "M"]
mitocarta3_0_methionines

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_24_180_pae,nAA_12_70_pae,nAA_6_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_6_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,4,0,1,2,3,1.909091,3.727273,5.545455,8.454545,1
7,A2ADF7,1,M,8,25.00,-17.959,-18.801,-19.503,-17.880,-11.797,...,10,0,2,4,7,1.944444,4.000000,6.555556,10.500000,1
11,A2ADF7,1,M,12,31.62,-22.014,-20.790,-20.768,-20.882,-2.522,...,11,0,2,4,7,2.000000,5.000000,9.142857,17.666667,1
16,A2ADF7,1,M,17,60.19,-24.992,-24.872,-26.170,-24.382,3.164,...,15,0,2,4,9,2.142857,7.666667,17.238095,32.380952,1
96,A2ADF7,1,M,97,86.39,-12.515,-11.617,-10.128,-11.907,1.475,...,121,5,2,23,54,2.428571,19.047619,51.666667,104.095238,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,S4R2K0,528,M,91,98.40,-3.270,-3.856,-3.011,-5.229,-11.159,...,135,14,2,28,66,2.285714,25.428571,66.619048,119.285714,0
132,S4R2K0,528,M,133,98.22,14.774,14.306,13.644,15.422,-7.821,...,92,7,3,23,51,2.428571,18.952381,49.142857,89.476190,0
203,S4R2K0,528,M,204,98.79,0.508,1.302,2.674,1.468,3.585,...,154,7,2,32,98,2.333333,28.714286,82.714286,134.619048,0
215,S4R2K0,528,M,216,98.03,3.940,3.358,2.382,2.708,12.422,...,93,9,3,20,46,2.333333,18.619048,46.714286,87.380952,0


In [71]:
mitocarta3_0_methionines["position"] = mitocarta3_0_methionines["position"] - 1 # zero-index the positions to match initial dataframe

mitocarta3_0_methionines_with_alphafold = mitocarta3_0_methionines.merge(
    mitocarta3_0_mitochondrial_completed_sequence[["UniProt", "UniProt-Primary", "Complete Sequence"]], 
    how="left", 
    left_on="protein_id", 
    right_on="UniProt-Primary"
)
mitocarta3_0_methionines_with_alphafold

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mitocarta3_0_methionines["position"] = mitocarta3_0_methionines["position"] - 1 # zero-index the positions to match initial dataframe


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_12_180_pae,nAA_18_180_pae,nAA_6_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR,UniProt,UniProt-Primary,Complete Sequence
0,A2ADF7,1,M,0,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,2,3,1.909091,3.727273,5.545455,8.454545,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
1,A2ADF7,1,M,7,25.00,-17.959,-18.801,-19.503,-17.880,-11.797,...,4,7,1.944444,4.000000,6.555556,10.500000,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
2,A2ADF7,1,M,11,31.62,-22.014,-20.790,-20.768,-20.882,-2.522,...,4,7,2.000000,5.000000,9.142857,17.666667,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
3,A2ADF7,1,M,16,60.19,-24.992,-24.872,-26.170,-24.382,3.164,...,4,9,2.142857,7.666667,17.238095,32.380952,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
4,A2ADF7,1,M,96,86.39,-12.515,-11.617,-10.128,-11.907,1.475,...,23,54,2.428571,19.047619,51.666667,104.095238,0,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4843,S4R2K0,528,M,90,98.40,-3.270,-3.856,-3.011,-5.229,-11.159,...,28,66,2.285714,25.428571,66.619048,119.285714,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4844,S4R2K0,528,M,132,98.22,14.774,14.306,13.644,15.422,-7.821,...,23,51,2.428571,18.952381,49.142857,89.476190,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4845,S4R2K0,528,M,203,98.79,0.508,1.302,2.674,1.468,3.585,...,32,98,2.333333,28.714286,82.714286,134.619048,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4846,S4R2K0,528,M,215,98.03,3.940,3.358,2.382,2.708,12.422,...,20,46,2.333333,18.619048,46.714286,87.380952,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...


In [72]:
#mitocarta3_0_methionines_with_alphafold.to_csv(os.path.join(datasets_path, "RvsS_full_mitochondrial_with_alphafold.csv"))

In [73]:
path = os.path.join(datasets_path, "RvsS_full_mitochondrial_with_alphafold.csv")
mitocarta3_0_methionines_with_alphafold = pd.read_csv(path)
mitocarta3_0_methionines_with_alphafold.set_index("Unnamed: 0", inplace=True)
mitocarta3_0_methionines_with_alphafold.index.name = None
mitocarta3_0_methionines_with_alphafold

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_12_180_pae,nAA_18_180_pae,nAA_6_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,IDR,UniProt,UniProt-Primary,Complete Sequence
0,A2ADF7,1,M,0,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,2,3,1.909091,3.727273,5.545455,8.454545,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
1,A2ADF7,1,M,7,25.00,-17.959,-18.801,-19.503,-17.880,-11.797,...,4,7,1.944444,4.000000,6.555556,10.500000,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
2,A2ADF7,1,M,11,31.62,-22.014,-20.790,-20.768,-20.882,-2.522,...,4,7,2.000000,5.000000,9.142857,17.666667,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
3,A2ADF7,1,M,16,60.19,-24.992,-24.872,-26.170,-24.382,3.164,...,4,9,2.142857,7.666667,17.238095,32.380952,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
4,A2ADF7,1,M,96,86.39,-12.515,-11.617,-10.128,-11.907,1.475,...,23,54,2.428571,19.047619,51.666667,104.095238,0,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4843,S4R2K0,528,M,90,98.40,-3.270,-3.856,-3.011,-5.229,-11.159,...,28,66,2.285714,25.428571,66.619048,119.285714,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4844,S4R2K0,528,M,132,98.22,14.774,14.306,13.644,15.422,-7.821,...,23,51,2.428571,18.952381,49.142857,89.476190,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4845,S4R2K0,528,M,203,98.79,0.508,1.302,2.674,1.468,3.585,...,32,98,2.333333,28.714286,82.714286,134.619048,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4846,S4R2K0,528,M,215,98.03,3.940,3.358,2.382,2.708,12.422,...,20,46,2.333333,18.619048,46.714286,87.380952,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
