## Imports

In [1]:
# General imports 
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import requests as r
from Bio import SeqIO
from io import StringIO
import warnings

warnings.filterwarnings('ignore')

# Import structuremap functions
import structuremap.utils
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score

structuremap.utils.set_logger()

In [2]:
# Set parameters of analysis
analysis_threshold = 20 # number of amino acids either side to analyze

modifications = ["649.3660", "655.3735"] # which modifications we are looking for, as regex strings
heavy_modification = "655.3735" 
light_modification = "649.3660"

## Load Dataset - Labeled Methionines

In [3]:
# Set correct pathing
curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/Mitochondria
Global Data Directory: /Users/ritwiksrinivas/Desktop/Projects/MetML/global_data


In [4]:
# Load initial dataset
data_loc = os.path.join(curr_dir_path, "Mitochondria_DataSet.xlsx")
peptides = pd.read_excel(data_loc, sheet_name="Sheet2")
peptides

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,Protein,Protein ID,Entry Name,Gene,Protein Description,p-value,neglogp,Log2HL avg,Unnamed: 24,Protein ID.1
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",4.058191e-03,2.391668,3.289988,,Q8C196
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,sp|Q07417|ACADS_MOUSE,Q07417,ACADS_MOUSE,Acads,"Short-chain specific acyl-CoA dehydrogenase, m...",9.647931e-06,5.015566,2.789195,,Q07417
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,sp|Q91YI0|ARLY_MOUSE,Q91YI0,ARLY_MOUSE,Asl,Argininosuccinate lyase,4.283587e-02,1.368192,2.555706,,Q91YI0
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,sp|P50247|SAHH_MOUSE,P50247,SAHH_MOUSE,Ahcy,Adenosylhomocysteinase,8.561078e-10,9.067472,2.538998,,P50247
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,sp|P33267|CP2F2_MOUSE,P33267,CP2F2_MOUSE,Cyp2f2,Cytochrome P450 2F2,2.032232e-04,3.692027,2.389057,,P33267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,sp|Q9QXF8|GNMT_MOUSE,Q9QXF8,GNMT_MOUSE,Gnmt,Glycine N-methyltransferase,1.880219e-08,7.725792,-1.857135,,Q9QXF8
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,sp|Q8K3J1|NDUS8_MOUSE,Q8K3J1,NDUS8_MOUSE,Ndufs8,NADH dehydrogenase [ubiquinone] iron-sulfur pr...,7.034064e-14,13.152794,-1.871741,,Q8K3J1
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,sp|P63038|CH60_MOUSE,P63038,CH60_MOUSE,Hspd1,"60 kDa heat shock protein, mitochondrial",7.289026e-11,10.137330,-1.950460,,P63038
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,sp|Q8BMS1|ECHA_MOUSE,Q8BMS1,ECHA_MOUSE,Hadha,"Trifunctional enzyme subunit alpha, mitochondrial",5.582467e-09,8.253174,-2.270072,,Q8BMS1


In [5]:
# Canonicalize data - here, drop extra columns
peptides.drop(columns=["Unnamed: 24", "Protein ID.1"], axis=1, inplace=True)
peptides;

In [6]:
# Manual labeling of peptides
label_col_data = ["green"] * 39 + ["white"] * 143 + ["yellow"] * 21
label_col = pd.Series(label_col_data)
peptides["label"] = label_col

#pd.set_option("display.max_rows", None)
#display(peptides)
#pd.reset_option("display.max_rows")
peptides;

In [7]:
unique_uniprotIDs = peptides["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q8C196' 'Q07417' 'Q91YI0' 'P50247' 'P33267' 'P97872' 'Q9DBT9' 'P32020'
 'P50136' 'P26443' 'Q91VS7' 'Q920A5' 'O55125' 'Q05920' 'Q91VR2' 'P51881'
 'P56480' 'Q61425' 'P47962' 'P53395' 'Q921G7' 'Q9CQR4' 'P63017' 'Q62425'
 'P20029' 'O35423' 'Q61102' 'Q9WUR2' 'Q9CQQ7' 'P55096' 'P01942' 'O35490'
 'P38647' 'Q60991' 'Q9Z2I8' 'Q9EQ20' 'Q9Z1J3' 'Q5U458' 'O35129' 'Q99JY0'
 'Q9WTP6' 'Q99LC5' 'P16460' 'Q9CZW5' 'P08226' 'Q03265' 'P63038' 'Q8CAQ8'
 'O35386' 'Q9Z2I0' 'P54869' 'P97742' 'Q9DCN1' 'Q9WUM5' 'P60710' 'P68033'
 'Q9CQ69' 'Q9DBJ1' 'Q9CR62' 'P19783' 'Q8QZY2' 'O70579' 'Q9CQN1' 'Q8BH95'
 'P51660' 'Q8BGY7' 'Q8CC88' 'Q9CRB9' 'Q9Z0X1' 'Q9QXD1' 'Q2TPA8' 'P24270'
 'Q9CPQ8' 'Q80XN0' 'Q925I1' 'Q8CIM7' 'Q9Z1P6' 'Q8VC30' 'Q9R0H0' 'P51658'
 'P16332' 'Q8VDN2' 'Q8BJ64' 'P42125' 'Q99MR8' 'P29758' 'Q5FW57' 'P62821'
 'P52825' 'Q9JKR6' 'Q99KR3' 'Q61733' 'Q61335' 'Q7TNG8' 'Q9DCM2' 'Q8CFA2'
 'Q64433' 'P45952' 'P08249' 'O35459' 'P97450' 'Q9CYR0' 'Q64310' 'Q8K1Z0'
 'Q9CPU0' 'Q80XL6' 'P97807' 'P

In [8]:
# Helper function to get full amino acid sequence for a protein
def get_complete_sequence(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [9]:
# Intialize and update sequence cache df: get complete AA sequences for relevant proteins
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#unknown_sequences_df = pd.DataFrame({"Protein ID": unique_uniprotIDs})
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = unknown_sequences_df
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [10]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated

Unnamed: 0,Protein ID,Complete Sequence
0,Q8C196,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1,Q07417,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...
2,Q91YI0,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...
3,P50247,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...
4,P33267,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...
...,...,...
28,Q9QX47,MAADIEQVFRSFVVSKFREIQQELSSGRSEGQLNGETNPPIEGNQA...
29,P10605,MWWSLILLSCLLALTSAHDKPSFHPLSDDLINYINKQNTTWQAGRN...
30,P38060,MASVRKAFPRRLVGLTSLRAVSTSSMGTLPKQVKIVEVGPRDGLQN...
31,P11499,MPEEVHHGEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISN...


In [11]:
peptides_cs = peptides.merge(sequence_cache_df_updated, how="left", on="Protein ID")
peptides_cs # cs means "complete sequence"

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,Protein,Protein ID,Entry Name,Gene,Protein Description,p-value,neglogp,Log2HL avg,label,Complete Sequence
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",4.058191e-03,2.391668,3.289988,green,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,sp|Q07417|ACADS_MOUSE,Q07417,ACADS_MOUSE,Acads,"Short-chain specific acyl-CoA dehydrogenase, m...",9.647931e-06,5.015566,2.789195,green,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,sp|Q91YI0|ARLY_MOUSE,Q91YI0,ARLY_MOUSE,Asl,Argininosuccinate lyase,4.283587e-02,1.368192,2.555706,green,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,sp|P50247|SAHH_MOUSE,P50247,SAHH_MOUSE,Ahcy,Adenosylhomocysteinase,8.561078e-10,9.067472,2.538998,green,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,sp|P33267|CP2F2_MOUSE,P33267,CP2F2_MOUSE,Cyp2f2,Cytochrome P450 2F2,2.032232e-04,3.692027,2.389057,green,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,sp|Q9QXF8|GNMT_MOUSE,Q9QXF8,GNMT_MOUSE,Gnmt,Glycine N-methyltransferase,1.880219e-08,7.725792,-1.857135,yellow,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,sp|Q8K3J1|NDUS8_MOUSE,Q8K3J1,NDUS8_MOUSE,Ndufs8,NADH dehydrogenase [ubiquinone] iron-sulfur pr...,7.034064e-14,13.152794,-1.871741,yellow,MYRLSSSMLPRALAQAMRTGHLNGQSLHSSAVAATYKYVNKKEQES...
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,sp|P63038|CH60_MOUSE,P63038,CH60_MOUSE,Hspd1,"60 kDa heat shock protein, mitochondrial",7.289026e-11,10.137330,-1.950460,yellow,MLRLPTVLRQMRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDL...
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,sp|Q8BMS1|ECHA_MOUSE,Q8BMS1,ECHA_MOUSE,Hadha,"Trifunctional enzyme subunit alpha, mitochondrial",5.582467e-09,8.253174,-2.270072,yellow,MVASRAIGSLSRFSAFRILRSRGCICRSFTTSSALLTRTHINYGVK...


In [12]:
# Create regex pattern to identify desired modifications
def create_modifications_pattern(modifications):

    whole, mantissa = modifications[0].split(".")
    pattern = r"M\[{}\.{}\]".format(whole, mantissa)

    for i in range(1, len(modifications)):
        whole, mantissa = modifications[i].split(".")
        pattern += r"|M\[{}\.{}\]".format(whole, mantissa)
    
    return pattern

modifications_pattern = create_modifications_pattern(modifications)
print(modifications_pattern)

M\[649\.3660\]|M\[655\.3735\]


In [13]:
peptides_cs["Sequence Location"] = pd.Series([a.find(b) for a, b in zip(peptides_cs["Complete Sequence"], peptides_cs["Peptide Sequence"])])
peptides_cs;

In [14]:
peptides_cs["Sequence Length"] = peptides_cs["Peptide Sequence"].str.len()
peptides_cs;

In [15]:
# Sanity check - ensure sequence indexing is correct
temp = [A[B:B+C] for A, B, C in zip(peptides_cs["Complete Sequence"], peptides_cs["Sequence Location"], peptides_cs["Sequence Length"])]
(temp == peptides_cs["Peptide Sequence"]).value_counts()

Peptide Sequence
True    203
Name: count, dtype: int64

In [16]:
# Extract left prefix of modified methionine (for indexing purposes)

IUPACCodes = "ACDEFGHIKLMNPQRSTVWY"
filtering = lambda string: ''.join([char for char in string if char in IUPACCodes])

left_prefix_pattern = "(.*)(" + modifications_pattern + ")"
print(left_prefix_pattern)

peptides_cs["Left Prefix"] = peptides_cs["Light Modified Peptide"].str.extract(left_prefix_pattern)[0]
peptides_cs["Left Prefix"] = peptides_cs["Left Prefix"].map(filtering)
peptides_cs["Left Prefix Length"] = peptides_cs["Left Prefix"].str.len()

peptides_cs;

(.*)(M\[649\.3660\]|M\[655\.3735\])


In [17]:
peptides_cs["Methionine Location"] = peptides_cs["Sequence Location"] + peptides_cs["Left Prefix Length"]
peptides_cs;

In [18]:
# Sanity check - ensure methionine locations are correct
temp = [A[B] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
temp.count("M") == len(temp)

True

In [19]:
# Compute left/right analysis sequences based on threshold
peptides_cs[f"Left {analysis_threshold}"] = [A[B-analysis_threshold:B]  if (B - analysis_threshold >= 0) else A[0:B-1] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs[f"Right {analysis_threshold}"] = [A[B+1:B+1+analysis_threshold] for A, B in zip(peptides_cs["Complete Sequence"], peptides_cs["Methionine Location"])]
peptides_cs

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,Log2HL avg,label,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,3.289988,green,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12,AADTIGYPV,9,584,FAVESMEDALKAADTIGYPV,IRSAYALGGLGSGICPNKET
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,2.789195,green,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...,262,10,IA,2,264,DCRIPKENLLGEPGMGFKIA,QTLDMGRIGIASQALGIAQA
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,2.555706,green,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...,12,11,FVGAVDPI,8,20,MASESGKLWGGRFVGAVDPI,EKFNSSISYDRHLWNVDVQG
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,2.538998,green,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...,412,20,QAQYLG,6,418,LGKLNVKLTKLTEKQAQYLG,PINGPFKPDHYRY
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,2.389057,green,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...,358,12,FADVIP,6,364,SMPYTDAVIHEVQRFADVIP,NLPHRVTRDTPFRGFLIPKG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,-1.857135,yellow,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...,213,17,AH,2,215,KSDLTKDITTSVLTVNNKAH,VTLDYTVQVPGTGRDGSPGF
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,-1.871741,yellow,MYRLSSSMLPRALAQAMRTGHLNGQSLHSSAVAATYKYVNKKEQES...,41,10,KEQESEVD,8,49,SAVAATYKYVNKKEQESEVD,KSATDNAARILMWTELIRGL
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,-1.950460,yellow,MLRLPTVLRQMRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDL...,141,15,RGV,3,144,KEGFEKISKGANPVEIRRGV,LAVDAVIAELKKQSKPVTTP
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,-2.270072,yellow,MVASRAIGSLSRFSAFRILRSRGCICRSFTTSSALLTRTHINYGVK...,505,11,,0,505,AVSKRPEKVIGMHYFSPVDK,QLLEIITTDKTSKDTTASAV


# Download Alphafold Data - Labeled Methionines

In [20]:
# Path for alphafold protein data

alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/pae


In [21]:
# Set uniprot IDs to use
unique_uniprotIDs = peptides_cs["Protein ID"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q8C196' 'Q07417' 'Q91YI0' 'P50247' 'P33267' 'P97872' 'Q9DBT9' 'P32020'
 'P50136' 'P26443' 'Q91VS7' 'Q920A5' 'O55125' 'Q05920' 'Q91VR2' 'P51881'
 'P56480' 'Q61425' 'P47962' 'P53395' 'Q921G7' 'Q9CQR4' 'P63017' 'Q62425'
 'P20029' 'O35423' 'Q61102' 'Q9WUR2' 'Q9CQQ7' 'P55096' 'P01942' 'O35490'
 'P38647' 'Q60991' 'Q9Z2I8' 'Q9EQ20' 'Q9Z1J3' 'Q5U458' 'O35129' 'Q99JY0'
 'Q9WTP6' 'Q99LC5' 'P16460' 'Q9CZW5' 'P08226' 'Q03265' 'P63038' 'Q8CAQ8'
 'O35386' 'Q9Z2I0' 'P54869' 'P97742' 'Q9DCN1' 'Q9WUM5' 'P60710' 'P68033'
 'Q9CQ69' 'Q9DBJ1' 'Q9CR62' 'P19783' 'Q8QZY2' 'O70579' 'Q9CQN1' 'Q8BH95'
 'P51660' 'Q8BGY7' 'Q8CC88' 'Q9CRB9' 'Q9Z0X1' 'Q9QXD1' 'Q2TPA8' 'P24270'
 'Q9CPQ8' 'Q80XN0' 'Q925I1' 'Q8CIM7' 'Q9Z1P6' 'Q8VC30' 'Q9R0H0' 'P51658'
 'P16332' 'Q8VDN2' 'Q8BJ64' 'P42125' 'Q99MR8' 'P29758' 'Q5FW57' 'P62821'
 'P52825' 'Q9JKR6' 'Q99KR3' 'Q61733' 'Q61335' 'Q7TNG8' 'Q9DCM2' 'Q8CFA2'
 'Q64433' 'P45952' 'P08249' 'O35459' 'P97450' 'Q9CYR0' 'Q64310' 'Q8K1Z0'
 'Q9CPU0' 'Q80XL6' 'P97807' 'P

In [22]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=unique_uniprotIDs,
    out_folder=cif_dir
)

100%|██████████| 113/113 [00:00<00:00, 35919.39it/s]

2024-10-17 15:34:35> Valid proteins: 0
2024-10-17 15:34:35> Invalid proteins: 0
2024-10-17 15:34:35> Existing proteins: 113





In [23]:
# Download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=unique_uniprotIDs,
    out_folder=pae_dir, 
)

100%|██████████| 113/113 [00:00<00:00, 51623.61it/s]

2024-10-17 15:34:35> Valid proteins: 0
2024-10-17 15:34:35> Invalid proteins: 0
2024-10-17 15:34:35> Existing proteins: 113





## Construct Alphafold Dataframe (Calculate Accessibilities) - Labeled Methionines

In [24]:
# Format alphafold data into dataframe
alphafold_annotation = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=unique_uniprotIDs)
alphafold_annotation

100%|██████████| 1630/1630 [00:08<00:00, 200.45it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,51.060,49.845,51.772,unstructured,unstructured,0,0,0,0,1
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,50.385,50.224,50.740,TURN_TY1_P,TURN,0,0,0,1,0
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,53.616,55.022,52.617,HELX_RH_AL_P,HELX,0,1,0,0,0
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,52.434,52.182,52.692,HELX_RH_AL_P,HELX,0,1,0,0,0
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,49.354,48.180,50.434,HELX_RH_AL_P,HELX,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52926,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,-18.565,-18.413,-17.274,HELX_RH_AL_P,HELX,0,1,0,0,0
52927,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,-18.909,-17.814,-18.401,BEND,BEND,1,0,0,0,0
52928,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,-21.302,-22.660,-20.673,unstructured,unstructured,0,0,0,0,1
52929,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,-19.459,-19.112,-20.119,unstructured,unstructured,0,0,0,0,1


In [25]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 113/113 [00:01<00:00, 57.83it/s] 


In [26]:
alphafold_accessibility = alphafold_annotation.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [27]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 113/113 [00:00<00:00, 171.06it/s]


In [28]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [29]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 113/113 [00:00<00:00, 184.82it/s]


In [30]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [31]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 113/113 [00:00<00:00, 142.72it/s]


In [32]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [33]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 113/113 [00:00<00:00, 168.79it/s]


In [34]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [35]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 113/113 [00:00<00:00, 163.62it/s]


In [36]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [37]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 113/113 [00:00<00:00, 176.54it/s]


In [38]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [39]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 113/113 [00:00<00:00, 170.33it/s]


In [40]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [41]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 113/113 [00:00<00:00, 158.94it/s]


In [42]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [43]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 113/113 [00:00<00:00, 143.78it/s]


In [44]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [45]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 113/113 [00:00<00:00, 160.58it/s]


In [46]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [47]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 113/113 [00:00<00:00, 133.62it/s]


In [48]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [49]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 113/113 [00:01<00:00, 91.74it/s]


In [50]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [51]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 113/113 [00:01<00:00, 61.65it/s]


In [52]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility;

In [53]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 113/113 [00:00<00:00, 131.47it/s]


In [54]:
alphafold_accessibility = alphafold_accessibility.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,1,1,1,1,1,1,3,5,7,0
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,0,2,2,2,2,2,4,6,9,0
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,1,2,2,2,2,2,5,7,10,0
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,0,2,2,2,2,2,6,9,11,1
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,2,2,2,2,2,2,8,9,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52926,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,2,2,4,4,5,5,18,53,91,12
52927,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,2,2,2,5,5,7,15,44,78,4
52928,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,2,2,2,2,2,2,5,15,36,0
52929,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,2,2,2,2,2,2,3,7,19,0


In [55]:
alphafold_accessibility_smooth = get_smooth_score(
    alphafold_accessibility, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_smooth;

100%|██████████| 113/113 [00:00<00:00, 346.21it/s]


In [56]:
alphafold_accessibility_smooth['IDR'] = np.where(
    alphafold_accessibility_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,O35129,1,M,1,45.38,-35.172,-33.707,-32.901,-33.031,3.159,...,1.909091,1.909091,1.909091,1.909091,1.909091,6.000000,8.727273,11.181818,1.000000,1
1,O35129,1,A,2,50.78,-38.162,-37.117,-37.283,-35.709,2.229,...,1.916667,1.916667,1.916667,1.916667,1.916667,5.916667,8.666667,11.416667,1.000000,1
2,O35129,1,Q,3,52.40,-38.996,-38.671,-38.089,-37.769,4.596,...,1.923077,1.923077,1.923077,1.923077,1.923077,5.769231,8.538462,11.538462,0.923077,1
3,O35129,1,N,4,51.23,-39.258,-38.274,-36.904,-38.080,7.104,...,1.857143,1.857143,1.928571,1.928571,1.928571,5.642857,8.500000,11.571429,0.857143,1
4,O35129,1,L,5,50.02,-42.031,-40.574,-40.297,-39.604,6.273,...,1.800000,1.800000,1.933333,1.933333,1.933333,5.466667,8.466667,11.733333,0.800000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,Q9Z2I8,113,S,429,94.38,26.176,27.466,27.350,27.774,14.096,...,2.133333,3.533333,5.133333,6.333333,7.000000,20.666667,47.266667,85.200000,6.133333,0
429,Q9Z2I8,113,V,430,91.54,24.398,24.087,23.021,25.313,16.707,...,2.071429,3.571429,5.214286,6.428571,7.142857,20.285714,46.357143,83.642857,6.142857,0
430,Q9Z2I8,113,A,431,73.79,23.429,23.961,23.247,23.808,19.412,...,2.076923,3.538462,5.230769,6.461538,7.153846,20.307692,46.153846,83.307692,6.615385,0
431,Q9Z2I8,113,K,432,62.38,22.744,23.748,24.902,24.238,22.331,...,2.083333,3.500000,5.083333,6.250000,7.000000,20.333333,45.583333,81.916667,6.833333,0


# Merge Dataframes into Full Dataset (Includes Alphafold) - Labeled Methionines

In [57]:
alphafold_accessibility_smooth["position"] = alphafold_accessibility_smooth["position"] - 1 # zero-index the positions to match initial dataframe

peptides_wa = peptides_cs.merge(
    alphafold_accessibility_smooth, 
    how="left", 
    left_on=["Protein ID", "Methionine Location"], 
    right_on=["protein_id", "position"]
)
peptides_wa # wa means "with alphafold"

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,2.380952,3.666667,4.857143,6.190476,7.857143,22.190476,59.952381,120.857143,6.285714,0
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,2.761905,4.857143,6.285714,8.047619,9.047619,30.571429,90.857143,176.571429,8.285714,0
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,2.238095,3.285714,4.380952,5.238095,5.523810,11.238095,29.571429,61.952381,2.380952,0
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,2.095238,3.238095,4.428571,5.380952,5.952381,13.380952,27.047619,48.190476,3.047619,0
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,2.857143,4.238095,6.095238,7.380952,8.904762,27.523810,85.285714,171.619048,7.809524,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,2.523810,3.380952,4.428571,6.095238,7.666667,16.476190,42.238095,79.904762,2.380952,0
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,2.000000,2.904762,3.238095,4.000000,4.238095,7.190476,12.523810,19.428571,0.857143,1
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,2.857143,5.285714,6.238095,8.476190,8.952381,26.095238,62.571429,115.380952,7.571429,0
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,2.571429,3.428571,4.904762,6.761905,8.190476,25.952381,80.380952,173.428571,7.190476,0


In [58]:
#peptides_wa.to_csv(os.path.join(curr_dir_path, "Mitochondria_peptides_with_alphafold.csv"))

In [59]:
path = os.path.join(curr_dir_path, "Mitochondria_peptides_with_alphafold.csv")
peptides_wa = pd.read_csv(path)
peptides_wa.set_index("Unnamed: 0", inplace=True)
peptides_wa.index.name = None
peptides_wa

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,...,2.380952,3.666667,4.857143,6.190476,7.857143,22.190476,59.952381,120.857143,6.285714,0
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,...,2.761905,4.857143,6.285714,8.047619,9.047619,30.571429,90.857143,176.571429,8.285714,0
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,...,2.238095,3.285714,4.380952,5.238095,5.523810,11.238095,29.571429,61.952381,2.380952,0
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,...,2.095238,3.238095,4.428571,5.380952,5.952381,13.380952,27.047619,48.190476,3.047619,0
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,...,2.857143,4.238095,6.095238,7.380952,8.904762,27.523810,85.285714,171.619048,7.809524,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,...,2.523810,3.380952,4.428571,6.095238,7.666667,16.476190,42.238095,79.904762,2.380952,0
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,...,2.000000,2.904762,3.238095,4.000000,4.238095,7.190476,12.523810,19.428571,0.857143,1
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,...,2.857143,5.285714,6.238095,8.476190,8.952381,26.095238,62.571429,115.380952,7.571429,0
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,...,2.571429,3.428571,4.904762,6.761905,8.190476,25.952381,80.380952,173.428571,7.190476,0


# Load Dataset (MitoCarta3.0) - Full Mitochondrial Proteome

In [60]:
data_loc = os.path.join(curr_dir_path, "Mouse.MitoCarta3.0.xls")
mitocarta3_0 = pd.read_excel(data_loc, sheet_name="A Mouse MitoCarta3.0")
mitocarta3_0

Unnamed: 0,MouseGeneID,HumanOrthologGeneID,Symbol,Synonyms,Description,MitoCarta3.0_List,MitoCarta3.0_Evidence,MitoCarta3.0_SubMitoLocalization,MitoCarta3.0_MitoPathways,TrainingDataset,...,liver_total_peak_intensity_log10,heart_total_peak_intensity_log10,skeletalmuscle_total_peak_intensity_log10,adipose_total_peak_intensity_log10,smallintestine_total_peak_intensity_log10,largeintestine_total_peak_intensity_log10,stomach_total_peak_intensity_log10,placenta_total_peak_intensity_log10,testis_total_peak_intensity_log10,HPA_Main_Location_2020 (Reliability)
0,66445,1537.0,Cyc1,2610002H19Rik|AA408921|Cyct1,cytochrome c-1,MitoCarta3.0,"literature, APEX_IMS, APEX_matrix, targetP sig...",MIM,OXPHOS > Complex III > CIII subunits | Metabol...,Tmito,...,10.0,10.4,9.8,10.1,10.2,10.2,10.0,10.0,9.8,Mitochondria (Supported)
1,18597,5160.0,Pdha1,Pdha|Pdha-1,pyruvate dehydrogenase E1 alpha 1,MitoCarta3.0,"literature, APEX_matrix, targetP signal+, yeas...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,9.6,10.5,10.0,10.1,9.9,9.6,10.0,9.9,8.9,Mitochondria (Supported)
2,66043,513.0,Atp5d,0610008F14Rik|1500000I11Rik|AA960090|AI876556|...,"ATP synthase, H+ transporting, mitochondrial F...",MitoCarta3.0,"literature, APEX_matrix, targetP signal, yeast...",MIM,OXPHOS > Complex V > CV subunits | OXPHOS > OX...,Tmito,...,9.8,9.8,9.5,9.8,10.1,10.0,9.8,9.8,9.6,
3,74316,122961.0,Isca2,0710001C05Rik|5730594E03Rik|Hbld|Hbld1,iron-sulfur cluster assembly 2,MitoCarta3.0,"APEX_matrix, targetP signal+, yeast mito homol...",Matrix,Metabolism > Metals and cofactors > Fe-S clust...,Tmito,...,8.2,7.9,7.2,8.7,8.5,8.4,8.0,8.5,8.5,
4,68263,5162.0,Pdhb,2610103L06Rik|AL024199|C81408,pyruvate dehydrogenase (lipoamide) beta,MitoCarta3.0,"literature, targetP signal+, yeast mito homolo...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,9.6,10.5,9.8,10.4,9.8,9.9,9.9,10.0,10.5,Mitochondria (Supported)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,71703,51566.0,Armcx3,1200004E24Rik|AI450003|ALEX3,"armadillo repeat containing, X-linked 3",MitoCarta3.0,literature,MOM,Mitochondrial dynamics and surveillance > Traf...,Tpossible_mito,...,,,,,,,,,,Nucleoplasm (Approved)
1136,67474,9342.0,Snap29,1300018G05Rik|AI891940|AU020222|BB131856|Gs32,synaptosomal-associated protein 29,MitoCarta3.0,APEX_IMS,IMS,Mitochondrial dynamics and surveillance > Auto...,Tmito,...,,,,,,,,,,Cytosol (Supported)
1137,18970,5423.0,Polb,A430088C08Rik,"polymerase (DNA directed), beta",MitoCarta3.0,literature,Matrix,Mitochondrial central dogma > mtDNA maintenanc...,Tnon_mito,...,,,,,,,,,,Vesicles (Uncertain)
1138,109729085,109703458.0,Htd2,-,hydroxyacyl-thioester dehydratase type 2,MitoCarta3.0,literature,Matrix,Metabolism > Lipid metabolism > Type II fatty ...,NA - newly added to NCBI Entrez Gene,...,,,,,,,,,,Mitochondria (Supported)


In [61]:
# Calculate number of proteins in the mitochondrial proteome
is_mitochondrial = (mitocarta3_0["HPA_Main_Location_2020 (Reliability)"].str.contains("mitoch", case=False))
(is_mitochondrial == True).value_counts(dropna=False)

HPA_Main_Location_2020 (Reliability)
False    609
True     531
Name: count, dtype: int64

In [62]:
# Sanity Check - ensure protein split was done correctly (correct mitoch... string matching)

#pd.set_option('display.max_rows', None)
##display(mitocarta3_0[is_mitochondrial == True]["HPA_Main_Location_2020 (Reliability)"].value_counts())
#pd.reset_option('display.max_rows')

In [63]:
# Filter MitoCarta3.0 dataset to only include mitochondrial proteins
mitocarta3_0_mitochondrial = mitocarta3_0[is_mitochondrial == True]
mitocarta3_0_mitochondrial;

In [64]:
# Drop rows with NaN UniProt IDs (just one)
print(mitocarta3_0_mitochondrial["UniProt"].isna().value_counts())
mitocarta3_0_mitochondrial = mitocarta3_0_mitochondrial.dropna(subset=["UniProt"])
mitocarta3_0_mitochondrial;

UniProt
False    530
True       1
Name: count, dtype: int64


In [65]:
unique_uniprotIDs = mitocarta3_0_mitochondrial["UniProt"].unique()
print("Unique UniProt IDs: \n" + str(unique_uniprotIDs))
print("Number of Unique UniProt IDs: " + str(unique_uniprotIDs.size))

Unique UniProt IDs: 
['Q9D0M3' 'P35486' 'Q9D051' 'Q9CZ13' 'Q9DB15' 'Q8BJZ4' 'Q9CR68' 'Q8K2B3'
 'Q9CQA3' 'Q03265' 'Q8BTE0' 'Q9CXT8' 'Q9DB77' 'Q60597' 'Q9CQ69' 'P08249'
 'P53395' 'Q8BMF4' 'Q6PB66' 'Q9D1H8' 'Q8K0D5' 'Q99LP6' 'Q8BMS4' 'Q8VEM8'
 'Q9DC71' 'P56480' 'Q8VE22' 'Q99LC5' 'P97807' 'Q8BK72' 'Q9D6R2' 'Q91VD9'
 'Q8JZQ2' 'Q9DC61' 'Q9Z2I9' 'Q8VDC0' 'Q9DB20' 'Q9EQI8' 'Q8BJ03' 'Q9CQJ1'
 'P62073' 'Q8BH95' 'Q8BMS1' 'Q9WUM5' 'Q9CZU6' 'P43024' 'Q07417' 'Q9D6J6'
 'Q8BIJ6' 'Q06185' 'Q91VA7' 'Q9Z2I0' 'Q5RL20' 'Q8K4F5' 'Q9D880' 'Q9D1B9'
 'Q8C6I2' 'Q9D6J5' 'O08749' 'Q91WD5' 'Q91YT0' 'P58281' 'Q8CAQ8' 'Q78IK4'
 'Q8CGK3' 'Q924L1' 'Q91YJ5' 'Q8K3J1' 'Q9D773' 'Q8R2Q4' 'Q9WV98' 'Q8K411'
 'P45952' 'Q80YD1' 'Q99N96' 'Q99KI0' 'Q8BGH2' 'Q924T2' 'Q9CY73' 'P67778'
 'Q8C3X4' 'Q8BKF1' 'Q91VC9' 'P62075' 'P52825' 'Q3TBW2' 'P97450' 'Q8CAK1'
 'Q9Z2Q5' 'Q8QZS1' 'Q9D0K2' 'Q9D172' 'Q9CQJ8' 'Q9CPQ1' 'P09671' 'Q9CQH3'
 'Q61425' 'Q99M04' 'P46656' 'Q99J25' 'Q9CWV0' 'Q99JT1' 'Q8BYL4' 'O35943'
 'P70404' 'Q9CY16' 'O35857' 'Q

In [66]:
# Load and update sequence cache df: mapping from UniProt IDs to complete AA sequence
# SLOW - ONLY DO THIS ONCE - CONVERT TO CSV FILE, THEN RE-LOAD FROM THAT FILE

#path = os.path.join(global_data_path, "complete_sequence_cache.csv")
#sequence_cache_df = pd.read_csv(path)
#sequence_cache_df.set_index("Unnamed: 0", inplace=True)
#sequence_cache_df.index.name = None
#display(sequence_cache_df)
#
## Determine unknown sequences
#
#unknown_uniprotIDs_idxs = ~np.isin(unique_uniprotIDs, sequence_cache_df["Protein ID"].values)
#unknown_uniprotIDs = unique_uniprotIDs[unknown_uniprotIDs_idxs]
#unknown_sequences_df = pd.DataFrame({"Protein ID": unknown_uniprotIDs})
#display(unknown_sequences_df)
#
## Retrieve unknown sequences
#
#tqdm.pandas()
#unknown_sequences_df["Complete Sequence"] = unknown_sequences_df["Protein ID"].progress_apply(get_complete_sequence)
#display(unknown_sequences_df)
#
#sequence_cache_df_updated = pd.concat([sequence_cache_df, unknown_sequences_df])
#sequence_cache_df_updated.to_csv(os.path.join(global_data_path, "complete_sequence_cache.csv"))
#sequence_cache_df_updated;

In [67]:
# Load cache df: mapping from UniProt IDs to complete AA sequence
path = os.path.join(global_data_path, "complete_sequence_cache.csv")
sequence_cache_df_updated = pd.read_csv(path)
sequence_cache_df_updated.set_index("Unnamed: 0", inplace=True)
sequence_cache_df_updated.index.name = None
sequence_cache_df_updated;

In [68]:
mitocarta3_0_mitochondrial_cs = mitocarta3_0_mitochondrial.merge(sequence_cache_df_updated, how="left", left_on="UniProt", right_on="Protein ID")
mitocarta3_0_mitochondrial_cs.index = mitocarta3_0_mitochondrial.index
mitocarta3_0_mitochondrial_cs = mitocarta3_0_mitochondrial_cs.drop(columns=["Protein ID"])
mitocarta3_0_mitochondrial_cs

Unnamed: 0,MouseGeneID,HumanOrthologGeneID,Symbol,Synonyms,Description,MitoCarta3.0_List,MitoCarta3.0_Evidence,MitoCarta3.0_SubMitoLocalization,MitoCarta3.0_MitoPathways,TrainingDataset,...,heart_total_peak_intensity_log10,skeletalmuscle_total_peak_intensity_log10,adipose_total_peak_intensity_log10,smallintestine_total_peak_intensity_log10,largeintestine_total_peak_intensity_log10,stomach_total_peak_intensity_log10,placenta_total_peak_intensity_log10,testis_total_peak_intensity_log10,HPA_Main_Location_2020 (Reliability),Complete Sequence
0,66445,1537.0,Cyc1,2610002H19Rik|AA408921|Cyct1,cytochrome c-1,MitoCarta3.0,"literature, APEX_IMS, APEX_matrix, targetP sig...",MIM,OXPHOS > Complex III > CIII subunits | Metabol...,Tmito,...,10.4,9.8,10.1,10.2,10.2,10.0,10.0,9.8,Mitochondria (Supported),MAAAAASLRRTVLGPRGVGLPGASAPGLLGGARSRQLPLRTPQAVS...
1,18597,5160.0,Pdha1,Pdha|Pdha-1,pyruvate dehydrogenase E1 alpha 1,MitoCarta3.0,"literature, APEX_matrix, targetP signal+, yeas...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,10.5,10.0,10.1,9.9,9.6,10.0,9.9,8.9,Mitochondria (Supported),MRKMLAAVSRVLAGSAQKPASRVLVASRNFANDATFEIKKCDLHRL...
4,68263,5162.0,Pdhb,2610103L06Rik|AL024199|C81408,pyruvate dehydrogenase (lipoamide) beta,MitoCarta3.0,"literature, targetP signal+, yeast mito homolo...",Matrix,Metabolism > Carbohydrate metabolism > Pyruvat...,Tmito,...,10.5,9.8,10.4,9.8,9.9,9.9,10.0,10.5,Mitochondria (Supported),MAVVAGLVRGPLRQASGLLKRRFHRSAPAAVQLTVREAINQGMDEE...
5,22273,7384.0,Uqcrc1,1110032G10Rik,ubiquinol-cytochrome c reductase core protein 1,MitoCarta3.0,"literature, APEX_matrix, targetP signal+, yeas...",MIM,"Protein import, sorting and homeostasis > Prot...",Tmito,...,10.8,10.3,10.1,10.6,10.2,10.3,10.2,9.9,Mitochondria (Supported),MAASAVCRAACSGTQVLLRTRRSPALLRLPALRGTATFAQALQSVP...
7,56282,6182.0,Mrpl12,0610034O11Rik|1500031N16Rik|L12mt|MRP-|MRP-L12...,mitochondrial ribosomal protein L12,MitoCarta3.0,"literature, targetP signal+, yeast mito homolo...",Matrix,Mitochondrial central dogma > mtRNA metabolism...,Tmito,...,8.9,8.6,9.3,9.2,9.8,9.0,9.4,9.0,Mitochondria (Supported),MLPVAASRCLWGPRLGLRGAALRLARQQMPSVCAARQLRSSSHRRS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,67759,55848.0,Plgrkt,1110007H22Rik|5033414D02Rik|AI852040|Plg-|Plg-...,"plasminogen receptor, C-terminal lysine transm...",MitoCarta3.0,GFP,Membrane,0,Tmito,...,,6.9,8.6,9.3,9.5,8.1,9.5,8.7,Mitochondria (Uncertain),MGFIFSKSMNENMKNQQEFMVTHARLQLERHLTMQNEMRERQMAMQ...
1129,78248,51309.0,Armcx1,3010033I09Rik|ALEX1,"armadillo repeat containing, X-linked 1",MitoCarta3.0,literature,MOM,Mitochondrial dynamics and surveillance > Traf...,Tnon_mito,...,,,,,,,,,Mitochondria;Nucleoplasm (Approved),MGRTREAGCVAAGMVIGAGACYCVYRLTWGKDENEKLWDEEEEEEE...
1130,67416,9823.0,Armcx2,3230401N03Rik|AI043003|ALEX2,"armadillo repeat containing, X-linked 2",MitoCarta3.0,literature,Membrane,0,Tnon_mito,...,,,,,,,,,Mitochondria (Approved),MSRARDAGCVAAGIVIGASAWYCVYKYTRGKDQKKKRLTKPKNRAS...
1132,269642,339983.0,Nat8l,1110038O08Rik|Sh|Shati,N-acetyltransferase 8-like,MitoCarta3.0,literature,Membrane,Metabolism > Amino acid metabolism > Glutamate...,Tpossible_mito,...,,,,,,,,,Mitochondria (Approved),MHCGPPDMVCETKIVATEDHEALPGAKKDALLVAAGAMWPPLPAAP...


In [69]:
# NOTE: Sequence length from dataset doesn't exactly match up with length of sequence as determined by UniProtID - weird
(mitocarta3_0_mitochondrial_cs["ProteinLength"] - mitocarta3_0_mitochondrial_cs["Complete Sequence"].str.len()).value_counts();

# Download Alphafold Data - Full Mitochondrial Proteome

In [70]:
# Path for alphafold protein data

alphafold_path_str = "../alphafold_data/"
alphafold_path = os.path.abspath(alphafold_path_str)

cif_dir = os.path.join(alphafold_path, "cif")
pae_dir = os.path.join(alphafold_path, "pae")

print(alphafold_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/MetML/alphafold_data/pae


In [71]:
# NOTE: these IDs are invalid from an Alphafold perspective - they are the secondary UniProtIDs, which was fine for querying UniProt, but not Alphafold
# so, manually impute these IDs with their primary ones
# invalid_proteins_cif -> ['F7C846', 'Q9JLT4', 'Q3UW66', 'Q91XR9']

replace_dict = {'F7C846': 'Q8R5C0', 'Q3UW66': 'Q99J99', 'Q91XR9': 'O70325'} 
# Q9JLT4 is weird - seems to be primary ID, but not in Alphafold
# Q91XR9 / O70325 is weird - O70325 exists in UniProt, but not Alphafold

mitocarta3_0_mitochondrial_cs["UniProt-Primary"] = mitocarta3_0_mitochondrial_cs["UniProt"].replace(replace_dict)
mitocarta3_0_mitochondrial_cs = mitocarta3_0_mitochondrial_cs.drop(mitocarta3_0_mitochondrial_cs[mitocarta3_0_mitochondrial_cs["UniProt"] == "Q9JLT4"].index)
mitocarta3_0_mitochondrial_cs = mitocarta3_0_mitochondrial_cs.drop(mitocarta3_0_mitochondrial_cs[mitocarta3_0_mitochondrial_cs["UniProt-Primary"] == "O70325"].index)
mitocarta3_0_mitochondrial_cs;

In [72]:
# Set uniprot IDs to use

uniprotIDs_fullproteome = mitocarta3_0_mitochondrial_cs["UniProt-Primary"].unique()
print("Unique UniProt IDs: \n" + str(uniprotIDs_fullproteome))
print("Number of Unique UniProt IDs: " + str(uniprotIDs_fullproteome.size))

Unique UniProt IDs: 
['Q9D0M3' 'P35486' 'Q9D051' 'Q9CZ13' 'Q9DB15' 'Q8BJZ4' 'Q9CR68' 'Q8K2B3'
 'Q9CQA3' 'Q03265' 'Q8BTE0' 'Q9CXT8' 'Q9DB77' 'Q60597' 'Q9CQ69' 'P08249'
 'P53395' 'Q8BMF4' 'Q6PB66' 'Q9D1H8' 'Q8K0D5' 'Q99LP6' 'Q8BMS4' 'Q8VEM8'
 'Q9DC71' 'P56480' 'Q8VE22' 'Q99LC5' 'P97807' 'Q8BK72' 'Q9D6R2' 'Q91VD9'
 'Q8JZQ2' 'Q9DC61' 'Q9Z2I9' 'Q8VDC0' 'Q9DB20' 'Q9EQI8' 'Q8BJ03' 'Q9CQJ1'
 'P62073' 'Q8BH95' 'Q8BMS1' 'Q9WUM5' 'Q9CZU6' 'P43024' 'Q07417' 'Q9D6J6'
 'Q8BIJ6' 'Q06185' 'Q91VA7' 'Q9Z2I0' 'Q5RL20' 'Q8K4F5' 'Q9D880' 'Q9D1B9'
 'Q8C6I2' 'Q9D6J5' 'O08749' 'Q91WD5' 'Q91YT0' 'P58281' 'Q8CAQ8' 'Q78IK4'
 'Q8CGK3' 'Q924L1' 'Q91YJ5' 'Q8K3J1' 'Q9D773' 'Q8R2Q4' 'Q9WV98' 'Q8K411'
 'P45952' 'Q80YD1' 'Q99N96' 'Q99KI0' 'Q8BGH2' 'Q924T2' 'Q9CY73' 'P67778'
 'Q8C3X4' 'Q8BKF1' 'Q91VC9' 'P62075' 'P52825' 'Q3TBW2' 'P97450' 'Q8CAK1'
 'Q9Z2Q5' 'Q8QZS1' 'Q9D0K2' 'Q9D172' 'Q9CQJ8' 'Q9CPQ1' 'P09671' 'Q9CQH3'
 'Q61425' 'Q99M04' 'P46656' 'Q99J25' 'Q9CWV0' 'Q99JT1' 'Q8BYL4' 'O35943'
 'P70404' 'Q9CY16' 'O35857' 'Q

In [73]:
# Download cif data for proteins
# SLOW THE FIRST TIME - caches the relevant cif data
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprotIDs_fullproteome,
    out_folder=cif_dir
)

100%|██████████| 528/528 [00:00<00:00, 192858.36it/s]

2024-10-17 15:34:59> Valid proteins: 0
2024-10-17 15:34:59> Invalid proteins: 0
2024-10-17 15:34:59> Existing proteins: 528





In [74]:
# download pae data for proteins
# SLOW THE FIRST TIME - caches the relevant pae data
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprotIDs_fullproteome,
    out_folder=pae_dir, 
)

100%|██████████| 528/528 [00:00<00:00, 209022.42it/s]

2024-10-17 15:34:59> Valid proteins: 0
2024-10-17 15:34:59> Invalid proteins: 0
2024-10-17 15:34:59> Existing proteins: 528





# Construct Alphafold Dataframe (Calculate Accessibilities) - Full Mitochondrial Proteome

In [75]:
# Format alphafold data into dataframe

alphafold_annotation_full = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=uniprotIDs_fullproteome)
alphafold_annotation_full

100%|██████████| 1630/1630 [00:31<00:00, 52.26it/s] 


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,38.096,37.222,39.345,unstructured,unstructured,0,0,0,0,1
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,38.724,39.162,38.463,unstructured,unstructured,0,0,0,0,1
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,36.289,36.675,37.473,unstructured,unstructured,0,0,0,0,1
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,33.884,32.424,34.597,unstructured,unstructured,0,0,0,0,1
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,34.618,35.575,34.658,unstructured,unstructured,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197263,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,-27.747,-27.920,-26.354,HELX_RH_AL_P,HELX,0,1,0,0,0
197264,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,-27.713,-26.580,-27.423,HELX_RH_AL_P,HELX,0,1,0,0,0
197265,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,-27.292,-26.038,-27.216,HELX_RH_AL_P,HELX,0,1,0,0,0
197266,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,-30.321,-30.226,-29.116,HELX_RH_AL_P,HELX,0,1,0,0,0


In [76]:
# Calculate full sphere exposure -> radius = 2
exposure_sphere_rad2 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=2, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad2;

100%|██████████| 528/528 [00:02<00:00, 211.84it/s]


In [77]:
alphafold_accessibility_full = alphafold_annotation_full.merge(
    exposure_sphere_rad2, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [78]:
# Calculate full sphere exposure -> radius = 3
exposure_sphere_rad3 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=3, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad3;

100%|██████████| 528/528 [00:02<00:00, 261.40it/s]


In [79]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad3, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [80]:
# Calculate full sphere exposure -> radius = 4
exposure_sphere_rad4 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=4, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4;

100%|██████████| 528/528 [00:02<00:00, 214.44it/s]


In [81]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad4, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [82]:
# Calculate full sphere exposure -> radius = 4.5
exposure_sphere_rad4_5 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=4.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad4_5;

100%|██████████| 528/528 [00:02<00:00, 239.07it/s]


In [83]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad4_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [84]:
# Calculate full sphere exposure -> radius = 5
exposure_sphere_rad5 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5;

100%|██████████| 528/528 [00:02<00:00, 251.21it/s]


In [85]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [86]:
# Calculate full sphere exposure -> radius = 5.5
exposure_sphere_rad5_5 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=5.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad5_5;

100%|██████████| 528/528 [00:02<00:00, 228.28it/s]


In [87]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad5_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [88]:
# Calculate full sphere exposure -> radius = 6
exposure_sphere_rad6 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=6, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6;

100%|██████████| 528/528 [00:02<00:00, 220.06it/s]


In [89]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad6, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [90]:
# Calculate full sphere exposure -> radius = 6.5
exposure_sphere_rad6_5 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=6.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad6_5;

100%|██████████| 528/528 [00:02<00:00, 240.28it/s]


In [91]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad6_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [92]:
# Calculate full sphere exposure -> radius = 7
exposure_sphere_rad7 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=7, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7;

100%|██████████| 528/528 [00:02<00:00, 217.47it/s]


In [93]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad7, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [94]:
# Calculate full sphere exposure -> radius = 7.5
exposure_sphere_rad7_5 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=7.5, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad7_5;

100%|██████████| 528/528 [00:02<00:00, 235.28it/s]


In [95]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad7_5, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [96]:
# Calculate full sphere exposure -> radius = 8
exposure_sphere_rad8 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=8, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad8;

100%|██████████| 528/528 [00:02<00:00, 233.72it/s]


In [97]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad8, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [98]:
# Calculate full sphere exposure -> radius = 12
exposure_sphere_rad12 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=12, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad12;

100%|██████████| 528/528 [00:02<00:00, 197.88it/s]


In [99]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [100]:
# Calculate full sphere exposure -> radius = 18
exposure_sphere_rad18 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=18, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad18;

100%|██████████| 528/528 [00:04<00:00, 128.60it/s]


In [101]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad18, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [102]:
# Calculate full sphere exposure -> radius = 24
exposure_sphere_rad24 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
exposure_sphere_rad24;

100%|██████████| 528/528 [00:05<00:00, 93.27it/s] 


In [103]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_sphere_rad24, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full;

In [104]:
# Calculate part sphere exposure -> angle = 70, radius = 12
exposure_ang70_rad12 = annotate_accessibility(
    df=alphafold_annotation_full, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
exposure_ang70_rad12;

100%|██████████| 528/528 [00:02<00:00, 181.71it/s]


In [105]:
alphafold_accessibility_full = alphafold_accessibility_full.merge(
    exposure_ang70_rad12, how='left', on=['protein_id','AA','position'])
alphafold_accessibility_full

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,1,1,1,1,1,1,2,3,4,0
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,1,2,2,2,2,2,3,4,6,0
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,2,2,2,2,2,2,4,5,6,0
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,1,2,2,2,2,2,4,6,8,0
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,1,2,2,2,2,2,4,6,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197263,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,2,2,2,2,3,3,6,9,15,0
197264,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,2,2,2,2,2,3,6,8,13,1
197265,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,2,2,2,2,2,3,5,8,11,1
197266,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,2,2,2,2,2,2,5,6,9,2


In [106]:
alphafold_accessibility_full_smooth = get_smooth_score(
    alphafold_accessibility_full, 
    np.array(['nAA_2_180_pae', 'nAA_3_180_pae', 'nAA_4_180_pae', 'nAA_4.5_180_pae', 'nAA_5_180_pae', 'nAA_5.5_180_pae', 'nAA_6_180_pae', 'nAA_6.5_180_pae', 'nAA_7_180_pae', 'nAA_7.5_180_pae', 'nAA_8_180_pae','nAA_12_180_pae', 'nAA_18_180_pae', 'nAA_24_180_pae', 'nAA_12_70_pae']), 
    [10])
alphafold_accessibility_full_smooth;

100%|██████████| 528/528 [00:00<00:00, 659.14it/s]


In [107]:
alphafold_accessibility_full_smooth['IDR'] = np.where(
    alphafold_accessibility_full_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_full_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,A2ADF7,1,M,1,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,1.909091,1.909091,1.909091,1.909091,1.909091,3.727273,5.545455,8.454545,0.000000,1
1,A2ADF7,1,K,2,27.13,-16.032,-15.462,-14.173,-15.203,-26.991,...,1.916667,1.916667,1.916667,1.916667,1.916667,3.750000,5.666667,8.666667,0.000000,1
2,A2ADF7,1,P,3,32.83,-16.958,-17.790,-19.230,-17.200,-24.520,...,1.923077,1.923077,1.923077,1.923077,1.923077,3.769231,5.769231,8.769231,0.000000,1
3,A2ADF7,1,T,4,33.64,-16.686,-15.841,-15.615,-16.546,-22.219,...,1.928571,1.928571,1.928571,1.928571,1.928571,3.857143,5.857143,8.714286,0.000000,1
4,A2ADF7,1,Q,5,28.86,-16.698,-16.784,-16.026,-16.209,-19.372,...,1.933333,1.933333,1.933333,1.933333,1.933333,3.866667,5.866667,8.733333,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,S4R2K0,528,M,227,93.01,-4.021,-2.734,-1.530,-2.851,2.349,...,2.066667,2.533333,3.333333,3.866667,4.333333,10.133333,20.333333,38.200000,1.666667,0
227,S4R2K0,528,E,228,93.81,-7.039,-5.845,-6.164,-4.573,1.709,...,2.071429,2.571429,3.214286,3.714286,4.071429,9.500000,19.357143,35.785714,1.500000,0
228,S4R2K0,528,V,229,88.77,-7.847,-7.972,-7.895,-6.944,4.796,...,2.076923,2.461538,3.076923,3.615385,4.000000,9.153846,18.615385,34.000000,1.615385,1
229,S4R2K0,528,N,230,90.23,-6.518,-6.363,-4.940,-6.637,4.882,...,2.083333,2.500000,3.000000,3.583333,4.000000,9.250000,18.250000,33.250000,1.750000,1


# Merge Dataframes into Full Dataset (Includes Alphafold) - Full Mitochondrial Proteome

In [108]:
# filter out df to exclude (likely) cleaved N-terminal methionines based on next AA
n_terminal_met_cleave_partners = np.array(["G", "A", "S", "T", "C", "P", "V"])
nextAA = alphafold_accessibility_full_smooth["AA"].shift(periods=-1, fill_value="X")
n_terminal_met_cond = ((alphafold_accessibility_full_smooth["AA"] == "M") & (alphafold_accessibility_full_smooth["position"] == 1) & np.isin(nextAA, n_terminal_met_cleave_partners))
mitocarta3_0_methionines = alphafold_accessibility_full_smooth[~n_terminal_met_cond]
mitocarta3_0_methionines;

In [109]:
# filter out df to only include methionines
mitocarta3_0_methionines = mitocarta3_0_methionines[mitocarta3_0_methionines["AA"] == "M"]
mitocarta3_0_methionines;

In [110]:
mitocarta3_0_methionines["position"] = mitocarta3_0_methionines["position"] - 1 # zero-index the positions to match initial dataframe

mitocarta3_0_methionines_wa = mitocarta3_0_methionines.merge(
    mitocarta3_0_mitochondrial_cs[["UniProt", "UniProt-Primary", "Complete Sequence"]], 
    how="left", 
    left_on="protein_id", 
    right_on="UniProt-Primary"
)
mitocarta3_0_methionines_wa

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR,UniProt,UniProt-Primary,Complete Sequence
0,A2ADF7,1,M,0,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,1.909091,1.909091,3.727273,5.545455,8.454545,0.000000,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
1,A2ADF7,1,M,7,25.00,-17.959,-18.801,-19.503,-17.880,-11.797,...,1.944444,1.944444,4.000000,6.555556,10.500000,0.111111,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
2,A2ADF7,1,M,11,31.62,-22.014,-20.790,-20.768,-20.882,-2.522,...,2.476190,2.476190,5.000000,9.142857,17.666667,0.285714,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
3,A2ADF7,1,M,16,60.19,-24.992,-24.872,-26.170,-24.382,3.164,...,3.857143,3.904762,7.666667,17.238095,32.380952,1.047619,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
4,A2ADF7,1,M,96,86.39,-12.515,-11.617,-10.128,-11.907,1.475,...,7.619048,8.142857,19.047619,51.666667,104.095238,5.095238,0,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4492,S4R2K0,528,M,90,98.40,-3.270,-3.856,-3.011,-5.229,-11.159,...,7.523810,8.571429,25.428571,66.619048,119.285714,7.761905,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4493,S4R2K0,528,M,132,98.22,14.774,14.306,13.644,15.422,-7.821,...,6.380952,7.666667,18.952381,49.142857,89.476190,4.380952,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4494,S4R2K0,528,M,203,98.79,0.508,1.302,2.674,1.468,3.585,...,7.619048,8.809524,28.714286,82.714286,134.619048,8.571429,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4495,S4R2K0,528,M,215,98.03,3.940,3.358,2.382,2.708,12.422,...,5.761905,6.714286,18.619048,46.714286,87.380952,4.428571,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...


In [111]:
#mitocarta3_0_methionines_wa.to_csv(os.path.join(curr_dir_path, "Mitochondria_mitocarta_with_alphafold.csv"))

In [112]:
path = os.path.join(curr_dir_path, "Mitochondria_mitocarta_with_alphafold.csv")
mitocarta3_0_methionines_wa = pd.read_csv(path)
mitocarta3_0_methionines_wa.set_index("Unnamed: 0", inplace=True)
mitocarta3_0_methionines_wa.index.name = None
mitocarta3_0_methionines_wa

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR,UniProt,UniProt-Primary,Complete Sequence
0,A2ADF7,1,M,0,34.90,-16.170,-15.807,-16.898,-15.637,-29.957,...,1.909091,1.909091,3.727273,5.545455,8.454545,0.000000,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
1,A2ADF7,1,M,7,25.00,-17.959,-18.801,-19.503,-17.880,-11.797,...,1.944444,1.944444,4.000000,6.555556,10.500000,0.111111,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
2,A2ADF7,1,M,11,31.62,-22.014,-20.790,-20.768,-20.882,-2.522,...,2.476190,2.476190,5.000000,9.142857,17.666667,0.285714,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
3,A2ADF7,1,M,16,60.19,-24.992,-24.872,-26.170,-24.382,3.164,...,3.857143,3.904762,7.666667,17.238095,32.380952,1.047619,1,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
4,A2ADF7,1,M,96,86.39,-12.515,-11.617,-10.128,-11.907,1.475,...,7.619048,8.142857,19.047619,51.666667,104.095238,5.095238,0,A2ADF7,A2ADF7,MKPTQAQMAPAMDSREMVSPAVDLVLGASACCLACVFTNPLEVVKT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4492,S4R2K0,528,M,90,98.40,-3.270,-3.856,-3.011,-5.229,-11.159,...,7.523810,8.571429,25.428571,66.619048,119.285714,7.761905,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4493,S4R2K0,528,M,132,98.22,14.774,14.306,13.644,15.422,-7.821,...,6.380952,7.666667,18.952381,49.142857,89.476190,4.380952,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4494,S4R2K0,528,M,203,98.79,0.508,1.302,2.674,1.468,3.585,...,7.619048,8.809524,28.714286,82.714286,134.619048,8.571429,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...
4495,S4R2K0,528,M,215,98.03,3.940,3.358,2.382,2.708,12.422,...,5.761905,6.714286,18.619048,46.714286,87.380952,4.428571,0,S4R2K0,S4R2K0,MLLLGRLPRPAWVPGSRAQRCSSLAALEGPARTRSYWRYLRRLVCG...


# End