In [1]:
import pandas as pd
import os
from tqdm.auto import tqdm
from pandarallel import pandarallel
from rdkit import Chem
from tqdm import tqdm as top_tqdm

In [2]:
def get_structure_sequence(pdb_file):
    try:
        mol = Chem.MolFromPDBFile(pdb_file)
        protein_sequence = Chem.MolToSequence(mol)
    except:
        protein_sequence = ''
    return protein_sequence

def multiprocess_structure_check(df, nb_workers, pdb_file_path):
    
    if nb_workers != 0:

        pandarallel.initialize(nb_workers=nb_workers, progress_bar=True)
        df['pdb_files'] = df['alphafolddb-id'].parallel_apply(
            lambda x: os.path.join(pdb_file_path, f'AF-{x}-F1-model_v4.pdb'))
        df['aa_sequence_calculated'] = df['pdb_files'].parallel_apply(
            lambda x: get_structure_sequence(x))
    else:
        top_tqdm.pandas(desc='pandas bar')
        df['pdb_files'] = df['alphafolddb-id'].progress_apply(
            lambda x: os.path.join(pdb_file_path, f'AF-{x}-F1-model_v4.pdb'))
        df['aa_sequence_calculated'] = df['pdb_files'].progress_apply(
            lambda x: get_structure_sequence(x))
    
    df['is_valid'] = (df['aa_sequence_calculated'] == df['aa_sequence'])

    return df


def get_blast_database(dir, fasta_path):
    database_df = pd.DataFrame()
    csv_fnames = os.listdir(dir)
    pbar = tqdm(
        csv_fnames,
        total=len(csv_fnames)
    )
    for fname in pbar:
        df = pd.read_csv(os.path.join(dir, fname))
        df = df[['alphafolddb-id', 'aa_sequence', 'site_labels', 'site_types']]
        database_df = pd.concat([database_df, df])
    
    database_df = database_df.drop_duplicates(subset=['alphafolddb-id', 'aa_sequence','site_labels', 'site_types']).reset_index(drop=True)
    database_df['alphafolddb-id'] = database_df['alphafolddb-id'].apply(lambda x:x.replace(';',''))

    with open(fasta_path, 'w', encoding='utf-8') as f:
        for idx, row in tqdm(database_df.iterrows(), total=len(database_df)):
            f.write('>{}\n'.format(row['alphafolddb-id']))
            f.write('{}\n'.format(row['aa_sequence']))
    return database_df

def get_query_database(path, fasta_path, pdb_file_path):
    database_df = pd.read_csv(path)
    database_df = database_df[['alphafolddb-id', 'aa_sequence','site_labels', 'site_types']]
    database_df['alphafolddb-id'] = database_df['alphafolddb-id'].apply(lambda x:x.replace(';',''))
    
      
    
    write_database_df = database_df.drop_duplicates(subset=['alphafolddb-id', 'aa_sequence','site_labels', 'site_types']).reset_index(drop=True)


    with open(fasta_path, 'w', encoding='utf-8') as f:
        for idx, row in tqdm(write_database_df.iterrows(), total=len(write_database_df)):
            f.write('>{}\n'.format(row['alphafolddb-id']))
            f.write('{}\n'.format(row['aa_sequence']))
    return database_df



           

In [3]:
test_dataset_path = '../../dataset/mcsa_fine_tune/normal_mcsa'
dataset_path = '../../dataset/ec_site_dataset/uniprot_ecreact_cluster_split_merge_dataset_limit_100'
blast_database_df = pd.read_pickle('../../dataset/raw_dataset/ec_datasets/split_ec_dataset/train_ec_uniprot_dataset_cluster_sample.pkl')
blast_database_path = '../../dataset/raw_dataset/uniprot/uniprot_sprot.fasta'
blast_database_df['alphafolddb-id'] = blast_database_df['AlphaFoldDB'].apply(lambda x:x.replace(';',''))
blast_database_df['aa_sequence'] = blast_database_df['Sequence'].apply(lambda x:x)
blast_database_df


Unnamed: 0,Entry,Organism,Length,EC number,AlphaFoldDB,Active site,Binding site,Site,PDB,Sequence,...,Binding site reformate,Site reformate,Site labeled,Sequence_in_rxnaamapper_test,cluster,All site reformate,site_labels,site_types,alphafolddb-id,aa_sequence
0,A1L3X0,Homo sapiens (Human),281,2.3.1.199,A1L3X0;,"ACT_SITE 150; /note=""Nucleophile""; /evidence=""...","BINDING 124; /ligand=""3-oxoeicosanoyl-CoA""; /l...",,6Y7F;,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,...,"[{'function': 'BINDING', 'position': '124', 'l...",,True,False,Cluster 58969,"[{'function': 'BINDING', 'position': '124', 'l...","[[124], [137], [139], [142], [147], [187], [20...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",A1L3X0,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...
1,A2RUC4,Homo sapiens (Human),315,1.14.11.42,A2RUC4;,,"BINDING 106; /ligand=""2-oxoglutarate""; /ligand...",,3AL5;3AL6;,MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTV...,...,"[{'function': 'BINDING', 'position': '106', 'l...",,True,False,Cluster 49687,"[{'function': 'BINDING', 'position': '106', 'l...","[[106], [160], [162], [166], [175], [235]]","[0, 0, 0, 0, 0, 0]",A2RUC4,MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTV...
2,A5PLL7,Homo sapiens (Human),270,1.14.19.77,A5PLL7;,,,"SITE 95; /note=""Essential for catalytic activi...",,MAGAENWPGQQLELDEDEASCCRWGAQHAGARELAALYSPGKRLQE...,...,,"[{'function': 'SITE', 'position': '95', 'note'...",True,False,Cluster 61425,"[{'function': 'SITE', 'position': '95', 'note'...","[[95], [120], [121], [186], [190], [214], [217...","[2, 2, 2, 2, 2, 2, 2, 2]",A5PLL7,MAGAENWPGQQLELDEDEASCCRWGAQHAGARELAALYSPGKRLQE...
3,C9JRZ8,Homo sapiens (Human),316,1.1.1.-,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...,...,"[{'function': 'BINDING', 'position': '20..22',...","[{'function': 'SITE', 'position': '78', 'note'...",True,False,Cluster 49416,"[{'function': 'BINDING', 'position': '20..22',...","[[20, 22], [44], [111], [160, 161], [184], [21...","[0, 0, 0, 0, 0, 0, 0, 1, 2]",C9JRZ8,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...
4,C9JRZ8,Homo sapiens (Human),316,1.1.1.216,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...,...,"[{'function': 'BINDING', 'position': '20..22',...","[{'function': 'SITE', 'position': '78', 'note'...",True,False,Cluster 49416,"[{'function': 'BINDING', 'position': '20..22',...","[[20, 22], [44], [111], [160, 161], [184], [21...","[0, 0, 0, 0, 0, 0, 0, 1, 2]",C9JRZ8,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139464,Q49WE4,Staphylococcus saprophyticus subsp. saprophyti...,169,3.1.-.-,Q49WE4;,"ACT_SITE 34; /note=""Proton donor""; /evidence=""...",,,,MILGLALIPSKAFQDEVNAYRKRYDAHYATIMPHITIKGQFKINDG...,...,,,True,False,Cluster 84924,"[{'function': 'ACT_SITE', 'position': '34', 'n...","[[34], [115]]","[1, 1]",Q49WE4,MILGLALIPSKAFQDEVNAYRKRYDAHYATIMPHITIKGQFKINDG...
139465,Q5HQE9,Staphylococcus epidermidis (strain ATCC 35984 ...,169,3.1.-.-,Q5HQE9;,"ACT_SITE 34; /note=""Proton donor""; /evidence=""...",,,,MILGLALVPSKSFQDEVNAYRKRYDNHYAQIMPHITIKPQFEIDDH...,...,,,True,False,Cluster 84921,"[{'function': 'ACT_SITE', 'position': '34', 'n...","[[34], [115]]","[1, 1]",Q5HQE9,MILGLALVPSKSFQDEVNAYRKRYDNHYAQIMPHITIKPQFEIDDH...
139466,Q8CNG6,Staphylococcus epidermidis (strain ATCC 12228 ...,395,2.7.7.-,Q8CNG6;,,"BINDING 99..102; /ligand=""UTP""; /ligand_id=""Ch...",,,MLDKNQLEKYNQEHLYEYEKLMSSNEKNALDEKVDQLNLAEIQDLY...,...,"[{'function': 'BINDING', 'position': '99..102'...",,True,False,Cluster 29613,"[{'function': 'BINDING', 'position': '99..102'...","[[99, 102], [113], [178], [204], [235], [344]]","[0, 0, 0, 0, 0, 0]",Q8CNG6,MLDKNQLEKYNQEHLYEYEKLMSSNEKNALDEKVDQLNLAEIQDLY...
139467,Q97U03,Saccharolobus solfataricus (strain ATCC 35092 ...,272,4.2.1.-,Q97U03;,"ACT_SITE 185; /note=""Schiff-base intermediate ...",,,,MIGSEIRMAKLFDKGRALVVALDHGLVMGPLKGIENPVEVVAKIAK...,...,,,True,False,Cluster 61416,"[{'function': 'ACT_SITE', 'position': '185', '...",[[185]],[1],Q97U03,MIGSEIRMAKLFDKGRALVVALDHGLVMGPLKGIENPVEVVAKIAK...


In [4]:
test_dataset = get_query_database(os.path.join(test_dataset_path, 'test_dataset', 'mcsa_test.csv'), fasta_path=os.path.join(test_dataset_path, 'test_dataset.fasta'), pdb_file_path=os.path.join(os.path.dirname(test_dataset_path), 'structures', 'alphafolddb_download'))


  0%|          | 0/95 [00:00<?, ?it/s]

In [5]:
test_dataset = multiprocess_structure_check(test_dataset, 10, pdb_file_path='../../dataset/mcsa_fine_tune/structures/alphafolddb_download')
test_dataset = test_dataset.loc[test_dataset['is_valid']].reset_index(drop=True)
test_dataset

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=10), Label(value='0 / 10'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=10), Label(value='0 / 10'))), HBox…

Unnamed: 0,alphafolddb-id,aa_sequence,site_labels,site_types,pdb_files,aa_sequence_calculated,is_valid
0,P07598,MSRTVMERIEYEMHTPDPKADPDKLHFVQIDEAKCIGCDTCSQYCP...,"[[156], [159], [178], [198], [237], [240], [24...",,../../dataset/mcsa_fine_tune/structures/alphaf...,MSRTVMERIEYEMHTPDPKADPDKLHFVQIDEAKCIGCDTCSQYCP...,True
1,P00436,MPIELLPETPSQTAGPYVHIGLALEAAGNPTRDQEIWNRLAKPDAP...,"[[109], [148], [158], [161], [163]]",,../../dataset/mcsa_fine_tune/structures/alphaf...,MPIELLPETPSQTAGPYVHIGLALEAAGNPTRDQEIWNRLAKPDAP...,True
2,Q55389,MTSSDTQNNKTLAAMKNFAEQYAKRTDTYFCSDLSVTAVVIEGLAR...,"[[56], [58], [75], [77], [86], [87], [88]]",,../../dataset/mcsa_fine_tune/structures/alphaf...,MTSSDTQNNKTLAAMKNFAEQYAKRTDTYFCSDLSVTAVVIEGLAR...,True
3,P68688,MQTVIFGRSGCPYCVRAKDLAEKLSNERDDFQYQYVDIRAEGITKE...,"[[8], [10], [11], [13], [14], [18], [72]]",,../../dataset/mcsa_fine_tune/structures/alphaf...,MQTVIFGRSGCPYCVRAKDLAEKLSNERDDFQYQYVDIRAEGITKE...,True
4,P0A006,MDKKTIYFICTGNSCRSQMAEGWGKEILGEGWNVYSAGIETHGVNP...,"[[10], [11], [12], [13], [14], [15], [16], [17...",,../../dataset/mcsa_fine_tune/structures/alphaf...,MDKKTIYFICTGNSCRSQMAEGWGKEILGEGWNVYSAGIETHGVNP...,True
...,...,...,...,...,...,...,...
77,P42126,MALVASVRVPARVLLRAGARLPGAALGRTERAAGGGDGARRFGSQR...,"[[108], [153], [177], [178]]",,../../dataset/mcsa_fine_tune/structures/alphaf...,MALVASVRVPARVLLRAGARLPGAALGRTERAAGGGDGARRFGSQR...,True
78,P27001,MLEEALAAIQNARDLEELKALKARYLGKKGLLTQEMKGLSALPLEE...,"[[149], [178], [204], [218], [261], [314]]",,../../dataset/mcsa_fine_tune/structures/alphaf...,MLEEALAAIQNARDLEELKALKARYLGKKGLLTQEMKGLSALPLEE...,True
79,P22106,MCSIFGVFDIKTDAVELRKKALELSRLMRHRGPDWSGIYASDNAIL...,"[[2], [51], [75], [76], [322], [325]]",,../../dataset/mcsa_fine_tune/structures/alphaf...,MCSIFGVFDIKTDAVELRKKALELSRLMRHRGPDWSGIYASDNAIL...,True
80,P46849,MKRMIALDGAQGEGGGQILRSALSLSMITGQPFTITSIRAGRAKPG...,"[[13], [308]]",,../../dataset/mcsa_fine_tune/structures/alphaf...,MKRMIALDGAQGEGGGQILRSALSLSMITGQPFTITSIRAGRAKPG...,True


In [6]:
import subprocess

database_fasta = os.path.join(dataset_path, 'blast_database.fasta')
database = os.path.join(dataset_path, 'blast_database')
command = f'makeblastdb -in {database_fasta} -dbtype prot -out {database}'
subprocess.run(command, shell=True)



Building a new DB, current time: 05/23/2024 17:57:21
New DB name:   /home/xiaoruiwang/data/ubuntu_work_beta/single_step_work/EasIFA_v2/dataset/ec_site_dataset/uniprot_ecreact_cluster_split_merge_dataset_limit_100/blast_database
New DB title:  ../../dataset/ec_site_dataset/uniprot_ecreact_cluster_split_merge_dataset_limit_100/blast_database.fasta
Sequence type: Protein
Deleted existing Protein BLAST database named /home/xiaoruiwang/data/ubuntu_work_beta/single_step_work/EasIFA_v2/dataset/ec_site_dataset/uniprot_ecreact_cluster_split_merge_dataset_limit_100/blast_database
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 44793 sequences in 0.458338 seconds.


CompletedProcess(args='makeblastdb -in ../../dataset/ec_site_dataset/uniprot_ecreact_cluster_split_merge_dataset_limit_100/blast_database.fasta -dbtype prot -out ../../dataset/ec_site_dataset/uniprot_ecreact_cluster_split_merge_dataset_limit_100/blast_database', returncode=0)

In [7]:
query_file = os.path.join(test_dataset_path, 'test_dataset.fasta')
output_file = os.path.join(test_dataset_path, 'blast_results_sprot.txt')
command = f'blastp -query {query_file} -db {database} -out {output_file} -evalue 0.001 -outfmt 6'
if not os.path.exists(output_file):
    subprocess.run(command, shell=True)


In [8]:
def read_blast_results(path):
    column_headers = [
    "Query ID",
    "Subject ID",
    "% Identity",
    "Alignment Length",
    "Mismatches",
    "Gap Opens",
    "Query Start",
    "Query End",
    "Subject Start",
    "Subject End",
    "E-value",
    "Bit Score",
    ]
    results_df = pd.read_csv(path, sep='\t', header=None)
    results_df.columns = column_headers
    return results_df



In [9]:
blast_p_results = read_blast_results(path=output_file)
blast_p_results

Unnamed: 0,Query ID,Subject ID,% Identity,Alignment Length,Mismatches,Gap Opens,Query Start,Query End,Subject Start,Subject End,E-value,Bit Score
0,P07598,Q46508,45.802,393,195,5,31,407,149,539,2.020000e-100,310.0
1,P07598,P80491,34.783,69,41,2,25,93,179,243,7.930000e-05,44.3
2,P07598,H6LC27,40.385,52,31,0,30,81,274,325,3.340000e-04,42.7
3,P07598,Q60340,41.667,48,26,1,30,77,173,218,5.500000e-04,41.6
4,Q5SJ80,P98004,26.042,288,180,8,219,498,221,483,3.020000e-11,65.9
...,...,...,...,...,...,...,...,...,...,...,...,...
2443,P01096,P29685,24.430,307,211,5,120,417,146,440,2.110000e-14,76.3
2444,P01096,P19366,24.706,340,212,7,125,443,85,401,3.700000e-14,75.1
2445,P01096,A5FLS1,22.327,318,202,5,134,417,77,383,1.300000e-12,70.1
2446,P01096,Q971B7,26.482,253,155,6,175,409,192,431,1.540000e-12,70.1


In [10]:
print(blast_p_results['% Identity'].max())
print(blast_p_results['% Identity'].min())
print(blast_p_results['% Identity'].mean())

79.336
18.788
31.9657892156863


In [11]:
import sys
sys.path.append('../../')
from dataset_preprocess.pdb_preprocess_utils import map_active_site_for_one
from utils import predict_activate_site_with_sequence_alignment, predict_activate_site_type_with_sequence_alignment



In [12]:


predicted_activate_sites, overlap_scores, false_positive_rates = predict_activate_site_with_sequence_alignment(test_dataset, database=blast_database_df, blastp_results=blast_p_results, top_n=5)

  0%|          | 0/82 [00:00<?, ?it/s]

Get 82 results
Accuracy: 0.9709, Precision: 0.2442, Specificity: 0.9838, Overlap Score: 0.3133, False Positive Rate: 0.0162, F1: 0.2227, MCC: 0.2394


In [13]:
test_dataset['site_types'] = test_dataset['site_labels'].apply(lambda x:str([1]*len(eval(x))))
test_dataset

Unnamed: 0,alphafolddb-id,aa_sequence,site_labels,site_types,pdb_files,aa_sequence_calculated,is_valid
0,P07598,MSRTVMERIEYEMHTPDPKADPDKLHFVQIDEAKCIGCDTCSQYCP...,"[[156], [159], [178], [198], [237], [240], [24...","[1, 1, 1, 1, 1, 1, 1, 1]",../../dataset/mcsa_fine_tune/structures/alphaf...,MSRTVMERIEYEMHTPDPKADPDKLHFVQIDEAKCIGCDTCSQYCP...,True
1,P00436,MPIELLPETPSQTAGPYVHIGLALEAAGNPTRDQEIWNRLAKPDAP...,"[[109], [148], [158], [161], [163]]","[1, 1, 1, 1, 1]",../../dataset/mcsa_fine_tune/structures/alphaf...,MPIELLPETPSQTAGPYVHIGLALEAAGNPTRDQEIWNRLAKPDAP...,True
2,Q55389,MTSSDTQNNKTLAAMKNFAEQYAKRTDTYFCSDLSVTAVVIEGLAR...,"[[56], [58], [75], [77], [86], [87], [88]]","[1, 1, 1, 1, 1, 1, 1]",../../dataset/mcsa_fine_tune/structures/alphaf...,MTSSDTQNNKTLAAMKNFAEQYAKRTDTYFCSDLSVTAVVIEGLAR...,True
3,P68688,MQTVIFGRSGCPYCVRAKDLAEKLSNERDDFQYQYVDIRAEGITKE...,"[[8], [10], [11], [13], [14], [18], [72]]","[1, 1, 1, 1, 1, 1, 1]",../../dataset/mcsa_fine_tune/structures/alphaf...,MQTVIFGRSGCPYCVRAKDLAEKLSNERDDFQYQYVDIRAEGITKE...,True
4,P0A006,MDKKTIYFICTGNSCRSQMAEGWGKEILGEGWNVYSAGIETHGVNP...,"[[10], [11], [12], [13], [14], [15], [16], [17...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",../../dataset/mcsa_fine_tune/structures/alphaf...,MDKKTIYFICTGNSCRSQMAEGWGKEILGEGWNVYSAGIETHGVNP...,True
...,...,...,...,...,...,...,...
77,P42126,MALVASVRVPARVLLRAGARLPGAALGRTERAAGGGDGARRFGSQR...,"[[108], [153], [177], [178]]","[1, 1, 1, 1]",../../dataset/mcsa_fine_tune/structures/alphaf...,MALVASVRVPARVLLRAGARLPGAALGRTERAAGGGDGARRFGSQR...,True
78,P27001,MLEEALAAIQNARDLEELKALKARYLGKKGLLTQEMKGLSALPLEE...,"[[149], [178], [204], [218], [261], [314]]","[1, 1, 1, 1, 1, 1]",../../dataset/mcsa_fine_tune/structures/alphaf...,MLEEALAAIQNARDLEELKALKARYLGKKGLLTQEMKGLSALPLEE...,True
79,P22106,MCSIFGVFDIKTDAVELRKKALELSRLMRHRGPDWSGIYASDNAIL...,"[[2], [51], [75], [76], [322], [325]]","[1, 1, 1, 1, 1, 1]",../../dataset/mcsa_fine_tune/structures/alphaf...,MCSIFGVFDIKTDAVELRKKALELSRLMRHRGPDWSGIYASDNAIL...,True
80,P46849,MKRMIALDGAQGEGGGQILRSALSLSMITGQPFTITSIRAGRAKPG...,"[[13], [308]]","[1, 1]",../../dataset/mcsa_fine_tune/structures/alphaf...,MKRMIALDGAQGEGGGQILRSALSLSMITGQPFTITSIRAGRAKPG...,True


In [14]:
test_dataset['site_labels'][0]

'[[156], [159], [178], [198], [237], [240], [245], [382]]'

In [15]:
blast_database_df

Unnamed: 0,Entry,Organism,Length,EC number,AlphaFoldDB,Active site,Binding site,Site,PDB,Sequence,...,Binding site reformate,Site reformate,Site labeled,Sequence_in_rxnaamapper_test,cluster,All site reformate,site_labels,site_types,alphafolddb-id,aa_sequence
0,A1L3X0,Homo sapiens (Human),281,2.3.1.199,A1L3X0;,"ACT_SITE 150; /note=""Nucleophile""; /evidence=""...","BINDING 124; /ligand=""3-oxoeicosanoyl-CoA""; /l...",,6Y7F;,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,...,"[{'function': 'BINDING', 'position': '124', 'l...",,True,False,Cluster 58969,"[{'function': 'BINDING', 'position': '124', 'l...","[[124], [137], [139], [142], [147], [187], [20...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",A1L3X0,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...
1,A2RUC4,Homo sapiens (Human),315,1.14.11.42,A2RUC4;,,"BINDING 106; /ligand=""2-oxoglutarate""; /ligand...",,3AL5;3AL6;,MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTV...,...,"[{'function': 'BINDING', 'position': '106', 'l...",,True,False,Cluster 49687,"[{'function': 'BINDING', 'position': '106', 'l...","[[106], [160], [162], [166], [175], [235]]","[0, 0, 0, 0, 0, 0]",A2RUC4,MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTV...
2,A5PLL7,Homo sapiens (Human),270,1.14.19.77,A5PLL7;,,,"SITE 95; /note=""Essential for catalytic activi...",,MAGAENWPGQQLELDEDEASCCRWGAQHAGARELAALYSPGKRLQE...,...,,"[{'function': 'SITE', 'position': '95', 'note'...",True,False,Cluster 61425,"[{'function': 'SITE', 'position': '95', 'note'...","[[95], [120], [121], [186], [190], [214], [217...","[2, 2, 2, 2, 2, 2, 2, 2]",A5PLL7,MAGAENWPGQQLELDEDEASCCRWGAQHAGARELAALYSPGKRLQE...
3,C9JRZ8,Homo sapiens (Human),316,1.1.1.-,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...,...,"[{'function': 'BINDING', 'position': '20..22',...","[{'function': 'SITE', 'position': '78', 'note'...",True,False,Cluster 49416,"[{'function': 'BINDING', 'position': '20..22',...","[[20, 22], [44], [111], [160, 161], [184], [21...","[0, 0, 0, 0, 0, 0, 0, 1, 2]",C9JRZ8,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...
4,C9JRZ8,Homo sapiens (Human),316,1.1.1.216,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...,...,"[{'function': 'BINDING', 'position': '20..22',...","[{'function': 'SITE', 'position': '78', 'note'...",True,False,Cluster 49416,"[{'function': 'BINDING', 'position': '20..22',...","[[20, 22], [44], [111], [160, 161], [184], [21...","[0, 0, 0, 0, 0, 0, 0, 1, 2]",C9JRZ8,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139464,Q49WE4,Staphylococcus saprophyticus subsp. saprophyti...,169,3.1.-.-,Q49WE4;,"ACT_SITE 34; /note=""Proton donor""; /evidence=""...",,,,MILGLALIPSKAFQDEVNAYRKRYDAHYATIMPHITIKGQFKINDG...,...,,,True,False,Cluster 84924,"[{'function': 'ACT_SITE', 'position': '34', 'n...","[[34], [115]]","[1, 1]",Q49WE4,MILGLALIPSKAFQDEVNAYRKRYDAHYATIMPHITIKGQFKINDG...
139465,Q5HQE9,Staphylococcus epidermidis (strain ATCC 35984 ...,169,3.1.-.-,Q5HQE9;,"ACT_SITE 34; /note=""Proton donor""; /evidence=""...",,,,MILGLALVPSKSFQDEVNAYRKRYDNHYAQIMPHITIKPQFEIDDH...,...,,,True,False,Cluster 84921,"[{'function': 'ACT_SITE', 'position': '34', 'n...","[[34], [115]]","[1, 1]",Q5HQE9,MILGLALVPSKSFQDEVNAYRKRYDNHYAQIMPHITIKPQFEIDDH...
139466,Q8CNG6,Staphylococcus epidermidis (strain ATCC 12228 ...,395,2.7.7.-,Q8CNG6;,,"BINDING 99..102; /ligand=""UTP""; /ligand_id=""Ch...",,,MLDKNQLEKYNQEHLYEYEKLMSSNEKNALDEKVDQLNLAEIQDLY...,...,"[{'function': 'BINDING', 'position': '99..102'...",,True,False,Cluster 29613,"[{'function': 'BINDING', 'position': '99..102'...","[[99, 102], [113], [178], [204], [235], [344]]","[0, 0, 0, 0, 0, 0]",Q8CNG6,MLDKNQLEKYNQEHLYEYEKLMSSNEKNALDEKVDQLNLAEIQDLY...
139467,Q97U03,Saccharolobus solfataricus (strain ATCC 35092 ...,272,4.2.1.-,Q97U03;,"ACT_SITE 185; /note=""Schiff-base intermediate ...",,,,MIGSEIRMAKLFDKGRALVVALDHGLVMGPLKGIENPVEVVAKIAK...,...,,,True,False,Cluster 61416,"[{'function': 'ACT_SITE', 'position': '185', '...",[[185]],[1],Q97U03,MIGSEIRMAKLFDKGRALVVALDHGLVMGPLKGIENPVEVVAKIAK...


In [16]:
predicted_activate_sites, predicted_activate_sites_vec, overlap_scores_list, false_positive_rates_list = predict_activate_site_type_with_sequence_alignment(test_dataset, database=blast_database_df, blastp_results=blast_p_results, top_n=5)

  0%|          | 0/82 [00:00<?, ?it/s]

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

Get 82 results
Accuracy: 0.9709, Precision: 0.2442, Specificity: 0.9838, Overlap Score: 0.3133, False Positive Rate: 0.0162, F1: 0.2227, MCC: 0.2394
Multiclassfication Metrics:
recall_cls_0: 0.9838, recall_cls_1: 0.0000, recall_cls_2: 0.1009, recall_cls_3: 0.0000, fpr_cls_0: 0.6867, fpr_cls_1: 0.0188, fpr_cls_2: 0.0004, fpr_cls_3: 0.0006, multi-class mcc: 0.1648


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
