In [1]:
import pandas as pd
import os
from tqdm.auto import tqdm
from pandarallel import pandarallel
from rdkit import Chem
from tqdm import tqdm as top_tqdm

In [2]:
def get_structure_sequence(pdb_file):
    try:
        mol = Chem.MolFromPDBFile(pdb_file)
        protein_sequence = Chem.MolToSequence(mol)
    except:
        protein_sequence = ''
    return protein_sequence

def multiprocess_structure_check(df, nb_workers, pdb_file_path):
    
    if nb_workers != 0:

        pandarallel.initialize(nb_workers=nb_workers, progress_bar=True)
        df['pdb_files'] = df['alphafolddb-id'].parallel_apply(
            lambda x: os.path.join(pdb_file_path, f'AF-{x}-F1-model_v4.pdb'))
        df['aa_sequence_calculated'] = df['pdb_files'].parallel_apply(
            lambda x: get_structure_sequence(x))
    else:
        top_tqdm.pandas(desc='pandas bar')
        df['pdb_files'] = df['alphafolddb-id'].progress_apply(
            lambda x: os.path.join(pdb_file_path, f'AF-{x}-F1-model_v4.pdb'))
        df['aa_sequence_calculated'] = df['pdb_files'].progress_apply(
            lambda x: get_structure_sequence(x))
    
    df['is_valid'] = (df['aa_sequence_calculated'] == df['aa_sequence'])

    return df
def get_blast_database(dir, fasta_path):
    database_df = pd.DataFrame()
    csv_fnames = os.listdir(dir)
    pbar = tqdm(
        csv_fnames,
        total=len(csv_fnames)
    )
    for fname in pbar:
        df = pd.read_csv(os.path.join(dir, fname))
        df = df[['alphafolddb-id', 'aa_sequence', 'site_labels', 'site_types']]
        database_df = pd.concat([database_df, df])
    
    database_df = database_df.drop_duplicates(subset=['alphafolddb-id', 'aa_sequence','site_labels', 'site_types']).reset_index(drop=True)
    database_df['alphafolddb-id'] = database_df['alphafolddb-id'].apply(lambda x:x.replace(';',''))

    with open(fasta_path, 'w', encoding='utf-8') as f:
        for idx, row in tqdm(database_df.iterrows(), total=len(database_df)):
            f.write('>{}\n'.format(row['alphafolddb-id']))
            f.write('{}\n'.format(row['aa_sequence']))
    return database_df

def get_query_database(path, fasta_path, pdb_file_path):
    database_df = pd.read_csv(path)
    database_df = database_df[['alphafolddb-id', 'aa_sequence','site_labels', 'site_types']]
    database_df['alphafolddb-id'] = database_df['alphafolddb-id'].apply(lambda x:x.replace(';',''))
    
    database_df = multiprocess_structure_check(database_df, nb_workers=12, pdb_file_path=pdb_file_path)
    
    write_database_df = database_df.drop_duplicates(subset=['alphafolddb-id', 'aa_sequence','site_labels', 'site_types']).reset_index(drop=True)


    with open(fasta_path, 'w', encoding='utf-8') as f:
        for idx, row in tqdm(write_database_df.iterrows(), total=len(write_database_df)):
            f.write('>{}\n'.format(row['alphafolddb-id']))
            f.write('{}\n'.format(row['aa_sequence']))
    return database_df

    

           

In [3]:
dataset_path = '../../dataset/ec_site_dataset/uniprot_ecreact_cluster_split_merge_dataset_limit_100'
blast_database_path = '../../dataset/raw_dataset/uniprot/uniprot_sprot.fasta'
train_database_df = pd.read_pickle('../../dataset/raw_dataset/ec_datasets/split_ec_dataset/train_ec_uniprot_dataset_cluster_sample.pkl')
test_dataset = get_query_database(os.path.join(dataset_path, 'test_dataset', 'uniprot_ecreact_merge.csv'), fasta_path=os.path.join(dataset_path, 'test_dataset.fasta'), pdb_file_path=os.path.join(os.path.dirname(dataset_path), 'structures', 'alphafolddb_download'))



INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=75), Label(value='0 / 75'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=75), Label(value='0 / 75'))), HBox…

  0%|          | 0/853 [00:00<?, ?it/s]

In [4]:
train_database_df['alphafolddb-id'] = train_database_df['AlphaFoldDB'].apply(lambda x:x.replace(';',''))
train_database_df['aa_sequence'] = train_database_df['Sequence'].apply(lambda x:x)
train_database_df

Unnamed: 0,Entry,Organism,Length,EC number,AlphaFoldDB,Active site,Binding site,Site,PDB,Sequence,...,Binding site reformate,Site reformate,Site labeled,Sequence_in_rxnaamapper_test,cluster,All site reformate,site_labels,site_types,alphafolddb-id,aa_sequence
0,A1L3X0,Homo sapiens (Human),281,2.3.1.199,A1L3X0;,"ACT_SITE 150; /note=""Nucleophile""; /evidence=""...","BINDING 124; /ligand=""3-oxoeicosanoyl-CoA""; /l...",,6Y7F;,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,...,"[{'function': 'BINDING', 'position': '124', 'l...",,True,False,Cluster 58969,"[{'function': 'BINDING', 'position': '124', 'l...","[[124], [137], [139], [142], [147], [187], [20...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",A1L3X0,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...
1,A2RUC4,Homo sapiens (Human),315,1.14.11.42,A2RUC4;,,"BINDING 106; /ligand=""2-oxoglutarate""; /ligand...",,3AL5;3AL6;,MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTV...,...,"[{'function': 'BINDING', 'position': '106', 'l...",,True,False,Cluster 49687,"[{'function': 'BINDING', 'position': '106', 'l...","[[106], [160], [162], [166], [175], [235]]","[0, 0, 0, 0, 0, 0]",A2RUC4,MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTV...
2,A5PLL7,Homo sapiens (Human),270,1.14.19.77,A5PLL7;,,,"SITE 95; /note=""Essential for catalytic activi...",,MAGAENWPGQQLELDEDEASCCRWGAQHAGARELAALYSPGKRLQE...,...,,"[{'function': 'SITE', 'position': '95', 'note'...",True,False,Cluster 61425,"[{'function': 'SITE', 'position': '95', 'note'...","[[95], [120], [121], [186], [190], [214], [217...","[2, 2, 2, 2, 2, 2, 2, 2]",A5PLL7,MAGAENWPGQQLELDEDEASCCRWGAQHAGARELAALYSPGKRLQE...
3,C9JRZ8,Homo sapiens (Human),316,1.1.1.-,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...,...,"[{'function': 'BINDING', 'position': '20..22',...","[{'function': 'SITE', 'position': '78', 'note'...",True,False,Cluster 49416,"[{'function': 'BINDING', 'position': '20..22',...","[[20, 22], [44], [111], [160, 161], [184], [21...","[0, 0, 0, 0, 0, 0, 0, 1, 2]",C9JRZ8,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...
4,C9JRZ8,Homo sapiens (Human),316,1.1.1.216,C9JRZ8;,"ACT_SITE 49; /note=""Proton donor""; /evidence=""...","BINDING 20..22; /ligand=""NADP(+)""; /ligand_id=...","SITE 78; /note=""Lowers pKa of active site Tyr""...",,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...,...,"[{'function': 'BINDING', 'position': '20..22',...","[{'function': 'SITE', 'position': '78', 'note'...",True,False,Cluster 49416,"[{'function': 'BINDING', 'position': '20..22',...","[[20, 22], [44], [111], [160, 161], [184], [21...","[0, 0, 0, 0, 0, 0, 0, 1, 2]",C9JRZ8,MATFVELSTKAKMPIVGLGTWRSLLGKVKEAVKVAIDAEYRHIDCA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139464,Q49WE4,Staphylococcus saprophyticus subsp. saprophyti...,169,3.1.-.-,Q49WE4;,"ACT_SITE 34; /note=""Proton donor""; /evidence=""...",,,,MILGLALIPSKAFQDEVNAYRKRYDAHYATIMPHITIKGQFKINDG...,...,,,True,False,Cluster 84924,"[{'function': 'ACT_SITE', 'position': '34', 'n...","[[34], [115]]","[1, 1]",Q49WE4,MILGLALIPSKAFQDEVNAYRKRYDAHYATIMPHITIKGQFKINDG...
139465,Q5HQE9,Staphylococcus epidermidis (strain ATCC 35984 ...,169,3.1.-.-,Q5HQE9;,"ACT_SITE 34; /note=""Proton donor""; /evidence=""...",,,,MILGLALVPSKSFQDEVNAYRKRYDNHYAQIMPHITIKPQFEIDDH...,...,,,True,False,Cluster 84921,"[{'function': 'ACT_SITE', 'position': '34', 'n...","[[34], [115]]","[1, 1]",Q5HQE9,MILGLALVPSKSFQDEVNAYRKRYDNHYAQIMPHITIKPQFEIDDH...
139466,Q8CNG6,Staphylococcus epidermidis (strain ATCC 12228 ...,395,2.7.7.-,Q8CNG6;,,"BINDING 99..102; /ligand=""UTP""; /ligand_id=""Ch...",,,MLDKNQLEKYNQEHLYEYEKLMSSNEKNALDEKVDQLNLAEIQDLY...,...,"[{'function': 'BINDING', 'position': '99..102'...",,True,False,Cluster 29613,"[{'function': 'BINDING', 'position': '99..102'...","[[99, 102], [113], [178], [204], [235], [344]]","[0, 0, 0, 0, 0, 0]",Q8CNG6,MLDKNQLEKYNQEHLYEYEKLMSSNEKNALDEKVDQLNLAEIQDLY...
139467,Q97U03,Saccharolobus solfataricus (strain ATCC 35092 ...,272,4.2.1.-,Q97U03;,"ACT_SITE 185; /note=""Schiff-base intermediate ...",,,,MIGSEIRMAKLFDKGRALVVALDHGLVMGPLKGIENPVEVVAKIAK...,...,,,True,False,Cluster 61416,"[{'function': 'ACT_SITE', 'position': '185', '...",[[185]],[1],Q97U03,MIGSEIRMAKLFDKGRALVVALDHGLVMGPLKGIENPVEVVAKIAK...


In [5]:
test_dataset = test_dataset.loc[test_dataset['is_valid']].reset_index(drop=True)
test_dataset

Unnamed: 0,alphafolddb-id,aa_sequence,site_labels,site_types,pdb_files,aa_sequence_calculated,is_valid
0,A0A1S7LCW6,MKLKGTTIVALGMLVVAIMVLASMIDLPGSDMSATPAPPDTPRGAP...,"[[206], [212], [215], [216], [246], [252], [25...","[0, 0, 0, 0, 0, 0, 0, 0]",../../dataset/ec_site_dataset/structures/alpha...,MKLKGTTIVALGMLVVAIMVLASMIDLPGSDMSATPAPPDTPRGAP...,True
1,Q9F0J6,MQATKIIDGFHLVGAIDWNSRDFHGYTLSPMGTTYNAYLVEDEKTT...,"[[79], [81], [83], [146], [165], [165], [226]]","[0, 0, 0, 0, 0, 0, 0]",../../dataset/ec_site_dataset/structures/alpha...,MQATKIIDGFHLVGAIDWNSRDFHGYTLSPMGTTYNAYLVEDEKTT...,True
2,Q5BEJ7,MADHEQEQEPLSIAIIGGGIIGLMTALGLLHRNIGKVTIYERASAW...,"[[41, 42], [245, 247], [320], [330, 334]]","[0, 0, 0, 0]",../../dataset/ec_site_dataset/structures/alpha...,MADHEQEQEPLSIAIIGGGIIGLMTALGLLHRNIGKVTIYERASAW...,True
3,Q9HUH4,MPQALSTDILIVGGGIAGLWLNARLRRAGYATVLVESASLGGGQSV...,"[[17], [36], [44, 45], [49, 51], [346, 347]]","[0, 0, 0, 0, 0]",../../dataset/ec_site_dataset/structures/alpha...,MPQALSTDILIVGGGIAGLWLNARLRRAGYATVLVESASLGGGQSV...,True
4,P96692,MAEFTHLVNERRSASNFLSGHPITKEDLNEMFELVALAPSAFNLQH...,"[[11, 13], [68, 70], [157, 158], [193], [196]]","[0, 0, 0, 0, 0]",../../dataset/ec_site_dataset/structures/alpha...,MAEFTHLVNERRSASNFLSGHPITKEDLNEMFELVALAPSAFNLQH...,True
...,...,...,...,...,...,...,...
887,O30144,MFLKVRAEKRLGNFRLNVDFEMGRDYCVLLGPTGAGKSVFLELIAG...,"[[31, 38]]",[0],../../dataset/ec_site_dataset/structures/alpha...,MFLKVRAEKRLGNFRLNVDFEMGRDYCVLLGPTGAGKSVFLELIAG...,True
888,P28737,MSRKFDLKTITDLSVLVGTGISLYYLVSRLLNDVESGPLSGKSRES...,"[[133, 140]]",[0],../../dataset/ec_site_dataset/structures/alpha...,MSRKFDLKTITDLSVLVGTGISLYYLVSRLLNDVESGPLSGKSRES...,True
889,P37093,MTEMVISPAERQSIRRLPFSFANRFKLVLDWNEDFSQASIYYLAPL...,"[[397], [400], [430], [433]]","[0, 0, 0, 0]",../../dataset/ec_site_dataset/structures/alpha...,MTEMVISPAERQSIRRLPFSFANRFKLVLDWNEDFSQASIYYLAPL...,True
890,P94360,MAELRMEHIYKFYDQKEPAVDDFNLHIADKEFIVFVGPSGCGKSTT...,"[[37, 44]]",[0],../../dataset/ec_site_dataset/structures/alpha...,MAELRMEHIYKFYDQKEPAVDDFNLHIADKEFIVFVGPSGCGKSTT...,True


In [6]:
import subprocess

database_fasta = blast_database_path
database = os.path.join(os.path.dirname(database_fasta), 'uniprot_sprot')
command = f'makeblastdb -in {database_fasta} -dbtype prot -out {database}'
subprocess.run(command, shell=True)



Building a new DB, current time: 05/23/2024 20:38:40
New DB name:   /home/xiaoruiwang/data/ubuntu_work_beta/single_step_work/EasIFA_v2/dataset/raw_dataset/uniprot/uniprot_sprot
New DB title:  ../../dataset/raw_dataset/uniprot/uniprot_sprot.fasta
Sequence type: Protein
Deleted existing Protein BLAST database named /home/xiaoruiwang/data/ubuntu_work_beta/single_step_work/EasIFA_v2/dataset/raw_dataset/uniprot/uniprot_sprot
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 569516 sequences in 6.57521 seconds.


CompletedProcess(args='makeblastdb -in ../../dataset/raw_dataset/uniprot/uniprot_sprot.fasta -dbtype prot -out ../../dataset/raw_dataset/uniprot/uniprot_sprot', returncode=0)

In [7]:
query_file = os.path.join(dataset_path, 'test_dataset.fasta')
output_file = os.path.join(dataset_path, 'blast_results_use_uniprot_sprot.txt')
command = f'blastp -query {query_file} -db {database} -out {output_file} -evalue 0.001 -outfmt 6 -num_threads 16'
if not os.path.exists(output_file):
    subprocess.run(command, shell=True)
# subprocess.run(command, shell=True)

In [8]:
def read_blast_results(path):
    column_headers = [
    "Query ID",
    "Subject ID",
    "% Identity",
    "Alignment Length",
    "Mismatches",
    "Gap Opens",
    "Query Start",
    "Query End",
    "Subject Start",
    "Subject End",
    "E-value",
    "Bit Score",
    ]
    results_df = pd.read_csv(path, sep='\t', header=None)
    results_df.columns = column_headers
    results_df['Subject ID'] = results_df['Subject ID'].apply(lambda x:x.split('|')[1])
    results_df = results_df.loc[results_df['Query ID']!=results_df['Subject ID']] # 把测试集删除，防止数据泄露
    return results_df


In [9]:
blast_p_results = read_blast_results(path=output_file)
blast_p_results

Unnamed: 0,Query ID,Subject ID,% Identity,Alignment Length,Mismatches,Gap Opens,Query Start,Query End,Subject Start,Subject End,E-value,Bit Score
1,A0A1S7LCW6,Q93DZ0,46.316,190,89,3,82,258,80,269,4.430000e-47,160.0
2,A0A1S7LCW6,Q2W8Q1,45.226,199,96,3,73,258,76,274,3.140000e-44,153.0
4,Q9F0J6,Q9FDN7,41.878,394,225,3,2,393,3,394,3.710000e-104,316.0
5,Q9F0J6,Q58142,41.962,367,200,6,1,364,3,359,1.080000e-96,297.0
6,Q9F0J6,A8GG94,35.695,367,230,4,3,365,2,366,7.840000e-77,249.0
...,...,...,...,...,...,...,...,...,...,...,...,...
143303,O07550,Q7A470,33.913,230,139,5,337,560,8,230,5.150000e-26,110.0
143304,O07550,Q99S47,33.913,230,139,5,337,560,8,230,5.150000e-26,110.0
143305,O07550,Q2YYM4,33.913,230,139,5,337,560,8,230,8.690000e-26,110.0
143306,O07550,Q8NVB5,33.913,230,139,5,337,560,8,230,9.200000e-26,110.0


In [10]:
print(blast_p_results['% Identity'].max())
print(blast_p_results['% Identity'].min())
print(blast_p_results['% Identity'].mean())

100.0
17.702
34.82454090063533


In [11]:
import sys
sys.path.append('../../')
from dataset_preprocess.pdb_preprocess_utils import map_active_site_for_one
from utils import predict_activate_site_with_sequence_alignment, predict_activate_site_type_with_sequence_alignment
from common.utils import merge_similarity_index



In [12]:
test_dataset_with_similarity_index = pd.read_csv(os.path.join(dataset_path, 'test_dataset_with_similarity_idx.csv'))
test_dataset = merge_similarity_index(test_dataset, test_dataset_with_similarity_index)

In [13]:
# test_dataset_with_results = predict_activate_site_with_sequence_alignment(test_dataset, database=train_database_df, blastp_results=blast_p_results, top_n=5, output_results=True)

In [14]:
test_dataset_with_results: pd.DataFrame = predict_activate_site_type_with_sequence_alignment(test_dataset, database=train_database_df, blastp_results=blast_p_results, top_n=5, output_results=True)
os.makedirs('baseline_results', exist_ok=True)
test_dataset_with_results.to_csv(os.path.join('baseline_results', 'uniprotblastp_alignment.csv'), index=False)
test_dataset_with_results.to_json(os.path.join('baseline_results', 'uniprotblastp_alignment.json'))

  0%|          | 0/892 [00:00<?, ?it/s]

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

Get 892 results
Accuracy: 0.9822, Precision: 0.7257, Specificity: 0.9924, Overlap Score: 0.7326, False Positive Rate: 0.0076, F1: 0.7041, MCC: 0.7089
Multiclassfication Metrics:
recall_cls_0: 0.9924, recall_cls_1: 0.5930, recall_cls_2: 0.4612, recall_cls_3: 0.0828, fpr_cls_0: 0.2674, fpr_cls_1: 0.0071, fpr_cls_2: 0.0004, fpr_cls_3: 0.0002, multi-class mcc: 0.7073
