In [1]:
import csv
import pandas as pd
import json
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
INPUT_PATH = '/home/vit/Projects/cryptobench/data/E-add-noncryptic-pockets/ahoj-v2/cryptobench'
CRYPTIC_DATASET_PATH = f'{INPUT_PATH}/dataset.json'
PREFILTERED_CSV = '/home/vit/Projects/cryptobench/data/B-create-dataset/ahoj-v2/dataset_dataframe.csv'
with open(CRYPTIC_DATASET_PATH) as f:
    cryptic_dataset = json.load(f)

dataset_base_pd = pd.read_csv(PREFILTERED_CSV)

for apo_pdb_id, holo_structures in cryptic_dataset.items():
    largest_pRMSD = -1
    for idx, holo_structure in enumerate(holo_structures):
        holo_pdb_id = holo_structure['holo_pdb_id']
        holo_chain_id = holo_structure['holo_chain']
        apo_chain_id = holo_structure['apo_chain']
        ligand = holo_structure['ligand']
        ligand_index = int(holo_structure['ligand_index'])
        ligand_chain = holo_structure['ligand_chain']
        record = dataset_base_pd[
            (dataset_base_pd['apo_structure'] == apo_pdb_id) &
            (dataset_base_pd['holo_structure'] == holo_pdb_id) &
            (dataset_base_pd['apo_chains3'] == apo_chain_id) &
            (dataset_base_pd['holo_chains3'] == holo_chain_id) &
            (dataset_base_pd['ligand'] == ligand) &
            (dataset_base_pd['ligand_index'] == ligand_index) &
            (dataset_base_pd['ligand_chain'] == ligand_chain)
            ]
        
        if len(record) > 1:
            assert len(record['apo_pocket_rms'].unique()) == 1

        assert len(record) > 0
        pRMSD = record['apo_pocket_rms'].iloc[0]       

        largest_pRMSD = max(largest_pRMSD, pRMSD)

        cryptic_dataset[apo_pdb_id][idx]['pRMSD'] = pRMSD
    
    already_assigned_main_holo = False
    for idx, holo_structure in enumerate(holo_structures):
        if cryptic_dataset[apo_pdb_id][idx]['pRMSD'] == largest_pRMSD and not already_assigned_main_holo:
            already_assigned_main_holo = True
            cryptic_dataset[apo_pdb_id][idx]['is_main_holo_structure'] = True
        else:
            cryptic_dataset[apo_pdb_id][idx]['is_main_holo_structure'] = False



  dataset_base_pd = pd.read_csv(PREFILTERED_CSV)


In [43]:
for apo_pdb_id, holo_structures in cryptic_dataset.items():
    assert len([i for i in holo_structures if i['is_main_holo_structure']]) == 1

create test set

In [11]:
import os
import shutil
SPLITS_PATH = f'{INPUT_PATH}/splits.json'
OUTPUT_PATH = '/home/vit/Projects/cryptobench/data/I-final/ahoj-v2/cryptobench-dataset'
with open(SPLITS_PATH) as f:
    splits = json.load(f)

for fold, filename in zip(['test', 'train-0', 'train-1', 'train-2', 'train-3'],
                         ['test.json', 'train-fold-0.json', 'train-fold-1.json', 'train-fold-2.json', 'train-fold-3.json']):
    subset = {}
    for i in splits[fold]:
        subset[i] = cryptic_dataset[i]
    with open(f'{OUTPUT_PATH}/folds/{filename}', 'w', encoding='utf-8') as f:
        json.dump(subset, f, ensure_ascii=False, indent=4)

with open(f'{OUTPUT_PATH}/dataset.json', 'w', encoding='utf-8') as f:
    json.dump(cryptic_dataset, f, ensure_ascii=False, indent=4)

SANITY_CHECK_PATH = f'{INPUT_PATH}'
for filename in os.listdir(f'{SANITY_CHECK_PATH}/folds'):
    with open(f'{SANITY_CHECK_PATH}/folds/{filename}') as f:
        original = json.load(f)
    with open(f'{OUTPUT_PATH}/folds/{filename}') as f:
        new = json.load(f)

    for apo, holos in original.items():
        assert apo in new
        assert len(new[apo]) == len(holos)

shutil.copytree(f'{INPUT_PATH}/auxiliary-data', f'{OUTPUT_PATH}/auxiliary-data', dirs_exist_ok=True)
shutil.copy(f'/home/vit/Projects/cryptobench/src/README.md', OUTPUT_PATH)
shutil.copy(f'{INPUT_PATH}/splits.json', OUTPUT_PATH)
os.rename(f'{OUTPUT_PATH}/splits.json', f'{OUTPUT_PATH}/folds.json')

# select HOLO structure with the largest pRMSD for each apo structure

In [52]:
cryptic_dataset_main_holos = {}
for apo_pdb_id, holo_structures in cryptic_dataset.items():
    was_selected = False
    for idx, holo_structure in enumerate(holo_structures):
        if holo_structure['is_main_holo_structure']:
            cryptic_dataset_main_holos[apo_pdb_id] = holo_structure
            was_selected = True
            break
    assert was_selected

test_subset_main_holos = {}
for apo_pdb_id, holo_structures in test_subset.items():
    was_selected = False
    for idx, holo_structure in enumerate(holo_structures):
        if holo_structure['is_main_holo_structure']:
            test_subset_main_holos[apo_pdb_id] = holo_structure
            was_selected = True
            break
    assert was_selected

with open(f'../other/data/single-pair-dataset.json', 'w', encoding='utf-8') as f:
    json.dump(cryptic_dataset_main_holos, f, ensure_ascii=False, indent=4)

with open(f'../other/data/single-pair-test.json', 'w', encoding='utf-8') as f:
    json.dump(test_subset_main_holos, f, ensure_ascii=False, indent=4)
